this repo has no description
2
fork

Configure Feed

Select the types of activity you want to include in your feed.

Store full byte sample unversioned in memory

garrison c014f0d6 55783096

+163 -71
+119
lib/kv/byte_sample.ex
··· 1 + defmodule Hobbes.KV.ByteSample do 2 + alias Hobbes.Utils 3 + import Hobbes.Utils 4 + 5 + # Byte sample should be 1/250th the size of k/v data 6 + @byte_sample_factor 250 7 + # TODO: 250 is to rare for tests, buggify 8 + #@byte_sample_factor 2 9 + 10 + # Approximate overhead per sample (other than key size) 11 + # (currently just 8 bytes to store the size value as a string) 12 + @byte_sample_overhead_bytes 8 13 + 14 + @type t :: :ets.table 15 + 16 + @spec new :: t 17 + def new do 18 + :ets.new(__MODULE__, [:ordered_set, :private]) 19 + end 20 + 21 + @spec load(t, [{binary, binary}]) :: :ok 22 + def load(table, pairs) when is_list(pairs) do 23 + Enum.each(pairs, fn {k, v} when is_binary(k) and is_binary(v) -> 24 + bytes = decode_float(v) 25 + :ets.insert(table, {k, bytes}) 26 + end) 27 + :ok 28 + end 29 + 30 + @spec scan(t, binary, binary) :: [{binary, float}] 31 + def scan(table, start_key, end_key) do 32 + acc = 33 + case :ets.lookup(table, start_key) do 34 + [{^start_key, _size}] = result -> result 35 + [] -> [] 36 + end 37 + 38 + do_scan(table, end_key, start_key, acc) 39 + |> Enum.reverse() 40 + end 41 + 42 + defp do_scan(table, end_key, prev_key, acc) do 43 + case :ets.next_lookup(table, prev_key) do 44 + {_key, [{key, _size} = pair]} -> 45 + case key < end_key do 46 + true -> do_scan(table, end_key, key, [pair | acc]) 47 + false -> acc 48 + end 49 + 50 + :"$end_of_table" -> acc 51 + end 52 + end 53 + 54 + @spec apply_batch(t, [Utils.mutation]) :: [Utils.mutation] 55 + def apply_batch(table, mutations) when is_list(mutations) do 56 + mutations 57 + |> Enum.reduce([], fn 58 + {:write, k, v}, acc -> 59 + key_size = byte_size(k) 60 + pair_size = key_size + byte_size(v) 61 + probability = byte_sample_probability(key_size, pair_size) 62 + 63 + case (:erlang.phash2(k, 1000) / 1000) < probability do 64 + true -> 65 + # Correct for sampling probability (see comments in byte_sample_probability/2) 66 + sampled_size = pair_size / min(probability, 1) 67 + 68 + :ets.insert(table, {k, sampled_size}) 69 + mut = {:write, special_byte_sample_prefix() <> k, encode_float(sampled_size)} 70 + [mut | acc] 71 + 72 + false -> acc 73 + end 74 + 75 + {:clear, k}, acc -> 76 + case :ets.member(table, k) do 77 + true -> 78 + :ets.delete(table, k) 79 + mut = {:clear, special_byte_sample_prefix() <> k} 80 + [mut | acc] 81 + 82 + false -> acc 83 + end 84 + 85 + {:clear_range, _k, _v}, _acc -> 86 + raise "Not implemented" 87 + end) 88 + |> Enum.reverse() 89 + end 90 + 91 + defp byte_sample_probability(key_size, pair_size) do 92 + # Probability that a key/value pair of this size belongs in the byte sample 93 + # This is a function of the size rather than a percentage of keys so that 94 + # we can maintain the byte sample as a fraction of *total KV size* 95 + # 96 + # Intuitively: the byte sample only stores keys, so if values are larger than 97 + # the overhead we can afford to store more samples while staying under the limit 98 + # Therefore, if the value is large, probability should increase 99 + # 100 + # We then correct out the probability factor by dividing size to get sampled_size 101 + # at the end, so that the sample is not biased by the larger pairs 102 + # 103 + # This algorithm is borrowed directly from FDB (storageserver isKeyValueInSample) 104 + (pair_size / (key_size + @byte_sample_overhead_bytes)) / @byte_sample_factor 105 + end 106 + 107 + defp encode_float(float) when is_number(float) do 108 + Integer.to_string(round(float * 1000)) 109 + end 110 + 111 + defp decode_float(string) when is_binary(string) do 112 + String.to_integer(string) / 1000 113 + end 114 + 115 + @doc false 116 + def dump(table) do 117 + :ets.tab2list(table) 118 + end 119 + end
+17 -65
lib/servers/storage.ex
··· 5 5 import ExUnit.Assertions, only: [assert: 1] 6 6 7 7 alias Hobbes.{HybridKV, MetaStore, SparseShardMap} 8 - alias Hobbes.KV.MutationLog 8 + alias Hobbes.KV.{MutationLog, ByteSample} 9 9 alias Hobbes.Structs.{Cluster, TLogGeneration, Server, PeekResult, RangeResult, ShardStats} 10 10 alias Hobbes.Servers.{CommitBuffer, TLog, Storage, Distributor} 11 11 ··· 48 48 kv: term, 49 49 meta_store: term, 50 50 shard_map: SparseShardMap.t, 51 + byte_sample: ByteSample.t, 51 52 52 53 imports: [ShardImport.t], 53 54 shard_clears: [ShardClear.t], ··· 67 68 :kv, 68 69 :meta_store, 69 70 :shard_map, 71 + :byte_sample, 70 72 71 73 :imports, 72 74 :shard_clears, ··· 79 81 @flush_interval_ms 250 80 82 @import_interval_ms 10 81 83 82 - # Byte sample should be 1/250th the size of k/v data 83 - @byte_sample_factor 250 84 - # Approximate overhead per sample (other than key size) 85 - # (currently just 8 bytes to store the size value as a string) 86 - @byte_sample_overhead_bytes 8 87 - # Prefix under which to store byte sample data 88 - @byte_sample_prefix "\xFF\xFF\xFE" 89 - 90 84 def start_link(arg), do: SimServer.start_link(__MODULE__, arg) 91 85 92 86 # TODO: get rid of single-key reads entirely and just send a multi-key read from Transaction ··· 179 173 kv: kv, 180 174 meta_store: MetaStore.new(kv.mem_kv), 181 175 shard_map: shard_map, 176 + byte_sample: ByteSample.new(), 182 177 183 178 imports: %{}, 184 179 shard_clears: [], 185 180 } 181 + 182 + byte_sample_pairs = HybridKV.scan(kv, 1, special_byte_sample_prefix(), special_byte_sample_end()).pairs 183 + ByteSample.load(state.byte_sample, byte_sample_pairs) 186 184 187 185 SimServer.send_after(self(), :tick_ping, 0) 188 186 SimServer.send_after(self(), :flush, @flush_interval_ms) ··· 218 216 end 219 217 220 218 def handle_call({:get_shard_stats, start_key, end_key}, _from, %State{} = state) do 221 - read_version = state.data_version 222 - 223 - bs_sk = @byte_sample_prefix <> start_key 224 - bs_ek = @byte_sample_prefix <> end_key 225 - %RangeResult{pairs: pairs, more: false} = HybridKV.scan(state.kv, read_version, bs_sk, bs_ek) 219 + pairs = ByteSample.scan(state.byte_sample, start_key, end_key) 226 220 227 221 size = 228 222 pairs 229 - |> Enum.reduce(0, fn {_k, v}, acc -> acc + decode_float(v) end) 223 + |> Enum.reduce(0, fn {_k, bytes}, acc -> acc + bytes end) 230 224 |> round() 231 225 232 226 half_size = div(size, 2) 233 227 # TODO: we use the full midpoint key, but we could instead use the shortest key which 234 228 # separates the midpoint and the next key (which would make the shard map smaller) 235 229 midpoint = 236 - Enum.reduce_while(pairs, 0, fn {k, v}, acc -> 237 - acc = acc + decode_float(v) 230 + Enum.reduce_while(pairs, 0, fn {k, bytes}, acc -> 231 + acc = acc + bytes 238 232 case acc > half_size do 239 233 true -> {:halt, k} 240 234 false -> {:cont, acc} 241 235 end 242 236 end) 243 237 |> case do 244 - @byte_sample_prefix <> midpoint -> midpoint 238 + midpoint when is_binary(midpoint) -> midpoint 245 239 # If the shard is too small to have any byte sample keys 246 240 0 -> start_key 247 241 end ··· 367 361 HybridKV.flush(kv, adv) 368 362 369 363 HybridKV.delete_range_storage(kv, start_key, end_key) 370 - HybridKV.delete_range_storage(kv, @byte_sample_prefix <> start_key, @byte_sample_prefix <> end_key) 364 + # TODO: clear byte sample 371 365 end) 372 366 state = %State{state | shard_clears: remaining_clears} 373 367 ··· 511 505 end 512 506 513 507 defp apply_data_mutations(%State{kv: kv} = state, version, data_mutations) when is_list(data_mutations) do 514 - # TODO: append byte sample mutations to batch 515 508 kv = HybridKV.apply_batch(kv, version, data_mutations) 516 - %{state | kv: kv} 517 - end 518 - 519 - defp byte_sample_key(key), do: @byte_sample_prefix <> key 520 509 521 - @spec byte_sample_pair(binary, binary) :: {:in_sample, binary, binary} | {:not_in_sample, binary} 522 - defp byte_sample_pair(key, value) when is_binary(key) and is_binary(value) do 523 - # Note: this byte sample algorithm is borrowed directly from FDB 524 - # See isKeyValueInSample in storageserver 525 - key_size = byte_size(key) 526 - pair_size = key_size + byte_size(value) 510 + byte_sample_mutations = ByteSample.apply_batch(state.byte_sample, data_mutations) 511 + MutationLog.append(state.kv.mutation_log, version, byte_sample_mutations) 527 512 528 - # Probability that a key/value pair of this size belongs in the byte sample 529 - # This is a function of the size rather than a percentage of keys so that 530 - # we can maintain the byte sample as a fraction of *total KV size* 531 - # 532 - # Intuitively: the byte sample only stores keys, so if values are larger than 533 - # the overhead we can afford to store more samples while staying under the limit 534 - # Therefore, if the value is large, probability should increase 535 - # 536 - # We then correct out the probability factor by dividing size to get sampled_size 537 - # at the end, so that the sample is not biased by the larger pairs 538 - probability = (pair_size / (key_size + @byte_sample_overhead_bytes)) / @byte_sample_factor 539 - 540 - hash = :erlang.phash2(key, 1000) / 1000 541 - case hash < probability do 542 - true -> 543 - sampled_size = pair_size / min(probability, 1) 544 - 545 - s_key = @byte_sample_prefix <> key 546 - s_value = encode_float(sampled_size) 547 - 548 - {:in_sample, s_key, s_value} 549 - 550 - false -> 551 - {:not_in_sample, @byte_sample_prefix <> key} 552 - end 513 + %{state | kv: kv} 553 514 end 554 515 555 516 defp apply_special_mutations(%State{} = state, version, mutations) do ··· 633 594 if cancelled? do 634 595 %ShardImport{start_key: start_key, end_key: end_key} = si 635 596 HybridKV.nuke_range(state.kv, start_key, end_key) 636 - HybridKV.nuke_range(state.kv, @byte_sample_prefix <> start_key, @byte_sample_prefix <> end_key) 597 + # TODO: clear byte sample 637 598 end 638 599 639 600 %State{state | imports: Map.delete(state.imports, si.ref)} ··· 738 699 true -> :ok 739 700 false -> {:error, :wrong_server} 740 701 end 741 - end 742 - 743 - # TODO: switch to a real encoding, obviously 744 - defp encode_float(float) when is_number(float) do 745 - Integer.to_string(round(float * 1000)) 746 - end 747 - 748 - defp decode_float(string) when is_binary(string) do 749 - String.to_integer(string) / 1000 750 702 end 751 703 end
+3
lib/utils.ex
··· 35 35 defmacro special_server_keys_prefix, do: "\xFF\xFF/sk/" 36 36 defmacro special_server_keys_end, do: "\xFF\xFF/sk0" 37 37 38 + defmacro special_byte_sample_prefix, do: "\xFF\xFF/bs/" 39 + defmacro special_byte_sample_end, do: "\xFF\xFF/bs0" 40 + 38 41 def meta_tag, do: -1 39 42 40 43 def mvcc_window, do: 5_000_000
+2 -2
test/hobbes_test.exs
··· 117 117 keys: 80, 118 118 clients: 20, 119 119 client_tick_ms: 100, 120 - duration_ms: 10_000, 120 + duration_ms: 14_000, 121 121 ]}, 122 122 {Workloads.KillServers, [ 123 - delay_ms: 3_000, 123 + delay_ms: 7_000, 124 124 duration_ms: 0, 125 125 ]}, 126 126 ], HobbesTest.SimOpts.sim_opts(name: test, cluster_opts: [
-4
test/hybrid_kv_test.exs
··· 148 148 end 149 149 end 150 150 151 - defp inc_version(%Verifier{version: version} = verifier) do 152 - %Verifier{verifier | version: version + rand(3)} 153 - end 154 - 155 151 defp rand(s \\ 0, e), do: Enum.random(s..e) 156 152 157 153 defp rand_read_version(%Verifier{} = verifier) do
+22
test/kv/byte_sample_test.exs
··· 1 + defmodule Hobbes.KV.ByteSampleTest do 2 + use ExUnit.Case, async: true 3 + 4 + alias Hobbes.KV.ByteSample 5 + 6 + @moduletag :byte_sample 7 + 8 + setup do 9 + %{bs: ByteSample.new()} 10 + end 11 + 12 + describe "ByteSample" do 13 + test "samples", %{bs: bs} do 14 + mutations = Enum.map(1..4000, fn i -> {:write, "k#{String.pad_leading(to_string(i), 4, "0")}", "v#{i}"} end) 15 + ByteSample.apply_batch(bs, mutations) 16 + 17 + pairs = ByteSample.scan(bs, "k1000", "k2000") 18 + # Anything else will flake if we change the parameters 19 + assert is_list(pairs) 20 + end 21 + end 22 + end