···6161 - [X] Implement clears in KVs
6262 - [ ] Implement clears in the database
6363 - [ ] Range clears
6464- - [ ] Implement range clears in KVs
6464+ - [X] Implement range deletes in HybridKV
6565 - [X] Add persistent tree (RangeForest) to hold deleted ranges
6666 - [X] Handle deleted ranges in HybridKV.get/put
6767 - [X] Handle deleted ranges in HybridKV.flush
6868- - [ ] Handle deleted ranges in HybridKV.scan
6868+ - [X] Handle deleted ranges in HybridKV.scan
6969 - [ ] Implement range clears in the database
7070- [ ] Recovery
7171 - [ ] Monitor all transaction processes in Manager to detect failures
+84-31
lib/hybrid_kv.ex
···6868 limit = Keyword.get(opts, :limit, :infinity)
6969 reverse = Keyword.get(opts, :reverse, false)
70707171+ deleted_ranges =
7272+ kv.deleted_forest
7373+ |> RangeForest.tree_at(version)
7474+ |> RangeTree.intersect_range(kv.flushed_version, start_key, end_key)
7575+7176 read_limit = case limit do
7277 :infinity -> :infinity
7378 limit -> limit + 1
···75807681 {pairs, count} =
7782 case reverse do
7878- false -> do_scan(:forward, kv, version, start_key, end_key, read_limit, 0)
7979- true -> do_scan(:backward, kv, version, start_key, end_key, read_limit, 0)
8383+ false -> do_scan(:forward, kv, deleted_ranges, version, start_key, end_key, read_limit, 0)
8484+ true -> do_scan(:backward, kv, Enum.reverse(deleted_ranges), version, start_key, end_key, read_limit, 0)
8085 end
81868287 # We over-read by 1 and then use the extra read
···9095 end
91969297 # This exists purely as a sanity check and should be unreachable except for bugs
9393- defp do_scan(_direction, _kv, _version, _start_key, _end_key, _limit, 1000), do: raise "Scan caught in loop!"
9898+ defp do_scan(_direction, _kv, _deleted_ranges, _version, _start_key, _end_key, _limit, 1000), do: raise "Scan caught in loop!"
949995100 # Abandon hope all ye who enter here
9696- defp do_scan(:forward, %HybridKV{} = kv, version, start_key, end_key, limit, scan_count) do
9797- storage_result = FlatKV.scan(kv.storage_kv, start_key, end_key, limit: limit)
9898- {_start, storage_end_key} = storage_result.range
101101+ defp do_scan(:forward, %HybridKV{} = kv, deleted_ranges, version, start_key, end_key, limit, scan_count) do
102102+ {merged_pairs, scanned_end_key, deleted_ranges} =
103103+ case deleted_ranges do
104104+ [{sk, ek, del_v} | rest] when sk <= start_key ->
105105+ # We are "inside" a range clear, so we scan mem only to the end of min(ek, end_key)
106106+ mem_result = MemKV.scan(kv.mem_kv, version, del_v + 1, start_key, min(ek, end_key), limit: limit)
107107+ {_start, mem_end_key} = mem_result.range
108108+ # Merge just to clear :deleted
109109+ pairs = merge([], mem_result.pairs, false)
99110100100- # Read mem only up to the end of the range scanned by storage
101101- mem_result = MemKV.scan(kv.mem_kv, version, start_key, storage_end_key, limit: limit)
102102- {_start, mem_end_key} = mem_result.range
111111+ # Note: it would be wrong to return "rest" here if we are still inside
112112+ # the deleted range (due to hitting the limit), however if a mem scan
113113+ # hits the limit then we are done scanning because there is nothing
114114+ # for the tombstones to clear out, so we don't care
115115+ {pairs, mem_end_key, rest}
103116104104- # Both KVs were scanned up to this key
105105- # Anything past this key was only scanned by storage and must be discarded
106106- scanned_end_key = min(storage_end_key, mem_end_key)
117117+ _ ->
118118+ # If there is a range delete ahead, we want to stop this hybrid scan before that
119119+ # delete's start_key so that the next iteration will scan it mem only (the previous clause)
120120+ stop_key = case deleted_ranges do
121121+ [{sk, _ek, _del_v} | _] -> sk
122122+ [] -> end_key
123123+ end
124124+125125+ storage_result = FlatKV.scan(kv.storage_kv, start_key, stop_key, limit: limit)
126126+ {_start, storage_end_key} = storage_result.range
127127+128128+ # Read mem only up to the end of the range scanned by storage
129129+ mem_result = MemKV.scan(kv.mem_kv, version, 0, start_key, storage_end_key, limit: limit)
130130+ {_start, mem_end_key} = mem_result.range
107131108108- merged_pairs =
109109- merge(storage_result.pairs, mem_result.pairs, false)
110110- # TODO: more efficient to do this in merge/3
111111- |> Enum.take_while(fn {k, _v} -> k < scanned_end_key end)
132132+ # Both KVs were scanned up to this key
133133+ # Anything past this key was only scanned by storage and must be discarded
134134+ scanned_end_key = min(storage_end_key, mem_end_key)
135135+136136+ merged_pairs =
137137+ merge(storage_result.pairs, mem_result.pairs, false)
138138+ # TODO: more efficient to do this in merge/3
139139+ |> Enum.take_while(fn {k, _v} -> k < scanned_end_key end)
140140+141141+ {merged_pairs, scanned_end_key, deleted_ranges}
142142+ end
112143113144 # TODO: compute count in merge/3
114145 count = length(merged_pairs)
···129160 # We got <limit pairs but we really did scan the full range
130161 {merged_pairs, count}
131162 false ->
132132- # We got <limit pairs and we did not scan the full range, meaning
133133- # the KVs hit the limit but then the :deleted tombstones cleared
134134- # out enough keys to bring us back under the limit, so we must
135135- # keep scanning
136136- {next_pairs, next_count} = do_scan(:forward, kv, version, scanned_end_key, end_key, limit - count, scan_count + 1)
163163+ # We got <limit pairs and we have not yet scanned the full range,
164164+ # so we need to keep scanning
165165+ #
166166+ # This could be because we hit the limit and then the :deleted
167167+ # tombstones cleared out enough pairs to bring us back under,
168168+ # or because we ran into a range delete and had to skip over it
169169+ {next_pairs, next_count} = do_scan(:forward, kv, deleted_ranges, version, scanned_end_key, end_key, subtract_limit(limit, count), scan_count + 1)
137170 {merged_pairs ++ next_pairs, count + next_count}
138171 end
139172 end
···141174142175 # See :forward for comments, the :backward version is the same except key logic
143176 # is inverted (deals with start_key instead of end_key)
144144- defp do_scan(:backward, %HybridKV{} = kv, version, start_key, end_key, limit, scan_count) do
145145- storage_result = FlatKV.scan(kv.storage_kv, start_key, end_key, limit: limit, reverse: true)
146146- {storage_start_key, _end_key} = storage_result.range
177177+ defp do_scan(:backward, %HybridKV{} = kv, deleted_ranges, version, start_key, end_key, limit, scan_count) do
178178+ {merged_pairs, scanned_start_key, deleted_ranges} =
179179+ case deleted_ranges do
180180+ [{sk, ek, del_v} | rest] when ek >= end_key ->
181181+ mem_result = MemKV.scan(kv.mem_kv, version, del_v + 1, max(sk, start_key), end_key, limit: limit, reverse: true)
182182+ {mem_start_key, _end_key} = mem_result.range
183183+ pairs = merge([], mem_result.pairs, true)
184184+ {pairs, mem_start_key, rest}
185185+186186+ _ ->
187187+ stop_key = case deleted_ranges do
188188+ [{_sk, ek, _del_v} | _] -> ek
189189+ [] -> start_key
190190+ end
147191148148- mem_result = MemKV.scan(kv.mem_kv, version, storage_start_key, end_key, limit: limit, reverse: true)
149149- {mem_start_key, _end_key} = mem_result.range
192192+ storage_result = FlatKV.scan(kv.storage_kv, stop_key, end_key, limit: limit, reverse: true)
193193+ {storage_start_key, _end_key} = storage_result.range
150194151151- scanned_start_key = max(storage_start_key, mem_start_key)
195195+ mem_result = MemKV.scan(kv.mem_kv, version, 0, storage_start_key, end_key, limit: limit, reverse: true)
196196+ {mem_start_key, _end_key} = mem_result.range
152197153153- merged_pairs =
154154- merge(storage_result.pairs, mem_result.pairs, true)
155155- |> Enum.take_while(fn {k, _v} -> k >= scanned_start_key end)
198198+ scanned_start_key = max(storage_start_key, mem_start_key)
199199+200200+ merged_pairs =
201201+ merge(storage_result.pairs, mem_result.pairs, true)
202202+ |> Enum.take_while(fn {k, _v} -> k >= scanned_start_key end)
203203+204204+ {merged_pairs, scanned_start_key, deleted_ranges}
205205+ end
156206157207 count = length(merged_pairs)
158208···168218 true ->
169219 {merged_pairs, count}
170220 false ->
171171- {next_pairs, next_count} = do_scan(:backward, kv, version, start_key, scanned_start_key, limit - count, scan_count + 1)
221221+ {next_pairs, next_count} = do_scan(:backward, kv, deleted_ranges, version, start_key, scanned_start_key, subtract_limit(limit, count), scan_count + 1)
172222 {merged_pairs ++ next_pairs, count + next_count}
173223 end
174224 end
175225 end
226226+227227+ defp subtract_limit(:infinity, _n), do: :infinity
228228+ defp subtract_limit(limit, n), do: limit - n
176229177230 @spec merge([{binary, binary}], [{binary, binary}], boolean) :: [{binary, binary}]
178231 defp merge(list1, list2, reverse), do: do_merge(reverse, list1, list2, []) |> Enum.reverse()
+19-18
lib/mem_kv.ex
···3939 end
4040 end
41414242- @spec scan(:ets.table, non_neg_integer, binary, binary, keyword) :: RangeResult.t
4343- def scan(table, version, start_key, end_key, opts \\ []) do
4242+ @spec scan(:ets.table, non_neg_integer, non_neg_integer, binary, binary, keyword) :: RangeResult.t
4343+ def scan(table, version, floor_version, start_key, end_key, opts \\ [])
4444+ when is_integer(version) and is_integer(floor_version) and is_binary(start_key) and is_binary(end_key) do
4445 limit = Keyword.get(opts, :limit, :infinity)
4546 reverse = Keyword.get(opts, :reverse, false)
4647···51525253 # Note: atoms (e.g. :infinity) always sort larger than numbers
5354 {pairs, count} = case reverse do
5454- false -> do_scan(:forward, table, version, end_key, read_limit, {start_key, -1}, [], 0)
5555- true -> do_scan(:backward, table, version, start_key, read_limit, {end_key, -1}, [], 0)
5555+ false -> do_scan(:forward, table, version, floor_version, end_key, read_limit, {start_key, -1}, [], 0)
5656+ true -> do_scan(:backward, table, version, floor_version, start_key, read_limit, {end_key, -1}, [], 0)
5657 end
57585859 # We over-read by 1 (read_limit) and then use the extra key
···9293 end
9394 end
94959595- defp do_scan(:forward, table, version, end_key, limit, {_, _} = prev, acc, count) do
9696+ defp do_scan(:forward, table, version, floor_version, end_key, limit, {_, _} = prev, acc, count) do
9697 case :ets.next(table, prev) do
9798 {key, ver} = full_key when key < end_key ->
9898- case ver <= version do
9999+ case ver >= floor_version and ver <= version do
99100 true ->
100101 [{^full_key, value}] = :ets.lookup(table, full_key)
101102102103 case acc do
103104 [] ->
104105 acc = [{key, value} | acc]
105105- do_scan(:forward, table, version, end_key, limit, full_key, acc, count)
106106+ do_scan(:forward, table, version, floor_version, end_key, limit, full_key, acc, count)
106107107108 [{^key, _value} | rest] ->
108109 # Same key, overwrite lower version
109110 acc = [{key, value} | rest]
110110- do_scan(:forward, table, version, end_key, limit, full_key, acc, count)
111111+ do_scan(:forward, table, version, floor_version, end_key, limit, full_key, acc, count)
111112112113 [{_key, :deleted} | _rest] ->
113114 # Key boundary, last pair was :deleted
114115 acc = [{key, value} | acc]
115115- do_scan(:forward, table, version, end_key, limit, full_key, acc, count)
116116+ do_scan(:forward, table, version, floor_version, end_key, limit, full_key, acc, count)
116117117118 [{_key, _value} | _rest] ->
118119 # Key boundary, last pair was *not* :deleted
···120121 case count < limit do
121122 true ->
122123 acc = [{key, value} | acc]
123123- do_scan(:forward, table, version, end_key, limit, full_key, acc, count)
124124+ do_scan(:forward, table, version, floor_version, end_key, limit, full_key, acc, count)
124125 false ->
125126 {acc, count}
126127 end
···128129129130 false ->
130131 # Version is unreadable, skip
131131- do_scan(:forward, table, version, end_key, limit, full_key, acc, count)
132132+ do_scan(:forward, table, version, floor_version, end_key, limit, full_key, acc, count)
132133 end
133134134135 _ ->
···143144 end
144145 end
145146146146- defp do_scan(:backward, table, version, start_key, limit, {_, _} = prev, acc, count) do
147147+ defp do_scan(:backward, table, version, floor_version, start_key, limit, {_, _} = prev, acc, count) do
147148 case :ets.prev(table, prev) do
148149 {key, ver} = full_key when key >= start_key ->
149149- case ver <= version do
150150+ case ver >= floor_version and ver <= version do
150151 true ->
151152 [{^full_key, value}] = :ets.lookup(table, full_key)
152153153154 case acc do
154155 [] ->
155156 acc = [{key, value}]
156156- do_scan(:backward, table, version, start_key, limit, full_key, acc, count)
157157+ do_scan(:backward, table, version, floor_version, start_key, limit, full_key, acc, count)
157158158159 [{^key, _value} | _rest] ->
159160 # Same key, keep higher version
160160- do_scan(:backward, table, version, start_key, limit, full_key, acc, count)
161161+ do_scan(:backward, table, version, floor_version, start_key, limit, full_key, acc, count)
161162162163 [{_key, :deleted} | _rest] ->
163164 # Key boundary, last pair was :deleted
164165 acc = [{key, value} | acc]
165165- do_scan(:backward, table, version, start_key, limit, full_key, acc, count)
166166+ do_scan(:backward, table, version, floor_version, start_key, limit, full_key, acc, count)
166167167168 [{_key, _value} | _rest] ->
168169 # Key boundary, last pair was *not* :deleted
···170171 case count < limit do
171172 true ->
172173 acc = [{key, value} | acc]
173173- do_scan(:backward, table, version, start_key, limit, full_key, acc, count)
174174+ do_scan(:backward, table, version, floor_version, start_key, limit, full_key, acc, count)
174175 false ->
175176 {acc, count}
176177 end
···178179179180 false ->
180181 # Version is unreadable, skip
181181- do_scan(:backward, table, version, start_key, limit, full_key, acc, count)
182182+ do_scan(:backward, table, version, floor_version, start_key, limit, full_key, acc, count)
182183 end
183184184185 _ ->
+1-1
lib/meta_store.ex
···8484 end
85858686 defp scan(kv, version, start_key, end_key, opts \\ []) do
8787- MemKV.scan(kv, version, @special_prefix <> start_key, @special_prefix <> end_key, opts)
8787+ MemKV.scan(kv, version, 0, @special_prefix <> start_key, @special_prefix <> end_key, opts)
8888 # TODO: remove this and return the real result
8989 |> case do
9090 %RangeResult{pairs: pairs} -> pairs
+8-4
lib/range_forest.ex
···113113114114 @spec insert_range(RangeTree.t, non_neg_integer, binary, binary) :: [{binary, binary, non_neg_integer}]
115115 def intersect_range(tree, min_version, start_key, end_key) do
116116- case :gb_trees.smaller(start_key, tree) do
117117- {sk, {_ek, _v}} -> :gb_trees.iterator_from(sk, tree)
118118- :none -> :gb_trees.iterator_from(start_key, tree)
116116+ acc = case :gb_trees.smaller(start_key, tree) do
117117+ {sk, {ek, v}} when start_key < ek and v >= min_version ->
118118+ [{sk, ek, v}]
119119+ _ ->
120120+ []
119121 end
120120- |> scan_intersect_range(end_key, min_version, [])
122122+123123+ :gb_trees.iterator_from(start_key, tree)
124124+ |> scan_intersect_range(end_key, min_version, acc)
121125 |> Enum.reverse()
122126 end
123127
+1-1
test/hybrid_kv_test.exs
···6464 ], limit: :infinity
6565 end
66666767- @ops [:put, :delete, :delete_range, :get]
6767+ @ops [:put, :delete, :delete_range, :get, :scan]
6868 defp random_op do
6969 case Enum.random(1..100) do
7070 1 -> :flush