Skip to content

Commit 7f70eeb

Browse files
committed
Add prefix bloom filter support
1 parent 1486943 commit 7f70eeb

File tree

13 files changed

+1777
-14
lines changed

13 files changed

+1777
-14
lines changed

src/compaction/worker.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ fn merge_segments(
364364
opts.tree_id,
365365
opts.config.cache.clone(),
366366
opts.config.descriptor_table.clone(),
367+
opts.config.prefix_extractor.clone(),
367368
payload.dest_level <= 1, // TODO: look at configuration
368369
payload.dest_level <= 2, // TODO: look at configuration
369370
#[cfg(feature = "metrics")]

src/config.rs

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
// This source code is licensed under both the Apache 2.0 and MIT License
33
// (found in the LICENSE-* files in the repository)
44

5-
use crate::{path::absolute_path, BlobTree, Cache, CompressionType, DescriptorTable, Tree};
5+
use crate::{
6+
path::absolute_path, prefix::SharedPrefixExtractor, BlobTree, Cache, CompressionType,
7+
DescriptorTable, Tree,
8+
};
69
use std::{
710
path::{Path, PathBuf},
811
sync::Arc,
@@ -94,6 +97,10 @@ pub struct Config {
9497
/// Descriptor table to use
9598
#[doc(hidden)]
9699
pub descriptor_table: Arc<DescriptorTable>,
100+
101+
/// Prefix extractor for bloom filters
102+
#[doc(hidden)]
103+
pub prefix_extractor: Option<SharedPrefixExtractor>,
97104
}
98105

99106
impl Default for Config {
@@ -115,6 +122,7 @@ impl Default for Config {
115122
compression: CompressionType::None,
116123
blob_compression: CompressionType::None,
117124
bloom_bits_per_key: 10,
125+
prefix_extractor: None,
118126

119127
blob_file_target_size: /* 64 MiB */ 64 * 1_024 * 1_024,
120128
blob_file_separation_threshold: /* 4 KiB */ 4 * 1_024,
@@ -312,6 +320,30 @@ impl Config {
312320
self
313321
}
314322

323+
/// Sets the prefix extractor for bloom filters.
324+
///
325+
/// A prefix extractor allows bloom filters to index prefixes of keys
326+
/// instead of (or in addition to) the full keys. This enables efficient
327+
/// filtering for prefix-based queries.
328+
///
329+
/// # Example
330+
///
331+
/// ```
332+
/// # use lsm_tree::Config;
333+
/// use lsm_tree::prefix::FixedPrefixExtractor;
334+
/// use std::sync::Arc;
335+
///
336+
/// # let path = tempfile::tempdir()?;
337+
/// let config = Config::new(path)
338+
/// .prefix_extractor(Arc::new(FixedPrefixExtractor::new(8)));
339+
/// # Ok::<(), Box<dyn std::error::Error>>(())
340+
/// ```
341+
#[must_use]
342+
pub fn prefix_extractor(mut self, extractor: SharedPrefixExtractor) -> Self {
343+
self.prefix_extractor = Some(extractor);
344+
self
345+
}
346+
315347
/// Opens a tree using the config.
316348
///
317349
/// # Errors

src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,9 @@ pub mod mvcc_stream;
182182

183183
mod path;
184184

185+
/// Prefix extraction for bloom filters
186+
pub mod prefix;
187+
185188
#[doc(hidden)]
186189
pub mod range;
187190

src/metrics.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,14 @@ impl Metrics {
4545
let hits = self.bloom_filter_hits.load(Relaxed) as f64;
4646
hits / queries
4747
}
48+
49+
/// Number of bloom filter queries performed.
50+
pub fn bloom_filter_queries(&self) -> usize {
51+
self.bloom_filter_queries.load(Relaxed)
52+
}
53+
54+
/// Number of bloom filter hits (queries that avoided disk I/O).
55+
pub fn bloom_filter_hits(&self) -> usize {
56+
self.bloom_filter_hits.load(Relaxed)
57+
}
4858
}

src/prefix.rs

Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
// Copyright (c) 2024-present, fjall-rs
2+
// This source code is licensed under both the Apache 2.0 and MIT License
3+
// (found in the LICENSE-* files in the repository)
4+
5+
use std::sync::Arc;
6+
7+
/// Trait for extracting prefixes from keys for prefix bloom filters.
8+
///
9+
/// A prefix extractor allows the bloom filter to index prefixes of keys
10+
/// instead of (or in addition to) the full keys. This enables efficient
11+
/// filtering for prefix-based queries.
12+
///
13+
/// # Examples
14+
///
15+
/// ## Simple fixed-length prefix:
16+
/// ```
17+
/// use lsm_tree::prefix::PrefixExtractor;
18+
///
19+
/// struct FixedPrefixExtractor(usize);
20+
///
21+
/// impl PrefixExtractor for FixedPrefixExtractor {
22+
/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
23+
/// Box::new(std::iter::once(key.get(0..self.0).unwrap_or(key)))
24+
/// }
25+
///
26+
/// fn name(&self) -> &str {
27+
/// "fixed_prefix"
28+
/// }
29+
/// }
30+
/// ```
31+
///
32+
/// ## Segmented prefixes (e.g., `account_id#user_id)`:
33+
/// ```
34+
/// use lsm_tree::prefix::PrefixExtractor;
35+
///
36+
/// struct SegmentedPrefixExtractor;
37+
///
38+
/// impl PrefixExtractor for SegmentedPrefixExtractor {
39+
/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
40+
/// let mut prefixes = vec![];
41+
/// let mut end = 0;
42+
/// for (i, &byte) in key.iter().enumerate() {
43+
/// if byte == b'#' {
44+
/// prefixes.push(&key[0..i]);
45+
/// end = i;
46+
/// }
47+
/// }
48+
/// if end < key.len() {
49+
/// prefixes.push(key);
50+
/// }
51+
/// Box::new(prefixes.into_iter())
52+
/// }
53+
///
54+
/// fn name(&self) -> &str {
55+
/// "segmented_prefix"
56+
/// }
57+
/// }
58+
/// ```
59+
pub trait PrefixExtractor: Send + Sync {
60+
/// Extracts zero or more prefixes from a key.
61+
///
62+
/// All prefixes will be added to the bloom filter during segment construction.
63+
///
64+
/// An empty iterator means the key is "out of domain" and won't be added to the bloom filter.
65+
fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a>;
66+
67+
/// Returns a unique name for this prefix extractor.
68+
fn name(&self) -> &str;
69+
}
70+
71+
/// A prefix extractor that returns the full key.
72+
///
73+
/// This is the default behavior if no prefix extractor is specified.
74+
pub struct FullKeyExtractor;
75+
76+
impl PrefixExtractor for FullKeyExtractor {
77+
fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
78+
Box::new(std::iter::once(key))
79+
}
80+
81+
fn name(&self) -> &'static str {
82+
"full_key"
83+
}
84+
}
85+
86+
/// A prefix extractor that returns a fixed-length prefix.
87+
///
88+
/// If the key is shorter than the prefix length, returns the full key.
89+
pub struct FixedPrefixExtractor {
90+
length: usize,
91+
}
92+
93+
impl FixedPrefixExtractor {
94+
/// Creates a new fixed-length prefix extractor.
95+
#[must_use]
96+
pub fn new(length: usize) -> Self {
97+
Self { length }
98+
}
99+
}
100+
101+
impl PrefixExtractor for FixedPrefixExtractor {
102+
fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
103+
if key.len() <= self.length {
104+
Box::new(std::iter::once(key))
105+
} else if let Some(prefix) = key.get(0..self.length) {
106+
Box::new(std::iter::once(prefix))
107+
} else {
108+
Box::new(std::iter::empty())
109+
}
110+
}
111+
112+
fn name(&self) -> &'static str {
113+
"fixed_prefix"
114+
}
115+
}
116+
117+
/// A prefix extractor that requires keys to be at least a certain length.
118+
///
119+
/// Keys shorter than the required length are considered "out of domain"
120+
/// and won't be added to the bloom filter. This matches `RocksDB`'s behavior.
121+
pub struct FixedLengthExtractor {
122+
length: usize,
123+
}
124+
125+
impl FixedLengthExtractor {
126+
/// Creates a new fixed-length extractor.
127+
#[must_use]
128+
pub fn new(length: usize) -> Self {
129+
Self { length }
130+
}
131+
}
132+
133+
impl PrefixExtractor for FixedLengthExtractor {
134+
fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
135+
if key.len() < self.length {
136+
// Key is too short - out of domain
137+
Box::new(std::iter::empty())
138+
} else if let Some(prefix) = key.get(0..self.length) {
139+
Box::new(std::iter::once(prefix))
140+
} else {
141+
Box::new(std::iter::empty())
142+
}
143+
}
144+
145+
fn name(&self) -> &'static str {
146+
"fixed_length"
147+
}
148+
}
149+
150+
/// Examples of custom multi-prefix extractors.
151+
///
152+
/// Users can implement their own prefix extractors that return multiple prefixes.
153+
/// The bloom filter will include all returned prefixes.
154+
///
155+
/// # Example
156+
///
157+
/// ```
158+
/// use lsm_tree::prefix::PrefixExtractor;
159+
/// use std::sync::Arc;
160+
///
161+
/// // Example 1: Hierarchical prefix extractor based on delimiter
162+
/// // For key "user/123/data" with delimiter '/', generates:
163+
/// // - "user"
164+
/// // - "user/123"
165+
/// // - "user/123/data" (full key)
166+
/// struct HierarchicalPrefixExtractor {
167+
/// delimiter: u8,
168+
/// }
169+
///
170+
/// impl PrefixExtractor for HierarchicalPrefixExtractor {
171+
/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
172+
/// let delimiter = self.delimiter;
173+
/// let mut prefixes = Vec::new();
174+
///
175+
/// // Generate all prefixes up to each delimiter
176+
/// for (i, &byte) in key.iter().enumerate() {
177+
/// if byte == delimiter {
178+
/// prefixes.push(&key[0..i]);
179+
/// }
180+
/// }
181+
///
182+
/// // Always include the full key
183+
/// prefixes.push(key);
184+
///
185+
/// Box::new(prefixes.into_iter())
186+
/// }
187+
///
188+
/// fn name(&self) -> &str {
189+
/// "hierarchical_prefix"
190+
/// }
191+
/// }
192+
///
193+
/// // Example 2: Domain-based extractor for email-like keys
194+
/// // For "user@example.com", generates:
195+
/// // - "example.com" (domain)
196+
/// // - "user@example.com" (full key)
197+
/// struct EmailDomainExtractor;
198+
///
199+
/// impl PrefixExtractor for EmailDomainExtractor {
200+
/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
201+
/// if let Ok(key_str) = std::str::from_utf8(key) {
202+
/// if let Some(at_pos) = key_str.find('@') {
203+
/// // Return both domain and full email
204+
/// let domain = &key[at_pos + 1..];
205+
/// return Box::new(vec![domain, key].into_iter());
206+
/// }
207+
/// }
208+
/// // If not an email format, just return the full key
209+
/// Box::new(std::iter::once(key))
210+
/// }
211+
///
212+
/// fn name(&self) -> &str {
213+
/// "email_domain"
214+
/// }
215+
/// }
216+
///
217+
/// // Usage:
218+
/// # let path = tempfile::tempdir()?;
219+
/// let tree = lsm_tree::Config::new(path)
220+
/// .prefix_extractor(Arc::new(HierarchicalPrefixExtractor { delimiter: b'/' }))
221+
/// .open()?;
222+
/// # Ok::<(), Box<dyn std::error::Error>>(())
223+
/// ```
224+
/// Type alias for a shared prefix extractor
225+
pub type SharedPrefixExtractor = Arc<dyn PrefixExtractor>;
226+
227+
#[cfg(test)]
228+
mod tests {
229+
use super::*;
230+
231+
#[test]
232+
fn test_full_key_extractor() {
233+
let extractor = FullKeyExtractor;
234+
let key = b"test_key";
235+
let prefixes: Vec<_> = extractor.extract(key).collect();
236+
assert_eq!(prefixes.len(), 1);
237+
assert_eq!(prefixes[0], b"test_key");
238+
}
239+
240+
#[test]
241+
fn test_fixed_prefix_extractor() {
242+
let extractor = FixedPrefixExtractor::new(5);
243+
244+
// Key longer than prefix
245+
let key = b"longer_key";
246+
let prefixes: Vec<_> = extractor.extract(key).collect();
247+
assert_eq!(prefixes.len(), 1);
248+
assert_eq!(prefixes[0], b"longe");
249+
250+
// Key shorter than prefix
251+
let key = b"key";
252+
let prefixes: Vec<_> = extractor.extract(key).collect();
253+
assert_eq!(prefixes.len(), 1);
254+
assert_eq!(prefixes[0], b"key");
255+
256+
// Key exactly prefix length
257+
let key = b"exact";
258+
let prefixes: Vec<_> = extractor.extract(key).collect();
259+
assert_eq!(prefixes.len(), 1);
260+
assert_eq!(prefixes[0], b"exact");
261+
}
262+
263+
#[test]
264+
fn test_empty_key() {
265+
let full_key = FullKeyExtractor;
266+
let fixed = FixedPrefixExtractor::new(5);
267+
268+
let key = b"";
269+
270+
let prefixes: Vec<_> = full_key.extract(key).collect();
271+
assert_eq!(prefixes.len(), 1);
272+
assert_eq!(prefixes[0], b"");
273+
274+
let prefixes: Vec<_> = fixed.extract(key).collect();
275+
assert_eq!(prefixes.len(), 1);
276+
assert_eq!(prefixes[0], b"");
277+
}
278+
279+
#[test]
280+
fn test_fixed_length_extractor() {
281+
let extractor = FixedLengthExtractor::new(5);
282+
283+
// Key shorter than required length - out of domain
284+
let key = b"abc";
285+
let prefixes: Vec<_> = extractor.extract(key).collect();
286+
assert_eq!(prefixes.len(), 0); // Empty iterator
287+
288+
// Key exactly required length
289+
let key = b"exact";
290+
let prefixes: Vec<_> = extractor.extract(key).collect();
291+
assert_eq!(prefixes.len(), 1);
292+
assert_eq!(prefixes[0], b"exact");
293+
294+
// Key longer than required length
295+
let key = b"longer_key";
296+
let prefixes: Vec<_> = extractor.extract(key).collect();
297+
assert_eq!(prefixes.len(), 1);
298+
assert_eq!(prefixes[0], b"longe");
299+
}
300+
}

0 commit comments

Comments
 (0)