Skip to content

Commit 5c502c3

Browse files
committed
Add prefix bloom filter support
1 parent 1486943 commit 5c502c3

File tree

13 files changed

+1774
-14
lines changed

13 files changed

+1774
-14
lines changed

src/compaction/worker.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ fn merge_segments(
364364
opts.tree_id,
365365
opts.config.cache.clone(),
366366
opts.config.descriptor_table.clone(),
367+
opts.config.prefix_extractor.clone(),
367368
payload.dest_level <= 1, // TODO: look at configuration
368369
payload.dest_level <= 2, // TODO: look at configuration
369370
#[cfg(feature = "metrics")]

src/config.rs

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
// This source code is licensed under both the Apache 2.0 and MIT License
33
// (found in the LICENSE-* files in the repository)
44

5-
use crate::{path::absolute_path, BlobTree, Cache, CompressionType, DescriptorTable, Tree};
5+
use crate::{
6+
path::absolute_path, prefix::SharedPrefixExtractor, BlobTree, Cache, CompressionType,
7+
DescriptorTable, Tree,
8+
};
69
use std::{
710
path::{Path, PathBuf},
811
sync::Arc,
@@ -94,6 +97,10 @@ pub struct Config {
9497
/// Descriptor table to use
9598
#[doc(hidden)]
9699
pub descriptor_table: Arc<DescriptorTable>,
100+
101+
/// Prefix extractor for bloom filters
102+
#[doc(hidden)]
103+
pub prefix_extractor: Option<SharedPrefixExtractor>,
97104
}
98105

99106
impl Default for Config {
@@ -115,6 +122,7 @@ impl Default for Config {
115122
compression: CompressionType::None,
116123
blob_compression: CompressionType::None,
117124
bloom_bits_per_key: 10,
125+
prefix_extractor: None,
118126

119127
blob_file_target_size: /* 64 MiB */ 64 * 1_024 * 1_024,
120128
blob_file_separation_threshold: /* 4 KiB */ 4 * 1_024,
@@ -312,6 +320,30 @@ impl Config {
312320
self
313321
}
314322

323+
/// Sets the prefix extractor for bloom filters.
324+
///
325+
/// A prefix extractor allows bloom filters to index prefixes of keys
326+
/// instead of (or in addition to) the full keys. This enables efficient
327+
/// filtering for prefix-based queries.
328+
///
329+
/// # Example
330+
///
331+
/// ```
332+
/// # use lsm_tree::Config;
333+
/// use lsm_tree::prefix::FixedPrefixExtractor;
334+
/// use std::sync::Arc;
335+
///
336+
/// # let path = tempfile::tempdir()?;
337+
/// let config = Config::new(path)
338+
/// .prefix_extractor(Arc::new(FixedPrefixExtractor::new(8)));
339+
/// # Ok::<(), Box<dyn std::error::Error>>(())
340+
/// ```
341+
#[must_use]
342+
pub fn prefix_extractor(mut self, extractor: SharedPrefixExtractor) -> Self {
343+
self.prefix_extractor = Some(extractor);
344+
self
345+
}
346+
315347
/// Opens a tree using the config.
316348
///
317349
/// # Errors

src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,9 @@ pub mod mvcc_stream;
182182

183183
mod path;
184184

185+
/// Prefix extraction for bloom filters
186+
pub mod prefix;
187+
185188
#[doc(hidden)]
186189
pub mod range;
187190

src/metrics.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,14 @@ impl Metrics {
4545
let hits = self.bloom_filter_hits.load(Relaxed) as f64;
4646
hits / queries
4747
}
48+
49+
/// Number of bloom filter queries performed.
50+
pub fn bloom_filter_queries(&self) -> usize {
51+
self.bloom_filter_queries.load(Relaxed)
52+
}
53+
54+
/// Number of bloom filter hits (queries that avoided disk I/O).
55+
pub fn bloom_filter_hits(&self) -> usize {
56+
self.bloom_filter_hits.load(Relaxed)
57+
}
4858
}

src/prefix.rs

Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
// Copyright (c) 2024-present, fjall-rs
2+
// This source code is licensed under both the Apache 2.0 and MIT License
3+
// (found in the LICENSE-* files in the repository)
4+
5+
use std::sync::Arc;
6+
7+
/// Trait for extracting prefixes from keys for prefix bloom filters.
8+
///
9+
/// A prefix extractor allows the bloom filter to index prefixes of keys
10+
/// instead of (or in addition to) the full keys. This enables efficient
11+
/// filtering for prefix-based queries.
12+
///
13+
/// # Examples
14+
///
15+
/// ## Simple fixed-length prefix:
16+
/// ```
17+
/// use lsm_tree::prefix::PrefixExtractor;
18+
///
19+
/// struct FixedPrefixExtractor(usize);
20+
///
21+
/// impl PrefixExtractor for FixedPrefixExtractor {
22+
/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
23+
/// Box::new(std::iter::once(key.get(0..self.0).unwrap_or(key)))
24+
/// }
25+
///
26+
/// fn name(&self) -> &str {
27+
/// "fixed_prefix"
28+
/// }
29+
/// }
30+
/// ```
31+
///
32+
/// ## Segmented prefixes (e.g., `account_id#user_id)`:
33+
/// ```
34+
/// use lsm_tree::prefix::PrefixExtractor;
35+
///
36+
/// struct SegmentedPrefixExtractor;
37+
///
38+
/// impl PrefixExtractor for SegmentedPrefixExtractor {
39+
/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
40+
/// let mut prefixes = vec![];
41+
/// let mut end = 0;
42+
/// for (i, &byte) in key.iter().enumerate() {
43+
/// if byte == b'#' {
44+
/// prefixes.push(&key[0..i]);
45+
/// end = i;
46+
/// }
47+
/// }
48+
/// if end < key.len() {
49+
/// prefixes.push(key);
50+
/// }
51+
/// Box::new(prefixes.into_iter())
52+
/// }
53+
///
54+
/// fn name(&self) -> &str {
55+
/// "segmented_prefix"
56+
/// }
57+
/// }
58+
/// ```
59+
pub trait PrefixExtractor: Send + Sync {
60+
/// Extracts zero or more prefixes from a key.
61+
///
62+
/// All prefixes will be added to the bloom filter during segment construction.
63+
///
64+
/// An empty iterator means the key is "out of domain" and won't be added to the bloom filter.
65+
fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a>;
66+
67+
/// Returns a unique name for this prefix extractor.
68+
fn name(&self) -> &str;
69+
}
70+
71+
/// A prefix extractor that returns the full key.
72+
///
73+
/// This is the default behavior if no prefix extractor is specified.
74+
pub struct FullKeyExtractor;
75+
76+
impl PrefixExtractor for FullKeyExtractor {
77+
fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
78+
Box::new(std::iter::once(key))
79+
}
80+
81+
fn name(&self) -> &'static str {
82+
"full_key"
83+
}
84+
}
85+
86+
/// A prefix extractor that returns a fixed-length prefix.
87+
///
88+
/// If the key is shorter than the prefix length, returns the full key.
89+
pub struct FixedPrefixExtractor {
90+
length: usize,
91+
}
92+
93+
impl FixedPrefixExtractor {
94+
/// Creates a new fixed-length prefix extractor.
95+
#[must_use]
96+
pub fn new(length: usize) -> Self {
97+
Self { length }
98+
}
99+
}
100+
101+
impl PrefixExtractor for FixedPrefixExtractor {
102+
fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
103+
if key.len() <= self.length {
104+
Box::new(std::iter::once(key))
105+
} else {
106+
Box::new(std::iter::once(&key[0..self.length]))
107+
}
108+
}
109+
110+
fn name(&self) -> &'static str {
111+
"fixed_prefix"
112+
}
113+
}
114+
115+
/// A prefix extractor that requires keys to be at least a certain length.
116+
///
117+
/// Keys shorter than the required length are considered "out of domain"
118+
/// and won't be added to the bloom filter. This matches `RocksDB`'s behavior.
119+
pub struct FixedLengthExtractor {
120+
length: usize,
121+
}
122+
123+
impl FixedLengthExtractor {
124+
/// Creates a new fixed-length extractor.
125+
#[must_use]
126+
pub fn new(length: usize) -> Self {
127+
Self { length }
128+
}
129+
}
130+
131+
impl PrefixExtractor for FixedLengthExtractor {
132+
fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
133+
if key.len() < self.length {
134+
// Key is too short - out of domain
135+
Box::new(std::iter::empty())
136+
} else {
137+
Box::new(std::iter::once(&key[0..self.length]))
138+
}
139+
}
140+
141+
fn name(&self) -> &'static str {
142+
"fixed_length"
143+
}
144+
}
145+
146+
/// Examples of custom multi-prefix extractors.
147+
///
148+
/// Users can implement their own prefix extractors that return multiple prefixes.
149+
/// The bloom filter will include all returned prefixes.
150+
///
151+
/// # Example
152+
///
153+
/// ```
154+
/// use lsm_tree::prefix::PrefixExtractor;
155+
/// use std::sync::Arc;
156+
///
157+
/// // Example 1: Hierarchical prefix extractor based on delimiter
158+
/// // For key "user/123/data" with delimiter '/', generates:
159+
/// // - "user"
160+
/// // - "user/123"
161+
/// // - "user/123/data" (full key)
162+
/// struct HierarchicalPrefixExtractor {
163+
/// delimiter: u8,
164+
/// }
165+
///
166+
/// impl PrefixExtractor for HierarchicalPrefixExtractor {
167+
/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
168+
/// let delimiter = self.delimiter;
169+
/// let mut prefixes = Vec::new();
170+
///
171+
/// // Generate all prefixes up to each delimiter
172+
/// for (i, &byte) in key.iter().enumerate() {
173+
/// if byte == delimiter {
174+
/// prefixes.push(&key[0..i]);
175+
/// }
176+
/// }
177+
///
178+
/// // Always include the full key
179+
/// prefixes.push(key);
180+
///
181+
/// Box::new(prefixes.into_iter())
182+
/// }
183+
///
184+
/// fn name(&self) -> &str {
185+
/// "hierarchical_prefix"
186+
/// }
187+
/// }
188+
///
189+
/// // Example 2: Domain-based extractor for email-like keys
190+
/// // For "user@example.com", generates:
191+
/// // - "example.com" (domain)
192+
/// // - "user@example.com" (full key)
193+
/// struct EmailDomainExtractor;
194+
///
195+
/// impl PrefixExtractor for EmailDomainExtractor {
196+
/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
197+
/// if let Ok(key_str) = std::str::from_utf8(key) {
198+
/// if let Some(at_pos) = key_str.find('@') {
199+
/// // Return both domain and full email
200+
/// let domain = &key[at_pos + 1..];
201+
/// return Box::new(vec![domain, key].into_iter());
202+
/// }
203+
/// }
204+
/// // If not an email format, just return the full key
205+
/// Box::new(std::iter::once(key))
206+
/// }
207+
///
208+
/// fn name(&self) -> &str {
209+
/// "email_domain"
210+
/// }
211+
/// }
212+
///
213+
/// // Usage:
214+
/// # let path = tempfile::tempdir()?;
215+
/// let tree = lsm_tree::Config::new(path)
216+
/// .prefix_extractor(Arc::new(HierarchicalPrefixExtractor { delimiter: b'/' }))
217+
/// .open()?;
218+
/// # Ok::<(), Box<dyn std::error::Error>>(())
219+
/// ```
220+
221+
/// Type alias for a shared prefix extractor
222+
pub type SharedPrefixExtractor = Arc<dyn PrefixExtractor>;
223+
224+
#[cfg(test)]
225+
mod tests {
226+
use super::*;
227+
228+
#[test]
229+
fn test_full_key_extractor() {
230+
let extractor = FullKeyExtractor;
231+
let key = b"test_key";
232+
let prefixes: Vec<_> = extractor.extract(key).collect();
233+
assert_eq!(prefixes.len(), 1);
234+
assert_eq!(prefixes[0], b"test_key");
235+
}
236+
237+
#[test]
238+
fn test_fixed_prefix_extractor() {
239+
let extractor = FixedPrefixExtractor::new(5);
240+
241+
// Key longer than prefix
242+
let key = b"longer_key";
243+
let prefixes: Vec<_> = extractor.extract(key).collect();
244+
assert_eq!(prefixes.len(), 1);
245+
assert_eq!(prefixes[0], b"longe");
246+
247+
// Key shorter than prefix
248+
let key = b"key";
249+
let prefixes: Vec<_> = extractor.extract(key).collect();
250+
assert_eq!(prefixes.len(), 1);
251+
assert_eq!(prefixes[0], b"key");
252+
253+
// Key exactly prefix length
254+
let key = b"exact";
255+
let prefixes: Vec<_> = extractor.extract(key).collect();
256+
assert_eq!(prefixes.len(), 1);
257+
assert_eq!(prefixes[0], b"exact");
258+
}
259+
260+
#[test]
261+
fn test_empty_key() {
262+
let full_key = FullKeyExtractor;
263+
let fixed = FixedPrefixExtractor::new(5);
264+
265+
let key = b"";
266+
267+
let prefixes: Vec<_> = full_key.extract(key).collect();
268+
assert_eq!(prefixes.len(), 1);
269+
assert_eq!(prefixes[0], b"");
270+
271+
let prefixes: Vec<_> = fixed.extract(key).collect();
272+
assert_eq!(prefixes.len(), 1);
273+
assert_eq!(prefixes[0], b"");
274+
}
275+
276+
#[test]
277+
fn test_fixed_length_extractor() {
278+
let extractor = FixedLengthExtractor::new(5);
279+
280+
// Key shorter than required length - out of domain
281+
let key = b"abc";
282+
let prefixes: Vec<_> = extractor.extract(key).collect();
283+
assert_eq!(prefixes.len(), 0); // Empty iterator
284+
285+
// Key exactly required length
286+
let key = b"exact";
287+
let prefixes: Vec<_> = extractor.extract(key).collect();
288+
assert_eq!(prefixes.len(), 1);
289+
assert_eq!(prefixes[0], b"exact");
290+
291+
// Key longer than required length
292+
let key = b"longer_key";
293+
let prefixes: Vec<_> = extractor.extract(key).collect();
294+
assert_eq!(prefixes.len(), 1);
295+
assert_eq!(prefixes[0], b"longe");
296+
}
297+
}

0 commit comments

Comments
 (0)