Skip to content

Commit e13f35b

Browse files
committed
feat: experimental unicode-enabled string matching for avx2
1 parent b96bb31 commit e13f35b

40 files changed

+238445
-559
lines changed

Cargo.lock

Lines changed: 171 additions & 134 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/rsonpath-lib/src/classification/memmem.rs

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use crate::{
44
input::{error::InputError, Input},
55
result::InputRecorder,
6-
string_pattern::StringPattern,
6+
string_pattern::{matcher::StringPatternMatcher, StringPattern},
77
BLOCK_SIZE,
88
};
99

@@ -16,14 +16,19 @@ pub trait Memmem<'i, 'b, 'r, I: Input, const N: usize> {
1616
/// - `start_idx` &ndash; index of the start of search, either falling inside `first_block`,
1717
/// or at the start of the next block.
1818
///
19+
/// # Returns
20+
/// None if there was nno match.
21+
/// Otherwise, `Some((i, j, block))` where `i` and `j` delimit the match exactly,
22+
/// and `block` is the input block in which the start of the match occured.
23+
///
1924
/// # Errors
2025
/// Errors when reading the underlying [`Input`] are propagated.
2126
fn find_label(
2227
&mut self,
2328
first_block: Option<I::Block<'i, N>>,
2429
start_idx: usize,
2530
label: &StringPattern,
26-
) -> Result<Option<(usize, I::Block<'i, N>)>, InputError>;
31+
) -> Result<Option<(usize, usize, I::Block<'i, N>)>, InputError>;
2732
}
2833

2934
pub(crate) mod nosimd;
@@ -39,19 +44,21 @@ pub(crate) mod sse2_32;
3944
pub(crate) mod sse2_64;
4045

4146
pub(crate) trait MemmemImpl {
42-
type Classifier<'i, 'b, 'r, I, R>: Memmem<'i, 'b, 'r, I, BLOCK_SIZE>
47+
type Classifier<'i, 'b, 'r, I, SM, R>: Memmem<'i, 'b, 'r, I, BLOCK_SIZE>
4348
where
4449
I: Input + 'i,
50+
SM: StringPatternMatcher,
4551
<I as Input>::BlockIterator<'i, 'r, R, BLOCK_SIZE>: 'b,
4652
R: InputRecorder<<I as Input>::Block<'i, BLOCK_SIZE>> + 'r,
4753
'i: 'r;
4854

49-
fn memmem<'i, 'b, 'r, I, R>(
55+
fn memmem<'i, 'b, 'r, I, SM, R>(
5056
input: &'i I,
5157
iter: &'b mut <I as Input>::BlockIterator<'i, 'r, R, BLOCK_SIZE>,
52-
) -> Self::Classifier<'i, 'b, 'r, I, R>
58+
) -> Self::Classifier<'i, 'b, 'r, I, SM, R>
5359
where
5460
I: Input,
61+
SM: StringPatternMatcher,
5562
R: InputRecorder<<I as Input>::Block<'i, BLOCK_SIZE>>,
5663
'i: 'r;
5764
}
Lines changed: 65 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,68 @@
11
use super::{shared::mask_32, shared::vector_256, *};
22
use crate::input::{error::InputErrorConvertible, InputBlockIterator};
3+
use std::marker::PhantomData;
34

45
const SIZE: usize = 32;
56

67
pub(crate) struct Constructor;
78

89
impl MemmemImpl for Constructor {
9-
type Classifier<'i, 'b, 'r, I, R>
10-
= Avx2MemmemClassifier32<'i, 'b, 'r, I, R>
10+
type Classifier<'i, 'b, 'r, I, SM, R>
11+
= Avx2MemmemClassifier32<'i, 'b, 'r, I, SM, R>
1112
where
1213
I: Input + 'i,
14+
SM: StringPatternMatcher,
1315
<I as Input>::BlockIterator<'i, 'r, R, BLOCK_SIZE>: 'b,
1416
R: InputRecorder<<I as Input>::Block<'i, BLOCK_SIZE>> + 'r,
1517
'i: 'r;
1618

17-
fn memmem<'i, 'b, 'r, I, R>(
19+
fn memmem<'i, 'b, 'r, I, SM, R>(
1820
input: &'i I,
1921
iter: &'b mut <I as Input>::BlockIterator<'i, 'r, R, BLOCK_SIZE>,
20-
) -> Self::Classifier<'i, 'b, 'r, I, R>
22+
) -> Self::Classifier<'i, 'b, 'r, I, SM, R>
2123
where
2224
I: Input,
25+
SM: StringPatternMatcher,
2326
R: InputRecorder<<I as Input>::Block<'i, BLOCK_SIZE>>,
2427
'i: 'r,
2528
{
26-
Self::Classifier { input, iter }
29+
Self::Classifier::new(input, iter)
2730
}
2831
}
2932

30-
pub(crate) struct Avx2MemmemClassifier32<'i, 'b, 'r, I, R>
33+
pub(crate) struct Avx2MemmemClassifier32<'i, 'b, 'r, I, SM, R>
3134
where
3235
I: Input,
3336
R: InputRecorder<I::Block<'i, SIZE>> + 'r,
3437
{
3538
input: &'i I,
3639
iter: &'b mut I::BlockIterator<'i, 'r, R, SIZE>,
40+
phantom_data: PhantomData<SM>,
3741
}
3842

39-
impl<'i, 'b, 'r, I, R> Avx2MemmemClassifier32<'i, 'b, 'r, I, R>
43+
impl<'i, 'b, 'r, I, SM, R> Avx2MemmemClassifier32<'i, 'b, 'r, I, SM, R>
4044
where
4145
I: Input,
46+
SM: StringPatternMatcher,
4247
R: InputRecorder<I::Block<'i, SIZE>>,
4348
'i: 'r,
4449
{
4550
#[inline]
4651
#[allow(dead_code)]
4752
pub(crate) fn new(input: &'i I, iter: &'b mut I::BlockIterator<'i, 'r, R, SIZE>) -> Self {
48-
Self { input, iter }
53+
Self {
54+
input,
55+
iter,
56+
phantom_data: PhantomData,
57+
}
4958
}
5059

5160
#[inline(always)]
5261
unsafe fn find_empty(
5362
&mut self,
54-
label: &StringPattern,
63+
pattern: &StringPattern,
5564
mut offset: usize,
56-
) -> Result<Option<(usize, I::Block<'i, SIZE>)>, InputError> {
65+
) -> Result<Option<(usize, usize, I::Block<'i, SIZE>)>, InputError> {
5766
let classifier = vector_256::BlockClassifier256::new(b'"', b'"');
5867
let mut previous_block: u32 = 0;
5968

@@ -63,12 +72,8 @@ where
6372
let mut result = (previous_block | (classified.first << 1)) & classified.second;
6473
while result != 0 {
6574
let idx = result.trailing_zeros() as usize;
66-
if self
67-
.input
68-
.is_member_match(offset + idx - 1, offset + idx + 1, label)
69-
.e()?
70-
{
71-
return Ok(Some((offset + idx - 1, block)));
75+
if let Some(to) = self.input.pattern_match_from::<SM>(offset + idx - 1, pattern).e()? {
76+
return Ok(Some((offset + idx - 1, to, block)));
7277
}
7378
result &= !(1 << idx);
7479
}
@@ -86,28 +91,36 @@ where
8691
#[inline(always)]
8792
unsafe fn find_letter(
8893
&mut self,
89-
label: &StringPattern,
94+
pattern: &StringPattern,
9095
mut offset: usize,
91-
) -> Result<Option<(usize, I::Block<'i, SIZE>)>, InputError> {
92-
let classifier = vector_256::BlockClassifier256::new(label.unquoted()[0], b'"');
93-
let mut previous_block: u32 = 0;
96+
) -> Result<Option<(usize, usize, I::Block<'i, SIZE>)>, InputError> {
97+
let classifier = vector_256::BlockClassifier256::new(pattern.unquoted()[0], b'"');
98+
let mut previous_slash: u32 = 0;
99+
let mut previous_first: u32 = 0;
100+
let mut previous_quote: u32 = 0;
94101

95102
while let Some(block) = self.iter.next().e()? {
96103
let classified = classifier.classify_block(&block);
97104

98-
if let Some(res) = mask_32::find_in_mask(
105+
if let Some((from, to)) = mask_32::find_in_mask::<_, SM>(
99106
self.input,
100-
label,
101-
previous_block,
107+
pattern,
108+
previous_slash,
109+
previous_quote,
110+
previous_first,
102111
classified.first,
103112
classified.second,
113+
classified.slashes,
114+
classified.quotes,
104115
offset,
105116
)? {
106-
return Ok(Some((res, block)));
117+
return Ok(Some((from, to, block)));
107118
}
108119

109120
offset += SIZE;
110-
previous_block = classified.first >> (SIZE - 1);
121+
previous_slash = classified.slashes >> (SIZE - 1);
122+
previous_first = classified.first >> (SIZE - 1);
123+
previous_quote = classified.quotes >> (SIZE - 2);
111124
}
112125

113126
Ok(None)
@@ -116,43 +129,52 @@ where
116129
#[inline(always)]
117130
unsafe fn find_label_avx2(
118131
&mut self,
119-
label: &StringPattern,
132+
pattern: &StringPattern,
120133
mut offset: usize,
121-
) -> Result<Option<(usize, I::Block<'i, SIZE>)>, InputError> {
122-
if label.unquoted().is_empty() {
123-
return self.find_empty(label, offset);
124-
} else if label.unquoted().len() == 1 {
125-
return self.find_letter(label, offset);
134+
) -> Result<Option<(usize, usize, I::Block<'i, SIZE>)>, InputError> {
135+
if pattern.unquoted().is_empty() {
136+
return self.find_empty(pattern, offset);
137+
} else if pattern.unquoted().len() == 1 {
138+
return self.find_letter(pattern, offset);
126139
}
127140

128-
let classifier = vector_256::BlockClassifier256::new(label.unquoted()[0], label.unquoted()[1]);
129-
let mut previous_block: u32 = 0;
141+
let classifier = vector_256::BlockClassifier256::new(pattern.unquoted()[0], pattern.unquoted()[1]);
142+
let mut previous_slash: u32 = 0;
143+
let mut previous_first: u32 = 0;
144+
let mut previous_quote: u32 = 0;
130145

131146
while let Some(block) = self.iter.next().e()? {
132147
let classified = classifier.classify_block(&block);
133148

134-
if let Some(res) = mask_32::find_in_mask(
149+
if let Some((from, to)) = mask_32::find_in_mask::<_, SM>(
135150
self.input,
136-
label,
137-
previous_block,
151+
pattern,
152+
previous_slash,
153+
previous_quote,
154+
previous_first,
138155
classified.first,
139156
classified.second,
157+
classified.slashes,
158+
classified.quotes,
140159
offset,
141160
)? {
142-
return Ok(Some((res, block)));
161+
return Ok(Some((from, to, block)));
143162
}
144163

145164
offset += SIZE;
146-
previous_block = classified.first >> (SIZE - 1);
165+
previous_slash = classified.slashes >> (SIZE - 1);
166+
previous_first = classified.first >> (SIZE - 1);
167+
previous_quote = classified.quotes >> (SIZE - 2);
147168
}
148169

149170
Ok(None)
150171
}
151172
}
152173

153-
impl<'i, 'b, 'r, I, R> Memmem<'i, 'b, 'r, I, SIZE> for Avx2MemmemClassifier32<'i, 'b, 'r, I, R>
174+
impl<'i, 'b, 'r, I, SM, R> Memmem<'i, 'b, 'r, I, SIZE> for Avx2MemmemClassifier32<'i, 'b, 'r, I, SM, R>
154175
where
155176
I: Input,
177+
SM: StringPatternMatcher,
156178
R: InputRecorder<I::Block<'i, SIZE>>,
157179
'i: 'r,
158180
{
@@ -161,15 +183,15 @@ where
161183
&mut self,
162184
first_block: Option<I::Block<'i, SIZE>>,
163185
start_idx: usize,
164-
label: &StringPattern,
165-
) -> Result<Option<(usize, I::Block<'i, SIZE>)>, InputError> {
186+
pattern: &StringPattern,
187+
) -> Result<Option<(usize, usize, I::Block<'i, SIZE>)>, InputError> {
166188
if let Some(b) = first_block {
167-
if let Some(res) = shared::find_label_in_first_block(self.input, b, start_idx, label)? {
189+
if let Some(res) = shared::find_pattern_in_first_block::<_, SM, SIZE>(self.input, b, start_idx, pattern)? {
168190
return Ok(Some(res));
169191
}
170192
}
171193
let next_block_offset = self.iter.get_offset();
172194
// SAFETY: target feature invariant
173-
unsafe { self.find_label_avx2(label, next_block_offset) }
195+
unsafe { self.find_label_avx2(pattern, next_block_offset) }
174196
}
175197
}

0 commit comments

Comments
 (0)