Skip to content

Commit ae30798

Browse files
committed
Optimize backreference handling
1 parent 439ca98 commit ae30798

File tree

3 files changed

+45
-39
lines changed

3 files changed

+45
-39
lines changed

regex_enumerator/regex_enumerator.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@
44

55
class RegexEnumerator:
66
def __init__(self, regex: str, additional_charset: str | list[str] = None) -> None:
7-
default_charset = [chr(c) for c in range(32, 127)]
7+
default_charset = [' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
8+
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?',
9+
'@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
10+
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
11+
'`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
12+
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']
813

914
if additional_charset is None:
1015
additional = []

regex_enumerator/regex_parser.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,17 +82,16 @@ def _parseRegex(self, to_close: bool) -> RegexTree:
8282
elements.append(
8383
CharClass(chars, min_len, max_len))
8484
case '.':
85-
chars = list(self.charset)
8685
min_len, max_len = self._parseQuantifier()
8786
elements.append(
88-
CharClass(chars, min_len, max_len))
87+
CharClass(self.charset, min_len, max_len))
8988
case '\\':
9089
reference = self._parseBackReferenceLookahead()
9190
if reference is None:
9291
chars = self._parseEscapeChar()
9392
min_len, max_len = self._parseQuantifier()
9493
elements.append(
95-
CharClass([chars], min_len, max_len))
94+
CharClass(chars, min_len, max_len))
9695
continue
9796
if isinstance(reference, str):
9897
if reference not in named_groups:
@@ -110,7 +109,7 @@ def _parseRegex(self, to_close: bool) -> RegexTree:
110109
case _:
111110
min_len, max_len = self._parseQuantifier()
112111
elements.append(
113-
CharClass([char], min_len, max_len))
112+
CharClass(char, min_len, max_len))
114113

115114
if to_close:
116115
self._raise_error("Unmatched opening parenthesis")
@@ -191,7 +190,7 @@ def _parseEscapeChar(self) -> str:
191190
self._raise_error("Unicode property not supported")
192191
case _: return char
193192

194-
def _parseCharClass(self) -> list[str]:
193+
def _parseCharClass(self) -> str:
195194
chars_list: list[str] = []
196195
first_char = None
197196
range_divider = False
@@ -249,11 +248,12 @@ def _parseCharClass(self) -> list[str]:
249248
elif first_char is not None:
250249
chars_list.append(first_char)
251250

251+
charset = ''.join(sorted(set(''.join(chars_list))))
252+
252253
if negated:
253-
chars_list = [
254-
c for c in self.charset if c not in ''.join(chars_list)]
254+
return ''.join(c for c in self.charset if c not in charset)
255255

256-
return chars_list
256+
return charset
257257

258258
def _parseQuantifier(self) -> tuple[int, int | None]:
259259

regex_enumerator/regex_tree.py

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@ class RegexTree:
33

44

55
class CharClass:
6-
def __init__(self, chars_list: list[str], min_len: int, max_len: int | None):
6+
def __init__(self, charset: str, min_len: int, max_len: int | None):
77
self._index = 0
8-
self._chars: str = ''.join(sorted(set(''.join(chars_list))))
8+
self._charset = charset
99
self._min_len = min_len
1010
self._max_len = max_len
11-
self._base = len(self._chars)
11+
self._base = len(charset)
1212
self.done = self._base == 0 or self._max_len == 0
1313
self.current: list[str] = self._first()
1414

@@ -18,7 +18,7 @@ def _first(self) -> list[str]:
1818

1919
if self._base == 1 and self._max_len is not None:
2020
self.done = True
21-
result = [self._chars *
21+
result = [self._charset *
2222
i for i in range(self._min_len, self._max_len + 1)]
2323
return result
2424

@@ -27,7 +27,7 @@ def _first(self) -> list[str]:
2727

2828
result = ['']
2929
for _ in range(self._min_len):
30-
result = [pfx + sfx for pfx in self._chars for sfx in result]
30+
result = [pfx + sfx for pfx in self._charset for sfx in result]
3131

3232
self._last = result
3333
return result
@@ -39,7 +39,7 @@ def next(self) -> list[str]:
3939
if self._max_len is not None and self._index + self._min_len == self._max_len:
4040
self.done = True
4141

42-
result = [pfx + sfx for pfx in self._last for sfx in self._chars]
42+
result = [pfx + sfx for pfx in self._last for sfx in self._charset]
4343
self.current.extend(result)
4444
self._last = result
4545
return result
@@ -118,26 +118,18 @@ def next(self) -> set[str]:
118118
self._index = index
119119
result: list[tuple[str, dict[RegexTree, str]]] = []
120120

121-
if isinstance(self._elements[0], CharClass):
121+
if isinstance(self._elements[0], RegexTree) and len(self._elements[0].references):
122122
for string in self._elements[0].next() if index == 0 else self._elements[0].current:
123-
result.append((string, {}))
123+
result.append((string, {self._elements[0]: string}))
124124
else:
125125
for string in self._elements[0].next() if index == 0 else self._elements[0].current:
126-
result.append((string, {self._elements[0]: string}))
126+
result.append((string, {}))
127127

128128
done = self._elements[0].done
129129

130130
for i, element in enumerate(self._elements[1:], start=1):
131131
temp = []
132-
if isinstance(element, CharClass):
133-
for sfx in element.next() if i == index else element.current:
134-
for pfx in result:
135-
temp.append((pfx[0] + sfx, pfx[1]))
136-
elif isinstance(element, RegexTree):
137-
for sfx in element.next() if i == index else element.current:
138-
for pfx in result:
139-
temp.append((pfx[0] + sfx, {**pfx[1], element: sfx}))
140-
else:
132+
if isinstance(element, BackReference):
141133
if i == index:
142134
element.next()
143135
for pfx in result:
@@ -146,6 +138,14 @@ def next(self) -> set[str]:
146138
for sfx in element.current[reference]:
147139
temp.append(
148140
(pfx[0] + sfx, pfx[1]))
141+
elif isinstance(element, RegexTree) and len(element.references):
142+
for sfx in element.next() if i == index else element.current:
143+
for pfx in result:
144+
temp.append((pfx[0] + sfx, {**pfx[1], element: sfx}))
145+
else:
146+
for sfx in element.next() if i == index else element.current:
147+
for pfx in result:
148+
temp.append((pfx[0] + sfx, pfx[1]))
149149
result = temp
150150
done = done and element.done
151151

@@ -204,33 +204,34 @@ def _first(self) -> set[str]:
204204

205205
result: list[tuple[str, dict[RegexTree, str]]] = []
206206

207-
if isinstance(self._elements[0], CharClass):
207+
if isinstance(self._elements[0], RegexTree) and len(self._elements[0].references):
208208
for char in self._elements[0].current:
209-
result.append((char, {}))
209+
result.append((char, {self._elements[0]: char}))
210210
else:
211211
for char in self._elements[0].current:
212-
result.append((char, {self._elements[0]: char}))
212+
result.append((char, {}))
213213

214214
done = self._elements[0].done
215215

216216
for element in self._elements[1:]:
217217
temp: list[tuple[str, dict[RegexTree, str]]] = []
218218
done = done and element.done
219-
if isinstance(element, CharClass):
220-
for pfx in result:
221-
for sfx in element.current:
222-
temp.append((pfx[0] + sfx, pfx[1]))
223-
elif isinstance(element, RegexTree):
224-
for pfx in result:
225-
for sfx in element.current:
226-
temp.append((pfx[0] + sfx, {**pfx[1], element: sfx}))
227-
else:
219+
if isinstance(element, BackReference):
228220
for pfx in result:
229221
reference = pfx[1][element.reference]
230222
assert reference is not None
231223
for sfx in element.current[reference]:
232224
temp.append(
233225
(pfx[0] + sfx, pfx[1]))
226+
elif isinstance(element, RegexTree) and len(element.references):
227+
for pfx in result:
228+
for sfx in element.current:
229+
temp.append((pfx[0] + sfx, {**pfx[1], element: sfx}))
230+
else:
231+
for pfx in result:
232+
for sfx in element.current:
233+
temp.append((pfx[0] + sfx, pfx[1]))
234+
234235
result = temp
235236

236237
self.done = done

0 commit comments

Comments
 (0)