Skip to content

Commit 61ac29b

Browse files
committed
✨ Add {m} and {n,m} in PythonRegex
1 parent e7214f7 commit 61ac29b

File tree

2 files changed

+92
-4
lines changed

2 files changed

+92
-4
lines changed

Diff for: pyformlang/regular_expression/python_regex.py

+66-3
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def _recombine(regex_to_recombine):
176176
if regex_to_recombine[idx] == "\\x" and idx < len(regex_to_recombine) - 2 \
177177
and regex_to_recombine[idx + 1] in HEXASTRING \
178178
and regex_to_recombine[idx + 2] in HEXASTRING:
179-
next_str = "".join(regex_to_recombine[idx+1:idx+3])
179+
next_str = "".join(regex_to_recombine[idx + 1:idx + 3])
180180
s_trans = chr(int(next_str, 16))
181181
temp.append(TRANSFORMATIONS.get(s_trans, s_trans))
182182
idx += 3
@@ -197,12 +197,12 @@ def _recombine(regex_to_recombine):
197197
temp.append(TRANSFORMATIONS.get(name, name))
198198
idx = idx_end + 1
199199
elif regex_to_recombine[idx] == "\\u":
200-
unicode_str = "".join(regex_to_recombine[idx+1: idx+5])
200+
unicode_str = "".join(regex_to_recombine[idx + 1: idx + 5])
201201
decoded = chr(int(unicode_str, 16))
202202
temp.append(TRANSFORMATIONS.get(decoded, decoded))
203203
idx = idx + 5
204204
elif regex_to_recombine[idx] == "\\U":
205-
unicode_str = "".join(regex_to_recombine[idx+1: idx+9])
205+
unicode_str = "".join(regex_to_recombine[idx + 1: idx + 9])
206206
decoded = chr(int(unicode_str, 16))
207207
temp.append(TRANSFORMATIONS.get(decoded, decoded))
208208
idx = idx + 9
@@ -291,8 +291,71 @@ def _preprocess_positive_closure(self):
291291
for j in range(pos_opening, len(regex_temp)):
292292
regex_temp.append(regex_temp[j])
293293
regex_temp.append("*")
294+
regex_temp = self._add_repetition(regex_temp)
294295
self._python_regex = "".join(regex_temp)
295296

297+
@staticmethod
298+
def _is_repetition(regex_list, idx):
299+
if regex_list[idx] == "{":
300+
end = idx
301+
for i in range(idx + 1, len(regex_list)):
302+
if regex_list[i] == "}":
303+
end = i
304+
break
305+
inner = "".join(regex_list[idx + 1:end])
306+
if "," in inner:
307+
split = inner.split(",")
308+
if len(split) != 2 or not split[0].isdigit() or not split[1].isdigit():
309+
return None
310+
return int(split[0]), int(split[1]), end
311+
if inner.isdigit():
312+
return int(inner), end
313+
return None
314+
315+
@staticmethod
316+
def _find_repeated_sequence(regex_list):
317+
if regex_list[-1] != ")":
318+
return [regex_list[-1]]
319+
res = [")"]
320+
counter = -1
321+
for i in range(len(regex_list) - 2, -1, -1):
322+
if regex_list[i] == "(":
323+
counter += 1
324+
res.append("(")
325+
if counter == 0:
326+
return res[::-1]
327+
elif regex_list[i] == ")":
328+
counter -= 1
329+
res.append(")")
330+
else:
331+
res.append(regex_list[i])
332+
return []
333+
334+
def _add_repetition(self, regex_list):
335+
res = []
336+
idx = 0
337+
while idx < len(regex_list):
338+
rep = self._is_repetition(regex_list, idx)
339+
if rep is None:
340+
res.append(regex_list[idx])
341+
idx += 1
342+
elif len(rep) == 2:
343+
n_rep, end = rep
344+
repeated = self._find_repeated_sequence(res)
345+
for _ in range(n_rep - 1):
346+
res.extend(repeated)
347+
idx = end + 1
348+
elif len(rep) == 3:
349+
min_rep, max_rep, end = rep
350+
repeated = self._find_repeated_sequence(res)
351+
for _ in range(min_rep - 1):
352+
res.extend(repeated)
353+
for _ in range(min_rep, max_rep):
354+
res.extend(repeated)
355+
res.append("?")
356+
idx = end + 1
357+
return res
358+
296359
def _preprocess_optional(self):
297360
regex_temp = []
298361
for symbol in self._python_regex:

Diff for: pyformlang/regular_expression/tests/test_python_regex.py

+26-1
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def test_shortcut_word(self):
199199
def _test_compare(self, regex, s_test):
200200
r_pyformlang = PythonRegex(regex)
201201
r_python = re.compile(regex)
202-
self.assertEqual(r_python.match(s_test) is not None, r_pyformlang.accepts(s_test))
202+
self.assertEqual(r_python.fullmatch(s_test) is not None, r_pyformlang.accepts(s_test))
203203

204204
def test_backslash(self):
205205
self._test_compare(".*", "]")
@@ -283,3 +283,28 @@ def test_dot_harder(self):
283283
self._test_compare(r"\.", ".")
284284
self._test_compare(r"\\\.", "\\a")
285285
self._test_compare(r"\\\.", "\\.")
286+
287+
def test_single_repetition(self):
288+
self._test_compare(r"\d{3}-\d{3}-\d{4}", "012-876-3789")
289+
self._test_compare(r"a{5}b", "ab")
290+
self._test_compare(r"a{5}b", "aaaaab")
291+
self._test_compare(r"a{5b", "aaaaab")
292+
self._test_compare(r"a{5b", "a{5b")
293+
self._test_compare(r"T{4}P{3}", "TTTTTTPPPPPPPPPPPP")
294+
295+
def test_range_repetition(self):
296+
self._test_compare(r"a{2,5}b", "ab")
297+
self._test_compare(r"a{2,5}b", "aab")
298+
self._test_compare(r"a{2,5}b", "aaaaab")
299+
self._test_compare(r"a{2,5}b", "aaaaaab")
300+
self._test_compare(r"a{2,5,7}b", "aaaaab")
301+
self._test_compare(r"a{2,5,7}b", "a{2,5,7}b")
302+
self._test_compare(r"ab{2,5}", "ab")
303+
self._test_compare(r"ab{2,5}", "abbb")
304+
self._test_compare(r"ab{2,5}", "abbbbb")
305+
self._test_compare(r"ab{2,5}", "abbbbbbbbb")
306+
self._test_compare(r"[a-z]{1,3}", "")
307+
self._test_compare(r"[a-z]{1,3}", "d")
308+
self._test_compare(r"[a-z]{1,3}", "do")
309+
self._test_compare(r"[a-z]{1,3}", "dpo")
310+
self._test_compare(r"[a-z]{1,3}", "dpoz")

0 commit comments

Comments
 (0)