|
151 | 151 | } |
152 | 152 |
|
153 | 153 | TEXT_DATE_PATTERN = re.compile(r"[.:,_/ -]|^\d+$") |
| 154 | +# gate for try_date_expr: a real date has a 4-digit year or a month name |
| 155 | +FOUR_DIGITS = re.compile(r"\d{4}") |
154 | 156 |
|
155 | 157 | DISCARD_PATTERNS = re.compile( |
156 | 158 | r"^\d{2}:\d{2}(?: |:|$)|" |
|
168 | 170 | TEXT_PATTERNS = re.compile( |
169 | 171 | r'(?:date[^0-9"]{,20}|updated|last-modified|published|posted|on)(?:[ :])*?([0-9]{1,4})[./]([0-9]{1,2})[./]([0-9]{2,4})|' # EN |
170 | 172 | r"(?:Datum|Stand|Veröffentlicht am):? ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})|" # DE |
171 | | - r"(?:güncellen?me|yayı(?:m|n)lan?ma) *?(?:tarihi)? *?:? *?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})|" |
| 173 | + # bounded space-runs ({0,9}? not *?) to prevent ReDoS |
| 174 | + r"(?:güncellen?me|yayı(?:m|n)lan?ma) {0,9}?(?:tarihi)? {0,9}?:? {0,9}?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})|" |
172 | 175 | r"([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4}) *?(?:'de|'da|'te|'ta|’de|’da|’te|’ta|tarihinde) *(?:güncellendi|yayı(?:m|n)landı)", # TR |
173 | 176 | re.I, |
174 | 177 | ) |
|
182 | 185 |
|
183 | 186 | # extensive search patterns |
184 | 187 | YEAR_PATTERN = re.compile(rf"^\D?({YEAR_RE})") |
| 188 | +# bounded gap (\D{0,99}, not unbounded \D*) to avoid quadratic backtracking (ReDoS) |
185 | 189 | COPYRIGHT_PATTERN = re.compile( |
186 | | - rf"(?:©|\©|Copyright|\(c\))\D*(?:{YEAR_RE})?-?({YEAR_RE})\D" |
| 190 | + rf"(?:©|\©|Copyright|\(c\))\D{{0,99}}(?:{YEAR_RE})?-?({YEAR_RE})\D" |
187 | 191 | ) |
188 | 192 | THREE_PATTERN = re.compile(r"/([0-9]{4}/[0-9]{2}/[0-9]{2})[01/]") |
189 | 193 | THREE_CATCH = re.compile(r"([0-9]{4})/([0-9]{2})/([0-9]{2})") |
|
201 | 205 | ) |
202 | 206 | SLASHES_YEAR = re.compile(r"([0-9]{2})$") |
203 | 207 | YYYYMM_PATTERN = re.compile(r"\D([12][0-9]{3}[/.-](?:1[0-2]|0[1-9]))\D") |
204 | | -YYYYMM_CATCH = re.compile(rf"({YEAR_RE})[/.-](1[0-2]|0[1-9]|)") |
| 208 | +YYYYMM_CATCH = re.compile(rf"({YEAR_RE})[/.-](1[0-2]|0[1-9])") |
205 | 209 | MMYYYY_PATTERN = re.compile(r"\D([01]?[0-9][/.-][12][0-9]{3})\D") |
206 | 210 | MMYYYY_YEAR = re.compile(rf"({YEAR_RE})\D?$") |
207 | 211 | SIMPLE_PATTERN = re.compile(rf"(?<!w3.org)\D({YEAR_RE})\D") |
@@ -409,8 +413,13 @@ def try_date_expr( |
409 | 413 | return customresult |
410 | 414 |
|
411 | 415 | # use slow but extensive search |
412 | | - # additional filters to prevent computational cost |
413 | | - if extensive_search and TEXT_DATE_PATTERN.search(string): |
| 416 | + # additional filters to prevent computational cost: only hand strings that |
| 417 | + # could be a date (4-digit year or a letter) to the slow external parser |
| 418 | + if ( |
| 419 | + extensive_search |
| 420 | + and TEXT_DATE_PATTERN.search(string) |
| 421 | + and (FOUR_DIGITS.search(string) or any(c.isalpha() for c in string)) |
| 422 | + ): |
414 | 423 | # send to date parser |
415 | 424 | dateparser_result = external_date_parser(string, outputformat) |
416 | 425 | if is_valid_date( |
|
0 commit comments