Skip to content

Commit 93b98a6

Browse files
authored
maintenance: harden code and add tests to prevent bugs (#191)
* maintenance: code hardening to prevent bugs * fix: remove parser calls guard * remove function * consolidate tests and CI
1 parent b895282 commit 93b98a6

9 files changed

Lines changed: 273 additions & 37 deletions

File tree

.github/workflows/codeql.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ jobs:
2222

2323
steps:
2424
- name: Checkout
25-
uses: actions/checkout@v4
25+
uses: actions/checkout@v5
2626

2727
- name: Initialize CodeQL
2828
uses: github/codeql-action/init@v3

.github/workflows/tests.yml

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ on:
99
pull_request:
1010
branches: [ master ]
1111

12+
permissions:
13+
contents: read
14+
1215
env:
1316
REF_PY_VERSION: "3.13"
1417

@@ -37,25 +40,14 @@ jobs:
3740
with:
3841
python-version: ${{ matrix.python-version }}
3942

40-
- name: Upgrade pip
41-
run: python -m pip install --upgrade pip
42-
43-
- name: Get pip cache dir
44-
id: pip-cache
45-
shell: bash
46-
run: |
47-
echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
48-
49-
- name: pip cache
50-
uses: actions/cache@v4
43+
- name: Install uv
44+
uses: astral-sh/setup-uv@v5
5145
with:
52-
path: ${{ steps.pip-cache.outputs.dir }}
53-
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
54-
restore-keys: |
55-
${{ runner.os }}-pip-
46+
enable-cache: true
47+
cache-dependency-glob: pyproject.toml
5648

5749
- name: Install dependencies
58-
run: python -m pip install --upgrade -e ".[dev]"
50+
run: uv pip install --system -e ".[dev]"
5951

6052
# tests
6153
- name: Lint with ruff
@@ -64,17 +56,24 @@ jobs:
6456
- name: Code format and type checking
6557
if: ${{ matrix.python-version == env.REF_PY_VERSION }}
6658
run: |
67-
ruff format --check htmldate
59+
ruff format --check --diff htmldate tests
6860
mypy -p htmldate
6961
7062
- name: Install full dependencies
7163
if: ${{ matrix.env.MINIMAL == 'false'}}
72-
run: python -m pip install -e ".[all]"
64+
run: uv pip install --system -e ".[all]"
7365

7466
- name: Test with pytest
7567
run: |
7668
python -m pytest --cov=./ --cov-report=xml
7769
70+
# benchmark regression gate (timezone pinned for a reproducible F1-score)
71+
- name: Evaluation quality gate
72+
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == env.REF_PY_VERSION }}
73+
env:
74+
TZ: Europe/Berlin
75+
run: cd tests && python eval_gate.py
76+
7877
# coverage
7978
- name: Upload coverage to Codecov
8079
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == env.REF_PY_VERSION }}

htmldate/extractors.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@
151151
}
152152

153153
TEXT_DATE_PATTERN = re.compile(r"[.:,_/ -]|^\d+$")
154+
# gate for try_date_expr: a real date has a 4-digit year or a month name
155+
FOUR_DIGITS = re.compile(r"\d{4}")
154156

155157
DISCARD_PATTERNS = re.compile(
156158
r"^\d{2}:\d{2}(?: |:|$)|"
@@ -168,7 +170,8 @@
168170
TEXT_PATTERNS = re.compile(
169171
r'(?:date[^0-9"]{,20}|updated|last-modified|published|posted|on)(?:[ :])*?([0-9]{1,4})[./]([0-9]{1,2})[./]([0-9]{2,4})|' # EN
170172
r"(?:Datum|Stand|Veröffentlicht am):? ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})|" # DE
171-
r"(?:güncellen?me|yayı(?:m|n)lan?ma) *?(?:tarihi)? *?:? *?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})|"
173+
# bounded space-runs ({0,9}? not *?) to prevent ReDoS
174+
r"(?:güncellen?me|yayı(?:m|n)lan?ma) {0,9}?(?:tarihi)? {0,9}?:? {0,9}?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})|"
172175
r"([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4}) *?(?:'de|'da|'te|'ta|’de|’da|’te|’ta|tarihinde) *(?:güncellendi|yayı(?:m|n)landı)", # TR
173176
re.I,
174177
)
@@ -182,8 +185,9 @@
182185

183186
# extensive search patterns
184187
YEAR_PATTERN = re.compile(rf"^\D?({YEAR_RE})")
188+
# bounded gap (\D{0,99}, not unbounded \D*) to avoid quadratic backtracking (ReDoS)
185189
COPYRIGHT_PATTERN = re.compile(
186-
rf"(?:©|\©|Copyright|\(c\))\D*(?:{YEAR_RE})?-?({YEAR_RE})\D"
190+
rf"(?:©|\©|Copyright|\(c\))\D{{0,99}}(?:{YEAR_RE})?-?({YEAR_RE})\D"
187191
)
188192
THREE_PATTERN = re.compile(r"/([0-9]{4}/[0-9]{2}/[0-9]{2})[01/]")
189193
THREE_CATCH = re.compile(r"([0-9]{4})/([0-9]{2})/([0-9]{2})")
@@ -201,7 +205,7 @@
201205
)
202206
SLASHES_YEAR = re.compile(r"([0-9]{2})$")
203207
YYYYMM_PATTERN = re.compile(r"\D([12][0-9]{3}[/.-](?:1[0-2]|0[1-9]))\D")
204-
YYYYMM_CATCH = re.compile(rf"({YEAR_RE})[/.-](1[0-2]|0[1-9]|)")
208+
YYYYMM_CATCH = re.compile(rf"({YEAR_RE})[/.-](1[0-2]|0[1-9])")
205209
MMYYYY_PATTERN = re.compile(r"\D([01]?[0-9][/.-][12][0-9]{3})\D")
206210
MMYYYY_YEAR = re.compile(rf"({YEAR_RE})\D?$")
207211
SIMPLE_PATTERN = re.compile(rf"(?<!w3.org)\D({YEAR_RE})\D")
@@ -409,8 +413,13 @@ def try_date_expr(
409413
return customresult
410414

411415
# use slow but extensive search
412-
# additional filters to prevent computational cost
413-
if extensive_search and TEXT_DATE_PATTERN.search(string):
416+
# additional filters to prevent computational cost: only hand strings that
417+
# could be a date (4-digit year or a letter) to the slow external parser
418+
if (
419+
extensive_search
420+
and TEXT_DATE_PATTERN.search(string)
421+
and (FOUR_DIGITS.search(string) or any(c.isalpha() for c in string))
422+
):
414423
# send to date parser
415424
dateparser_result = external_date_parser(string, outputformat)
416425
if is_valid_date(

htmldate/utils.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -160,13 +160,12 @@ def repair_faulty_html(htmlstring: str, beginning: str) -> str:
160160
if "doctype" in beginning:
161161
firstline, _, rest = htmlstring.partition("\n")
162162
htmlstring = DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
163-
# other issue with malformed documents: check first three lines
164-
for i, line in enumerate(htmlstring.splitlines()):
165-
if "<html" in line and line.endswith("/>"):
163+
# other issue with malformed documents: check first few lines only
164+
# (split avoids materialising every line of a large document)
165+
for line in htmlstring.split("\n", 4)[:4]:
166+
if "<html" in line and line.rstrip("\r").endswith("/>"):
166167
htmlstring = FAULTY_HTML.sub(r"\1>", htmlstring, count=1)
167168
break
168-
if i > 2:
169-
break
170169
return htmlstring
171170

172171

htmldate/validators.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,12 @@ def convert_date(datestring: str, inputformat: str, outputformat: str) -> str:
180180
def check_extracted_reference(reference: int, options: Extractor) -> str | None:
181181
"""Test if the extracted reference date can be returned"""
182182
if reference > 0:
183-
dateobject = datetime.fromtimestamp(reference)
183+
# reference (untrusted) may be outside the platform timestamp range
184+
try:
185+
dateobject = datetime.fromtimestamp(reference)
186+
except (OSError, OverflowError, ValueError):
187+
LOGGER.debug("invalid reference timestamp: %s", reference)
188+
return None
184189
converted = dateobject.strftime(options.format)
185190
if is_valid_date(
186191
converted, options.format, earliest=options.min, latest=options.max

tests/comparison.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from evaluation import (
1616
EVAL_PAGES,
1717
evaluate_result,
18+
f1_score,
1819
load_document,
1920
run_htmldate_extensive,
2021
run_htmldate_fast,
@@ -74,12 +75,12 @@ def calculate_scores(name, mydict):
7475
mydict["false_positives"],
7576
mydict["true_negatives"],
7677
)
77-
time1 = f'{mydict["time"] / RESULTS_DICT["htmldate_extensive"]["time"] :.2f}x'
78-
time2 = f'{mydict["time"] / RESULTS_DICT["htmldate_fast"]["time"] :.2f}x'
78+
time1 = f"{mydict['time'] / RESULTS_DICT['htmldate_extensive']['time']:.2f}x"
79+
time2 = f"{mydict['time'] / RESULTS_DICT['htmldate_fast']['time']:.2f}x"
7980
precision = tp / (tp + fp)
8081
recall = tp / (tp + fn)
8182
accuracy = (tp + tn) / (tp + tn + fp + fn)
82-
fscore = (2 * tp) / (2 * tp + fp + fn) # 2*((precision*recall)/(precision+recall))
83+
fscore = f1_score(tp, fp, fn)
8384
return name, precision, recall, accuracy, fscore, mydict["time"], time1, time2
8485

8586

@@ -98,15 +99,15 @@ def run_eval(html, data):
9899

99100

100101
if __name__ == "__main__":
101-
102102
# hack to suppress noise
103103
my_stdout = sys.stdout if ARGS.verbose else None
104104
my_stderr = sys.stderr if ARGS.verbose else None
105105

106106
for item, data in tqdm.tqdm(EVAL_PAGES.items(), total=len(EVAL_PAGES)):
107107
htmlstring = load_document(data["file"])
108-
with contextlib.redirect_stdout(my_stdout), contextlib.redirect_stderr(
109-
my_stderr
108+
with (
109+
contextlib.redirect_stdout(my_stdout),
110+
contextlib.redirect_stderr(my_stderr),
110111
):
111112
run_eval(htmlstring, data)
112113

tests/eval_gate.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""CI quality gate: fail if the benchmark F1-score regresses below baseline.
2+
3+
Run from the tests directory. Pin ``TZ=Europe/Berlin`` to reproduce the
4+
baseline (F1 depends on local time and the dateparser/lxml versions).
5+
"""
6+
7+
import sys
8+
9+
from evaluation import (
10+
EVAL_PAGES,
11+
evaluate_result,
12+
f1_score,
13+
load_document,
14+
run_htmldate_extensive,
15+
run_htmldate_fast,
16+
)
17+
18+
# current full-corpus baseline; no regression allowed
19+
FLOORS = {"extensive": 0.9490, "fast": 0.9253}
20+
RUNNERS = {"extensive": run_htmldate_extensive, "fast": run_htmldate_fast}
21+
22+
23+
def score_function(func):
24+
tp = fp = fn = 0
25+
for _, data in EVAL_PAGES.items():
26+
true_pos, false_pos, _, false_neg = evaluate_result(
27+
func(load_document(data["file"])), data
28+
)
29+
tp, fp, fn = tp + true_pos, fp + false_pos, fn + false_neg
30+
return f1_score(tp, fp, fn)
31+
32+
33+
def main():
34+
regression = False
35+
for name, func in RUNNERS.items():
36+
score, floor = round(score_function(func), 4), FLOORS[name]
37+
regression = regression or score < floor
38+
print(f"{name:>9}: F1={score:.4f} (floor {floor:.4f})")
39+
return int(regression)
40+
41+
42+
if __name__ == "__main__":
43+
sys.exit(main())

tests/evaluation.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,3 +161,8 @@ def evaluate_result(result, data):
161161
if result is None:
162162
return (0, 0, 1, 1) if datereference is None else (0, 0, 0, 1)
163163
return (1, 0, 0, 0) if result == datereference else (0, 1, 0, 0)
164+
165+
166+
def f1_score(tp, fp, fn):
167+
"""Compute the F1-score from the confusion-matrix counts"""
168+
return (2 * tp) / (2 * tp + fp + fn)

0 commit comments

Comments
 (0)