maintenance: harden code and add tests to prevent bugs (#191)

adbar · web-flow · commit 93b98a6a82df · 2026-06-15T18:23:15.000+02:00
* maintenance: code hardening to prevent bugs

* fix: remove parser calls guard

* remove function

* consolidate tests and CI
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -22,7 +22,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
 
       - name: Initialize CodeQL
         uses: github/codeql-action/init@v3
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -9,6 +9,9 @@ on:
   pull_request:
     branches: [ master ]
 
+permissions:
+  contents: read
+
 env:
   REF_PY_VERSION: "3.13"
 
@@ -37,25 +40,14 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
 
-    - name: Upgrade pip
-      run: python -m pip install --upgrade pip
-
-    - name: Get pip cache dir
-      id: pip-cache
-      shell: bash
-      run: |
-        echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
-
-    - name: pip cache
-      uses: actions/cache@v4
+    - name: Install uv
+      uses: astral-sh/setup-uv@v5
       with:
-        path: ${{ steps.pip-cache.outputs.dir }}
-        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
-        restore-keys: |
-          ${{ runner.os }}-pip-
+        enable-cache: true
+        cache-dependency-glob: pyproject.toml
 
     - name: Install dependencies
-      run: python -m pip install --upgrade -e ".[dev]"
+      run: uv pip install --system -e ".[dev]"
 
     # tests
     - name: Lint with ruff
@@ -64,17 +56,24 @@ jobs:
     - name: Code format and type checking
       if: ${{ matrix.python-version == env.REF_PY_VERSION }}
       run: |
-        ruff format --check htmldate
+        ruff format --check --diff htmldate tests
         mypy -p htmldate
 
     - name: Install full dependencies
       if: ${{ matrix.env.MINIMAL == 'false'}}
-      run: python -m pip install -e ".[all]"
+      run: uv pip install --system -e ".[all]"
 
     - name: Test with pytest
       run: |
         python -m pytest --cov=./ --cov-report=xml
 
+    # benchmark regression gate (timezone pinned for a reproducible F1-score)
+    - name: Evaluation quality gate
+      if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == env.REF_PY_VERSION }}
+      env:
+        TZ: Europe/Berlin
+      run: cd tests && python eval_gate.py
+
     # coverage
     - name: Upload coverage to Codecov
       if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == env.REF_PY_VERSION }}
diff --git a/htmldate/extractors.py b/htmldate/extractors.py
@@ -151,6 +151,8 @@
 }
 
 TEXT_DATE_PATTERN = re.compile(r"[.:,_/ -]|^\d+$")
+# gate for try_date_expr: a real date has a 4-digit year or a month name
+FOUR_DIGITS = re.compile(r"\d{4}")
 
 DISCARD_PATTERNS = re.compile(
     r"^\d{2}:\d{2}(?: |:|$)|"
@@ -168,7 +170,8 @@
 TEXT_PATTERNS = re.compile(
     r'(?:date[^0-9"]{,20}|updated|last-modified|published|posted|on)(?:[ :])*?([0-9]{1,4})[./]([0-9]{1,2})[./]([0-9]{2,4})|'  # EN
     r"(?:Datum|Stand|Veröffentlicht am):? ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})|"  # DE
-    r"(?:güncellen?me|yayı(?:m|n)lan?ma) *?(?:tarihi)? *?:? *?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})|"
+    # bounded space-runs ({0,9}? not *?) to prevent ReDoS
+    r"(?:güncellen?me|yayı(?:m|n)lan?ma) {0,9}?(?:tarihi)? {0,9}?:? {0,9}?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})|"
     r"([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4}) *?(?:'de|'da|'te|'ta|’de|’da|’te|’ta|tarihinde) *(?:güncellendi|yayı(?:m|n)landı)",  # TR
     re.I,
 )
@@ -182,8 +185,9 @@
 
 # extensive search patterns
 YEAR_PATTERN = re.compile(rf"^\D?({YEAR_RE})")
+# bounded gap (\D{0,99}, not unbounded \D*) to avoid quadratic backtracking (ReDoS)
 COPYRIGHT_PATTERN = re.compile(
-    rf"(?:©|\&copy;|Copyright|\(c\))\D*(?:{YEAR_RE})?-?({YEAR_RE})\D"
+    rf"(?:©|\&copy;|Copyright|\(c\))\D{{0,99}}(?:{YEAR_RE})?-?({YEAR_RE})\D"
 )
 THREE_PATTERN = re.compile(r"/([0-9]{4}/[0-9]{2}/[0-9]{2})[01/]")
 THREE_CATCH = re.compile(r"([0-9]{4})/([0-9]{2})/([0-9]{2})")
@@ -201,7 +205,7 @@
 )
 SLASHES_YEAR = re.compile(r"([0-9]{2})$")
 YYYYMM_PATTERN = re.compile(r"\D([12][0-9]{3}[/.-](?:1[0-2]|0[1-9]))\D")
-YYYYMM_CATCH = re.compile(rf"({YEAR_RE})[/.-](1[0-2]|0[1-9]|)")
+YYYYMM_CATCH = re.compile(rf"({YEAR_RE})[/.-](1[0-2]|0[1-9])")
 MMYYYY_PATTERN = re.compile(r"\D([01]?[0-9][/.-][12][0-9]{3})\D")
 MMYYYY_YEAR = re.compile(rf"({YEAR_RE})\D?$")
 SIMPLE_PATTERN = re.compile(rf"(?<!w3.org)\D({YEAR_RE})\D")
@@ -409,8 +413,13 @@ def try_date_expr(
         return customresult
 
     # use slow but extensive search
-    # additional filters to prevent computational cost
-    if extensive_search and TEXT_DATE_PATTERN.search(string):
+    # additional filters to prevent computational cost: only hand strings that
+    # could be a date (4-digit year or a letter) to the slow external parser
+    if (
+        extensive_search
+        and TEXT_DATE_PATTERN.search(string)
+        and (FOUR_DIGITS.search(string) or any(c.isalpha() for c in string))
+    ):
         # send to date parser
         dateparser_result = external_date_parser(string, outputformat)
         if is_valid_date(
diff --git a/htmldate/utils.py b/htmldate/utils.py
@@ -160,13 +160,12 @@ def repair_faulty_html(htmlstring: str, beginning: str) -> str:
     if "doctype" in beginning:
         firstline, _, rest = htmlstring.partition("\n")
         htmlstring = DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
-    # other issue with malformed documents: check first three lines
-    for i, line in enumerate(htmlstring.splitlines()):
-        if "<html" in line and line.endswith("/>"):
+    # other issue with malformed documents: check first few lines only
+    # (split avoids materialising every line of a large document)
+    for line in htmlstring.split("\n", 4)[:4]:
+        if "<html" in line and line.rstrip("\r").endswith("/>"):
             htmlstring = FAULTY_HTML.sub(r"\1>", htmlstring, count=1)
             break
-        if i > 2:
-            break
     return htmlstring
 
 
diff --git a/htmldate/validators.py b/htmldate/validators.py
@@ -180,7 +180,12 @@ def convert_date(datestring: str, inputformat: str, outputformat: str) -> str:
 def check_extracted_reference(reference: int, options: Extractor) -> str | None:
     """Test if the extracted reference date can be returned"""
     if reference > 0:
-        dateobject = datetime.fromtimestamp(reference)
+        # reference (untrusted) may be outside the platform timestamp range
+        try:
+            dateobject = datetime.fromtimestamp(reference)
+        except (OSError, OverflowError, ValueError):
+            LOGGER.debug("invalid reference timestamp: %s", reference)
+            return None
         converted = dateobject.strftime(options.format)
         if is_valid_date(
             converted, options.format, earliest=options.min, latest=options.max
diff --git a/tests/comparison.py b/tests/comparison.py
@@ -15,6 +15,7 @@
 from evaluation import (
     EVAL_PAGES,
     evaluate_result,
+    f1_score,
     load_document,
     run_htmldate_extensive,
     run_htmldate_fast,
@@ -74,12 +75,12 @@ def calculate_scores(name, mydict):
         mydict["false_positives"],
         mydict["true_negatives"],
     )
-    time1 = f'{mydict["time"] / RESULTS_DICT["htmldate_extensive"]["time"] :.2f}x'
-    time2 = f'{mydict["time"] / RESULTS_DICT["htmldate_fast"]["time"] :.2f}x'
+    time1 = f"{mydict['time'] / RESULTS_DICT['htmldate_extensive']['time']:.2f}x"
+    time2 = f"{mydict['time'] / RESULTS_DICT['htmldate_fast']['time']:.2f}x"
     precision = tp / (tp + fp)
     recall = tp / (tp + fn)
     accuracy = (tp + tn) / (tp + tn + fp + fn)
-    fscore = (2 * tp) / (2 * tp + fp + fn)  # 2*((precision*recall)/(precision+recall))
+    fscore = f1_score(tp, fp, fn)
     return name, precision, recall, accuracy, fscore, mydict["time"], time1, time2
 
 
@@ -98,15 +99,15 @@ def run_eval(html, data):
 
 
 if __name__ == "__main__":
-
     # hack to suppress noise
     my_stdout = sys.stdout if ARGS.verbose else None
     my_stderr = sys.stderr if ARGS.verbose else None
 
     for item, data in tqdm.tqdm(EVAL_PAGES.items(), total=len(EVAL_PAGES)):
         htmlstring = load_document(data["file"])
-        with contextlib.redirect_stdout(my_stdout), contextlib.redirect_stderr(
-            my_stderr
+        with (
+            contextlib.redirect_stdout(my_stdout),
+            contextlib.redirect_stderr(my_stderr),
         ):
             run_eval(htmlstring, data)
 
diff --git a/tests/eval_gate.py b/tests/eval_gate.py
@@ -0,0 +1,43 @@
+"""CI quality gate: fail if the benchmark F1-score regresses below baseline.
+
+Run from the tests directory. Pin ``TZ=Europe/Berlin`` to reproduce the
+baseline (F1 depends on local time and the dateparser/lxml versions).
+"""
+
+import sys
+
+from evaluation import (
+    EVAL_PAGES,
+    evaluate_result,
+    f1_score,
+    load_document,
+    run_htmldate_extensive,
+    run_htmldate_fast,
+)
+
+# current full-corpus baseline; no regression allowed
+FLOORS = {"extensive": 0.9490, "fast": 0.9253}
+RUNNERS = {"extensive": run_htmldate_extensive, "fast": run_htmldate_fast}
+
+
+def score_function(func):
+    tp = fp = fn = 0
+    for _, data in EVAL_PAGES.items():
+        true_pos, false_pos, _, false_neg = evaluate_result(
+            func(load_document(data["file"])), data
+        )
+        tp, fp, fn = tp + true_pos, fp + false_pos, fn + false_neg
+    return f1_score(tp, fp, fn)
+
+
+def main():
+    regression = False
+    for name, func in RUNNERS.items():
+        score, floor = round(score_function(func), 4), FLOORS[name]
+        regression = regression or score < floor
+        print(f"{name:>9}: F1={score:.4f} (floor {floor:.4f})")
+    return int(regression)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/evaluation.py b/tests/evaluation.py
@@ -161,3 +161,8 @@ def evaluate_result(result, data):
     if result is None:
         return (0, 0, 1, 1) if datereference is None else (0, 0, 0, 1)
     return (1, 0, 0, 0) if result == datereference else (0, 1, 0, 0)
+
+
+def f1_score(tp, fp, fn):
+    """Compute the F1-score from the confusion-matrix counts"""
+    return (2 * tp) / (2 * tp + fp + fn)
diff --git a/tests/unit_tests.py b/tests/unit_tests.py