adbar
diff --git a/‎.readthedocs.yaml‎
Lines changed: 6 additions & 2 deletions b/‎.readthedocs.yaml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 12 additions & 11 deletions b/‎README.md‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎docs/conf.py‎
Lines changed: 1 addition & 1 deletion b/‎docs/conf.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/evaluation.rst‎
Lines changed: 21 additions & 2 deletions b/‎docs/evaluation.rst‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎docs/index.rst‎
Lines changed: 8 additions & 18 deletions b/‎docs/index.rst‎
Lines changed: 8 additions & 18 deletions
diff --git a/‎docs/options.rst‎
Lines changed: 5 additions & 7 deletions b/‎docs/options.rst‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎docs/requirements.txt‎
Lines changed: 2 additions & 3 deletions b/‎docs/requirements.txt‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎htmldate/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎htmldate/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎htmldate/cli.py‎
Lines changed: 2 additions & 2 deletions b/‎htmldate/cli.py‎
Lines changed: 2 additions & 2 deletions
@@ -6,9 +6,9 @@ version: 2
 
 # Set the OS, Python version and other tools you might need
 build:
-  os: ubuntu-22.04
+  os: ubuntu-24.04
   tools:
-    python: "3.11"
+    python: "3.13"
     # You can also specify other tool versions:
     # nodejs: "20"
     # rust: "1.70"
@@ -33,3 +33,7 @@ sphinx:
 python:
   install:
     - requirements: docs/requirements.txt
+    # install the checked-out source so autodoc and the version reflect this
+    # branch/tag rather than the released package from PyPI
+    - method: pip
+      path: .
@@ -1,5 +1,11 @@
 ## Changelog
 
+## 1.10.0
+- maintenance: modernize typing, packaging and code
+- evaluation: review and correct benchmark ground-truth labels, update and speed up alternatives
+- performance: stable day-granular cache key and reduced copying
+- fixes: preserve tails in element cleaning
+
 ## 1.9.4
 - maintenance: remove LXML version constraint (#184)
 
 
@@ -54,7 +54,7 @@ $ htmldate -u http://blog.python.org/2016/12/python-360-is-now-available.html
     YMD](https://en.wikipedia.org/wiki/ISO_8601)).
 -   Detection of both original and updated dates.
 -   Multilingual.
--   Compatible with all recent versions of Python.
+-   Compatible with Python 3.10 and later.
 
 ### How it works
 
@@ -77,31 +77,32 @@ Finally, the output is validated and converted to the chosen format.
 
 ## Performance
 
-1000 web pages containing identifiable dates (as of 2023-11-13 on Python 3.10)
+1000 web pages containing identifiable dates (as of 2026-06-01 on Python 3.13)
 
 | Python Package | Precision | Recall | Accuracy | F-Score | Time |
 | -------------- | --------- | ------ | -------- | ------- | ---- |
-| articleDateExtractor 0.20 | 0.803 | 0.734 | 0.622 | 0.767 | 5x |
-| date_guesser 2.1.4 | 0.781 | 0.600 | 0.514 | 0.679 | 18x |
-| goose3 3.1.17 | 0.869 | 0.532 | 0.493 | 0.660 | 15x |
-| htmldate\[all\] 1.6.0 (fast) | **0.883** | 0.924 | 0.823 | 0.903 | **1x** |
-| htmldate\[all\] 1.6.0 (extensive) | 0.870 | **0.993** | **0.865** | **0.928** | 1.7x |
-| newspaper3k 0.2.8 | 0.769 | 0.667 | 0.556 | 0.715 | 15x |
-| news-please 1.5.35 | 0.801 | 0.768 | 0.645 | 0.784 | 34x |
+| articleDateExtractor 0.20 | 0.846 | 0.745 | 0.656 | 0.792 | 3x |
+| date_guesser 2.1.4 | 0.832 | 0.611 | 0.544 | 0.705 | 11x |
+| goose3 3.1.21 | **0.930** | 0.568 | 0.545 | 0.706 | 14x |
+| htmldate\[all\] 1.10.0 (fast) | 0.924 | 0.927 | 0.861 | 0.925 | **1x** |
+| htmldate\[all\] 1.10.0 (extensive) | 0.908 | **0.993** | **0.903** | **0.949** | 1.8x |
+| newspaper4k 0.9.5 | 0.912 | 0.728 | 0.680 | 0.810 | 2.5x |
+| news-please 1.6.16 | 0.845 | 0.777 | 0.680 | 0.810 | 29x |
 
 For the complete results and explanations see [evaluation
 page](https://htmldate.readthedocs.io/en/latest/evaluation.html).
 
 ## Installation
 
 Htmldate is tested on Linux, macOS and Windows systems, it is compatible
-with Python 3.8 upwards. It can notably be installed with `pip` (`pip3`
+with Python 3.10 upwards. It can notably be installed with `pip` (`pip3`
 where applicable) from the PyPI package repository:
 
 -   `pip install htmldate`
 -   (optionally) `pip install htmldate[speed]`
 
-The last version to support Python 3.6 and 3.7 is `htmldate==1.8.1`.
+The last version to support Python 3.6 and 3.7 is `htmldate==1.8.1`; for
+Python 3.8 and 3.9 use the `1.9.x` series.
 
 ## Documentation
 
 
@@ -21,7 +21,7 @@
 # -- Project information -----------------------------------------------------
 
 project = 'htmldate'
-copyright = '2023, <a href="https://adrien.barbaresi.eu/">Adrien Barbaresi</a>'
+copyright = '2017-2026, <a href="https://adrien.barbaresi.eu/">Adrien Barbaresi</a>'
 author = 'Adrien Barbaresi'
 
 # -- General configuration ---------------------------------------------------
 
@@ -18,7 +18,7 @@ There are comparable software solutions in Python, the following date extraction
 - `date_guesser <https://github.com/mitmedialab/date_guesser>`_ extracts publication dates from a web pages along with an accuracy measure (not used here),
 - `goose3 <https://github.com/goose3/goose3>`_ can extract information for embedded content,
 - `htmldate <https://github.com/adbar/htmldate>`_ is the software package described here, it is designed to extract original and updated publication dates of web pages,
-- `newspaper <https://github.com/codelucas/newspaper>`_ is mostly geared towards newspaper texts,
+- `newspaper4k <https://github.com/AndyTheFactory/newspaper4k>`_ (the maintained successor of newspaper3k) is mostly geared towards newspaper texts,
 - `news-please <https://github.com/fhamborg/news-please>`_ is a news crawler that extracts structured information.
 
 Two alternative packages are not tested here but could be used in addition:
@@ -36,7 +36,7 @@ Description
 
 **Time**: the execution time cannot be easily compared in all cases as some solutions perform a whole series of operations which are irrelevant to this task.
 
-**Errors:** *goose3*'s output isn't always meaningful and/or in a standardized format, these cases were discarded. *news-please* seems to have trouble with some encodings (e.g. in Chinese), in which case it leads to an exception.
+**Errors:** *goose3*'s output isn't always meaningful and/or in a standardized format, these cases were discarded.
 
 
 Results
@@ -45,6 +45,23 @@ Results
 The results below show that **date extraction is not a completely solved task** but one for which extractors have to resort to heuristics and guesses. The figures documenting recall and accuracy capture the real-world performance of the tools as the absence of a date output impacts the result.
 
 
+================================ ========= ========= ========= ========= =======
+1000 web pages containing identifiable dates (as of 2026-06-01 on Python 3.13)
+--------------------------------------------------------------------------------
+Python Package                   Precision Recall    Accuracy  F-Score   Time
+================================ ========= ========= ========= ========= =======
+articleDateExtractor 0.20        0.846     0.745     0.656     0.792     3x
+date_guesser 2.1.4               0.832     0.611     0.544     0.705     11x
+goose3 3.1.21                    **0.930** 0.568     0.545     0.706     14x
+htmldate[all] 1.10.0 (fast)      0.924     0.927     0.861     0.925     **1x**
+htmldate[all] 1.10.0 (extensive) 0.908     **0.993** **0.903** **0.949** 1.8x
+newspaper4k 0.9.5                0.912     0.728     0.680     0.810     2.5x
+news-please 1.6.16               0.845     0.777     0.680     0.810     29x
+================================ ========= ========= ========= ========= =======
+
+This run uses a reviewed version of the ground-truth labels (publication-date corrections) and the maintained *newspaper4k* fork in place of the now-unmaintained *newspaper3k*.
+
+
 =============================== ========= ========= ========= ========= =======
 1000 web pages containing identifiable dates (as of 2023-11-13 on Python 3.10)
 -------------------------------------------------------------------------------
@@ -62,6 +79,8 @@ news-please 1.5.35              0.801     0.768     0.645     0.784     34x
 
 Additional data for new pages in English collected by the `Data Culture Group <https://dataculturegroup.org>`_ at Northeastern University.
 
+The discussion below refers to the most recent run (top table), measured against a reviewed version of the publication-date labels.
+
 Precision describes if the dates given as output are correct: *goose3* fares well precision-wise but it fails to extract dates in a large majority of cases (poor recall). The difference in accuracy between *date_guesser* and *newspaper* is consistent with tests described on the `website of the former <https://github.com/mitmedialab/date_guesser>`_.
 
 It turns out that *htmldate* performs better than the other solutions overall. It is also noticeably faster than the strictly comparable packages (*articleDateExtractor* and most certainly *date_guesser*). Despite being measured on a sample, **the higher accuracy and faster processing time are highly significant**. Especially for smaller news outlets, websites and blogs, as well as pages written in languages other than English (in this case mostly but not exclusively German), *htmldate* greatly extends date extraction coverage without sacrificing precision.
 
@@ -80,7 +80,7 @@ Features
 -  URLs, HTML files, or HTML trees are given as input (includes batch processing)
 -  Output as string in any date format (defaults to `ISO 8601 YMD <https://en.wikipedia.org/wiki/ISO_8601>`_)
 -  Detection of both original and updated dates
--  Compatible with all recent versions of Python
+-  Compatible with Python 3.10 and later
 
 
 ``htmldate`` can examine markup and text. It provides the following ways to date an HTML document:
@@ -94,7 +94,7 @@ Features
 
 The output is thoroughly verified in terms of plausibility and adequateness. If a valid date has been found the library outputs a date string corresponding to either the last update or the original publishing statement (the default), in the desired format.
 
-Markup-based extraction is multilingual by nature, text-based refinements for better coverage currently support German, English and Turkish.
+Markup-based extraction is multilingual by nature, text-based refinements for better coverage currently support English, French, German, Indonesian and Turkish.
 
 
 Installation
@@ -103,16 +103,16 @@ Installation
 Main package
 ~~~~~~~~~~~~
 
-This Python package is tested on Linux, macOS and Windows systems; it is compatible with Python 3.8 upwards. It is available on the package repository `PyPI <https://pypi.org/>`_ and can notably be installed with ``pip`` or ``pipenv``:
+This Python package is tested on Linux, macOS and Windows systems; it is compatible with Python 3.10 upwards. It is available on the package repository `PyPI <https://pypi.org/>`_ and can notably be installed with ``pip`` or ``pipenv``:
 
 .. code-block:: bash
 
-    $ pip install htmldate # pip3 install on systems where both Python 2 and 3 are installed
+    $ pip install htmldate
     $ pip install --upgrade htmldate # to make sure you have the latest version
     $ pip install git+https://github.com/adbar/htmldate.git # latest available code (see build status above)
 
 
-The last version to support Python 3.6 and 3.7 is ``htmldate==1.8.1``.
+The last version to support Python 3.6 and 3.7 is ``htmldate==1.8.1``; for Python 3.8 and 3.9 use the ``1.9.x`` series.
 
 
 Optional
@@ -131,16 +131,6 @@ The ``dateparser`` package is noticeably slower in its latest versions, version
 *For infos on dependency management of Python packages see* `this discussion thread <https://stackoverflow.com/questions/41573587/what-is-the-difference-between-venv-pyvenv-pyenv-virtualenv-virtualenvwrappe>`_.
 
 
-Experimental
-~~~~~~~~~~~~
-
-Experimental compilation with ``mypyc``, as using pre-compiled library may shorten processing speed:
-
-1. Install ``mypy``: ``pip3 install mypy``
-2. Compile the package: ``python setup.py --use-mypyc bdist_wheel``
-3. Use the newly created wheel: ``pip3 install dist/...``
-
-
 With Python
 -----------
 
@@ -162,7 +152,7 @@ In case the web page features easily readable metadata in the header, the extrac
 .. code-block:: python
 
     >>> find_date('https://creativecommons.org/about/')
-    '2017-08-11' # has been updated since
+    '2017-08-11' # may change
     >>> find_date('https://creativecommons.org/about/', extensive_search=False)
     >>>
 
@@ -189,7 +179,7 @@ Change the output to a format known to Python's ``datetime`` module, the default
 .. code-block:: python
 
     >>> find_date('https://www.gnu.org/licenses/gpl-3.0.en.html', outputformat='%d %B %Y')
-    '18 November 2016'  # may have changed since
+    '18 November 2016'  # may change
 
 
 Original vs. updated dates
@@ -200,7 +190,7 @@ Although the time delta between original publication and "last modified" info is
 .. code-block:: python
 
     >>> find_date('https://netzpolitik.org/2016/die-cider-connection-abmahnungen-gegen-nutzer-von-creative-commons-bildern/', original_date=True)  # modified behavior
-    '2016-06-23'
+    '2016-06-23' # may change
 
 For more information see `options page <options.html>`_.
 
 
@@ -27,15 +27,15 @@ An external module can be used for download, as described in versions anterior t
     >>> import requests
     >>> r = requests.get('https://creativecommons.org/about/')
     >>> find_date(r.text)
-    '2017-11-28' # may have changed since
+    '2017-11-28' # may change
     # using htmldate's own fetch_url function
     >>> from htmldate.utils import fetch_url
     >>> htmldoc = fetch_url('https://blog.wikimedia.org/2018/06/28/interactive-maps-now-in-your-language/')
     >>> find_date(htmldoc)
-    '2018-06-28'
+    '2018-06-28' # may change
     # or simply
     >>> find_date('https://blog.wikimedia.org/2018/06/28/interactive-maps-now-in-your-language/') # URL detected
-    '2018-06-28'
+    '2018-06-28' # may change
 
 
 Date format
@@ -46,7 +46,7 @@ Change the output to a format known to Python's ``datetime`` module, the default
 .. code-block:: python
 
     >>> find_date('https://www.gnu.org/licenses/gpl-3.0.en.html', outputformat='%d %B %Y')
-    '18 November 2016' # may have changed since
+    '18 November 2016' # may change
     >>> find_date('http://blog.python.org/2016/12/python-360-is-now-available.html', outputformat='%Y-%m-%dT%H:%M:%S%z')
     '2016-12-23T05:11:00-0500'
 
@@ -62,7 +62,7 @@ Although the time delta between the original publication and the "last modified"
 .. code-block:: python
 
     >>> find_date('https://netzpolitik.org/2016/die-cider-connection-abmahnungen-gegen-nutzer-von-creative-commons-bildern/') # default setting
-    '2019-06-24'
+    '2019-06-24' # may change
     >>> find_date('https://netzpolitik.org/2016/die-cider-connection-abmahnungen-gegen-nutzer-von-creative-commons-bildern/', original_date=True) # modified behavior
     '2016-06-23'
 
@@ -77,8 +77,6 @@ See ``settings.py`` file:
    :show-inheritance:
    :undoc-members:
 
-The module can then be re-compiled locally to apply changes to the settings.
-
 
 Clearing caches
 ~~~~~~~~~~~~~~~
 
@@ -1,4 +1,3 @@
 # version required
-sphinx>=8.1.3
-# without version specifier
-htmldate
+sphinx>=9.1.0
+# htmldate itself is installed from the repo root (see .readthedocs.yaml)
@@ -7,7 +7,7 @@
 __author__ = "Adrien Barbaresi"
 __license__ = "Apache-2.0"
 __copyright__ = "Copyright 2017-present, Adrien Barbaresi"
-__version__ = "1.9.4"
+__version__ = "1.10.0"
 
 
 import logging
 
@@ -81,13 +81,13 @@ def process_args(args: argparse.Namespace) -> None:
         if args.URL:
             htmlstring = fetch_url(args.URL)
             if htmlstring is None:
-                sys.exit(f"No data for URL: {args.URL}" + "\n")
+                sys.exit(f"No data for URL: {args.URL}\n")
         # unicode check
         else:
             try:
                 htmlstring = sys.stdin.read()
             except UnicodeDecodeError as err:
-                sys.exit(f"Wrong buffer encoding: {str(err)}" + "\n")
+                sys.exit(f"Wrong buffer encoding: {err}\n")
         result = cli_examine(htmlstring, args)
         if result is not None:
             sys.stdout.write(result + "\n")