Skip to content

Commit 964be3c

Browse files
authored
prepare release 1.6.0 (#111)
* prepare release 1.6.0 * fix setup * update benchmark * update evaluation
1 parent 93530b1 commit 964be3c

File tree

7 files changed

+52
-31
lines changed

7 files changed

+52
-31
lines changed

CHANGELOG.md

+7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
## Changelog
22

3+
4+
## 1.6.0
5+
- focus on precision, stricter extraction patterns (#103, #105, #106, #112)
6+
- simplified code base (#108, #109)
7+
- replaced lxml.html.Cleaner (#104)
8+
- extended evaluation
9+
310
## 1.5.2
411
- fix for missing months keys in custom extractor (#100)
512
- fix for None in `try_date_expr()` (#101)

README.rst

+8-8
Original file line numberDiff line numberDiff line change
@@ -97,17 +97,17 @@ Performance
9797
-----------
9898

9999
=============================== ========= ========= ========= ========= =======
100-
500 web pages containing identifiable dates (as of 2022-11-28 on Python 3.8)
100+
1000 web pages containing identifiable dates (as of 2023-11-13 on Python 3.10)
101101
-------------------------------------------------------------------------------
102102
Python Package Precision Recall Accuracy F-Score Time
103103
=============================== ========= ========= ========= ========= =======
104-
articleDateExtractor 0.20 0.769 0.691 0.572 0.728 4x
105-
date_guesser 2.1.4 0.738 0.544 0.456 0.626 16x
106-
goose3 3.1.12 0.821 0.453 0.412 0.584 14x
107-
htmldate[all] 1.4.0 (fast) **0.856** 0.921 0.798 0.888 **1x**
108-
htmldate[all] 1.4.0 (extensive) 0.847 **0.991** **0.840** **0.913** 2.2x
109-
newspaper3k 0.2.8 0.729 0.630 0.510 0.675 13x
110-
news-please 1.5.22 0.769 0.691 0.572 0.728 38x
104+
articleDateExtractor 0.20 0.803 0.734 0.622 0.767 5x
105+
date_guesser 2.1.4 0.781 0.600 0.514 0.679 18x
106+
goose3 3.1.17 0.869 0.532 0.493 0.660 15x
107+
htmldate[all] 1.6.0 (fast) **0.883** 0.924 0.823 0.903 **1x**
108+
htmldate[all] 1.6.0 (extensive) 0.870 **0.993** **0.865** **0.928** 1.7x
109+
newspaper3k 0.2.8 0.769 0.667 0.556 0.715 15x
110+
news-please 1.5.35 0.801 0.768 0.645 0.784 34x
111111
=============================== ========= ========= ========= ========= =======
112112

113113
For complete results and explanations see the `evaluation page <https://htmldate.readthedocs.io/en/latest/evaluation.html>`_.

docs/evaluation.rst

+23-8
Original file line numberDiff line numberDiff line change
@@ -42,17 +42,17 @@ The results below show that **date extraction is not a completely solved task**
4242

4343

4444
=============================== ========= ========= ========= ========= =======
45-
500 web pages containing identifiable dates (as of 2022-11-28 on Python 3.8)
45+
1000 web pages containing identifiable dates (as of 2023-11-13 on Python 3.10)
4646
-------------------------------------------------------------------------------
4747
Python Package Precision Recall Accuracy F-Score Time
4848
=============================== ========= ========= ========= ========= =======
49-
articleDateExtractor 0.20 0.769 0.691 0.572 0.728 4x
50-
date_guesser 2.1.4 0.738 0.544 0.456 0.626 16x
51-
goose3 3.1.12 0.821 0.453 0.412 0.584 14x
52-
htmldate[all] 1.4.0 (fast) **0.856** 0.921 0.798 0.888 **1x**
53-
htmldate[all] 1.4.0 (extensive) 0.847 **0.991** **0.840** **0.913** 2.2x
54-
newspaper3k 0.2.8 0.729 0.630 0.510 0.675 13x
55-
news-please 1.5.22 0.769 0.691 0.572 0.728 38x
49+
articleDateExtractor 0.20 0.803 0.734 0.622 0.767 5x
50+
date_guesser 2.1.4 0.781 0.600 0.514 0.679 18x
51+
goose3 3.1.17 0.869 0.532 0.493 0.660 15x
52+
htmldate[all] 1.6.0 (fast) **0.883** 0.924 0.823 0.903 **1x**
53+
htmldate[all] 1.6.0 (extensive) 0.870 **0.993** **0.865** **0.928** 1.7x
54+
newspaper3k 0.2.8 0.769 0.667 0.556 0.715 15x
55+
news-please 1.5.35 0.801 0.768 0.645 0.784 34x
5656
=============================== ========= ========= ========= ========= =======
5757

5858

@@ -72,6 +72,21 @@ Note on the different versions:
7272
Older Results
7373
-------------
7474

75+
=============================== ========= ========= ========= ========= =======
76+
500 web pages containing identifiable dates (as of 2022-11-28 on Python 3.8)
77+
-------------------------------------------------------------------------------
78+
Python Package Precision Recall Accuracy F-Score Time
79+
=============================== ========= ========= ========= ========= =======
80+
articleDateExtractor 0.20 0.769 0.691 0.572 0.728 4x
81+
date_guesser 2.1.4 0.738 0.544 0.456 0.626 16x
82+
goose3 3.1.12 0.821 0.453 0.412 0.584 14x
83+
htmldate[all] 1.4.0 (fast) **0.856** 0.921 0.798 0.888 **1x**
84+
htmldate[all] 1.4.0 (extensive) 0.847 **0.991** **0.840** **0.913** 2.2x
85+
newspaper3k 0.2.8 0.729 0.630 0.510 0.675 13x
86+
news-please 1.5.22 0.769 0.691 0.572 0.728 38x
87+
=============================== ========= ========= ========= ========= =======
88+
89+
7590

7691
=============================== ========= ========= ========= ========= =======
7792
500 web pages containing identifiable dates (as of 2022-03-23 on Python 3.8)

htmldate/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
__author__ = "Adrien Barbaresi"
88
__license__ = "GNU GPL v3"
99
__copyright__ = "Copyright 2017-2023, Adrien Barbaresi"
10-
__version__ = "1.5.2"
10+
__version__ = "1.6.0"
1111

1212

1313
import logging

setup.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414
extras = {
1515
"speed": [
1616
"backports-datetime-fromisoformat; python_version < '3.11'",
17-
"cchardet >= 2.1.7; python_version < '3.11'", # build issue
18-
"faust-cchardet >= 2.1.19; python_version >= '3.11'", # fix for build
17+
"faust-cchardet >= 2.1.19",
1918
"urllib3[brotli]",
2019
],
2120
}
@@ -34,7 +33,7 @@ def get_long_description():
3433

3534
def get_version(package):
3635
"Return package version as listed in `__version__` in `init.py`"
37-
initfile = Path(package, "__init__.py").read_text() # Python >= 3.5
36+
initfile = Path(package, "__init__.py").read_text()
3837
return re.search("__version__ = ['\"]([^'\"]+)['\"]", initfile)[1]
3938

4039

@@ -117,7 +116,7 @@ def get_version(package):
117116
install_requires=[
118117
"backports-datetime-fromisoformat; python_version < '3.7'",
119118
"charset_normalizer >= 3.0.1; python_version < '3.7'",
120-
"charset_normalizer >= 3.3.0; python_version >= '3.7'",
119+
"charset_normalizer >= 3.3.2; python_version >= '3.7'",
121120
"dateparser >= 1.1.2", # 1.1.3+ slower
122121
"lxml >= 4.9.3 ; platform_system != 'Darwin'",
123122
"lxml == 4.9.2 ; platform_system == 'Darwin'",

tests/comparison.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -86,17 +86,14 @@ def run_newspaper(htmlstring):
8686
# throws error on the eval_default dataset
8787
try:
8888
myarticle = Article(htmlstring)
89-
except (TypeError, UnicodeDecodeError):
90-
return None
91-
myarticle.html = htmlstring
92-
myarticle.download_state = ArticleDownloadState.SUCCESS
93-
try:
89+
myarticle.html = htmlstring
90+
myarticle.download_state = ArticleDownloadState.SUCCESS
9491
myarticle.parse()
95-
except UnicodeEncodeError:
92+
except (UnicodeDecodeError, UnicodeEncodeError):
9693
return None
9794
if myarticle.publish_date is None or myarticle.publish_date == "":
9895
return None
99-
return convert_date(myarticle.publish_date, "%Y-%m-%d %H:%M:%S", "%Y-%m-%d")
96+
return str(myarticle.publish_date)[0:10]
10097

10198

10299
def run_newsplease(htmlstring):
@@ -129,11 +126,14 @@ def run_dateguesser(htmlstring):
129126

130127
def run_goose(htmlstring):
131128
"""try with the goose algorithm"""
132-
article = G.extract(raw_html=htmlstring)
129+
try:
130+
article = G.extract(raw_html=htmlstring)
131+
except (AttributeError, UnicodeDecodeError):
132+
return None
133133
if article.publish_date is None:
134134
return None
135-
datematch = re.match(r"[0-9]{4}-[0-9]{2}-[0-9]{2}", article.publish_date)
136135
try:
136+
datematch = re.match(r"[0-9]{4}-[0-9]{2}-[0-9]{2}", article.publish_date)
137137
return datematch[0]
138138
# illogical result
139139
except TypeError:

tests/eval-requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# package
2-
htmldate>=1.5.0
2+
htmldate>=1.6.0
33

44
# alternatives
55
articleDateExtractor==0.20

0 commit comments

Comments
 (0)