Skip to content

Commit 748e48a

Browse files
benbuzbeeGallaecio
andauthored
Fix date_parser with prefer_month_of_year wrong results (#1224)
* Fix date_parser with prefer_month_of_year wrong results Fix two problems 1. Parser would use current month even if prefer_month_of_year was not current when relative_base was not none 2. Parser would use current month to derive 'what is the last day of this month' - for example, with prefer_month=last and prefer_day=past, but current_month=april, it would return december 30th, because it would use april to find that the last day was the 30th, when it should use the month. Additionally, add a test to test_date_parser that uses prefer_month * Run pre-commit * Update test_dates_parse_utc_offset_does_not_throw to expect January It is parsing "0:4", french, with settings ``` "PREFER_DATES_FROM": "past", "PREFER_DAY_OF_MONTH": "first", "PREFER_LOCALE_DATE_ORDER": True, "PREFER_MONTH_OF_YEAR": "current", "RELATIVE_BASE": datetime( year=1970, month=1, day=1, hour=0, minute=0, second=0 ), ``` It used to expect to get `expected_date=datetime(1969, 12, 31, 14, 4)` but after my change it gets `datetime(1969, 1, 31, 14, 4)` I would argue that with PREFER_MONTH_OF_YEAR set to "Current", and "Current" being January 1st 1970, that `datetime(1969, 1, 31, 14, 4)` is a better result However with this particular set of configuration, I am not exactly 100% sure what to expect. These settings were generated by a fuzzer so perhaps they don't really make a ton of sense together anyway; rather than change the settings (and thus deviate from what the parser caught) I have opted to update the test expectation to accept January. * Update German test_search_and_parse to accept January for parsing of 'Die' It is searching a German string for dates and asserting that when it finds the word "Die" in the string, it should be parsed as `datetime.datetime(1999, 12, 28, 0, 0)` Similarly, my change makes this `datetime.datetime(1999, 1, 28, 0, 0)` instead. I don't speak German, but as far as I can tell "Die" just means "The" so I have no idea why it is even matching it. In my opinion, this could be a bug with the search identifying a non-date word, and so I can't really guess as to what a sensible result would be. For the sake of simplicity, I also just updated this test to accept January, --------- Co-authored-by: Adrián Chaves <[email protected]>
1 parent f659364 commit 748e48a

File tree

4 files changed

+79
-8
lines changed

4 files changed

+79
-8
lines changed

dateparser/parser.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -598,10 +598,13 @@ def _correct_for_month(self, dateobj):
598598
relative_base_month = (
599599
relative_base.month if hasattr(relative_base, "month") else relative_base
600600
)
601-
if getattr(self, "_token_month", None) or relative_base_month:
601+
602+
if getattr(self, "_token_month", None):
602603
return dateobj
603604

604-
dateobj = set_correct_month_from_settings(dateobj, self.settings)
605+
dateobj = set_correct_month_from_settings(
606+
dateobj, self.settings, relative_base_month
607+
)
605608
return dateobj
606609

607610
@classmethod
@@ -613,11 +616,13 @@ def parse(cls, datestring, settings, tz=None):
613616
# correction for past, future if applicable
614617
dateobj = po._correct_for_time_frame(dateobj, tz)
615618

619+
# correction for preference of month: beginning, current, end
620+
# must happen before day so that day is derived from the correct month
621+
dateobj = po._correct_for_month(dateobj)
622+
616623
# correction for preference of day: beginning, current, end
617624
dateobj = po._correct_for_day(dateobj)
618625

619-
# correction for preference of month: beginning, current, end
620-
dateobj = po._correct_for_month(dateobj)
621626
period = po._get_period()
622627

623628
return dateobj, period

tests/test_clean_api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def test_dates_which_match_locales_are_parsed(
119119
languages=["en"],
120120
region="",
121121
date_formats=["%a", "%a", "%a", "%a"],
122-
expected_date=datetime(1969, 12, 31, 14, 4),
122+
expected_date=datetime(1969, 1, 31, 14, 4),
123123
)
124124
]
125125
)

tests/test_date_parser.py

+60
Original file line numberDiff line numberDiff line change
@@ -1265,6 +1265,66 @@ def test_prefer_dates_from_with_timezone(
12651265
self.then_date_was_parsed_by_date_parser()
12661266
self.then_date_obj_exactly_is(expected)
12671267

1268+
@parameterized.expand(
1269+
[
1270+
param(
1271+
"2015",
1272+
prefer_day="current",
1273+
prefer_month="current",
1274+
today=datetime(2010, 2, 10),
1275+
expected=datetime(2015, 2, 10),
1276+
),
1277+
param(
1278+
"2015",
1279+
prefer_day="last",
1280+
prefer_month="current",
1281+
today=datetime(2010, 2, 10),
1282+
expected=datetime(2015, 2, 28),
1283+
),
1284+
param(
1285+
"2015",
1286+
prefer_day="first",
1287+
prefer_month="current",
1288+
today=datetime(2010, 2, 10),
1289+
expected=datetime(2015, 2, 1),
1290+
),
1291+
param(
1292+
"2015",
1293+
prefer_day="current",
1294+
prefer_month="last",
1295+
today=datetime(2010, 2, 10),
1296+
expected=datetime(2015, 12, 10),
1297+
),
1298+
param(
1299+
"2015",
1300+
prefer_day="last",
1301+
prefer_month="last",
1302+
today=datetime(2010, 2, 10),
1303+
expected=datetime(2015, 12, 31),
1304+
),
1305+
param(
1306+
"2020", # Leap year last day test
1307+
prefer_day="last",
1308+
prefer_month="current",
1309+
today=datetime(2010, 2, 10),
1310+
expected=datetime(2020, 2, 29),
1311+
),
1312+
]
1313+
)
1314+
def test_dates_with_no_day_or_month(
1315+
self, date_string, prefer_day, prefer_month, today=None, expected=None
1316+
):
1317+
self.given_parser(
1318+
settings={
1319+
"PREFER_DAY_OF_MONTH": prefer_day,
1320+
"PREFER_MONTH_OF_YEAR": prefer_month,
1321+
"RELATIVE_BASE": today,
1322+
}
1323+
)
1324+
self.when_date_is_parsed(date_string)
1325+
self.then_date_was_parsed_by_date_parser()
1326+
self.then_date_obj_exactly_is(expected)
1327+
12681328
def given_local_tz_offset(self, offset):
12691329
self.add_patch(
12701330
patch.object(

tests/test_search.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ def test_search_date_string(self, shortname, datetime_string):
410410
"Die UdSSR blieb gemäß dem Neutralitätspakt "
411411
"vom 13. April 1941 gegenüber Japan vorerst neutral.",
412412
[
413-
("Die", datetime.datetime(1999, 12, 28, 0, 0)),
413+
("Die", datetime.datetime(1999, 1, 28, 0, 0)),
414414
("13. April 1941", datetime.datetime(1941, 4, 13, 0, 0)),
415415
],
416416
settings={"RELATIVE_BASE": datetime.datetime(2000, 1, 1)},
@@ -825,7 +825,10 @@ def test_splitting_of_not_parsed(self, shortname, string, expected, settings=Non
825825
"бомбардировки срещу Япония, използувайки новозавладените острови като бази.",
826826
),
827827
# Chinese
828-
param("zh", "不過大多數人仍多把第二次世界大戰的爆發定為1939年9月1日德國入侵波蘭開始,2015年04月08日10点05。"),
828+
param(
829+
"zh",
830+
"不過大多數人仍多把第二次世界大戰的爆發定為1939年9月1日德國入侵波蘭開始,2015年04月08日10点05。",
831+
),
829832
# Czech
830833
param(
831834
"cs",
@@ -897,7 +900,10 @@ def test_splitting_of_not_parsed(self, shortname, string, expected, settings=Non
897900
"d'Etiopia. Il 9 maggio 1936 venne proclamato l'Impero. ",
898901
),
899902
# Japanese
900-
param("ja", "1933年(昭和8年)12月23日午前6時39分、宮城(現:皇居)内の産殿にて誕生。"),
903+
param(
904+
"ja",
905+
"1933年(昭和8年)12月23日午前6時39分、宮城(現:皇居)内の産殿にて誕生。",
906+
),
901907
# Persian
902908
param("fa", "نگ جهانی دوم جنگ جدی بین سپتامبر 1939 و 2 سپتامبر 1945 بود."),
903909
# Polish

0 commit comments

Comments
 (0)