Skip to content

Commit 017a5e6

Browse files
authored
Fix datetime type for publication date (#392)
* Fix datetime type for publication date * Fix date units in test * Fix datetime comparisons and remove python 3.10 support * Normalize dates
1 parent 7b62002 commit 017a5e6

File tree

8 files changed

+53
-19
lines changed

8 files changed

+53
-19
lines changed

.github/workflows/tox-pytest.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ jobs:
77
runs-on: ubuntu-latest
88
strategy:
99
matrix:
10-
python-version: ["3.10", "3.11", "3.12", "3.13"]
10+
python-version: ["3.11", "3.12", "3.13"]
1111
fail-fast: false
1212
defaults:
1313
run:

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ authors = [
1010
{ name = "Catalyst Cooperative", email = "[email protected]" },
1111
{ name = "Zach Schira", email = "[email protected]" },
1212
]
13-
requires-python = ">=3.10,<3.14.0a0"
13+
requires-python = ">=3.11,<3.14.0a0"
1414
dynamic = ["version"]
1515
license = { file = "LICENSE.txt" }
1616
dependencies = [
@@ -35,7 +35,6 @@ classifiers = [
3535
"Programming Language :: Python",
3636
"Programming Language :: Python :: 3",
3737
"Programming Language :: Python :: 3 :: Only",
38-
"Programming Language :: Python :: 3.10",
3938
"Programming Language :: Python :: 3.11",
4039
"Programming Language :: Python :: 3.12",
4140
"Programming Language :: Python :: 3.13",

src/ferc_xbrl_extractor/datapackage.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import stringcase
1010
from pydantic import BaseModel
1111

12-
from ferc_xbrl_extractor.helpers import get_logger
12+
from ferc_xbrl_extractor.helpers import get_logger, parse_dates
1313
from ferc_xbrl_extractor.instance import Instance
1414
from ferc_xbrl_extractor.taxonomy import Concept, LinkRole, Taxonomy
1515

@@ -118,7 +118,7 @@ def __hash__(self):
118118
"number": "Float64",
119119
"integer": "Int64",
120120
"boolean": "boolean",
121-
"date": "string",
121+
"date": "datetime64[ms]",
122122
"duration": "string",
123123
"year": "Int64",
124124
}
@@ -133,7 +133,7 @@ def __hash__(self):
133133
"number": float,
134134
"boolean": bool,
135135
"duration": str,
136-
"date": str,
136+
"date": parse_dates,
137137
}
138138
"""
139139
Map callables to schema field type to convert parsed values (Data Package `field.type`).

src/ferc_xbrl_extractor/helpers.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,20 @@
22

33
import logging
44

5+
import pandas as pd
6+
7+
8+
def parse_dates(date_str: str) -> pd.Timestamp:
9+
"""Helper to normalize date strings/parse in a consistent way."""
10+
try:
11+
if "24:00:00" in date_str:
12+
return pd.to_datetime(
13+
date_str.replace("24:00:00", "00:00:00")
14+
) + pd.Timedelta(days=1)
15+
return pd.to_datetime(date_str)
16+
except ValueError:
17+
return pd.NaT
18+
519

620
def get_logger(name: str) -> logging.Logger:
721
"""Helper function to append 'catalystcoop' to logger name and return logger."""

src/ferc_xbrl_extractor/instance.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from lxml.etree import _Element as Element # nosec: B410
1717
from pydantic import BaseModel, field_validator
1818

19-
from ferc_xbrl_extractor.helpers import get_logger
19+
from ferc_xbrl_extractor.helpers import get_logger, parse_dates
2020

2121
XBRL_INSTANCE = "http://www.xbrl.org/2003/instance"
2222
XBRL_LINK = "http://www.xbrl.org/2003/linkbase"
@@ -179,12 +179,12 @@ def as_primary_key(self, filing_name: str, axes: list[str]) -> dict[str, str]:
179179

180180
# Get date based on period type
181181
if self.period.instant:
182-
date_dict = {"date": self.period.end_date}
182+
date_dict = {"date": parse_dates(self.period.end_date)}
183183
else:
184184
date_dict = {
185185
# Ignore type because start_date will always be str if duration period
186-
"start_date": self.period.start_date,
187-
"end_date": self.period.end_date,
186+
"start_date": parse_dates(self.period.start_date),
187+
"end_date": parse_dates(self.period.end_date),
188188
}
189189

190190
return {
@@ -421,10 +421,14 @@ def instances_from_zip(instance_path: Path | io.BytesIO) -> list[InstanceBuilder
421421
with archive.open("rssfeed") as f:
422422
filings_metadata = json.loads(f.read())
423423

424+
# Publication time is always published as UTC, but just to be safe convert to UTC
425+
# then make timezone naive
424426
publication_times = {
425427
filing["filename"]: datetime.datetime.fromisoformat(
426428
filing["rss_metadata"]["published_parsed"]
427429
)
430+
.astimezone(datetime.UTC)
431+
.replace(tzinfo=None)
428432
for filers_metadata in filings_metadata.values()
429433
for filing in filers_metadata
430434
}

tests/integration/console_scripts_test.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,13 @@ def _find_empty_tables(db_conn, tables: set[str]) -> list[str]:
4141
return empty_tables
4242

4343

44+
def _get_sqlite_df(db_conn, table: str) -> pd.DataFrame:
45+
df = db_conn.table(table).df()
46+
return df.astype(
47+
dict.fromkeys(df.select_dtypes(include=["datetime"]).columns, "datetime64[ms]")
48+
)
49+
50+
4451
@pytest.mark.script_launch_mode("inprocess")
4552
def test_extract_example_filings(script_runner, tmp_path, test_dir):
4653
"""Test the XBRL extraction on the example filings.
@@ -103,7 +110,7 @@ def test_extract_example_filings(script_runner, tmp_path, test_dir):
103110
# SQLite/duckdb have nuanced dtype differences, so ignore types
104111
for table in sorted(sqlite_tables):
105112
pd.testing.assert_frame_equal(
106-
sqlite_conn.table(table).df(),
113+
_get_sqlite_df(sqlite_conn, table),
107114
duckdb_conn.table(table).df(),
108115
check_like=True,
109116
check_dtype=False,

tests/integration/datapackage_test.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def _create_schema(instant=True, axes=None):
9797
'cid_4,EID1,filing,2023-01-01T00:00:01,2020-01-01,2020-12-31,"value 3","value 4",\n'
9898
),
9999
dtype="string",
100-
parse_dates=["publication_time"],
100+
parse_dates=["publication_time", "start_date", "end_date"],
101101
),
102102
),
103103
(
@@ -111,7 +111,7 @@ def _create_schema(instant=True, axes=None):
111111
'cid_5,EID1,filing,2023-01-01T00:00:01,2020-01-01,2020-12-31,"Dim 1 Value","value 9","value 10",\n'
112112
),
113113
dtype="string",
114-
parse_dates=["publication_time"],
114+
parse_dates=["publication_time", "start_date", "end_date"],
115115
),
116116
),
117117
(
@@ -124,7 +124,7 @@ def _create_schema(instant=True, axes=None):
124124
'cid_3,EID1,filing,2023-01-01T00:00:01,2021-12-31,"Dim 1 Value","ferc:Dimension2Value","value 7","value 8",\n'
125125
),
126126
dtype="string",
127-
parse_dates=["publication_time"],
127+
parse_dates=["publication_time", "date"],
128128
),
129129
),
130130
],
@@ -142,11 +142,18 @@ def test_construct_dataframe(table_schema, period, df, in_memory_filing):
142142
fact_table = FactTable(table_schema, period)
143143

144144
constructed_df = fact_table.construct_dataframe(instance).reset_index()
145-
constructed_df = constructed_df.astype({"publication_time": "datetime64[s]"})
145+
146146
expected_df = (
147147
df.set_index(table_schema.primary_key)
148148
.drop("c_id", axis="columns")
149149
.reset_index()
150150
)
151-
expected_df = expected_df.astype({"publication_time": "datetime64[s]"})
151+
# Make sure date types use correct units
152+
expected_df = expected_df.astype(
153+
{
154+
col: "datetime64[ms]"
155+
for col in ["publication_time", "start_date", "end_date", "date"]
156+
if col in expected_df.columns
157+
}
158+
)
152159
pd.testing.assert_frame_equal(expected_df, constructed_df)

tests/unit/instance_test.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import logging
55
from collections import Counter
66

7+
import pandas as pd
78
import pytest
89

910
from ferc_xbrl_extractor.instance import (
@@ -79,16 +80,18 @@ def test_context_ids(test_context):
7980

8081
assert context_ids.get("entity_id") == test_context.entity.identifier
8182
assert context_ids.get("filing_name") == "filing_name"
82-
assert context_ids.get("date") == test_context.period.end_date
83+
assert context_ids.get("date") == pd.to_datetime(test_context.period.end_date)
8384

8485
# Change context to have a duration period, then change
8586
test_context.period.instant = False
8687
test_context.period.start_date = "2019-01-01"
8788

8889
context_ids = test_context.as_primary_key("filing_name", axes)
8990

90-
assert context_ids.get("start_date") == test_context.period.start_date
91-
assert context_ids.get("end_date") == test_context.period.end_date
91+
assert context_ids.get("start_date") == pd.to_datetime(
92+
test_context.period.start_date
93+
)
94+
assert context_ids.get("end_date") == pd.to_datetime(test_context.period.end_date)
9295

9396

9497
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)