Fix datetime type for publication date (#392)

zschira · web-flow · commit 017a5e6dfbed · 2025-11-26T09:28:53.000-05:00
* Fix datetime type for publication date

* Fix date units in test

* Fix datetime comparisons and remove python 3.10 support

* Normalize dates
diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.11", "3.12", "3.13"]
       fail-fast: false
     defaults:
       run:
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ authors = [
     { name = "Catalyst Cooperative", email = "pudl@catalyst.coop" },
     { name = "Zach Schira", email = "zach.schira@catalyst.coop" },
 ]
-requires-python = ">=3.10,<3.14.0a0"
+requires-python = ">=3.11,<3.14.0a0"
 dynamic = ["version"]
 license = { file = "LICENSE.txt" }
 dependencies = [
@@ -35,7 +35,6 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
diff --git a/src/ferc_xbrl_extractor/datapackage.py b/src/ferc_xbrl_extractor/datapackage.py
@@ -9,7 +9,7 @@
 import stringcase
 from pydantic import BaseModel
 
-from ferc_xbrl_extractor.helpers import get_logger
+from ferc_xbrl_extractor.helpers import get_logger, parse_dates
 from ferc_xbrl_extractor.instance import Instance
 from ferc_xbrl_extractor.taxonomy import Concept, LinkRole, Taxonomy
 
@@ -118,7 +118,7 @@ def __hash__(self):
     "number": "Float64",
     "integer": "Int64",
     "boolean": "boolean",
-    "date": "string",
+    "date": "datetime64[ms]",
     "duration": "string",
     "year": "Int64",
 }
@@ -133,7 +133,7 @@ def __hash__(self):
     "number": float,
     "boolean": bool,
     "duration": str,
-    "date": str,
+    "date": parse_dates,
 }
 """
 Map callables to schema field type to convert parsed values (Data Package `field.type`).
diff --git a/src/ferc_xbrl_extractor/helpers.py b/src/ferc_xbrl_extractor/helpers.py
@@ -2,6 +2,20 @@
 
 import logging
 
+import pandas as pd
+
+
+def parse_dates(date_str: str) -> pd.Timestamp:
+    """Helper to normalize date strings/parse in a consistent way."""
+    try:
+        if "24:00:00" in date_str:
+            return pd.to_datetime(
+                date_str.replace("24:00:00", "00:00:00")
+            ) + pd.Timedelta(days=1)
+        return pd.to_datetime(date_str)
+    except ValueError:
+        return pd.NaT
+
 
 def get_logger(name: str) -> logging.Logger:
     """Helper function to append 'catalystcoop' to logger name and return logger."""
diff --git a/src/ferc_xbrl_extractor/instance.py b/src/ferc_xbrl_extractor/instance.py
@@ -16,7 +16,7 @@
 from lxml.etree import _Element as Element  # nosec: B410
 from pydantic import BaseModel, field_validator
 
-from ferc_xbrl_extractor.helpers import get_logger
+from ferc_xbrl_extractor.helpers import get_logger, parse_dates
 
 XBRL_INSTANCE = "http://www.xbrl.org/2003/instance"
 XBRL_LINK = "http://www.xbrl.org/2003/linkbase"
@@ -179,12 +179,12 @@ def as_primary_key(self, filing_name: str, axes: list[str]) -> dict[str, str]:
 
         # Get date based on period type
         if self.period.instant:
-            date_dict = {"date": self.period.end_date}
+            date_dict = {"date": parse_dates(self.period.end_date)}
         else:
             date_dict = {
                 # Ignore type because start_date will always be str if duration period
-                "start_date": self.period.start_date,
-                "end_date": self.period.end_date,
+                "start_date": parse_dates(self.period.start_date),
+                "end_date": parse_dates(self.period.end_date),
             }
 
         return {
@@ -421,10 +421,14 @@ def instances_from_zip(instance_path: Path | io.BytesIO) -> list[InstanceBuilder
     with archive.open("rssfeed") as f:
         filings_metadata = json.loads(f.read())
 
+    # Publication time is always published as UTC, but just to be safe convert to UTC
+    # then make timezone naive
     publication_times = {
         filing["filename"]: datetime.datetime.fromisoformat(
             filing["rss_metadata"]["published_parsed"]
         )
+        .astimezone(datetime.UTC)
+        .replace(tzinfo=None)
         for filers_metadata in filings_metadata.values()
         for filing in filers_metadata
     }
diff --git a/tests/integration/console_scripts_test.py b/tests/integration/console_scripts_test.py
@@ -41,6 +41,13 @@ def _find_empty_tables(db_conn, tables: set[str]) -> list[str]:
     return empty_tables
 
 
+def _get_sqlite_df(db_conn, table: str) -> pd.DataFrame:
+    df = db_conn.table(table).df()
+    return df.astype(
+        dict.fromkeys(df.select_dtypes(include=["datetime"]).columns, "datetime64[ms]")
+    )
+
+
 @pytest.mark.script_launch_mode("inprocess")
 def test_extract_example_filings(script_runner, tmp_path, test_dir):
     """Test the XBRL extraction on the example filings.
@@ -103,7 +110,7 @@ def test_extract_example_filings(script_runner, tmp_path, test_dir):
         # SQLite/duckdb have nuanced dtype differences, so ignore types
         for table in sorted(sqlite_tables):
             pd.testing.assert_frame_equal(
-                sqlite_conn.table(table).df(),
+                _get_sqlite_df(sqlite_conn, table),
                 duckdb_conn.table(table).df(),
                 check_like=True,
                 check_dtype=False,
diff --git a/tests/integration/datapackage_test.py b/tests/integration/datapackage_test.py
@@ -97,7 +97,7 @@ def _create_schema(instant=True, axes=None):
                     'cid_4,EID1,filing,2023-01-01T00:00:01,2020-01-01,2020-12-31,"value 3","value 4",\n'
                 ),
                 dtype="string",
-                parse_dates=["publication_time"],
+                parse_dates=["publication_time", "start_date", "end_date"],
             ),
         ),
         (
@@ -111,7 +111,7 @@ def _create_schema(instant=True, axes=None):
                     'cid_5,EID1,filing,2023-01-01T00:00:01,2020-01-01,2020-12-31,"Dim 1 Value","value 9","value 10",\n'
                 ),
                 dtype="string",
-                parse_dates=["publication_time"],
+                parse_dates=["publication_time", "start_date", "end_date"],
             ),
         ),
         (
@@ -124,7 +124,7 @@ def _create_schema(instant=True, axes=None):
                     'cid_3,EID1,filing,2023-01-01T00:00:01,2021-12-31,"Dim 1 Value","ferc:Dimension2Value","value 7","value 8",\n'
                 ),
                 dtype="string",
-                parse_dates=["publication_time"],
+                parse_dates=["publication_time", "date"],
             ),
         ),
     ],
@@ -142,11 +142,18 @@ def test_construct_dataframe(table_schema, period, df, in_memory_filing):
     fact_table = FactTable(table_schema, period)
 
     constructed_df = fact_table.construct_dataframe(instance).reset_index()
-    constructed_df = constructed_df.astype({"publication_time": "datetime64[s]"})
+
     expected_df = (
         df.set_index(table_schema.primary_key)
         .drop("c_id", axis="columns")
         .reset_index()
     )
-    expected_df = expected_df.astype({"publication_time": "datetime64[s]"})
+    # Make sure date types use correct units
+    expected_df = expected_df.astype(
+        {
+            col: "datetime64[ms]"
+            for col in ["publication_time", "start_date", "end_date", "date"]
+            if col in expected_df.columns
+        }
+    )
     pd.testing.assert_frame_equal(expected_df, constructed_df)
diff --git a/tests/unit/instance_test.py b/tests/unit/instance_test.py
@@ -4,6 +4,7 @@
 import logging
 from collections import Counter
 
+import pandas as pd
 import pytest
 
 from ferc_xbrl_extractor.instance import (
@@ -79,16 +80,18 @@ def test_context_ids(test_context):
 
     assert context_ids.get("entity_id") == test_context.entity.identifier
     assert context_ids.get("filing_name") == "filing_name"
-    assert context_ids.get("date") == test_context.period.end_date
+    assert context_ids.get("date") == pd.to_datetime(test_context.period.end_date)
 
     # Change context to have a duration period, then change
     test_context.period.instant = False
     test_context.period.start_date = "2019-01-01"
 
     context_ids = test_context.as_primary_key("filing_name", axes)
 
-    assert context_ids.get("start_date") == test_context.period.start_date
-    assert context_ids.get("end_date") == test_context.period.end_date
+    assert context_ids.get("start_date") == pd.to_datetime(
+        test_context.period.start_date
+    )
+    assert context_ids.get("end_date") == pd.to_datetime(test_context.period.end_date)
 
 
 @pytest.mark.parametrize(