Skip to content

Commit 3608c14

Browse files
authored
Merge pull request #396 from catalyst-cooperative/string-datetimes
Treat datetime columns as strings
2 parents f351942 + f461c40 commit 3608c14

File tree

7 files changed

+25
-39
lines changed

7 files changed

+25
-39
lines changed

.github/release.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
changelog:
3+
exclude:
4+
authors:
5+
- catalyst-workflow-triggerer
6+
- dependabot
7+
- pre-commit-ci
8+
- pudlbot
9+
labels:
10+
- dependencies

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ repos:
2828
- repo: https://github.com/astral-sh/ruff-pre-commit
2929
rev: v0.14.7
3030
hooks:
31-
- id: ruff
31+
- id: ruff-check
3232
args: [--fix, --exit-non-zero-on-fix]
3333
- id: ruff-format
3434

src/ferc_xbrl_extractor/datapackage.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import stringcase
1010
from pydantic import BaseModel
1111

12-
from ferc_xbrl_extractor.helpers import get_logger, parse_dates
12+
from ferc_xbrl_extractor.helpers import get_logger
1313
from ferc_xbrl_extractor.instance import Instance
1414
from ferc_xbrl_extractor.taxonomy import Concept, LinkRole, Taxonomy
1515

@@ -118,7 +118,7 @@ def __hash__(self):
118118
"number": "Float64",
119119
"integer": "Int64",
120120
"boolean": "boolean",
121-
"date": "datetime64[ms]",
121+
"date": "string",
122122
"duration": "string",
123123
"year": "Int64",
124124
}
@@ -133,7 +133,7 @@ def __hash__(self):
133133
"number": float,
134134
"boolean": bool,
135135
"duration": str,
136-
"date": parse_dates,
136+
"date": str,
137137
}
138138
"""
139139
Map callables to schema field type to convert parsed values (Data Package `field.type`).

src/ferc_xbrl_extractor/helpers.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,6 @@
22

33
import logging
44

5-
import pandas as pd
6-
7-
8-
def parse_dates(date_str: str) -> pd.Timestamp:
9-
"""Helper to normalize date strings/parse in a consistent way."""
10-
try:
11-
if "24:00:00" in date_str:
12-
return pd.to_datetime(
13-
date_str.replace("24:00:00", "00:00:00")
14-
) + pd.Timedelta(days=1)
15-
return pd.to_datetime(date_str)
16-
except ValueError:
17-
return pd.NaT
18-
195

206
def get_logger(name: str) -> logging.Logger:
217
"""Helper function to append 'catalystcoop' to logger name and return logger."""

src/ferc_xbrl_extractor/instance.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from lxml.etree import _Element as Element # nosec: B410
1717
from pydantic import BaseModel, field_validator
1818

19-
from ferc_xbrl_extractor.helpers import get_logger, parse_dates
19+
from ferc_xbrl_extractor.helpers import get_logger
2020

2121
XBRL_INSTANCE = "http://www.xbrl.org/2003/instance"
2222
XBRL_LINK = "http://www.xbrl.org/2003/linkbase"
@@ -179,12 +179,12 @@ def as_primary_key(self, filing_name: str, axes: list[str]) -> dict[str, str]:
179179

180180
# Get date based on period type
181181
if self.period.instant:
182-
date_dict = {"date": parse_dates(self.period.end_date)}
182+
date_dict = {"date": self.period.end_date}
183183
else:
184184
date_dict = {
185185
# Ignore type because start_date will always be str if duration period
186-
"start_date": parse_dates(self.period.start_date),
187-
"end_date": parse_dates(self.period.end_date),
186+
"start_date": self.period.start_date,
187+
"end_date": self.period.end_date,
188188
}
189189

190190
return {

tests/integration/datapackage_test.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def _create_schema(instant=True, axes=None):
9797
'cid_4,EID1,filing,2023-01-01T00:00:01,2020-01-01,2020-12-31,"value 3","value 4",\n'
9898
),
9999
dtype="string",
100-
parse_dates=["publication_time", "start_date", "end_date"],
100+
parse_dates=["publication_time"],
101101
),
102102
),
103103
(
@@ -111,7 +111,7 @@ def _create_schema(instant=True, axes=None):
111111
'cid_5,EID1,filing,2023-01-01T00:00:01,2020-01-01,2020-12-31,"Dim 1 Value","value 9","value 10",\n'
112112
),
113113
dtype="string",
114-
parse_dates=["publication_time", "start_date", "end_date"],
114+
parse_dates=["publication_time"],
115115
),
116116
),
117117
(
@@ -124,7 +124,7 @@ def _create_schema(instant=True, axes=None):
124124
'cid_3,EID1,filing,2023-01-01T00:00:01,2021-12-31,"Dim 1 Value","ferc:Dimension2Value","value 7","value 8",\n'
125125
),
126126
dtype="string",
127-
parse_dates=["publication_time", "date"],
127+
parse_dates=["publication_time"],
128128
),
129129
),
130130
],
@@ -148,12 +148,5 @@ def test_construct_dataframe(table_schema, period, df, in_memory_filing):
148148
.drop("c_id", axis="columns")
149149
.reset_index()
150150
)
151-
# Make sure date types use correct units
152-
expected_df = expected_df.astype(
153-
{
154-
col: "datetime64[ms]"
155-
for col in ["publication_time", "start_date", "end_date", "date"]
156-
if col in expected_df.columns
157-
}
158-
)
151+
expected_df = expected_df.astype({"publication_time": "string"})
159152
pd.testing.assert_frame_equal(expected_df, constructed_df)

tests/unit/instance_test.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import logging
55
from collections import Counter
66

7-
import pandas as pd
87
import pytest
98

109
from ferc_xbrl_extractor.instance import (
@@ -80,18 +79,16 @@ def test_context_ids(test_context):
8079

8180
assert context_ids.get("entity_id") == test_context.entity.identifier
8281
assert context_ids.get("filing_name") == "filing_name"
83-
assert context_ids.get("date") == pd.to_datetime(test_context.period.end_date)
82+
assert context_ids.get("date") == test_context.period.end_date
8483

8584
# Change context to have a duration period, then change
8685
test_context.period.instant = False
8786
test_context.period.start_date = "2019-01-01"
8887

8988
context_ids = test_context.as_primary_key("filing_name", axes)
9089

91-
assert context_ids.get("start_date") == pd.to_datetime(
92-
test_context.period.start_date
93-
)
94-
assert context_ids.get("end_date") == pd.to_datetime(test_context.period.end_date)
90+
assert context_ids.get("start_date") == test_context.period.start_date
91+
assert context_ids.get("end_date") == test_context.period.end_date
9592

9693

9794
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)