Skip to content

Commit 09b3bb1

Browse files
authored
Fix DuckDB outputs and test that the match SQLite
2 parents b9200dc + ffb7343 commit 09b3bb1

File tree

4 files changed

+32
-31
lines changed

4 files changed

+32
-31
lines changed

src/ferc_xbrl_extractor/cli.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ def write_to_sqlite(sqlite_engine: Engine, table_name: str, table_data: pd.DataF
110110

111111
def write_to_duckdb(duckdb_path: str, table_name: str, table_data: pd.DataFrame):
112112
"""Write one table to a duckdb database."""
113+
table_data = table_data.reset_index()
113114
with duckdb.connect(duckdb_path) as duckdb_conn:
114115
duckdb_conn.execute(
115116
f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM table_data" # noqa: S608

src/ferc_xbrl_extractor/datapackage.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def __hash__(self):
120120
"boolean": "boolean",
121121
"date": "string",
122122
"duration": "string",
123-
"year": "datetime64[ns]",
123+
"year": "Int64",
124124
}
125125
"""
126126
Pandas data type by schema field type (Data Package `field.type`).
@@ -391,9 +391,7 @@ def __init__(self, schema: Schema, period_type: str):
391391
"""Create FactTable and prepare for constructing dataframe."""
392392
self.schema = schema
393393
# Map column names to function to convert parsed values
394-
self.columns = {
395-
field.name: CONVERT_DTYPES[field.type_] for field in schema.fields
396-
}
394+
self.columns = {field.name: field.type_ for field in schema.fields}
397395
self.axes = [name for name in schema.primary_key if name.endswith("axis")]
398396
self.data_columns = [
399397
field.name
@@ -424,7 +422,7 @@ def construct_dataframe(self, instance: Instance) -> pd.DataFrame:
424422
{
425423
"c_id": fact.c_id,
426424
"name": fact.name,
427-
"value": self.columns[fact.name](fact.value),
425+
"value": CONVERT_DTYPES[self.columns[fact.name]](fact.value),
428426
}
429427
for fact in raw_facts
430428
)
@@ -443,7 +441,14 @@ def construct_dataframe(self, instance: Instance) -> pd.DataFrame:
443441
)
444442
)
445443

446-
return contexts.join(facts).set_index(self.schema.primary_key).dropna(how="all")
444+
return (
445+
contexts.join(facts)
446+
.astype(
447+
{name: FIELD_TO_PANDAS[dtype] for name, dtype in self.columns.items()}
448+
)
449+
.set_index(self.schema.primary_key)
450+
.dropna(how="all")
451+
)
447452

448453

449454
class Datapackage(BaseModel):

tests/integration/console_scripts_test.py

Lines changed: 17 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,8 @@
33
from importlib.metadata import entry_points
44

55
import duckdb
6+
import pandas as pd
67
import pytest
7-
from sqlalchemy import create_engine, text
8-
from sqlalchemy.engine import Connection
98

109
# Obtain a list of all deployed entry point scripts to test:
1110
ENTRY_POINTS = [
@@ -26,22 +25,9 @@ def test_extractor_scripts(script_runner, ep):
2625
assert ret.success # nosec: B101
2726

2827

29-
def _get_sqlite_tables(sqlite_conn) -> set[str]:
30-
"""Return set of all tables in SQLITE db."""
31-
tables = sqlite_conn.execute(
32-
text(
33-
"SELECT name FROM sqlite_master "
34-
"WHERE type='table' AND name NOT LIKE 'sqlite_%';"
35-
)
36-
).fetchall()
37-
return {table_name for (table_name,) in tables}
38-
39-
40-
def _get_duckdb_tables(duckdb_conn) -> set[str]:
28+
def _get_tables(conn) -> set[str]:
4129
"""Return set of all tables in duckdb."""
42-
tables = duckdb_conn.execute(
43-
"SELECT table_name FROM information_schema.tables"
44-
).fetchall()
30+
tables = conn.execute("SELECT table_name FROM information_schema.tables").fetchall()
4531
return {table_name for (table_name,) in tables}
4632

4733

@@ -50,9 +36,6 @@ def _find_empty_tables(db_conn, tables: set[str]) -> list[str]:
5036
empty_tables = []
5137
for table_name in tables:
5238
query = f"SELECT COUNT(*) FROM '{table_name}';" # noqa: S608
53-
if isinstance(db_conn, Connection):
54-
query = text(query)
55-
5639
if db_conn.execute(query).fetchone()[0] == 0:
5740
empty_tables.append(table_name)
5841
return empty_tables
@@ -95,15 +78,13 @@ def test_extract_example_filings(script_runner, tmp_path, test_dir):
9578
assert ret.success
9679

9780
# Sanity check the sqlite/duckdb outputs
98-
sqlite_uri = f"sqlite:///{sqlite_path.absolute()}"
99-
sqlite_engine = create_engine(sqlite_uri)
10081
with (
101-
sqlite_engine.begin() as sqlite_conn,
82+
duckdb.connect(sqlite_path) as sqlite_conn,
10283
duckdb.connect(duckdb_path) as duckdb_conn,
10384
):
10485
# Check for tables that only exist in either sqlite/duckdb but not both
105-
sqlite_tables = _get_sqlite_tables(sqlite_conn)
106-
duckdb_tables = _get_duckdb_tables(duckdb_conn)
86+
sqlite_tables = _get_tables(sqlite_conn)
87+
duckdb_tables = _get_tables(duckdb_conn)
10788

10889
extra_sqlite_tables = sqlite_tables - duckdb_tables
10990
extra_duckdb_tables = duckdb_tables - sqlite_tables
@@ -118,6 +99,17 @@ def test_extract_example_filings(script_runner, tmp_path, test_dir):
11899
assert empty_sqlite_tables == []
119100
assert empty_duckdb_tables == []
120101

102+
# Check that tables are identical
103+
# SQLite/duckdb have nuanced dtype differences, so ignore types
104+
for table in sorted(sqlite_tables):
105+
pd.testing.assert_frame_equal(
106+
sqlite_conn.table(table).df(),
107+
duckdb_conn.table(table).df(),
108+
check_like=True,
109+
check_dtype=False,
110+
check_exact=True,
111+
)
112+
121113

122114
@pytest.mark.script_launch_mode("inprocess")
123115
def test_extract_example_filings_bad_form(script_runner, tmp_path, test_dir):

tests/integration/datapackage_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def _create_schema(instant=True, axes=None):
9696
'cid_1,EID1,filing,2023-01-01T00:00:01,2021-01-01,2021-12-31,"value 1","value 2",\n'
9797
'cid_4,EID1,filing,2023-01-01T00:00:01,2020-01-01,2020-12-31,"value 3","value 4",\n'
9898
),
99+
dtype="string",
99100
parse_dates=["publication_time"],
100101
),
101102
),
@@ -109,6 +110,7 @@ def _create_schema(instant=True, axes=None):
109110
'cid_4,EID1,filing,2023-01-01T00:00:01,2020-01-01,2020-12-31,total,"value 3","value 4",\n'
110111
'cid_5,EID1,filing,2023-01-01T00:00:01,2020-01-01,2020-12-31,"Dim 1 Value","value 9","value 10",\n'
111112
),
113+
dtype="string",
112114
parse_dates=["publication_time"],
113115
),
114116
),
@@ -121,6 +123,7 @@ def _create_schema(instant=True, axes=None):
121123
'cid_2,EID1,filing,2023-01-01T00:00:01,2021-12-31,total,total,"value 5","value 6",\n'
122124
'cid_3,EID1,filing,2023-01-01T00:00:01,2021-12-31,"Dim 1 Value","ferc:Dimension2Value","value 7","value 8",\n'
123125
),
126+
dtype="string",
124127
parse_dates=["publication_time"],
125128
),
126129
),

0 commit comments

Comments
 (0)