Skip to content

Commit ff29071

Browse files
authored
Merge pull request #242 from catalyst-cooperative/datapackage_fix
Fix datapackage generation from multiple taxonomies
2 parents d550b43 + 15ccb89 commit ff29071

File tree

10 files changed

+147
-74
lines changed

10 files changed

+147
-74
lines changed

README.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ filings, use the command:
124124

125125
.. code-block:: console
126126
127-
$ xbrl_extract examples/ferc1-2021-sample.zip ./ferc1-2021-sample.sqlite \
127+
$ xbrl_extract examples/ferc1-2021-sample.zip --db-path ./ferc1-2021-sample.sqlite \
128128
--taxonomy examples/ferc1-xbrl-taxonomies.zip
129129
130130
The tool expects the ``--taxonomy`` option to point to a zipfile containing archived
@@ -144,7 +144,7 @@ batches of 50 filings at a time.
144144

145145
.. code-block:: console
146146
147-
$ xbrl_extract examples/ferc1-2021-sample.zip ./ferc1-2021-sample.sqlite \
147+
$ xbrl_extract examples/ferc1-2021-sample.zip .--db-path /ferc1-2021-sample.sqlite \
148148
--taxonomy examples/ferc1-xbrl-taxonomies.zip
149149
--workers 5 \
150150
--batch-size 50
@@ -160,7 +160,7 @@ filings and taxonomy, run the following command.
160160

161161
.. code-block:: console
162162
163-
$ xbrl_extract examples/ferc1-2021-sample.zip ./ferc1-2021-sample.sqlite \
163+
$ xbrl_extract examples/ferc1-2021-sample.zip .--db-path /ferc1-2021-sample.sqlite \
164164
--taxonomy examples/ferc1-xbrl-taxonomies.zip
165165
--metadata-path metadata.json \
166166
--datapackage-path datapackage.json

examples/ferc1-2021-sample.zip

-14.8 MB
Binary file not shown.

src/ferc_xbrl_extractor/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def parse():
2424
)
2525
parser.add_argument(
2626
"-d",
27-
"--db_path",
27+
"--db-path",
2828
default="ferc-xbrl.sqlite",
2929
help="Store data in sqlite database specified in argument",
3030
)

src/ferc_xbrl_extractor/datapackage.py

Lines changed: 91 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
from ferc_xbrl_extractor.instance import Instance
1414
from ferc_xbrl_extractor.taxonomy import Concept, LinkRole, Taxonomy
1515

16+
logger = get_logger(__name__)
17+
1618

1719
class Field(BaseModel):
1820
"""A generic field descriptor, as per Frictionless Data specs.
@@ -332,6 +334,47 @@ def get_period_type(self):
332334
period_type = "instant" if "date" in self.schema_.primary_key else "duration"
333335
return period_type
334336

337+
def merge_resources(self, other: "Resource", other_version: str) -> "Resource":
338+
"""Merge same resource from multiple taxonomies.
339+
340+
This method attempts to merge resource definitions from multiple taxonomies
341+
creating a unified schema for the table in question. It does this by first
342+
comparing the primary keys of the two tables. If the primary keys aren't
343+
exactly the same it will raise an error. For the remaining columns, this
344+
method will check if there are any that are new or missing in ``other``.
345+
New columns will be added to the tables schema, and missing columns will
346+
be logged, but remain in the schema.
347+
"""
348+
if self.schema_.primary_key != other.schema_.primary_key:
349+
raise RuntimeError(
350+
f"Can't merge resource {self.name} when versions have incompatible schemas"
351+
)
352+
original_fields = {field.name for field in self.schema_.fields}
353+
other_fields = {field.name for field in other.schema_.fields}
354+
355+
if missing_fields := original_fields - other_fields:
356+
logger.warning(
357+
f"The following fields were removed from table {self.name} "
358+
f"in taxonomy version {other_version}: {missing_fields}"
359+
)
360+
361+
fields = self.schema_.fields
362+
if new_fields := other_fields - original_fields:
363+
logger.warning(
364+
f"The following fields were added to table {self.name} "
365+
f"in taxonomy version {other_version}: {new_fields}"
366+
)
367+
# Add new fields to schema
368+
fields += [
369+
field for field in other.schema_.fields if field.name in new_fields
370+
]
371+
# Return resource with updated schema
372+
return self.model_copy(
373+
update={
374+
"schema": Schema(primary_key=self.schema_.primary_key, fields=fields)
375+
}
376+
)
377+
335378

336379
class FactTable:
337380
"""Class to handle constructing a dataframe from an XBRL fact table.
@@ -355,7 +398,6 @@ def __init__(self, schema: Schema, period_type: str):
355398
if field.name not in schema.primary_key
356399
]
357400
self.instant = period_type == "instant"
358-
self.logger = get_logger(__name__)
359401

360402
def construct_dataframe(self, instance: Instance) -> pd.DataFrame:
361403
"""Construct dataframe from a parsed XBRL instance.
@@ -413,24 +455,60 @@ class Datapackage(BaseModel):
413455
resources: list[Resource]
414456

415457
@classmethod
416-
def from_taxonomy(
417-
cls, taxonomy: Taxonomy, db_uri: str, form_number: int = 1
458+
def from_taxonomies(
459+
cls, taxonomies: dict[str, Taxonomy], db_uri: str, form_number: int = 1
418460
) -> "Datapackage":
419-
"""Construct a Datapackage from an XBRL Taxonomy.
461+
"""Construct a Datapackage from parsed XBRL taxonomies.
462+
463+
FERC regularly releases new versions of their XBRL taxonomies, meaning
464+
data from different years conforms to slightly different structures. This
465+
method will attempt to merge these taxonomy versions into a single unified
466+
schema defined in a Datapackage descriptor.
467+
468+
The exact logic for merging taxonomies is as follows. First, the oldest
469+
available taxonomy is used to construct a baseline datapackage descriptor.
470+
Next, it will parse subsequent versions and compare the set of tables
471+
found with the baseline. New tables will be added to the schema, removed
472+
tables will simply be logged but remain in the schema, and tables in both
473+
versions will do a deeper column level comparison. For more info on the table
474+
comparison, see ``Resource.merge_resources``.
420475
421476
Args:
422-
taxonomy: XBRL taxonomy which defines the structure of the database.
477+
taxonomies: List of taxonomies to merge into a Datapackage.
423478
db_uri: Path to database required for a Frictionless resource.
424479
form_number: FERC form number used for datapackage name.
425480
"""
426-
resources = []
427-
for role in taxonomy.roles:
428-
for period_type in ["duration", "instant"]:
429-
resource = Resource.from_link_role(role, period_type, db_uri)
430-
if resource:
431-
resources.append(resource)
481+
resources = {}
482+
logger.info("Attempting to merge taxonomies into a single datapackage.")
483+
# Iterate through taxonomies in order of release and attempt to merge
484+
for i, (taxonomy_version, taxonomy) in enumerate(sorted(taxonomies.items())):
485+
baseline_resources = set(resources.keys())
486+
new_resources = set()
487+
for role in taxonomy.roles:
488+
for period_type in ["duration", "instant"]:
489+
if resource := Resource.from_link_role(role, period_type, db_uri):
490+
new_resources.add(resource.name)
491+
if resource.name not in resources:
492+
# All resources will be new when parsing first taxonomy
493+
if i > 0:
494+
logger.warning(
495+
f"Resource {resource.name} is new in {taxonomy_version}"
496+
)
497+
# Add new table to schema
498+
resources[resource.name] = resource
499+
else:
500+
# Merge tables in both versions of taxonomy
501+
resources[resource.name] = resources[
502+
resource.name
503+
].merge_resources(resource, taxonomy_version)
504+
if missing_resources := baseline_resources - new_resources:
505+
logger.warning(
506+
f"The following resources were removed in {taxonomy_version}: {missing_resources}"
507+
)
432508

433-
return cls(resources=resources, name=f"ferc{form_number}-extracted-xbrl")
509+
return cls(
510+
resources=list(resources.values()), name=f"ferc{form_number}-extracted-xbrl"
511+
)
434512

435513
def get_fact_tables(
436514
self, filter_tables: set[str] | None = None
@@ -439,7 +517,7 @@ def get_fact_tables(
439517
440518
Args:
441519
filter_tables: Optionally specify the set of tables to extract.
442-
If None, all possible tables will be extracted.
520+
If None, all possible tables will be extracted.
443521
"""
444522
if filter_tables:
445523
filtered_resources = (r for r in self.resources if r.name in filter_tables)

src/ferc_xbrl_extractor/taxonomy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ def from_source(
262262
return cls(roles=roles)
263263

264264

265-
def get_metadata_from_taxonomies(taxonomies: list[Taxonomy]) -> dict:
265+
def get_metadata_from_taxonomies(taxonomies: dict[str, Taxonomy]) -> dict:
266266
"""Get dictionary of taxonomy metadata.
267267
268268
XBRL taxonomies contain metadata that can be useful for interpreting reported
@@ -273,7 +273,7 @@ def get_metadata_from_taxonomies(taxonomies: list[Taxonomy]) -> dict:
273273

274274
duration_metadata = {}
275275
instant_metadata = {}
276-
for taxonomy in taxonomies:
276+
for taxonomy in taxonomies.values():
277277
# Get metadata for duration tables
278278
duration_metadata.update(
279279
{

src/ferc_xbrl_extractor/xbrl.py

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def extract(
8383

8484
def table_data_from_instances(
8585
instance_builders: list[InstanceBuilder],
86-
table_defs: dict[str, dict[str, FactTable]],
86+
table_defs: dict[str, FactTable],
8787
batch_size: int | None = None,
8888
workers: int | None = None,
8989
) -> tuple[dict[str, pd.DataFrame], dict[str, list]]:
@@ -200,7 +200,7 @@ def process_instance(
200200
logger.info(f"Extracting {instance.filing_name}")
201201

202202
dfs = {}
203-
for key, table_def in table_defs[instance.taxonomy_version].items():
203+
for key, table_def in table_defs.items():
204204
dfs[key] = table_def.construct_dataframe(instance)
205205

206206
return dfs
@@ -213,7 +213,7 @@ def get_fact_tables(
213213
filter_tables: set[str] | None = None,
214214
datapackage_path: str | None = None,
215215
metadata_path: str | None = None,
216-
) -> dict[str, dict[str, FactTable]]:
216+
) -> dict[str, FactTable]:
217217
"""Parse taxonomy from URL.
218218
219219
XBRL defines 'fact tables' that groups related facts. These fact
@@ -238,7 +238,7 @@ def get_fact_tables(
238238
Returns:
239239
Dictionary mapping to table names to structure.
240240
"""
241-
taxonomies = []
241+
taxonomies = {}
242242
fact_tables = {}
243243
metadata = {}
244244
with ZipFile(taxonomy_source, "r") as taxonomy_archive:
@@ -252,30 +252,24 @@ def get_fact_tables(
252252

253253
taxonomy_entry_point = f"taxonomy/form{form_number}/{taxonomy_date}/form/form{form_number}/form-{form_number}_{taxonomy_date}.xsd"
254254
taxonomy = Taxonomy.from_source(f, entry_point=taxonomy_entry_point)
255-
taxonomies.append(taxonomy)
255+
taxonomies[taxonomy_version] = taxonomy
256256

257-
datapackage = Datapackage.from_taxonomy(
258-
taxonomy, db_uri, form_number=form_number
259-
)
257+
datapackage = Datapackage.from_taxonomies(
258+
taxonomies, db_uri, form_number=form_number
259+
)
260260

261-
if datapackage_path:
262-
# Verify that datapackage descriptor is valid before outputting
263-
report = Package.validate_descriptor(
264-
datapackage.model_dump(by_alias=True)
265-
)
261+
if datapackage_path:
262+
# Verify that datapackage descriptor is valid before outputting
263+
report = Package.validate_descriptor(datapackage.model_dump(by_alias=True))
266264

267-
if not report.valid:
268-
raise RuntimeError(
269-
f"Generated datapackage is invalid - {report.errors}"
270-
)
265+
if not report.valid:
266+
raise RuntimeError(f"Generated datapackage is invalid - {report.errors}")
271267

272-
# Write to JSON file
273-
with Path(datapackage_path).open(mode="w") as f:
274-
f.write(datapackage.model_dump_json(by_alias=True))
268+
# Write to JSON file
269+
with Path(datapackage_path).open(mode="w") as f:
270+
f.write(datapackage.model_dump_json(by_alias=True, indent=2))
275271

276-
fact_tables[taxonomy_version] = datapackage.get_fact_tables(
277-
filter_tables=filter_tables
278-
)
272+
fact_tables = datapackage.get_fact_tables(filter_tables=filter_tables)
279273

280274
# Save taxonomy metadata
281275
metadata = get_metadata_from_taxonomies(taxonomies)

tests/integration/console_scripts_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def test_extract_example_filings(script_runner, tmp_path, test_dir):
4141
[
4242
"xbrl_extract",
4343
str(data_dir / "ferc1-xbrl-2021.zip"),
44-
"--db_path",
44+
"--db-path",
4545
str(out_db),
4646
"--taxonomy",
4747
str(data_dir / "ferc1-xbrl-taxonomies.zip"),
0 Bytes
Binary file not shown.

tests/integration/data_quality_test.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def extracted(metadata_dir, data_dir, request) -> ExtractOutput:
3535

3636

3737
def test_lost_facts_pct(extracted, request):
38-
table_defs_map, table_data, stats = extracted
38+
table_defs, table_data, stats = extracted
3939
total_facts = sum(
4040
instance_stats["total_facts"] for instance_stats in stats.values()
4141
)
@@ -65,16 +65,8 @@ def test_lost_facts_pct(extracted, request):
6565
assert instance_used_ratio > per_filing_threshold and instance_used_ratio <= 1
6666

6767

68-
def _get_relevant_table_defs(table_defs_map: dict):
69-
# Note: this just grabs table_defs from a random version of the taxonomy.
70-
# The taxonomy versions are close enough that this works for now, but this
71-
# could break tests in the future.
72-
return list(table_defs_map.values())[0]
73-
74-
7568
def test_publication_time(extracted):
76-
table_defs_map, table_data, _stats = extracted
77-
table_defs = _get_relevant_table_defs(table_defs_map)
69+
table_defs, table_data, _stats = extracted
7870

7971
for table_name, table in table_defs.items():
8072
assert (
@@ -86,8 +78,7 @@ def test_publication_time(extracted):
8678

8779

8880
def test_all_data_has_corresponding_id(extracted):
89-
table_defs_map, table_data, _stats = extracted
90-
table_defs = _get_relevant_table_defs(table_defs_map)
81+
table_defs, table_data, _stats = extracted
9182

9283
[id_table_name] = [
9384
name
@@ -109,8 +100,7 @@ def test_all_data_has_corresponding_id(extracted):
109100

110101

111102
def test_null_values(extracted):
112-
table_defs_map, table_data, _stats = extracted
113-
table_defs = _get_relevant_table_defs(table_defs_map)
103+
table_defs, table_data, _stats = extracted
114104

115105
for table_name, table in table_defs.items():
116106
dataframe = table_data[table_name]

tests/integration/datapackage_test.py

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,28 +23,39 @@
2323

2424
def test_datapackage_generation(test_dir, data_dir):
2525
"""Test that datapackage descriptor is valid."""
26-
with (
27-
zipfile.ZipFile(data_dir / "ferc1-xbrl-taxonomies.zip") as archive,
28-
archive.open("form-1-2022-01-01.zip", mode="r") as f,
29-
):
30-
taxonomy = Taxonomy.from_source(
31-
f,
32-
entry_point=Path(
33-
"taxonomy/form1/2022-01-01/form/form1/form-1_2022-01-01.xsd"
34-
),
35-
)
36-
datapackage = Datapackage.from_taxonomy(taxonomy, "sqlite:///test_db.sqlite")
37-
38-
filtered_tables = datapackage.get_fact_tables(
39-
filter_tables={"identification_001_duration"}
40-
)
41-
assert set(filtered_tables.keys()) == {"identification_001_duration"}
26+
taxonomies = {}
27+
for version, entry_point in [
28+
(
29+
"form-1-2022-01-01.zip",
30+
"taxonomy/form1/2022-01-01/form/form1/form-1_2022-01-01.xsd",
31+
),
32+
(
33+
"form-1-2023-11-01.zip",
34+
"taxonomy/form1/2023-11-01/form/form1/form-1_2023-11-01.xsd",
35+
),
36+
]:
37+
with (
38+
zipfile.ZipFile(data_dir / "ferc1-xbrl-taxonomies.zip") as archive,
39+
archive.open(version, mode="r") as f,
40+
):
41+
taxonomies[version] = Taxonomy.from_source(
42+
f,
43+
entry_point=Path(entry_point),
44+
)
45+
datapackage = Datapackage.from_taxonomies(taxonomies, "sqlite:///test_db.sqlite")
46+
47+
filter_tables = {
48+
"identification_001_duration",
49+
"energy_storage_operations_small_plants_419_duration",
50+
}
51+
filtered_tables = datapackage.get_fact_tables(filter_tables=filter_tables)
52+
assert set(filtered_tables.keys()) == filter_tables
4253

4354
all_tables = datapackage.get_fact_tables()
4455

4556
# 366 was just the value we had - this assertion is more of a regression
4657
# test than a normative statement
47-
assert len(all_tables) == 366
58+
assert len(all_tables) == 370
4859

4960
assert Package.validate_descriptor(datapackage.model_dump(by_alias=True))
5061

0 commit comments

Comments
 (0)