1313from ferc_xbrl_extractor .instance import Instance
1414from ferc_xbrl_extractor .taxonomy import Concept , LinkRole , Taxonomy
1515
16+ logger = get_logger (__name__ )
17+
1618
1719class Field (BaseModel ):
1820 """A generic field descriptor, as per Frictionless Data specs.
@@ -332,6 +334,47 @@ def get_period_type(self):
332334 period_type = "instant" if "date" in self .schema_ .primary_key else "duration"
333335 return period_type
334336
337+ def merge_resources (self , other : "Resource" , other_version : str ) -> "Resource" :
338+ """Merge same resource from multiple taxonomies.
339+
340+ This method attempts to merge resource definitions from multiple taxonomies
341+ creating a unified schema for the table in question. It does this by first
342+ comparing the primary keys of the two tables. If the primary keys aren't
343+ exactly the same it will raise an error. For the remaining columns, this
344+ method will check if there are any that are new or missing in ``other``.
345+ New columns will be added to the tables schema, and missing columns will
346+ be logged, but remain in the schema.
347+ """
348+ if self .schema_ .primary_key != other .schema_ .primary_key :
349+ raise RuntimeError (
350+ f"Can't merge resource { self .name } when versions have incompatible schemas"
351+ )
352+ original_fields = {field .name for field in self .schema_ .fields }
353+ other_fields = {field .name for field in other .schema_ .fields }
354+
355+ if missing_fields := original_fields - other_fields :
356+ logger .warning (
357+ f"The following fields were removed from table { self .name } "
358+ f"in taxonomy version { other_version } : { missing_fields } "
359+ )
360+
361+ fields = self .schema_ .fields
362+ if new_fields := other_fields - original_fields :
363+ logger .warning (
364+ f"The following fields were added to table { self .name } "
365+ f"in taxonomy version { other_version } : { new_fields } "
366+ )
367+ # Add new fields to schema
368+ fields += [
369+ field for field in other .schema_ .fields if field .name in new_fields
370+ ]
371+ # Return resource with updated schema
372+ return self .model_copy (
373+ update = {
374+ "schema" : Schema (primary_key = self .schema_ .primary_key , fields = fields )
375+ }
376+ )
377+
335378
336379class FactTable :
337380 """Class to handle constructing a dataframe from an XBRL fact table.
@@ -355,7 +398,6 @@ def __init__(self, schema: Schema, period_type: str):
355398 if field .name not in schema .primary_key
356399 ]
357400 self .instant = period_type == "instant"
358- self .logger = get_logger (__name__ )
359401
360402 def construct_dataframe (self , instance : Instance ) -> pd .DataFrame :
361403 """Construct dataframe from a parsed XBRL instance.
@@ -413,24 +455,60 @@ class Datapackage(BaseModel):
413455 resources : list [Resource ]
414456
415457 @classmethod
416- def from_taxonomy (
417- cls , taxonomy : Taxonomy , db_uri : str , form_number : int = 1
458+ def from_taxonomies (
459+ cls , taxonomies : dict [ str , Taxonomy ] , db_uri : str , form_number : int = 1
418460 ) -> "Datapackage" :
419- """Construct a Datapackage from an XBRL Taxonomy.
461+ """Construct a Datapackage from parsed XBRL taxonomies.
462+
463+ FERC regularly releases new versions of their XBRL taxonomies, meaning
464+ data from different years conforms to slightly different structures. This
465+ method will attempt to merge these taxonomy versions into a single unified
466+ schema defined in a Datapackage descriptor.
467+
468+ The exact logic for merging taxonomies is as follows. First, the oldest
469+ available taxonomy is used to construct a baseline datapackage descriptor.
470+ Next, it will parse subsequent versions and compare the set of tables
471+ found with the baseline. New tables will be added to the schema, removed
472+ tables will simply be logged but remain in the schema, and tables in both
473+ versions will do a deeper column level comparison. For more info on the table
474+ comparison, see ``Resource.merge_resources``.
420475
421476 Args:
422- taxonomy: XBRL taxonomy which defines the structure of the database .
477+ taxonomies: List of taxonomies to merge into a Datapackage .
423478 db_uri: Path to database required for a Frictionless resource.
424479 form_number: FERC form number used for datapackage name.
425480 """
426- resources = []
427- for role in taxonomy .roles :
428- for period_type in ["duration" , "instant" ]:
429- resource = Resource .from_link_role (role , period_type , db_uri )
430- if resource :
431- resources .append (resource )
481+ resources = {}
482+ logger .info ("Attempting to merge taxonomies into a single datapackage." )
483+ # Iterate through taxonomies in order of release and attempt to merge
484+ for i , (taxonomy_version , taxonomy ) in enumerate (sorted (taxonomies .items ())):
485+ baseline_resources = set (resources .keys ())
486+ new_resources = set ()
487+ for role in taxonomy .roles :
488+ for period_type in ["duration" , "instant" ]:
489+ if resource := Resource .from_link_role (role , period_type , db_uri ):
490+ new_resources .add (resource .name )
491+ if resource .name not in resources :
492+ # All resources will be new when parsing first taxonomy
493+ if i > 0 :
494+ logger .warning (
495+ f"Resource { resource .name } is new in { taxonomy_version } "
496+ )
497+ # Add new table to schema
498+ resources [resource .name ] = resource
499+ else :
500+ # Merge tables in both versions of taxonomy
501+ resources [resource .name ] = resources [
502+ resource .name
503+ ].merge_resources (resource , taxonomy_version )
504+ if missing_resources := baseline_resources - new_resources :
505+ logger .warning (
506+ f"The following resources were removed in { taxonomy_version } : { missing_resources } "
507+ )
432508
433- return cls (resources = resources , name = f"ferc{ form_number } -extracted-xbrl" )
509+ return cls (
510+ resources = list (resources .values ()), name = f"ferc{ form_number } -extracted-xbrl"
511+ )
434512
435513 def get_fact_tables (
436514 self , filter_tables : set [str ] | None = None
@@ -439,7 +517,7 @@ def get_fact_tables(
439517
440518 Args:
441519 filter_tables: Optionally specify the set of tables to extract.
442- If None, all possible tables will be extracted.
520+ If None, all possible tables will be extracted.
443521 """
444522 if filter_tables :
445523 filtered_resources = (r for r in self .resources if r .name in filter_tables )
0 commit comments