1+ from __future__ import annotations
2+
13from collections .abc import Iterable
24
35import pandas as pd
@@ -23,48 +25,26 @@ def merge_databases(
2325 concatenated dictionary.
2426
2527 Args:
26- target_db (BedrockGIDatabase): The Bedrock GI database into which the incoming data will be merged.
27- incoming_db (BedrockGIDatabase): The Bedrock GI database containing the data to be merged.
28+ brgi_databases (Iterable[BedrockGIDatabase]): The Bedrock GI databases containing the data to be merged.
2829
2930 Returns:
3031 BedrockGIDatabase: Merged Bedrock GI database.
3132 """
32- # write the body of this function that merges the incoming_db (BedrockGIDatabase) into the target_db (BedrockGIDatabase).
33- # duplicate rows in the incoming_db (BedrockGIDatabase) will be dropped.
34- # After merging tables validate them with the schemas from bedrock_ge.gi.schemas and check that foreign keys are correct.
35- # In case the incoming_db contains tables that are not in the target_db, add them to the target_db.
36- # The function must return a BedrockGIDatabase object.
37-
38- # merged_project = pd.concat(
39- # [target_db.Project, incoming_db.Project], ignore_index=True
40- # )
41- # ProjectSchema.validate(merged_project)
42-
43- # merged_location = pd.concat(
44- # [target_db.Location, incoming_db.Location], ignore_index=True
45- # )
46- # LocationSchema.validate(merged_location)
47- # check_foreign_key("project_uid", merged_project, merged_location)
48-
49- # merged_insitu = {}
50-
51- # Draw inspiration from polars.concat
52- # https://github.com/pola-rs/polars/blob/py-1.30.0/py-polars/polars/functions/eager.py
53-
5433 dbs = list (brgi_databases )
5534
5635 if not dbs :
57- msg = "Cannot merge an empty list of Bedrock GI databases."
58- raise ValueError (msg )
36+ raise ValueError ("Cannot merge an empty list of Bedrock GI databases." )
5937 elif len (dbs ) == 1 and isinstance (dbs [0 ], BedrockGIDatabase ):
6038 return dbs [0 ]
6139
62- merged_project = pd .concat ([db .Project for db in dbs ], ignore_index = True )
63- merged_project .drop_duplicates (inplace = True )
40+ project_dataframes = _filter_dataframes ([db .Project for db in dbs ])
41+ merged_project = pd .concat (project_dataframes , ignore_index = True )
42+ merged_project = merged_project .drop_duplicates ().reset_index (drop = True )
6443 ProjectSchema .validate (merged_project )
6544
66- merged_location = pd .concat ([db .Location for db in dbs ], ignore_index = True )
67- merged_location .drop_duplicates (inplace = True )
45+ location_dataframes = _filter_dataframes ([db .Location for db in dbs ])
46+ merged_location = pd .concat (location_dataframes , ignore_index = True )
47+ merged_location = merged_location .drop_duplicates ().reset_index (drop = True )
6848 LocationSchema .validate (merged_location )
6949 check_foreign_key ("project_uid" , merged_project , merged_location )
7050
@@ -80,19 +60,21 @@ def merge_databases(
8060
8161 merged_insitu : dict [str , pd .DataFrame ] = {}
8262 for insitu_table in insitu_tables :
83- insitu_df = pd . concat (
84- [db .InSituTests .get (insitu_table ) for db in dbs ], ignore_index = True
63+ insitu_dataframes = _filter_dataframes (
64+ [db .InSituTests .get (insitu_table ) for db in dbs ]
8565 )
86- insitu_df .drop_duplicates (inplace = True )
66+ insitu_df = pd .concat (insitu_dataframes , ignore_index = True )
67+ insitu_df = insitu_df .drop_duplicates ().reset_index (drop = True )
8768 InSituTestSchema .validate (insitu_df )
8869 check_foreign_key ("project_uid" , merged_project , insitu_df )
8970 check_foreign_key ("location_uid" , merged_location , insitu_df )
9071 merged_insitu [insitu_table ] = insitu_df
9172
92- sample_dfs = [db .Sample for db in dbs if db .Sample is not None ]
73+ sample_dfs = _filter_dataframes ([db .Sample for db in dbs ])
74+ merged_sample = None
9375 if sample_dfs :
9476 merged_sample = pd .concat (sample_dfs , ignore_index = True )
95- merged_sample .drop_duplicates (inplace = True )
77+ merged_sample = merged_sample .drop_duplicates (). reset_index ( drop = True )
9678 SampleSchema .validate (merged_sample )
9779 check_foreign_key ("project_uid" , merged_project , merged_sample )
9880
@@ -104,7 +86,7 @@ def merge_databases(
10486 if db .LabTests .get (lab_table ) is not None
10587 ]
10688 lab_df = pd .concat (lab_dfs , ignore_index = True )
107- lab_df .drop_duplicates (inplace = True )
89+ lab_df = lab_df .drop_duplicates (). reset_index ( drop = True )
10890 check_foreign_key ("project_uid" , merged_project , lab_df )
10991 check_foreign_key ("sample_uid" , merged_sample , lab_df )
11092 merged_lab [lab_table ] = lab_df
@@ -117,7 +99,7 @@ def merge_databases(
11799 if db .Other .get (other_table ) is not None
118100 ]
119101 other_df = pd .concat (other_dfs , ignore_index = True )
120- other_df .drop_duplicates (inplace = True )
102+ other_df = other_df .drop_duplicates (). reset_index ( drop = True )
121103 check_foreign_key ("project_uid" , merged_project , other_df )
122104 merged_other [other_table ] = other_df
123105
@@ -129,3 +111,19 @@ def merge_databases(
129111 LabTests = merged_lab ,
130112 Other = merged_other ,
131113 )
114+
115+
116+ def _filter_dataframes (dataframes : list [pd .DataFrame | None ]) -> list [pd .DataFrame ]:
117+ """Filter out empty or all-NA DataFrames to avoid FutureWarnings."""
118+ valid_dfs = []
119+ for df in dataframes :
120+ if df is not None and not df .empty and not df .isna ().all ().all ():
121+ if df .columns .duplicated ().any ():
122+ raise ValueError (
123+ f"Duplicate column names found in dataframe:\n { list (df .columns )} "
124+ )
125+
126+ df .dropna (axis = 1 , how = "all" , inplace = True )
127+
128+ valid_dfs .append (df )
129+ return valid_dfs
0 commit comments