Filter empty dataframes or None's when merging and remove duplicate columns when mapping

JoostGevaert · JoostGevaert · commit 7e74f191c70b · 2025-06-02T15:08:54.000+02:00
diff --git a/src/bedrock_ge/gi/db_operations.py b/src/bedrock_ge/gi/db_operations.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from collections.abc import Iterable
 
 import pandas as pd
@@ -23,48 +25,26 @@ def merge_databases(
     concatenated dictionary.
 
     Args:
-        target_db (BedrockGIDatabase): The Bedrock GI database into which the incoming data will be merged.
-        incoming_db (BedrockGIDatabase): The Bedrock GI database containing the data to be merged.
+        brgi_databases (Iterable[BedrockGIDatabase]): The Bedrock GI databases containing the data to be merged.
 
     Returns:
         BedrockGIDatabase: Merged Bedrock GI database.
     """
-    # write the body of this function that merges the incoming_db (BedrockGIDatabase) into the target_db (BedrockGIDatabase).
-    # duplicate rows in the incoming_db (BedrockGIDatabase) will be dropped.
-    # After merging tables validate them with the schemas from bedrock_ge.gi.schemas and check that foreign keys are correct.
-    # In case the incoming_db contains tables that are not in the target_db, add them to the target_db.
-    # The function must return a BedrockGIDatabase object.
-
-    # merged_project = pd.concat(
-    #     [target_db.Project, incoming_db.Project], ignore_index=True
-    # )
-    # ProjectSchema.validate(merged_project)
-
-    # merged_location = pd.concat(
-    #     [target_db.Location, incoming_db.Location], ignore_index=True
-    # )
-    # LocationSchema.validate(merged_location)
-    # check_foreign_key("project_uid", merged_project, merged_location)
-
-    # merged_insitu = {}
-
-    # Draw inspiration from polars.concat
-    # https://github.com/pola-rs/polars/blob/py-1.30.0/py-polars/polars/functions/eager.py
-
     dbs = list(brgi_databases)
 
     if not dbs:
-        msg = "Cannot merge an empty list of Bedrock GI databases."
-        raise ValueError(msg)
+        raise ValueError("Cannot merge an empty list of Bedrock GI databases.")
     elif len(dbs) == 1 and isinstance(dbs[0], BedrockGIDatabase):
         return dbs[0]
 
-    merged_project = pd.concat([db.Project for db in dbs], ignore_index=True)
-    merged_project.drop_duplicates(inplace=True)
+    project_dataframes = _filter_dataframes([db.Project for db in dbs])
+    merged_project = pd.concat(project_dataframes, ignore_index=True)
+    merged_project = merged_project.drop_duplicates().reset_index(drop=True)
     ProjectSchema.validate(merged_project)
 
-    merged_location = pd.concat([db.Location for db in dbs], ignore_index=True)
-    merged_location.drop_duplicates(inplace=True)
+    location_dataframes = _filter_dataframes([db.Location for db in dbs])
+    merged_location = pd.concat(location_dataframes, ignore_index=True)
+    merged_location = merged_location.drop_duplicates().reset_index(drop=True)
     LocationSchema.validate(merged_location)
     check_foreign_key("project_uid", merged_project, merged_location)
 
@@ -80,19 +60,21 @@ def merge_databases(
 
     merged_insitu: dict[str, pd.DataFrame] = {}
     for insitu_table in insitu_tables:
-        insitu_df = pd.concat(
-            [db.InSituTests.get(insitu_table) for db in dbs], ignore_index=True
+        insitu_dataframes = _filter_dataframes(
+            [db.InSituTests.get(insitu_table) for db in dbs]
         )
-        insitu_df.drop_duplicates(inplace=True)
+        insitu_df = pd.concat(insitu_dataframes, ignore_index=True)
+        insitu_df = insitu_df.drop_duplicates().reset_index(drop=True)
         InSituTestSchema.validate(insitu_df)
         check_foreign_key("project_uid", merged_project, insitu_df)
         check_foreign_key("location_uid", merged_location, insitu_df)
         merged_insitu[insitu_table] = insitu_df
 
-    sample_dfs = [db.Sample for db in dbs if db.Sample is not None]
+    sample_dfs = _filter_dataframes([db.Sample for db in dbs])
+    merged_sample = None
     if sample_dfs:
         merged_sample = pd.concat(sample_dfs, ignore_index=True)
-        merged_sample.drop_duplicates(inplace=True)
+        merged_sample = merged_sample.drop_duplicates().reset_index(drop=True)
         SampleSchema.validate(merged_sample)
         check_foreign_key("project_uid", merged_project, merged_sample)
 
@@ -104,7 +86,7 @@ def merge_databases(
             if db.LabTests.get(lab_table) is not None
         ]
         lab_df = pd.concat(lab_dfs, ignore_index=True)
-        lab_df.drop_duplicates(inplace=True)
+        lab_df = lab_df.drop_duplicates().reset_index(drop=True)
         check_foreign_key("project_uid", merged_project, lab_df)
         check_foreign_key("sample_uid", merged_sample, lab_df)
         merged_lab[lab_table] = lab_df
@@ -117,7 +99,7 @@ def merge_databases(
             if db.Other.get(other_table) is not None
         ]
         other_df = pd.concat(other_dfs, ignore_index=True)
-        other_df.drop_duplicates(inplace=True)
+        other_df = other_df.drop_duplicates().reset_index(drop=True)
         check_foreign_key("project_uid", merged_project, other_df)
         merged_other[other_table] = other_df
 
@@ -129,3 +111,19 @@ def merge_databases(
         LabTests=merged_lab,
         Other=merged_other,
     )
+
+
+def _filter_dataframes(dataframes: list[pd.DataFrame | None]) -> list[pd.DataFrame]:
+    """Filter out empty or all-NA DataFrames to avoid FutureWarnings."""
+    valid_dfs = []
+    for df in dataframes:
+        if df is not None and not df.empty and not df.isna().all().all():
+            if df.columns.duplicated().any():
+                raise ValueError(
+                    f"Duplicate column names found in dataframe:\n{list(df.columns)}"
+                )
+
+            df.dropna(axis=1, how="all", inplace=True)
+
+            valid_dfs.append(df)
+    return valid_dfs
diff --git a/src/bedrock_ge/gi/mapper.py b/src/bedrock_ge/gi/mapper.py
@@ -42,7 +42,7 @@ def map_to_brgi_db(brgi_db_mapping: BedrockGIMapping) -> BedrockGIDatabase:
     project_uid = brgi_db_mapping.Project.project_id + "-" + project_data_b64_hash
 
     # Create the project table
-    brgi_project = pd.DataFrame(
+    project_df = pd.DataFrame(
         {
             "project_uid": project_uid,
             "project_source_id": brgi_db_mapping.Project.project_id,
@@ -54,7 +54,8 @@ def map_to_brgi_db(brgi_db_mapping: BedrockGIMapping) -> BedrockGIDatabase:
         },
         index=[0],
     )
-    ProjectSchema.validate(brgi_project)
+    project_df = project_df.loc[:, ~project_df.columns.duplicated()]
+    ProjectSchema.validate(project_df)
 
     # Create the location table
     location_df = pd.DataFrame(
@@ -82,6 +83,7 @@ def map_to_brgi_db(brgi_db_mapping: BedrockGIMapping) -> BedrockGIDatabase:
         }
     )
     location_df = pd.concat([location_df, brgi_db_mapping.Location.data], axis=1)
+    location_df = location_df.loc[:, ~location_df.columns.duplicated()]
     LocationSchema.validate(location_df)
 
     # Create the in-situ test tables
@@ -100,6 +102,7 @@ def map_to_brgi_db(brgi_db_mapping: BedrockGIMapping) -> BedrockGIDatabase:
                 insitu_mapping.depth_to_base_column
             ]
         insitu_df = pd.concat([insitu_df, insitu_mapping.data], axis=1)
+        insitu_df = insitu_df.loc[:, ~insitu_df.columns.duplicated()]
         InSituTestSchema.validate(insitu_df)
         insitu_tests[insitu_mapping.table_name] = insitu_df.copy()
 
@@ -129,10 +132,11 @@ def map_to_brgi_db(brgi_db_mapping: BedrockGIMapping) -> BedrockGIDatabase:
                 brgi_db_mapping.Sample.depth_to_top_column
             ]
         sample_df = pd.concat([sample_df, brgi_db_mapping.Sample.data], axis=1)
+        sample_df = sample_df.loc[:, ~sample_df.columns.duplicated()]
         SampleSchema.validate(sample_df)
 
     # Create the lab test tables
-    brgi_lab_tests = {}
+    lab_tests = {}
     if brgi_db_mapping.Lab:
         for lab_mapping in brgi_db_mapping.Lab:
             lab_df = pd.DataFrame(
@@ -146,26 +150,24 @@ def map_to_brgi_db(brgi_db_mapping: BedrockGIMapping) -> BedrockGIDatabase:
                 lab_df["location_uid"] = lab_mapping.data[
                     lab_mapping.location_id_column
                 ]
-            lab_df = pd.concat(
-                [lab_df, lab_mapping.data.copy()],
-                axis=1,
-            )
+            lab_df = pd.concat([lab_df, lab_mapping.data.copy()], axis=1)
             LabTestSchema.validate(lab_df)
-            brgi_lab_tests[lab_mapping.table_name] = lab_df.copy()
+            lab_tests[lab_mapping.table_name] = lab_df.copy()
+
+    # Create the other tables
+    other_tables = {}
+    if brgi_db_mapping.Other:
+        for other_table_mapping in brgi_db_mapping.Other:
+            other_table_df = other_table_mapping.data
+            other_table_df.insert(0, "project_uid", project_uid)
+            other_tables[other_table_mapping.table_name] = other_table_df
 
     # Create and return the Bedrock GI database
     return BedrockGIDatabase(
-        Project=brgi_project,
+        Project=project_df,
         Location=location_df,
         InSituTests=insitu_tests,
         Sample=sample_df if brgi_db_mapping.Sample else None,
-        LabTests=brgi_lab_tests,
-        Other=(
-            {
-                other_table.table_name: pd.DataFrame(other_table.data)
-                for other_table in brgi_db_mapping.Other
-            }
-            if brgi_db_mapping.Other
-            else {}
-        ),
+        LabTests=lab_tests,
+        Other=other_tables,
     )