Skip to content

Commit 7e74f19

Browse files
committed
Filter empty dataframes or None's when merging and remove duplicate columns when mapping
1 parent af8aa3a commit 7e74f19

File tree

2 files changed

+55
-55
lines changed

2 files changed

+55
-55
lines changed

src/bedrock_ge/gi/db_operations.py

Lines changed: 35 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
from collections.abc import Iterable
24

35
import pandas as pd
@@ -23,48 +25,26 @@ def merge_databases(
2325
concatenated dictionary.
2426
2527
Args:
26-
target_db (BedrockGIDatabase): The Bedrock GI database into which the incoming data will be merged.
27-
incoming_db (BedrockGIDatabase): The Bedrock GI database containing the data to be merged.
28+
brgi_databases (Iterable[BedrockGIDatabase]): The Bedrock GI databases containing the data to be merged.
2829
2930
Returns:
3031
BedrockGIDatabase: Merged Bedrock GI database.
3132
"""
32-
# write the body of this function that merges the incoming_db (BedrockGIDatabase) into the target_db (BedrockGIDatabase).
33-
# duplicate rows in the incoming_db (BedrockGIDatabase) will be dropped.
34-
# After merging tables validate them with the schemas from bedrock_ge.gi.schemas and check that foreign keys are correct.
35-
# In case the incoming_db contains tables that are not in the target_db, add them to the target_db.
36-
# The function must return a BedrockGIDatabase object.
37-
38-
# merged_project = pd.concat(
39-
# [target_db.Project, incoming_db.Project], ignore_index=True
40-
# )
41-
# ProjectSchema.validate(merged_project)
42-
43-
# merged_location = pd.concat(
44-
# [target_db.Location, incoming_db.Location], ignore_index=True
45-
# )
46-
# LocationSchema.validate(merged_location)
47-
# check_foreign_key("project_uid", merged_project, merged_location)
48-
49-
# merged_insitu = {}
50-
51-
# Draw inspiration from polars.concat
52-
# https://github.com/pola-rs/polars/blob/py-1.30.0/py-polars/polars/functions/eager.py
53-
5433
dbs = list(brgi_databases)
5534

5635
if not dbs:
57-
msg = "Cannot merge an empty list of Bedrock GI databases."
58-
raise ValueError(msg)
36+
raise ValueError("Cannot merge an empty list of Bedrock GI databases.")
5937
elif len(dbs) == 1 and isinstance(dbs[0], BedrockGIDatabase):
6038
return dbs[0]
6139

62-
merged_project = pd.concat([db.Project for db in dbs], ignore_index=True)
63-
merged_project.drop_duplicates(inplace=True)
40+
project_dataframes = _filter_dataframes([db.Project for db in dbs])
41+
merged_project = pd.concat(project_dataframes, ignore_index=True)
42+
merged_project = merged_project.drop_duplicates().reset_index(drop=True)
6443
ProjectSchema.validate(merged_project)
6544

66-
merged_location = pd.concat([db.Location for db in dbs], ignore_index=True)
67-
merged_location.drop_duplicates(inplace=True)
45+
location_dataframes = _filter_dataframes([db.Location for db in dbs])
46+
merged_location = pd.concat(location_dataframes, ignore_index=True)
47+
merged_location = merged_location.drop_duplicates().reset_index(drop=True)
6848
LocationSchema.validate(merged_location)
6949
check_foreign_key("project_uid", merged_project, merged_location)
7050

@@ -80,19 +60,21 @@ def merge_databases(
8060

8161
merged_insitu: dict[str, pd.DataFrame] = {}
8262
for insitu_table in insitu_tables:
83-
insitu_df = pd.concat(
84-
[db.InSituTests.get(insitu_table) for db in dbs], ignore_index=True
63+
insitu_dataframes = _filter_dataframes(
64+
[db.InSituTests.get(insitu_table) for db in dbs]
8565
)
86-
insitu_df.drop_duplicates(inplace=True)
66+
insitu_df = pd.concat(insitu_dataframes, ignore_index=True)
67+
insitu_df = insitu_df.drop_duplicates().reset_index(drop=True)
8768
InSituTestSchema.validate(insitu_df)
8869
check_foreign_key("project_uid", merged_project, insitu_df)
8970
check_foreign_key("location_uid", merged_location, insitu_df)
9071
merged_insitu[insitu_table] = insitu_df
9172

92-
sample_dfs = [db.Sample for db in dbs if db.Sample is not None]
73+
sample_dfs = _filter_dataframes([db.Sample for db in dbs])
74+
merged_sample = None
9375
if sample_dfs:
9476
merged_sample = pd.concat(sample_dfs, ignore_index=True)
95-
merged_sample.drop_duplicates(inplace=True)
77+
merged_sample = merged_sample.drop_duplicates().reset_index(drop=True)
9678
SampleSchema.validate(merged_sample)
9779
check_foreign_key("project_uid", merged_project, merged_sample)
9880

@@ -104,7 +86,7 @@ def merge_databases(
10486
if db.LabTests.get(lab_table) is not None
10587
]
10688
lab_df = pd.concat(lab_dfs, ignore_index=True)
107-
lab_df.drop_duplicates(inplace=True)
89+
lab_df = lab_df.drop_duplicates().reset_index(drop=True)
10890
check_foreign_key("project_uid", merged_project, lab_df)
10991
check_foreign_key("sample_uid", merged_sample, lab_df)
11092
merged_lab[lab_table] = lab_df
@@ -117,7 +99,7 @@ def merge_databases(
11799
if db.Other.get(other_table) is not None
118100
]
119101
other_df = pd.concat(other_dfs, ignore_index=True)
120-
other_df.drop_duplicates(inplace=True)
102+
other_df = other_df.drop_duplicates().reset_index(drop=True)
121103
check_foreign_key("project_uid", merged_project, other_df)
122104
merged_other[other_table] = other_df
123105

@@ -129,3 +111,19 @@ def merge_databases(
129111
LabTests=merged_lab,
130112
Other=merged_other,
131113
)
114+
115+
116+
def _filter_dataframes(dataframes: list[pd.DataFrame | None]) -> list[pd.DataFrame]:
117+
"""Filter out empty or all-NA DataFrames to avoid FutureWarnings."""
118+
valid_dfs = []
119+
for df in dataframes:
120+
if df is not None and not df.empty and not df.isna().all().all():
121+
if df.columns.duplicated().any():
122+
raise ValueError(
123+
f"Duplicate column names found in dataframe:\n{list(df.columns)}"
124+
)
125+
126+
df.dropna(axis=1, how="all", inplace=True)
127+
128+
valid_dfs.append(df)
129+
return valid_dfs

src/bedrock_ge/gi/mapper.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def map_to_brgi_db(brgi_db_mapping: BedrockGIMapping) -> BedrockGIDatabase:
4242
project_uid = brgi_db_mapping.Project.project_id + "-" + project_data_b64_hash
4343

4444
# Create the project table
45-
brgi_project = pd.DataFrame(
45+
project_df = pd.DataFrame(
4646
{
4747
"project_uid": project_uid,
4848
"project_source_id": brgi_db_mapping.Project.project_id,
@@ -54,7 +54,8 @@ def map_to_brgi_db(brgi_db_mapping: BedrockGIMapping) -> BedrockGIDatabase:
5454
},
5555
index=[0],
5656
)
57-
ProjectSchema.validate(brgi_project)
57+
project_df = project_df.loc[:, ~project_df.columns.duplicated()]
58+
ProjectSchema.validate(project_df)
5859

5960
# Create the location table
6061
location_df = pd.DataFrame(
@@ -82,6 +83,7 @@ def map_to_brgi_db(brgi_db_mapping: BedrockGIMapping) -> BedrockGIDatabase:
8283
}
8384
)
8485
location_df = pd.concat([location_df, brgi_db_mapping.Location.data], axis=1)
86+
location_df = location_df.loc[:, ~location_df.columns.duplicated()]
8587
LocationSchema.validate(location_df)
8688

8789
# Create the in-situ test tables
@@ -100,6 +102,7 @@ def map_to_brgi_db(brgi_db_mapping: BedrockGIMapping) -> BedrockGIDatabase:
100102
insitu_mapping.depth_to_base_column
101103
]
102104
insitu_df = pd.concat([insitu_df, insitu_mapping.data], axis=1)
105+
insitu_df = insitu_df.loc[:, ~insitu_df.columns.duplicated()]
103106
InSituTestSchema.validate(insitu_df)
104107
insitu_tests[insitu_mapping.table_name] = insitu_df.copy()
105108

@@ -129,10 +132,11 @@ def map_to_brgi_db(brgi_db_mapping: BedrockGIMapping) -> BedrockGIDatabase:
129132
brgi_db_mapping.Sample.depth_to_top_column
130133
]
131134
sample_df = pd.concat([sample_df, brgi_db_mapping.Sample.data], axis=1)
135+
sample_df = sample_df.loc[:, ~sample_df.columns.duplicated()]
132136
SampleSchema.validate(sample_df)
133137

134138
# Create the lab test tables
135-
brgi_lab_tests = {}
139+
lab_tests = {}
136140
if brgi_db_mapping.Lab:
137141
for lab_mapping in brgi_db_mapping.Lab:
138142
lab_df = pd.DataFrame(
@@ -146,26 +150,24 @@ def map_to_brgi_db(brgi_db_mapping: BedrockGIMapping) -> BedrockGIDatabase:
146150
lab_df["location_uid"] = lab_mapping.data[
147151
lab_mapping.location_id_column
148152
]
149-
lab_df = pd.concat(
150-
[lab_df, lab_mapping.data.copy()],
151-
axis=1,
152-
)
153+
lab_df = pd.concat([lab_df, lab_mapping.data.copy()], axis=1)
153154
LabTestSchema.validate(lab_df)
154-
brgi_lab_tests[lab_mapping.table_name] = lab_df.copy()
155+
lab_tests[lab_mapping.table_name] = lab_df.copy()
156+
157+
# Create the other tables
158+
other_tables = {}
159+
if brgi_db_mapping.Other:
160+
for other_table_mapping in brgi_db_mapping.Other:
161+
other_table_df = other_table_mapping.data
162+
other_table_df.insert(0, "project_uid", project_uid)
163+
other_tables[other_table_mapping.table_name] = other_table_df
155164

156165
# Create and return the Bedrock GI database
157166
return BedrockGIDatabase(
158-
Project=brgi_project,
167+
Project=project_df,
159168
Location=location_df,
160169
InSituTests=insitu_tests,
161170
Sample=sample_df if brgi_db_mapping.Sample else None,
162-
LabTests=brgi_lab_tests,
163-
Other=(
164-
{
165-
other_table.table_name: pd.DataFrame(other_table.data)
166-
for other_table in brgi_db_mapping.Other
167-
}
168-
if brgi_db_mapping.Other
169-
else {}
170-
),
171+
LabTests=lab_tests,
172+
Other=other_tables,
171173
)

0 commit comments

Comments
 (0)