Skip to content

Commit 5380e1d

Browse files
committed
Implement database merge
1 parent 3da669d commit 5380e1d

File tree

2 files changed

+164
-42
lines changed

2 files changed

+164
-42
lines changed

examples/hk_kaitak_ags3/hk_kaitak_ags3_to_brgi_geodb.py

Lines changed: 94 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -225,14 +225,6 @@ def _(mo):
225225
return
226226

227227

228-
@app.cell
229-
def _():
230-
none = None
231-
boolean = not none
232-
boolean
233-
return
234-
235-
236228
@app.cell
237229
def _(CRS, zip, zipfile):
238230
from bedrock_ge.gi.ags3 import ags3_to_brgi_db_mapping
@@ -243,9 +235,7 @@ def _(CRS, zip, zipfile):
243235

244236
projected_crs = CRS("EPSG:2326")
245237
vertrical_crs = CRS("EPSG:5738")
246-
brgi_db_from_1_ags3_file = None
247-
brgi_db_obj = None
248-
238+
brgi_dbs = []
249239
with zipfile.ZipFile(zip) as zip_ref:
250240
# Iterate over files and directories in the .zip archive
251241
for i, file_name in enumerate(zip_ref.namelist()):
@@ -257,14 +247,14 @@ def _(CRS, zip, zipfile):
257247
ags3_mapping = ags_to_brgi_db_mapping(
258248
ags3_file, projected_crs, vertrical_crs
259249
)
260-
brgi_db_from_1_ags3_file = map_to_brgi_db(ags3_mapping)
250+
brgi_dbs.append(map_to_brgi_db(ags3_mapping))
261251

262252
# if not brgi_db_obj:
263253
# brgi_db_obj = brgi_db_from_1_ags3_file
264-
254+
265255
# if brgi_db_obj and brgi_db_from_1_ags3_file:
266256
# brgi_db_obj = merge_databases(brgi_db_obj, brgi_db_from_1_ags3_file)
267-
257+
268258
# print(f"i = {i}: brgi_db = {brgi_db_obj}")
269259

270260
# with zipfile.ZipFile(zip) as zip_ref:
@@ -279,6 +269,96 @@ def _(CRS, zip, zipfile):
279269
# brgi_db_obj = map_to_brgi_db(ags3_mapping_obj)
280270

281271
# brgi_db_obj
272+
return ags3_mapping, brgi_dbs, merge_databases
273+
274+
275+
@app.cell
276+
def _(project_data_jsons):
277+
hash(project_data_jsons)
278+
return
279+
280+
281+
@app.cell
282+
def _(ags3_mapping, pd):
283+
import base64
284+
import hashlib
285+
import json
286+
287+
brgi_db_mapping = ags3_mapping
288+
project_data_jsons = json.dumps(brgi_db_mapping.Project.data, sort_keys=True)
289+
project_data_hash = hashlib.blake2b(
290+
project_data_jsons.encode("utf-8"), digest_size=9
291+
).digest()
292+
url_safe_hash = base64.b64encode(project_data_hash).decode()
293+
project_uid = brgi_db_mapping.Project.project_id + "-" + url_safe_hash
294+
location_df = pd.DataFrame(
295+
{
296+
"location_uid": brgi_db_mapping.Location.data[
297+
brgi_db_mapping.Location.location_id_column
298+
]
299+
+ f"_{project_uid}",
300+
"location_source_id": brgi_db_mapping.Location.data[
301+
brgi_db_mapping.Location.location_id_column
302+
],
303+
"project_uid": project_uid,
304+
"easting": brgi_db_mapping.Location.data[
305+
brgi_db_mapping.Location.easting_column
306+
],
307+
"northing": brgi_db_mapping.Location.data[
308+
brgi_db_mapping.Location.northing_column
309+
],
310+
"ground_level_elevation": brgi_db_mapping.Location.data[
311+
brgi_db_mapping.Location.ground_level_elevation_column
312+
],
313+
"depth_to_base": brgi_db_mapping.Location.data[
314+
brgi_db_mapping.Location.depth_to_base_column
315+
],
316+
}
317+
)
318+
return base64, hashlib, project_data_jsons, project_uid, url_safe_hash
319+
320+
321+
@app.cell
322+
def _(url_safe_hash):
323+
b64_hash = url_safe_hash
324+
print(b64_hash)
325+
return
326+
327+
328+
@app.cell
329+
def _(base64):
330+
b = b'\xff\x00\xfa'
331+
encoded = base64.b64encode(b).decode()
332+
print(encoded)
333+
return
334+
335+
336+
@app.cell
337+
def _(base64, hashlib, project_data_jsons, project_uid):
338+
print(project_uid)
339+
bytes_hash = hashlib.blake2b(
340+
project_data_jsons.encode("utf-8"), digest_size=8
341+
).digest()
342+
safe_hash = base64.b85encode(bytes_hash).decode()
343+
safe_hash
344+
return
345+
346+
347+
@app.cell
348+
def _(ags3_mapping):
349+
ags3_mapping.Project
350+
return
351+
352+
353+
@app.cell
354+
def _(brgi_dbs):
355+
brgi_dbs[7].Project
356+
return
357+
358+
359+
@app.cell
360+
def _(brgi_dbs, merge_databases):
361+
brgi_db_obj = merge_databases(brgi_dbs)
282362
return
283363

284364

src/bedrock_ge/gi/db_operations.py

Lines changed: 70 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -59,31 +59,73 @@ def merge_databases(
5959
elif len(dbs) == 1 and isinstance(dbs[0], BedrockGIDatabase):
6060
return dbs[0]
6161

62-
target_db = dbs.pop(0)
63-
64-
merged_db = {
65-
"Project": "merged_project",
66-
"Location": "merged_location",
67-
}
68-
69-
# merged_db = BedrockGIDatabase(
70-
# Project=target_db.Project.append(incoming_db.Project),
71-
# Location=target_db.Location.append(incoming_db.Location),
72-
# InSituTests={
73-
# k: target_db.InSituTests[k].append(incoming_db.InSituTests[k])
74-
# for k in target_db.InSituTests
75-
# if k in incoming_db.InSituTests
76-
# },
77-
# Sample=target_db.Sample.append(incoming_db.Sample),
78-
# LabTests={
79-
# k: target_db.LabTests[k].append(incoming_db.LabTests[k])
80-
# for k in target_db.LabTests
81-
# if k in incoming_db.LabTests
82-
# },
83-
# Other={
84-
# k: target_db.Other[k].append(incoming_db.Other[k])
85-
# for k in target_db.Other
86-
# if k in incoming_db.Other
87-
# },
88-
# )
89-
return merged_db
62+
merged_project = pd.concat([db.Project for db in dbs], ignore_index=True)
63+
merged_project.drop_duplicates(inplace=True)
64+
ProjectSchema.validate(merged_project)
65+
66+
merged_location = pd.concat([db.Location for db in dbs], ignore_index=True)
67+
merged_location.drop_duplicates(inplace=True)
68+
LocationSchema.validate(merged_location)
69+
check_foreign_key("project_uid", merged_project, merged_location)
70+
71+
insitu_tables: set[str] = set()
72+
lab_tables: set[str] = set()
73+
other_tables: set[str] = set()
74+
for db in dbs:
75+
insitu_tables.update(db.InSituTests.keys())
76+
if db.LabTests:
77+
lab_tables.update(db.LabTests.keys())
78+
if db.Other:
79+
other_tables.update(db.Other.keys())
80+
81+
merged_insitu: dict[str, pd.DataFrame] = {}
82+
for insitu_table in insitu_tables:
83+
insitu_df = pd.concat(
84+
[db.InSituTests.get(insitu_table) for db in dbs], ignore_index=True
85+
)
86+
insitu_df.drop_duplicates(inplace=True)
87+
InSituTestSchema.validate(insitu_df)
88+
check_foreign_key("project_uid", merged_project, insitu_df)
89+
check_foreign_key("location_uid", merged_location, insitu_df)
90+
merged_insitu[insitu_table] = insitu_df
91+
92+
sample_dfs = [db.Sample for db in dbs if db.Sample is not None]
93+
if sample_dfs:
94+
merged_sample = pd.concat(sample_dfs, ignore_index=True)
95+
merged_sample.drop_duplicates(inplace=True)
96+
SampleSchema.validate(merged_sample)
97+
check_foreign_key("project_uid", merged_project, merged_sample)
98+
99+
merged_lab: dict[str, pd.DataFrame] = {}
100+
for lab_table in lab_tables:
101+
lab_dfs = [
102+
db.LabTests.get(lab_table)
103+
for db in dbs
104+
if db.LabTests.get(lab_table) is not None
105+
]
106+
lab_df = pd.concat(lab_dfs, ignore_index=True)
107+
lab_df.drop_duplicates(inplace=True)
108+
check_foreign_key("project_uid", merged_project, lab_df)
109+
check_foreign_key("sample_uid", merged_sample, lab_df)
110+
merged_lab[lab_table] = lab_df
111+
112+
merged_other: dict[str, pd.DataFrame] = {}
113+
for other_table in other_tables:
114+
other_dfs = [
115+
db.Other.get(other_table)
116+
for db in dbs
117+
if db.Other.get(other_table) is not None
118+
]
119+
other_df = pd.concat(other_dfs, ignore_index=True)
120+
other_df.drop_duplicates(inplace=True)
121+
check_foreign_key("project_uid", merged_project, other_df)
122+
merged_other[other_table] = other_df
123+
124+
return BedrockGIDatabase(
125+
Project=merged_project,
126+
Location=merged_location,
127+
InSituTests=merged_insitu,
128+
Sample=merged_sample,
129+
LabTests=merged_lab,
130+
Other=merged_other,
131+
)

0 commit comments

Comments
 (0)