Implement ags_to_brgi_db_mapping

JoostGevaert · JoostGevaert · commit 17774575bdb8 · 2025-05-28T16:31:33.000+02:00
diff --git a/examples/hk_kaitak_ags3/hk_kaitak_ags3_to_brgi_geodb.py b/examples/hk_kaitak_ags3/hk_kaitak_ags3_to_brgi_geodb.py
@@ -225,6 +225,36 @@ def _(mo):
     return
 
 
+@app.cell
+def _(CRS, zip, zipfile):
+    from bedrock_ge.gi.ags_parser import ags_to_brgi_db_mapping
+
+    projected_crs = CRS("EPSG:2326")
+    vertrical_crs = CRS("EPSG:5738")
+
+    with zipfile.ZipFile(zip) as zip_ref:
+        # Iterate over files and directories in the .zip archive
+        for file_name in zip_ref.namelist():
+            # Only process files that have an .ags or .AGS extension
+            if file_name.lower().endswith(".ags"):
+                print(f"\n🖥️ Processing {file_name} ...")
+                with zip_ref.open(file_name) as ags3_file:
+                    # Convert content of a single AGS 3 file to a Dictionary of pandas dataframes (a database)
+                    ags3_to_brgi_db_mapping = ags_to_brgi_db_mapping(
+                        ags3_file, projected_crs, vertrical_crs
+                    )
+
+    # with zipfile.ZipFile(zip) as zip_ref:
+    #     file_name = "58358/GE201304.18A.ags"
+    #     print(f"\n🖥️ Processing {file_name} ...")
+    #     with zip_ref.open(file_name) as ags3_file:
+    #         # Convert content of a single AGS 3 file to a Dictionary of pandas dataframes (a database)
+    #         ags3_to_brgi_db_mapping = ags_to_brgi_db_mapping(
+    #             ags3_file, projected_crs, vertrical_crs
+    #         )
+    return
+
+
 @app.cell
 def _(CRS, pd, zip, zip_of_ags3s_to_bedrock_gi_database):
     brgi_db = zip_of_ags3s_to_bedrock_gi_database(zip, CRS("EPSG:2326"))
diff --git a/examples/hk_kaitak_ags3/kaitak_gi.gpkg b/examples/hk_kaitak_ags3/kaitak_gi.gpkg
diff --git a/src/bedrock_ge/gi/ags3.py b/src/bedrock_ge/gi/ags3.py
@@ -19,7 +19,7 @@
 from bedrock_ge.gi.io_utils import coerce_string, open_text_data_source
 
 
-def ags3_to_db(
+def ags3_to_dfs(
     source: str | Path | IO[str] | IO[bytes] | bytes, encoding: str
 ) -> dict[str, pd.DataFrame]:
     """Converts AGS 3 data to a dictionary of pandas DataFrames.
@@ -35,7 +35,7 @@ def ags3_to_db(
             a pandas DataFrame containing the data for that group.
     """
     # Initialize dictionary and variables used in the AGS 3 read loop
-    ags3_db = {}
+    ags3_dfs = {}
     line_type = "line_0"
     group = ""
     headers: list[str] = ["", "", ""]
@@ -50,7 +50,7 @@ def ags3_to_db(
             if line.startswith('"**'):
                 line_type = "group_name"
                 if group:
-                    ags3_db[group] = pd.DataFrame(group_data, columns=headers)
+                    ags3_dfs[group] = pd.DataFrame(group_data, columns=headers)
 
                 group = line.strip(' ,"*')
                 group_data = []
@@ -86,7 +86,7 @@ def ags3_to_db(
                     continue
                 elif len(data_row) != len(headers):
                     print(
-                        f"\n🚨 CAUTION: The number of columns on line {i + 1} ({len(data_row)}) doesn't match the number of columns of group {group} ({len(headers)})!",
+                        f"\n🚨 CAUTION: The number of columns ({len(data_row)}) on line {i + 1} doesn't match the number of columns ({len(headers)}) of group {group}!",
                         f"{group} headers: {headers}",
                         f"Line {i + 1}:      {data_row}",
                         sep="\n",
@@ -113,75 +113,84 @@ def ags3_to_db(
                     group_data.append(cleaned_data_row)
 
     # Also add the last group's df to the dictionary of AGS dfs
-    ags3_db[group] = pd.DataFrame(group_data, columns=headers).dropna(axis=1, how="all")
+    ags3_dfs[group] = pd.DataFrame(group_data, columns=headers).dropna(
+        axis=1, how="all"
+    )
 
     if not group:
         print(
             '🚨 ERROR: The provided AGS 3 data does not contain any groups, i.e. lines starting with "**'
         )
 
-    return ags3_db
+    return ags3_dfs
 
 
 # TODO: AGS 3 table validation based on the AGS 3 data dictionary.
 def ags3_to_brgi_db_mapping(
-    ags3_db: dict[str, pd.DataFrame],
+    source: str | Path | IO[str] | IO[bytes] | bytes,
     projected_crs: CRS,
-    vertical_crs: CRS = CRS(3855),
+    vertical_crs: CRS,
+    encoding: str,
 ) -> BedrockGIDatabaseMapping:
-    """Map AGS 3 data to Bedrock GI data model.
+    """Map AGS 3 data to the Bedrock GI data model.
 
     Args:
         ags3_db (dict[str, pd.DataFrame]): A dictionary of pandas DataFrames, i.e. database,
             where each key is an AGS 3 group, and the corresponding value is
             a pandas DataFrame containing the data for that group.
         projected_crs (CRS): Projected coordinate reference system (CRS).
-        vertical_crs (CRS, optional): Vertical CRS.
-            Defaults to the Earth Gravitational Model 2008.
+        vertical_crs (CRS, optional): Vertical CRS. Defaults to EGM2008 height, EPSG:3855
+            which measures the orthometric height w.r.t. the Earth Gravitational Model 2008.
+        encoding (str): Encoding of the text file or bytes stream.
 
     Returns:
         BedrockGIDatabaseMapping: Object that maps AGS 3 data to Bedrock GI data model.
     """
-    check_ags_proj_group(ags3_db["PROJ"])
+    ags3_dfs = ags3_to_dfs(source, encoding)
+
+    check_ags_proj_group(ags3_dfs["PROJ"])
     ags3_project = ProjectTableMapping(
-        data=ags3_db["PROJ"].to_dict(orient="records")[0],
-        project_uid=ags3_db["PROJ"]["PROJ_ID"][0],
+        data=ags3_dfs["PROJ"].to_dict(orient="records")[0],
+        project_uid=ags3_dfs["PROJ"]["PROJ_ID"].iloc[0],
         horizontal_crs=projected_crs,
         vertical_crs=vertical_crs,
     )
-    del ags3_db["PROJ"]
+    del ags3_dfs["PROJ"]
 
-    Ags3HOLE.validate(ags3_db["HOLE"])
+    Ags3HOLE.validate(ags3_dfs["HOLE"])
     ags3_location = LocationTableMapping(
-        data=ags3_db["HOLE"],
+        data=ags3_dfs["HOLE"],
         location_id_column="HOLE_ID",
         easting_column="HOLE_NATE",
         northing_column="HOLE_NATN",
         ground_level_elevation_column="HOLE_GL",
         depth_to_base_column="HOLE_FDEP",
     )
-    del ags3_db["HOLE"]
+    del ags3_dfs["HOLE"]
 
-    if "SAMP" in ags3_db.keys():
-        Ags3SAMP.validate(ags3_db["SAMP"])
-        samp_df = ags3_db["SAMP"]
+    if "SAMP" in ags3_dfs.keys():
+        Ags3SAMP.validate(ags3_dfs["SAMP"])
+        samp_df = ags3_dfs["SAMP"]
         samp_df = _add_sample_source_id(samp_df)
         ags3_sample = SampleTableMapping(
             data=samp_df,
             location_id_column="HOLE_ID",
             sample_id_column="sample_source_id",
             depth_to_top_column="SAMP_TOP",
         )
-        del ags3_db["SAMP"]
+        del ags3_dfs["SAMP"]
     else:
         print("Your AGS 3 data doesn't contain a SAMP group, i.e. samples.")
+        ags3_sample = None
 
     ags3_lab_tests = []
     ags3_insitu_tests = []
     ags3_other_tables = []
 
-    for group, df in ags3_db.items():
-        if "SAMP_TOP" in df.columns:
+    for group, df in ags3_dfs.items():
+        # Non-standard group names contain the "?" prefix.
+        # => checking that "SAMP_TOP" / "HOLE_ID" is in the columns is too restrictive.
+        if any("SAMP_TOP" in col for col in df.columns):
             df = _add_sample_source_id(df)
             ags3_lab_tests.append(
                 LabTestTableMapping(
@@ -191,7 +200,7 @@ def ags3_to_brgi_db_mapping(
                     sample_id_column="sample_source_id",
                 )
             )
-        elif "HOLE_ID" in df.columns:
+        elif any("HOLE_ID" in col for col in df.columns):
             top_depth, base_depth = _get_depth_columns(group, list(df.columns))
             ags3_insitu_tests.append(
                 InSituTestTableMapping(
diff --git a/src/bedrock_ge/gi/ags_parser.py b/src/bedrock_ge/gi/ags_parser.py
@@ -0,0 +1,84 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import IO
+
+import pandas as pd
+from pyproj import CRS
+
+from bedrock_ge.gi.ags3 import ags3_to_brgi_db_mapping
+from bedrock_ge.gi.brgi_db_mapping import BedrockGIDatabaseMapping
+from bedrock_ge.gi.io_utils import detect_encoding, open_text_data_source
+
+
+def ags_to_brgi_db_mapping(
+    source: str | Path | IO[str] | IO[bytes] | bytes,
+    projected_crs: CRS,
+    vertical_crs: CRS = CRS(3855),
+    encoding: str | None = None,
+) -> BedrockGIDatabaseMapping:
+    """Map AGS 3 or AGS 4 data to the Bedrock GI data model.
+
+    Args:
+        source (str | Path | IO[str] | IO[bytes] | bytes): The AGS file (str or Path)
+            or a file-like object that represents the AGS file.
+        projected_crs (CRS): Projected Coordinate Reference System (CRS). For example:
+            - OSGB36 / British National Grid: `pyproj.CRS("EPSG:27700")`
+            - Hong Kong 1980 Grid System: `pyproj.CRS("EPSG:2326")`
+        vertical_crs (CRS, optional): Vertical CRS. Defaults to EGM2008 height, EPSG:3855
+            which measures the orthometric height w.r.t. the Earth Gravitational Model 2008.
+            - Ordnance Datum Newlyn (ODN) Height: `pyproj.CRS("EPSG:5701")`
+            - Hong Kong Principle Datum (HKPD) Height: `pyproj.CRS("EPSG:5738")`
+        encoding (str | None, optional): Encoding of the text file or bytes stream.
+            Defaults to None. An attempt at detecting the encoding will be made if None.
+
+    Raises:
+        ValueError: If the data does not match AGS 3 or AGS 4 format.
+
+    Returns:
+        BedrockGIDatabaseMapping: Object that maps AGS 3 or AGS 4 data to Bedrock GI data model.
+    """
+    if not encoding:
+        encoding = detect_encoding(source)
+
+    # Get first non-blank line, None if all lines are blank
+    with open_text_data_source(source, encoding=encoding) as f:
+        first_line = next((line.strip() for line in f if line.strip()), None)
+
+    if first_line:
+        if first_line.startswith('"**'):
+            ags_version = 3
+            brgi_db_mapping = ags3_to_brgi_db_mapping(
+                source, projected_crs, vertical_crs, encoding
+            )
+        elif first_line.startswith('"GROUP"'):
+            ags_version = 4
+            # brgi_db_mapping = ags4_to_brgi_db_mapping(
+            #     source, projected_crs, vertical_crs, encoding
+            # )
+        else:
+            # If first non-empty line doesn't match AGS 3 or AGS 4 format
+            raise ValueError("The data provided is not valid AGS 3 or AGS 4 data.")
+    else:
+        raise ValueError("The file provided has only blank lines")
+
+    # Log information about the mapped AGS 3 or AGS 4 data
+    project_uid = brgi_db_mapping.Project.project_uid
+    n_gi_locations = len(brgi_db_mapping.Location.data)
+    n_samples = len(brgi_db_mapping.Sample.data) if brgi_db_mapping.Sample else 0
+    print_args = [
+        f"AGS {ags_version} data was read for Project {project_uid}",
+        f"This GI data contains {n_gi_locations} GI locations, {n_samples} samples and:",
+        f"  - In-Situ Tests: {[insitu_test.table_name for insitu_test in brgi_db_mapping.InSitu]}",
+    ]
+    if brgi_db_mapping.Lab:
+        print_args.append(
+            f"  - Lab Tests: {[lab_test.table_name for lab_test in brgi_db_mapping.Lab]}"
+        )
+    if brgi_db_mapping.Other:
+        print_args.append(
+            f"  - Other Tables: {[other_table.table_name for other_table in brgi_db_mapping.Other]}"
+        )
+    print(*print_args, sep="\n", end="\n\n")
+
+    return brgi_db_mapping
diff --git a/src/bedrock_ge/gi/ags_schemas.py b/src/bedrock_ge/gi/ags_schemas.py
@@ -81,13 +81,6 @@ class BaseSAMP(pa.DataFrameModel):
 
 
 class Ags3SAMP(BaseSAMP):
-    sample_id: Series[str] = pa.Field(
-        # primary_key=True,
-        unique=True,
-        coerce=True,
-        description="Sample unique identifier",
-        # example="REF_TYPE_TOP_HOLE_ID",
-    )
     HOLE_ID: Series[str] = pa.Field(
         # foreign_key="Ags3HOLE.HOLE_ID",
         description="Exploratory hole or location equivalent",
diff --git a/src/bedrock_ge/gi/brgi_db_mapping.py b/src/bedrock_ge/gi/brgi_db_mapping.py
@@ -68,8 +68,8 @@ class BedrockGIDatabaseMapping(BaseModel):
     Location: LocationTableMapping
     InSitu: list[InSituTestTableMapping]
     Sample: Optional[SampleTableMapping] = None
-    Lab: Optional[list[LabTestTableMapping]] = None
-    Other: Optional[list[OtherTable]] = None
+    Lab: Optional[list[LabTestTableMapping]] = []
+    Other: Optional[list[OtherTable]] = []
 
 
 def map_to_brgi_db(brgi_db_mapping: BedrockGIDatabaseMapping) -> BedrockGIDatabase:
diff --git a/src/bedrock_ge/gi/gis_geometry.py b/src/bedrock_ge/gi/gis_geometry.py
@@ -9,8 +9,6 @@
 from pyproj.crs import CRS
 from shapely.geometry import LineString, Point
 
-# TODO: change function type hints, such that pandera checks the dataframes against the Bedrock schemas
-
 
 def calculate_gis_geometry(
     no_gis_brgi_db: Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame]],
@@ -52,13 +50,13 @@ def calculate_gis_geometry(
         print("Calculating GIS geometry for the Bedrock GI database tables...")
 
     # Check if all projects have the same CRS
-    if not brgi_db["Project"]["crs_wkt"].nunique() == 1:
+    if not brgi_db["Project"]["horizontal_crs_wkt"].nunique() == 1:
         raise ValueError(
             "All projects must have the same CRS (Coordinate Reference System).\n"
-            "Raise an issue on GitHub in case you need to be able to combine GI data that was acquired in multiple different CRS's."
+            "Raise an issue on GitHub in case you need to be able to combine GI data that was acquired in multiple different CRSes."
         )
 
-    crs = CRS.from_wkt(brgi_db["Project"]["crs_wkt"].iloc[0])
+    crs = CRS.from_wkt(brgi_db["Project"]["horizontal_crs_wkt"].iloc[0])
 
     # Calculate GIS geometry for the 'Location' table
     if verbose: