can specify pipeline to ingest now and standard age function works now for two specified columns

amyfromandi · amyfromandi · commit 7799f8a6ddec · 2026-01-08T12:49:37.000-06:00
diff --git a/py-modules/map-integration/macrostrat/map_integration/__init__.py b/py-modules/map-integration/macrostrat/map_integration/__init__.py
@@ -15,8 +15,7 @@
 from macrostrat.core import app
 from macrostrat.database import Database
 from macrostrat.map_integration.commands.prepare_fields import _prepare_fields
-
-# from macrostrat.map_integration.pipeline import ingest_map
+#from macrostrat.map_integration.pipeline import ingest_map
 from macrostrat.map_integration.process.geometry import create_rgeom, create_webgeom
 from macrostrat.map_integration.utils.ingestion_utils import (
     find_gis_files,
@@ -139,54 +138,14 @@ def delete_sources(
                 dict(table=Identifier("sources", table)),
             )
 
-        ingest_process = db.run_query(
-            """
-            SELECT id FROM maps_metadata.ingest_process
-            JOIN maps.sources ON maps.sources.source_id = maps_metadata.ingest_process.source_id
-            WHERE maps.sources.slug = :slug
-            """,
-            dict(slug=slug),
-        ).fetchone()
-
-        if ingest_process:
-            ingest_process_id = ingest_process[0]
-
-            print("Ingest Process ID", ingest_process_id)
-            if file_name is None:
-                rows = db.run_query(
-                    "select f.object_id from maps_metadata.map_files f where ingest_process_id = :ingest_process_id",
-                    dict(ingest_process_id=ingest_process_id),
-                ).fetchall()
-                object_ids = [r[0] for r in rows]
-                db.run_sql(
-                    "DELETE FROM maps_metadata.map_files WHERE ingest_process_id = :ingest_process_id",
-                    dict(ingest_process_id=ingest_process_id),
-                )
-                if object_ids:
-                    db.run_sql(
-                        """
-                        DELETE FROM storage.object
-                        WHERE id = ANY(:object_ids)
-                        """,
-                        dict(object_ids=object_ids),
-                    )
-
-            db.run_sql(
-                "DELETE FROM maps_metadata.ingest_process_tag WHERE ingest_process_id = :ingest_process_id",
-                dict(ingest_process_id=ingest_process_id),
-            )
-            db.run_sql(
-                "DELETE FROM maps_metadata.ingest_process WHERE id = :ingest_process_id",
-                dict(ingest_process_id=ingest_process_id),
-            )
-
         staging_delete_dir(s, db)
 
         source_id = db.run_query(
             "SELECT source_id FROM maps.sources WHERE slug = :slug",
             dict(slug=s),
         ).scalar()
 
+
         # Delete ALL ingest-related rows for this source
         db.run_sql(
             """
@@ -308,6 +267,7 @@ def _run_migrations(database: str = None):
 def staging(
     data_path: str,
     prefix: str = Option(..., help="Slug region prefix to avoid collisions"),
+    pipeline: str = Option("", help="Specify a pipeline to run"),
     merge_key: str = Option(
         "mapunit",
         help="primary key to left join the metadata into the sources polygons/lines/points table",
@@ -343,6 +303,7 @@ def staging(
     ingest_results = ingest_map(
         slug,
         gis_files,
+        pipeline=pipeline,
         if_exists="replace",
         meta_path=data_path,
         merge_key=merge_key,
@@ -434,28 +395,6 @@ def staging(
     create_rgeom(map_info)
     create_webgeom(map_info)
 
-    # Ingest process assertions
-    if len(object_ids) > 0:
-        ingest_id = db.run_query(
-            """
-            SELECT id
-            FROM maps_metadata.ingest_process
-            WHERE source_id = :source_id
-            ORDER BY id DESC
-            LIMIT 1
-            """,
-            dict(source_id=source_id),
-        ).scalar()
-
-        for object in object_ids:
-            db.run_sql(
-                """
-                INSERT INTO maps_metadata.map_files (ingest_process_id, object_id)
-                VALUES (:ingest_process_id, :object_id)
-                """,
-                dict(ingest_process_id=ingest_id, object_id=object),
-            )
-
     console.print(
         f"[green] \n Finished staging setup for {slug}. "
         f"View map here: https://dev.macrostrat.org/maps/ingestion/{source_id}/ [/green] \n"
@@ -470,12 +409,7 @@ def staging(
 
 
 @staging_cli.command("s3-upload")
-def cmd_upload_dir(
-    slug: str = ...,
-    data_path: Path = ...,
-    ext: str = Option(".gdb", help="extension of the data path"),
-    ingest_process_id: int = Option(None),
-):
+def cmd_upload_dir(slug: str = ..., data_path: Path = ..., ext: str = Option(".gdb", help="extension of the data path"), ingest_process_id: int = Option(None)):
     """Upload a local directory to the staging bucket under SLUG/."""
     db = get_database()
     source_id = db.run_query(
@@ -591,7 +525,6 @@ def list_layers(e00_path: Path) -> set[str]:
     def run(cmd):
         p = subprocess.run(cmd, capture_output=True, text=True)
         return p.returncode, p.stdout, p.stderr
-
     created = False
     for f in e00_files:
         base = f.stem
@@ -609,13 +542,9 @@ def run(cmd):
                 # create/overwrite first successful write
                 cmd += ["-overwrite"]
             cmd += [
-                str(out_gpkg),
-                str(f),
-                lyr,
-                "-nln",
-                f"{base}_lines",
-                "-nlt",
-                "LINESTRING",
+                str(out_gpkg), str(f), lyr,
+                "-nln", f"{base}_lines",
+                "-nlt", "LINESTRING",
             ]
             rc, _, err = run(cmd)
             if rc == 0:
@@ -628,13 +557,9 @@ def run(cmd):
             else:
                 cmd = ["ogr2ogr", "-f", "GPKG", "-update", "-append"]
             cmd += [
-                str(out_gpkg),
-                str(f),
-                lyr,
-                "-nln",
-                f"{base}_points",
-                "-nlt",
-                "POINT",
+                str(out_gpkg), str(f), lyr,
+                "-nln", f"{base}_points",
+                "-nlt", "POINT",
             ]
             rc, _, _ = run(cmd)
             if rc == 0:
@@ -647,13 +572,9 @@ def run(cmd):
             else:
                 cmd = ["ogr2ogr", "-f", "GPKG", "-update", "-append"]
             cmd += [
-                str(out_gpkg),
-                str(f),
-                lyr,
-                "-nln",
-                f"{base}_polygons",
-                "-nlt",
-                "POLYGON",
+                str(out_gpkg), str(f), lyr,
+                "-nln", f"{base}_polygons",
+                "-nlt", "POLYGON",
             ]
             rc, _, _ = run(cmd)
             if rc == 0:
@@ -663,7 +584,6 @@ def run(cmd):
 
     print(f"Done: {out_gpkg}")
 
-
 # ----------------------------------------------------------------------------------------------------------------------
 
 
@@ -795,38 +715,19 @@ def staging_bulk(
             ),
         )
 
+
         cmd_upload_dir(slug=slug, data_path=region_path, ext=ext)
 
+
         map_info = get_map_info(db, slug)
         _prepare_fields(map_info)
         create_rgeom(map_info)
         create_webgeom(map_info)
-        # Ingest process assertions
-        if len(object_ids) > 0:
-            ingest_id = db.run_query(
-                """
-                SELECT id
-                FROM maps_metadata.ingest_process
-                WHERE source_id = :source_id
-                ORDER BY id DESC
-                LIMIT 1
-                """,
-                dict(source_id=source_id),
-            ).scalar()
-
-            for object in object_ids:
-                db.run_sql(
-                    """
-                    INSERT INTO maps_metadata.map_files (ingest_process_id, object_id)
-                    VALUES (:ingest_process_id, :object_id)
-                    """,
-                    dict(ingest_process_id=ingest_id, object_id=object),
-                )
 
         print(
             f"\nFinished staging setup for {slug}. View map here: https://dev.macrostrat.org/maps/ingestion/{source_id}/ \n"
         )
     slug_list_path = parent / f"staged_slugs.txt"
     with open(slug_list_path, "w") as file:
         for slug in staged_slugs:
-            file.write(slug + "\n")
+            file.write(slug + "\n")
diff --git a/py-modules/map-integration/macrostrat/map_integration/commands/ingest.py b/py-modules/map-integration/macrostrat/map_integration/commands/ingest.py
@@ -45,7 +45,7 @@ def merge_metadata_polygons(polygon_df, meta_df, join_col) -> G.GeoDataFrame:
 
 
 def preprocess_dataframe(
-    poly_line_pt_df: G.GeoDataFrame, meta_path: Path, join_col: str, feature_suffix: str
+    poly_line_pt_df: G.GeoDataFrame, meta_path: Path, join_col: str, feature_suffix: str, pipeline: str
 ) -> Tuple[G.GeoDataFrame, str, str, str]:
     """
     Preprocess a GeoDataFrame by merging in metadata from a local .tsv,
@@ -62,23 +62,25 @@ def preprocess_dataframe(
     ingest_pipeline = ""
     comments = ""
     state = ""
-    ext = meta_path.suffix.lower()
-    print("here is the ext!", ext)
-    if ext == ".tsv":
+    if pipeline == ".tsv":
         meta_df = P.read_csv(meta_path, sep="\t")
         ingest_pipeline = ".tsv pipeline"
         # TODO tsv pipeline for if feature_suffix == "polygons", "lines" OR "points"
-    elif ext == ".csv":
+    elif pipeline == ".csv":
         meta_df = P.read_csv(meta_path)
         ingest_pipeline = ".csv pipeline"
         # TODO csv pipeline for if feature_suffix == "polygons", "lines" OR "points"
-    elif ext in [".xls", ".xlsx"]:
+    elif pipeline in [".xls", ".xlsx"]:
         ingest_pipeline = ".xls pipeline"
         meta_df = P.read_excel(meta_path)
         # TODO xls pipeline for if feature_suffix == "polygons", "lines" OR "points"
-    elif ext == ".gpkg":
-        map_t_b_standard(poly_line_pt_df, "epoch", "period")
-    elif ext == ".gdb":
+    elif pipeline == ".gpkg":
+        meta_df = map_t_b_standard(poly_line_pt_df, "epoch", "period")
+        ingest_pipeline = ".gpkg pipeline"
+        state = "needs review"
+        comments = ""
+        return meta_df, ingest_pipeline, comments, state
+    elif pipeline == ".gdb":
         if feature_suffix == "polygons":
             join_col = "mapunit"
             if join_col not in poly_line_pt_df.columns:
@@ -167,6 +169,7 @@ def ingest_map(
     files: List[Path],
     embed: bool = False,
     crs: str = None,
+    pipeline: str = "",
     if_exists: str = "replace",
     meta_path: str = None,
     # TODO add default key column to the first column in the file
@@ -268,23 +271,23 @@ def ingest_map(
     # concatenate all polygons into a single df, lines, and points as well
     for feature_type, df_list in frames.items():
         # Concatenate all dataframes
-        print("about to concatenate all df's per feature")
         df = G.GeoDataFrame(P.concat(df_list, ignore_index=True))
-        print("about to check for duplicates")
         df = df.loc[:, ~df.columns.duplicated()]
-
         feature_suffix = feature_type.lower() + "s"
         if feature_suffix == "linestrings":
             feature_suffix = "lines"
         # preprocess dataframe will take the concatenated polygons, lines, or points df and see if there are any metadata
         # formatted_filenames to append and map based on whatever integration pipeline is needed (inferred from the meta_path's ext)
         if meta_path:
             df.columns = df.columns.str.lower()
+            if pipeline == "":
+                pipeline = meta_path.suffix.lower()
             df, ingest_pipeline, comments, state = preprocess_dataframe(
                 df,
                 meta_path=meta_path,
                 join_col=join_col.lower(),
                 feature_suffix=feature_suffix,
+                pipeline=pipeline
             )
             if feature_suffix == "polygons":
                 ingest_results["ingest_pipeline"] = ingest_pipeline
diff --git a/py-modules/map-integration/macrostrat/map_integration/utils/ingestion_utils.py b/py-modules/map-integration/macrostrat/map_integration/utils/ingestion_utils.py
@@ -107,39 +107,46 @@ def get_strat_names_df() -> pd.DataFrame:
 
 
 # standard map age function. User gets to input their column 1 and a column 2 data to map to our ages.
-def map_t_b_standard(
-    meta_df: G.GeoDataFrame, col_one: str, col_two: str
-) -> G.GeoDataFrame:
-    """Populate the b_interval field using age and name information.
-    The function first tries a direct match between legend_df.age and the
-    canonical interval list. For formations whose age is not explicit, it scans
-    the formation name for any word that appears in the interval list.
-    Parameters:
-    legend_df : G.GeoDataFrame. Legend table with at least age and name columns.
-
-    Returns:
-    G.GeoDataFrame: The input frame with a newly filled/created b_interval column.
-    """
+def map_t_b_standard(meta_df: G.GeoDataFrame, col_one: str, col_two: str) -> G.GeoDataFrame:
     interval_df = get_age_interval_df().reset_index(drop=True)
     interval_lookup = {
-        row["interval_name"].lower(): row["id"] for _, row in interval_df.iterrows()
+        str(row["interval_name"]).strip().lower(): int(row["id"])
+        for _, row in interval_df.iterrows()
     }
 
-    # map age fields to b/t intervals
-    # must have a match in the macrotrat.intervals dictionary in order to return a valid interval
-    for word in meta_df[col_one]:
-        if word in interval_lookup:
-            meta_df["b_interval"] = interval_lookup[word]
-            meta_df["t_interval"] = interval_lookup[word]
-
-    # for the rest of NA's we will map the name field to b/t intervals
-    needs_fill = meta_df["b_interval"].isna()
-
-    if needs_fill.any():
-        for word in meta_df[col_two]:
-            if word in interval_lookup:
-                meta_df["b_interval"] = interval_lookup[word]
-                meta_df["t_interval"] = interval_lookup[word]
+    # Ensure columns exist (prevents KeyError)
+    if "b_interval" not in meta_df.columns:
+        meta_df["b_interval"] = pd.NA
+    if "t_interval" not in meta_df.columns:
+        meta_df["t_interval"] = pd.NA
+
+    if col_one in meta_df.columns:
+        mapped_col_one = (
+            meta_df[col_one]
+            .astype("string")
+            .str.strip()
+            .str.lower()
+            .replace("", pd.NA)
+            .map(interval_lookup)
+        )
+        meta_df["b_interval"] = mapped_col_one
+        meta_df["t_interval"] = mapped_col_one
+
+    #fallback to map col_two if col_one row is empty
+    if col_two in meta_df.columns:
+        needs_fill = meta_df["b_interval"].isna() | meta_df["t_interval"].isna()
+        if needs_fill.any():
+            mapped_col_two = (
+                meta_df.loc[needs_fill, col_two]
+                .astype("string")
+                .str.strip()
+                .str.lower()
+                .replace("", pd.NA)
+                .map(interval_lookup)
+            )
+            meta_df.loc[needs_fill, "b_interval"] = mapped_col_two
+            meta_df.loc[needs_fill, "t_interval"] = mapped_col_two
+
     return meta_df