UW-Macrostrat · amyfromandi · Jan 6, 2026 · Dec 23, 2025 · Dec 23, 2025 · Dec 24, 2025
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ __pycache__
 macrostrat.toml
 *.egg-info
 *.pyc
+.DS_Store
diff --git a/.idea/data_source_mapping.xml b/.idea/data_source_mapping.xml
diff --git a/.idea/sqldialects.xml b/.idea/sqldialects.xml
diff --git a/py-modules/cli/macrostrat/cli/database/__init__.py b/py-modules/cli/macrostrat/cli/database/__init__.py
@@ -343,6 +343,15 @@ def update_permissions():
 keys = ["username", "host", "port", "password", "database"]
 
 
+@db_app.command(name="refresh-postgrest")
+def refresh_postgrest():
+    """
+    Refresh PostgrREST API table definitions
+    """
+    db = get_db()
+    db.run_sql("NOTIFY pgrst, 'reload schema';")
+
+
 @db_app.command(name="credentials", rich_help_panel="Helpers")
 def connection_details():
     """Show PostgreSQL connection credentials"""

diff --git a/py-modules/cli/macrostrat/cli/schema_management/__init__.py b/py-modules/cli/macrostrat/cli/schema_management/__init__.py
@@ -272,7 +272,7 @@ def dump_schema(schema: str):
 
 
 @schema_app.command(rich_help_panel="Utils")
-def provision():
+def provision(pattern: str = Argument("*")):
     """Apply all schema objects to the database
 
     TODO: filter out non-idempotent statements (table creation, etc.)
@@ -282,6 +282,8 @@ def provision():
     environment = settings.env
 
     counter = StatementCounter(safe=True)
-    apply_schema_for_environment(db, environment, statement_filter=counter.filter)
+    apply_schema_for_environment(
+        db, environment, statement_filter=counter.filter, pattern=pattern
+    )
     db.run_sql("NOTIFY pgrst, 'reload schema';")
     counter.print_report()
diff --git a/py-modules/cli/macrostrat/cli/schema_management/defs.py b/py-modules/cli/macrostrat/cli/schema_management/defs.py
@@ -56,14 +56,18 @@ def apply_schema_for_environment(
     *,
     recursive: bool = True,
     statement_filter=lambda s, p: True,
+    pattern: str = "*",
 ):
+    if "*" not in pattern:
+        pattern = f"*{pattern}*"
+
     for env_dir in schema_dirs_for_environment(env):
         schema_dir = env_dir
         if not schema_dir.exists():
             continue
 
         func = schema_dir.rglob if recursive else schema_dir.glob
-        fixtures = sorted(list(func("*.sql")))
+        fixtures = sorted(list(func(pattern + ".sql")))
         fixtures = [f for f in fixtures if not f.name.endswith(".plan.sql")]
 
         if len(fixtures) == 0:

diff --git a/py-modules/core/macrostrat/core/schemas.py b/py-modules/core/macrostrat/core/schemas.py
@@ -193,6 +193,7 @@ class IngestProcess(Base):
     access_group_id: Mapped[int] = mapped_column(
         ForeignKey("macrostrat_auth.group.id"), nullable=True
     )
+    # TODO remove all object_group_id associations
     object_group_id: Mapped[ObjectGroup] = mapped_column(
         ForeignKey("storage.object_group.id")
     )

diff --git a/py-modules/integrations/macrostrat/integrations/__init__.py b/py-modules/integrations/macrostrat/integrations/__init__.py
@@ -1,18 +1,18 @@
-from typer import Argument
+from typer import Argument, Typer
 
+from macrostrat.core.migrations import run_migrations
+
+from .gbdb import app as gbdb_app
+from .gbdb import update_age_model
+from .schema import IntegrationsBaseSchema
 from .strabospot import populate_strabospot
 
 pipelines = {
     "strabospot": populate_strabospot,
+    "update-gbdb-age-model": update_age_model,
 }
 
 
-from typer import Typer
-
-from macrostrat.core.migrations import run_migrations
-
-from .schema import IntegrationsBaseSchema
-
 app = Typer(
     no_args_is_help=True,
     help="StraboSpot structural geology data system",
@@ -45,3 +45,6 @@ def run(pipeline: str = Argument(None)):
         return
 
     pipelines[pipeline]()
+
+
+app.add_typer(gbdb_app, name="gbdb")
diff --git a/py-modules/integrations/macrostrat/integrations/gbdb/__init__.py b/py-modules/integrations/macrostrat/integrations/gbdb/__init__.py
@@ -0,0 +1,53 @@
+import os
+from pathlib import Path
+
+from typer import Typer
+
+from macrostrat.core.database import get_database
+
+app = Typer(
+    name="gbdb",
+    no_args_is_help=True,
+    short_help="Geologic map database integration",
+)
+
+
+def update_age_model():
+    """
+    Stack units by age
+    """
+    db = get_database()
+
+    res = db.run_query("SELECT count(*) FROM macrostrat_gbdb.strata").scalar()
+    print(res)
+
+
+pipeline_dir = Path(__file__).parent / "pipeline"
+ingest_dir = Path("/Users/Daven/Projects/Macrostrat/Datasets/GBDB workshop")
+
+
+@app.command()
+def run_pipeline():
+    """
+    Run the data ingestion pipeline
+    """
+
+    runnables = []
+    for ext in [".py", ".sql", ".sh"]:
+        runnables.extend(pipeline_dir.glob(f"*{ext}"))
+
+    for runnable in sorted(runnables):
+        print(f"Running {runnable.name}...")
+        ROOT_DIR = ingest_dir
+        os.environ["ROOT_DIR"] = str(ROOT_DIR)
+        if runnable.suffix == ".py":
+            exec(runnable.read_text(), globals())
+        elif runnable.suffix == ".sql":
+            db = get_database()
+            db.run_sql(runnable)
+        elif runnable.suffix == ".sh":
+            import subprocess
+
+            subprocess.run(["bash", str(runnable)], check=True)
+        else:
+            print(f"Unknown file type: {runnable.suffix}")
diff --git a/py-modules/integrations/macrostrat/integrations/gbdb/explore.sql b/py-modules/integrations/macrostrat/integrations/gbdb/explore.sql
@@ -0,0 +1,81 @@
+SELECT count(*) FROM macrostrat_api.gbdb_strata WHERE (min_ma IS NOT NULL AND max_ma IS NOT NULL);
+
+
+-- 121903 strata have an age constraint
+SELECT count(*) FROM macrostrat_api.gbdb_strata WHERE has_age_constraint;
+
+--153217 strata do not have an age constraint
+SELECT count(*) FROM macrostrat_api.gbdb_strata WHERE NOT has_age_constraint;
+
+--153179 strata do not have an age constraint but have a formation name
+SELECT count(*) FROM macrostrat_api.gbdb_strata WHERE NOT has_age_constraint and (formation IS NOT NULL);
+
+
+
+-- 5742 formations mentioned that do not have an age constraint defined
+SELECT count(DISTINCT formation)
+FROM macrostrat_api.gbdb_strata
+WHERE NOT has_age_constraint
+  AND (formation IS NOT NULL);
+
+
+SELECT DISTINCT (formation) formation
+FROM macrostrat_api.gbdb_strata
+WHERE NOT has_age_constraint
+  AND (formation IS NOT NULL);
+
+
+SELECT * FROM macrostrat.intervals WHERE id = macrostrat_api.interval_for_age_range(100, 140);
+
+
+SELECT count(*) FROM macrostrat_gbdb.sections WHERE has_age_constraint;
+
+
+-- WITH duplicate_units AS (SELECT unit_id, section_id, COUNT(*)
+--                FROM macrostrat_api.gbdb_strata
+--                GROUP BY unit_id, section_id
+--                HAVING COUNT(*) > 1
+--                ORDER BY count DESC)
+-- SELECT unit_id, array_agg(depth_scale) FROM duplicate_units
+-- JOIN macrostrat_api.gbdb_strata USING (unit_id, section_id)
+-- GROUP BY unit_id;
+--
+-- WITH duplicate_units AS (SELECT unit_id, section_id, COUNT(*)
+--                          FROM macrostrat_api.gbdb_strata
+--                          GROUP BY unit_id, section_id
+--                          HAVING COUNT(*) > 1
+--                          ORDER BY count DESC)
+-- SELECT * FROM duplicate_units
+-- JOIN macrostrat_api.gbdb_strata USING (unit_id, section_id);
+
+
+WITH col_sections AS (SELECT section_id, sc.id col_id
+                      FROM macrostrat_gbdb.sections s
+                             JOIN macrostrat_gbdb.summary_columns sc
+                                  ON ST_Intersects(ST_SetSRID(ST_MakePoint(lng, lat), 4326), sc.geometry)
+                      WHERE has_age_constraint)
+SELECT
+    row_number() OVER () unit_id,
+    col_id,
+    f.formation unit_name,
+    min_ma t_age,
+    max_ma b_age
+FROM macrostrat_api.gbdb_formations f
+       JOIN col_sections cs ON cs.section_id = f.section_id
+WHERE f.min_ma IS NOT NULL AND f.max_ma IS NOT NULL
+GROUP BY col_id, f.formation, min_ma, max_ma;
+
+SELECT age_source, count(*), count(*)::numeric/(SELECT count(*) proportion FROM macrostrat_api.gbdb_strata_with_age_model) FROM macrostrat_api.gbdb_strata_with_age_model GROUP BY age_source;
+
+SELECT * FROM macrostrat_api.gbdb_formations WHERE formation ILIKE '%Fangyan%';
+
+SELECT * FROM macrostrat_api.gbdb_formations WHERE min_ma IS null ORDER BY formation;
+
+SELECT count(*), round(count(*)::numeric/(SELECT count(*) proportion FROM macrostrat_api.gbdb_strata), 2) FROM macrostrat_api.gbdb_strata WHERE min_ma IS NOT NULL AND max_ma IS NOT NULL;
+
+
+SELECT age_source, count(*), round(count(*)::numeric/(SELECT count(*) FROM macrostrat_gbdb.strata), 2) proportion FROM macrostrat_api.gbdb_strata_with_age_model WHERE country = 'China' GROUP BY age_source ;
+
+-- Age control
+
+SELECT DISTINCT ON (name_clean) * FROM macrostrat_gbdb.external_age_control;
diff --git a/py-modules/integrations/macrostrat/integrations/gbdb/export-tables.sh b/py-modules/integrations/macrostrat/integrations/gbdb/export-tables.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash -e
+
+
+q="SELECT * FROM macrostrat_api.gbdb_strata_with_age_model"
+macrostrat db psql -c "COPY ($q) TO STDOUT WITH CSV DELIMITER ',' HEADER" > gbdb-strata-with-linear-age-model-v2.csv
diff --git a/py-modules/integrations/macrostrat/integrations/gbdb/ingest_strata.py b/py-modules/integrations/macrostrat/integrations/gbdb/ingest_strata.py
@@ -0,0 +1,49 @@
+from pathlib import Path
+
+import numpy as N
+from IPython import embed
+from pandas import DataFrame, read_csv
+
+from macrostrat.core.database import get_database
+
+
+def ingest_strata_next(source: Path):
+    src = Path(source)
+
+    df = read_csv(src)
+    # Set empty strings to NaN
+    df.replace("", N.nan, inplace=True)
+
+    # Get unique section IDs
+    section_ids = df["section_id"].unique()
+
+    print(f"Found {len(section_ids)} unique sections")
+
+    db = get_database()
+    project_id = get_or_create_project(db, "GBDB")
+
+    for section_id in section_ids:
+        df1 = df[df["section_id"] == section_id]
+        ingest_column(df1)
+
+
+def ingest_column(df: DataFrame):
+    df2 = df.iloc[0]
+    # Get the section name and other parameters
+
+    embed()
+    raise
+
+
+def get_or_create_project(db, name: str):
+    res = db.run_query(
+        "SELECT id FROM macrostrat.projects WHERE project = :name",
+        {"name": name},
+    ).fetchone()
+    if res is not None:
+        return res
+    res = db.run_query(
+        "INSERT INTO macrostrat.projects (project) VALUES (:name) RETURNING id",
+        {"name": name},
+    ).fetchone()
+    return res
diff --git a/py-modules/integrations/macrostrat/integrations/gbdb/pipeline/01-import-lex.sh b/py-modules/integrations/macrostrat/integrations/gbdb/pipeline/01-import-lex.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash -e
+
+root_dir=$ROOT_DIR
+
+macrostrat db psql -c "TRUNCATE TABLE macrostrat_gbdb.chinalex"
+cat "$root_dir/2024-08-18_chinalex.csv" \
+| macrostrat db psql -c "COPY macrostrat_gbdb.chinalex FROM STDIN WITH (FORMAT CSV, HEADER true)"
+
+macrostrat db psql -c "TRUNCATE TABLE macrostrat_gbdb.stratigraphic_dictionary_llm"
+cat "$root_dir/stratigraphic-dictionary-llm-extraction.csv" \
+| macrostrat db psql -c 'COPY macrostrat_gbdb.stratigraphic_dictionary_llm FROM STDIN WITH (FORMAT CSV, HEADER true)'
diff --git a/py-modules/integrations/macrostrat/integrations/gbdb/pipeline/02-import-strata.sh b/py-modules/integrations/macrostrat/integrations/gbdb/pipeline/02-import-strata.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+cat "$ROOT_DIR/geological_strata.csv" \
+| macrostrat db psql -c 'COPY macrostrat_gbdb.strata FROM STDIN WITH (FORMAT CSV, HEADER true, FORCE_NULL (unit_thickness))'
diff --git a/py-modules/integrations/macrostrat/integrations/gbdb/pipeline/03-fill-tables.sql b/py-modules/integrations/macrostrat/integrations/gbdb/pipeline/03-fill-tables.sql
@@ -0,0 +1,57 @@
+-- Some fixes to imported strata
+UPDATE macrostrat_gbdb.strata SET member = null WHERE member = '';
+UPDATE macrostrat_gbdb.strata SET formation = null WHERE formation = '';
+UPDATE macrostrat_gbdb.strata SET epoch = null WHERE epoch = '';
+
+
+TRUNCATE TABLE macrostrat_gbdb.sections;
+WITH a AS (SELECT section_id,
+                  lng,
+                  lat,
+                  count(ac.t_age) > 0 has_age_constraint,
+                  MIN(ac.t_age)                 min_ma,
+                  MAX(ac.b_age)                 max_ma
+  FROM macrostrat_gbdb.strata s
+  LEFT JOIN macrostrat_gbdb.best_external_age_control ac
+                            ON lower(s.formation) = lower(ac.name_clean)
+                              AND lower(s.formation) != 'unknown'
+           GROUP BY section_id, lng, lat
+)
+INSERT INTO macrostrat_gbdb.sections
+SELECT *,
+       macrostrat_api.color_for_age_range(a.min_ma, a.max_ma) color
+FROM a;
+
+TRUNCATE TABLE macrostrat_gbdb.summary_columns;
+WITH hexgrid AS (
+  SELECT ST_HexagonGrid(1, ST_MakeEnvelope(-180, -90, 180, 90, 4326)) AS hex
+)
+INSERT INTO macrostrat_gbdb.summary_columns
+SELECT
+    row_number() OVER () id,
+    ST_ForceRHR((hex).geom) geometry
+FROM hexgrid
+WHERE ST_Intersects((hex).geom, (
+    SELECT ST_Union(ST_SetSRID(ST_MakePoint(lng, lat), 4326)) FROM macrostrat_gbdb.sections WHERE has_age_constraint
+  )
+);
+
+
+TRUNCATE TABLE macrostrat_gbdb.summary_units;
+WITH col_sections AS (
+  SELECT section_id, sc.id col_id
+  FROM macrostrat_gbdb.sections s
+         JOIN macrostrat_gbdb.summary_columns sc
+              ON ST_Intersects(ST_SetSRID(ST_MakePoint(lng, lat), 4326), sc.geometry)
+  WHERE has_age_constraint)
+INSERT INTO macrostrat_gbdb.summary_units
+SELECT
+    row_number() OVER () unit_id,
+    col_id,
+    f.formation unit_name,
+    min_ma t_age,
+    max_ma b_age
+FROM macrostrat_api.gbdb_formations f
+       JOIN col_sections cs ON cs.section_id = f.section_id
+WHERE f.min_ma IS NOT NULL AND f.max_ma IS NOT NULL
+GROUP BY col_id, f.formation, min_ma, max_ma;