Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
56b4315
updating file management schemas and standardizing routes
amyfromandi Dec 23, 2025
d122701
cli file management working. will refactor for the api
amyfromandi Dec 23, 2025
b6690a2
updating file management
amyfromandi Dec 24, 2025
619bbf5
updated s3 file management process
amyfromandi Jan 5, 2026
fbdc83c
Pull in GBDB strata
davenquinn Sep 24, 2025
de53223
Update SQL files
davenquinn Sep 24, 2025
7d009d1
Colors
davenquinn Sep 24, 2025
509fd7f
Strata with age model
davenquinn Sep 24, 2025
260b4e8
Create some column summary tables
davenquinn Sep 25, 2025
e2ff2dc
Update proportions
davenquinn Sep 25, 2025
5822eba
Add a note
davenquinn Sep 26, 2025
6434287
Update lexicon import
davenquinn Sep 26, 2025
37a2615
Updated age model
davenquinn Sep 26, 2025
a316ac1
Updated table creation code
davenquinn Sep 26, 2025
97ab839
Update table info
davenquinn Sep 26, 2025
fd125f6
Updated SQL scripts
davenquinn Sep 28, 2025
47fd5f4
Starting point for GBDB CLI
davenquinn Sep 29, 2025
7c05048
Updated GBDB ingestion scripts
davenquinn Oct 2, 2025
ac866e5
added convert-e00 pipeline for Doug's maps
amyfromandi Jan 5, 2026
0b2c356
Move files around
davenquinn Jan 5, 2026
d7fa922
Update schema for new file-management approach
davenquinn Jan 5, 2026
a1a5906
Improve GBDB migration capabilities
davenquinn Jan 6, 2026
c6a4776
Updated GBDB ingestion pipeline utils
davenquinn Jan 6, 2026
7dba2d7
Prepare for provisioning in dev
davenquinn Jan 6, 2026
95b18b7
Updated SQL dialects
davenquinn Jan 6, 2026
57533b3
Format code and sort imports
davenquinn Jan 6, 2026
eea9e8a
Merge pull request #246 from UW-Macrostrat/gbdb-integration
davenquinn Jan 6, 2026
115fe2f
updated standard pipeline
amyfromandi Jan 6, 2026
95d962f
Merge branch 'file_management' of github.com:UW-Macrostrat/macrostrat…
amyfromandi Jan 6, 2026
99b9fba
updating code so the cli runs
amyfromandi Jan 6, 2026
57261dd
Format code and sort imports
amyfromandi Jan 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ __pycache__
macrostrat.toml
*.egg-info
*.pyc
.DS_Store
2 changes: 1 addition & 1 deletion .idea/data_source_mapping.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion .idea/sqldialects.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions py-modules/cli/macrostrat/cli/database/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,15 @@ def update_permissions():
keys = ["username", "host", "port", "password", "database"]


@db_app.command(name="refresh-postgrest")
def refresh_postgrest():
"""
Refresh PostgrREST API table definitions
"""
db = get_db()
db.run_sql("NOTIFY pgrst, 'reload schema';")


@db_app.command(name="credentials", rich_help_panel="Helpers")
def connection_details():
"""Show PostgreSQL connection credentials"""
Expand Down
6 changes: 4 additions & 2 deletions py-modules/cli/macrostrat/cli/schema_management/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def dump_schema(schema: str):


@schema_app.command(rich_help_panel="Utils")
def provision():
def provision(pattern: str = Argument("*")):
"""Apply all schema objects to the database

TODO: filter out non-idempotent statements (table creation, etc.)
Expand All @@ -282,6 +282,8 @@ def provision():
environment = settings.env

counter = StatementCounter(safe=True)
apply_schema_for_environment(db, environment, statement_filter=counter.filter)
apply_schema_for_environment(
db, environment, statement_filter=counter.filter, pattern=pattern
)
db.run_sql("NOTIFY pgrst, 'reload schema';")
counter.print_report()
6 changes: 5 additions & 1 deletion py-modules/cli/macrostrat/cli/schema_management/defs.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,18 @@ def apply_schema_for_environment(
*,
recursive: bool = True,
statement_filter=lambda s, p: True,
pattern: str = "*",
):
if "*" not in pattern:
pattern = f"*{pattern}*"

for env_dir in schema_dirs_for_environment(env):
schema_dir = env_dir
if not schema_dir.exists():
continue

func = schema_dir.rglob if recursive else schema_dir.glob
fixtures = sorted(list(func("*.sql")))
fixtures = sorted(list(func(pattern + ".sql")))
fixtures = [f for f in fixtures if not f.name.endswith(".plan.sql")]

if len(fixtures) == 0:
Expand Down
1 change: 1 addition & 0 deletions py-modules/core/macrostrat/core/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ class IngestProcess(Base):
access_group_id: Mapped[int] = mapped_column(
ForeignKey("macrostrat_auth.group.id"), nullable=True
)
# TODO remove all object_group_id associations
object_group_id: Mapped[ObjectGroup] = mapped_column(
ForeignKey("storage.object_group.id")
)
Expand Down
17 changes: 10 additions & 7 deletions py-modules/integrations/macrostrat/integrations/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from typer import Argument
from typer import Argument, Typer

from macrostrat.core.migrations import run_migrations

from .gbdb import app as gbdb_app
from .gbdb import update_age_model
from .schema import IntegrationsBaseSchema
from .strabospot import populate_strabospot

pipelines = {
"strabospot": populate_strabospot,
"update-gbdb-age-model": update_age_model,
}


from typer import Typer

from macrostrat.core.migrations import run_migrations

from .schema import IntegrationsBaseSchema

app = Typer(
no_args_is_help=True,
help="StraboSpot structural geology data system",
Expand Down Expand Up @@ -45,3 +45,6 @@ def run(pipeline: str = Argument(None)):
return

pipelines[pipeline]()


app.add_typer(gbdb_app, name="gbdb")
53 changes: 53 additions & 0 deletions py-modules/integrations/macrostrat/integrations/gbdb/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
from pathlib import Path

from typer import Typer

from macrostrat.core.database import get_database

app = Typer(
name="gbdb",
no_args_is_help=True,
short_help="Geologic map database integration",
)


def update_age_model():
"""
Stack units by age
"""
db = get_database()

res = db.run_query("SELECT count(*) FROM macrostrat_gbdb.strata").scalar()
print(res)


pipeline_dir = Path(__file__).parent / "pipeline"
ingest_dir = Path("/Users/Daven/Projects/Macrostrat/Datasets/GBDB workshop")


@app.command()
def run_pipeline():
"""
Run the data ingestion pipeline
"""

runnables = []
for ext in [".py", ".sql", ".sh"]:
runnables.extend(pipeline_dir.glob(f"*{ext}"))

for runnable in sorted(runnables):
print(f"Running {runnable.name}...")
ROOT_DIR = ingest_dir
os.environ["ROOT_DIR"] = str(ROOT_DIR)
if runnable.suffix == ".py":
exec(runnable.read_text(), globals())
elif runnable.suffix == ".sql":
db = get_database()
db.run_sql(runnable)
elif runnable.suffix == ".sh":
import subprocess

subprocess.run(["bash", str(runnable)], check=True)
else:
print(f"Unknown file type: {runnable.suffix}")
81 changes: 81 additions & 0 deletions py-modules/integrations/macrostrat/integrations/gbdb/explore.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
SELECT count(*) FROM macrostrat_api.gbdb_strata WHERE (min_ma IS NOT NULL AND max_ma IS NOT NULL);


-- 121903 strata have an age constraint
SELECT count(*) FROM macrostrat_api.gbdb_strata WHERE has_age_constraint;

--153217 strata do not have an age constraint
SELECT count(*) FROM macrostrat_api.gbdb_strata WHERE NOT has_age_constraint;

--153179 strata do not have an age constraint but have a formation name
SELECT count(*) FROM macrostrat_api.gbdb_strata WHERE NOT has_age_constraint and (formation IS NOT NULL);



-- 5742 formations mentioned that do not have an age constraint defined
SELECT count(DISTINCT formation)
FROM macrostrat_api.gbdb_strata
WHERE NOT has_age_constraint
AND (formation IS NOT NULL);


SELECT DISTINCT (formation) formation
FROM macrostrat_api.gbdb_strata
WHERE NOT has_age_constraint
AND (formation IS NOT NULL);


SELECT * FROM macrostrat.intervals WHERE id = macrostrat_api.interval_for_age_range(100, 140);


SELECT count(*) FROM macrostrat_gbdb.sections WHERE has_age_constraint;


-- WITH duplicate_units AS (SELECT unit_id, section_id, COUNT(*)
-- FROM macrostrat_api.gbdb_strata
-- GROUP BY unit_id, section_id
-- HAVING COUNT(*) > 1
-- ORDER BY count DESC)
-- SELECT unit_id, array_agg(depth_scale) FROM duplicate_units
-- JOIN macrostrat_api.gbdb_strata USING (unit_id, section_id)
-- GROUP BY unit_id;
--
-- WITH duplicate_units AS (SELECT unit_id, section_id, COUNT(*)
-- FROM macrostrat_api.gbdb_strata
-- GROUP BY unit_id, section_id
-- HAVING COUNT(*) > 1
-- ORDER BY count DESC)
-- SELECT * FROM duplicate_units
-- JOIN macrostrat_api.gbdb_strata USING (unit_id, section_id);


WITH col_sections AS (SELECT section_id, sc.id col_id
FROM macrostrat_gbdb.sections s
JOIN macrostrat_gbdb.summary_columns sc
ON ST_Intersects(ST_SetSRID(ST_MakePoint(lng, lat), 4326), sc.geometry)
WHERE has_age_constraint)
SELECT
row_number() OVER () unit_id,
col_id,
f.formation unit_name,
min_ma t_age,
max_ma b_age
FROM macrostrat_api.gbdb_formations f
JOIN col_sections cs ON cs.section_id = f.section_id
WHERE f.min_ma IS NOT NULL AND f.max_ma IS NOT NULL
GROUP BY col_id, f.formation, min_ma, max_ma;

SELECT age_source, count(*), count(*)::numeric/(SELECT count(*) proportion FROM macrostrat_api.gbdb_strata_with_age_model) FROM macrostrat_api.gbdb_strata_with_age_model GROUP BY age_source;

SELECT * FROM macrostrat_api.gbdb_formations WHERE formation ILIKE '%Fangyan%';

SELECT * FROM macrostrat_api.gbdb_formations WHERE min_ma IS null ORDER BY formation;

SELECT count(*), round(count(*)::numeric/(SELECT count(*) proportion FROM macrostrat_api.gbdb_strata), 2) FROM macrostrat_api.gbdb_strata WHERE min_ma IS NOT NULL AND max_ma IS NOT NULL;


SELECT age_source, count(*), round(count(*)::numeric/(SELECT count(*) FROM macrostrat_gbdb.strata), 2) proportion FROM macrostrat_api.gbdb_strata_with_age_model WHERE country = 'China' GROUP BY age_source ;

-- Age control

SELECT DISTINCT ON (name_clean) * FROM macrostrat_gbdb.external_age_control;
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env bash -e


q="SELECT * FROM macrostrat_api.gbdb_strata_with_age_model"
macrostrat db psql -c "COPY ($q) TO STDOUT WITH CSV DELIMITER ',' HEADER" > gbdb-strata-with-linear-age-model-v2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from pathlib import Path

import numpy as N
from IPython import embed
from pandas import DataFrame, read_csv

from macrostrat.core.database import get_database


def ingest_strata_next(source: Path):
src = Path(source)

df = read_csv(src)
# Set empty strings to NaN
df.replace("", N.nan, inplace=True)

# Get unique section IDs
section_ids = df["section_id"].unique()

print(f"Found {len(section_ids)} unique sections")

db = get_database()
project_id = get_or_create_project(db, "GBDB")

for section_id in section_ids:
df1 = df[df["section_id"] == section_id]
ingest_column(df1)


def ingest_column(df: DataFrame):
df2 = df.iloc[0]
# Get the section name and other parameters

embed()
raise


def get_or_create_project(db, name: str):
res = db.run_query(
"SELECT id FROM macrostrat.projects WHERE project = :name",
{"name": name},
).fetchone()
if res is not None:
return res
res = db.run_query(
"INSERT INTO macrostrat.projects (project) VALUES (:name) RETURNING id",
{"name": name},
).fetchone()
return res
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash -e

root_dir=$ROOT_DIR

macrostrat db psql -c "TRUNCATE TABLE macrostrat_gbdb.chinalex"
cat "$root_dir/2024-08-18_chinalex.csv" \
| macrostrat db psql -c "COPY macrostrat_gbdb.chinalex FROM STDIN WITH (FORMAT CSV, HEADER true)"

macrostrat db psql -c "TRUNCATE TABLE macrostrat_gbdb.stratigraphic_dictionary_llm"
cat "$root_dir/stratigraphic-dictionary-llm-extraction.csv" \
| macrostrat db psql -c 'COPY macrostrat_gbdb.stratigraphic_dictionary_llm FROM STDIN WITH (FORMAT CSV, HEADER true)'
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env bash

cat "$ROOT_DIR/geological_strata.csv" \
| macrostrat db psql -c 'COPY macrostrat_gbdb.strata FROM STDIN WITH (FORMAT CSV, HEADER true, FORCE_NULL (unit_thickness))'
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
-- Some fixes to imported strata
UPDATE macrostrat_gbdb.strata SET member = null WHERE member = '';
UPDATE macrostrat_gbdb.strata SET formation = null WHERE formation = '';
UPDATE macrostrat_gbdb.strata SET epoch = null WHERE epoch = '';


TRUNCATE TABLE macrostrat_gbdb.sections;
WITH a AS (SELECT section_id,
lng,
lat,
count(ac.t_age) > 0 has_age_constraint,
MIN(ac.t_age) min_ma,
MAX(ac.b_age) max_ma
FROM macrostrat_gbdb.strata s
LEFT JOIN macrostrat_gbdb.best_external_age_control ac
ON lower(s.formation) = lower(ac.name_clean)
AND lower(s.formation) != 'unknown'
GROUP BY section_id, lng, lat
)
INSERT INTO macrostrat_gbdb.sections
SELECT *,
macrostrat_api.color_for_age_range(a.min_ma, a.max_ma) color
FROM a;

TRUNCATE TABLE macrostrat_gbdb.summary_columns;
WITH hexgrid AS (
SELECT ST_HexagonGrid(1, ST_MakeEnvelope(-180, -90, 180, 90, 4326)) AS hex
)
INSERT INTO macrostrat_gbdb.summary_columns
SELECT
row_number() OVER () id,
ST_ForceRHR((hex).geom) geometry
FROM hexgrid
WHERE ST_Intersects((hex).geom, (
SELECT ST_Union(ST_SetSRID(ST_MakePoint(lng, lat), 4326)) FROM macrostrat_gbdb.sections WHERE has_age_constraint
)
);


TRUNCATE TABLE macrostrat_gbdb.summary_units;
WITH col_sections AS (
SELECT section_id, sc.id col_id
FROM macrostrat_gbdb.sections s
JOIN macrostrat_gbdb.summary_columns sc
ON ST_Intersects(ST_SetSRID(ST_MakePoint(lng, lat), 4326), sc.geometry)
WHERE has_age_constraint)
INSERT INTO macrostrat_gbdb.summary_units
SELECT
row_number() OVER () unit_id,
col_id,
f.formation unit_name,
min_ma t_age,
max_ma b_age
FROM macrostrat_api.gbdb_formations f
JOIN col_sections cs ON cs.section_id = f.section_id
WHERE f.min_ma IS NOT NULL AND f.max_ma IS NOT NULL
GROUP BY col_id, f.formation, min_ma, max_ma;
Loading