Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .idea/data_source_mapping.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion .idea/sqldialects.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions py-modules/cli/macrostrat/cli/database/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,15 @@ def update_permissions():
keys = ["username", "host", "port", "password", "database"]


@db_app.command(name="refresh-postgrest")
def refresh_postgrest():
"""
Refresh PostgrREST API table definitions
"""
db = get_db()
db.run_sql("NOTIFY pgrst, 'reload schema';")


@db_app.command(name="credentials", rich_help_panel="Helpers")
def connection_details():
"""Show PostgreSQL connection credentials"""
Expand Down
6 changes: 4 additions & 2 deletions py-modules/cli/macrostrat/cli/schema_management/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def dump_schema(schema: str):


@schema_app.command(rich_help_panel="Utils")
def provision():
def provision(pattern: str = Argument("*")):
"""Apply all schema objects to the database

TODO: filter out non-idempotent statements (table creation, etc.)
Expand All @@ -282,6 +282,8 @@ def provision():
environment = settings.env

counter = StatementCounter(safe=True)
apply_schema_for_environment(db, environment, statement_filter=counter.filter)
apply_schema_for_environment(
db, environment, statement_filter=counter.filter, pattern=pattern
)
db.run_sql("NOTIFY pgrst, 'reload schema';")
counter.print_report()
6 changes: 5 additions & 1 deletion py-modules/cli/macrostrat/cli/schema_management/defs.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,18 @@ def apply_schema_for_environment(
*,
recursive: bool = True,
statement_filter=lambda s, p: True,
pattern: str = "*",
):
if "*" not in pattern:
pattern = f"*{pattern}*"

for env_dir in schema_dirs_for_environment(env):
schema_dir = env_dir
if not schema_dir.exists():
continue

func = schema_dir.rglob if recursive else schema_dir.glob
fixtures = sorted(list(func("*.sql")))
fixtures = sorted(list(func(pattern + ".sql")))
fixtures = [f for f in fixtures if not f.name.endswith(".plan.sql")]

if len(fixtures) == 0:
Expand Down
17 changes: 10 additions & 7 deletions py-modules/integrations/macrostrat/integrations/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from typer import Argument
from typer import Argument, Typer

from macrostrat.core.migrations import run_migrations

from .gbdb import app as gbdb_app
from .gbdb import update_age_model
from .schema import IntegrationsBaseSchema
from .strabospot import populate_strabospot

pipelines = {
"strabospot": populate_strabospot,
"update-gbdb-age-model": update_age_model,
}


from typer import Typer

from macrostrat.core.migrations import run_migrations

from .schema import IntegrationsBaseSchema

app = Typer(
no_args_is_help=True,
help="StraboSpot structural geology data system",
Expand Down Expand Up @@ -45,3 +45,6 @@ def run(pipeline: str = Argument(None)):
return

pipelines[pipeline]()


app.add_typer(gbdb_app, name="gbdb")
53 changes: 53 additions & 0 deletions py-modules/integrations/macrostrat/integrations/gbdb/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
from pathlib import Path

from typer import Typer

from macrostrat.core.database import get_database

app = Typer(
name="gbdb",
no_args_is_help=True,
short_help="Geologic map database integration",
)


def update_age_model():
"""
Stack units by age
"""
db = get_database()

res = db.run_query("SELECT count(*) FROM macrostrat_gbdb.strata").scalar()
print(res)


pipeline_dir = Path(__file__).parent / "pipeline"
ingest_dir = Path("/Users/Daven/Projects/Macrostrat/Datasets/GBDB workshop")


@app.command()
def run_pipeline():
"""
Run the data ingestion pipeline
"""

runnables = []
for ext in [".py", ".sql", ".sh"]:
runnables.extend(pipeline_dir.glob(f"*{ext}"))

for runnable in sorted(runnables):
print(f"Running {runnable.name}...")
ROOT_DIR = ingest_dir
os.environ["ROOT_DIR"] = str(ROOT_DIR)
if runnable.suffix == ".py":
exec(runnable.read_text(), globals())
elif runnable.suffix == ".sql":
db = get_database()
db.run_sql(runnable)
elif runnable.suffix == ".sh":
import subprocess

subprocess.run(["bash", str(runnable)], check=True)
else:
print(f"Unknown file type: {runnable.suffix}")
81 changes: 81 additions & 0 deletions py-modules/integrations/macrostrat/integrations/gbdb/explore.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
SELECT count(*) FROM macrostrat_api.gbdb_strata WHERE (min_ma IS NOT NULL AND max_ma IS NOT NULL);


-- 121903 strata have an age constraint
SELECT count(*) FROM macrostrat_api.gbdb_strata WHERE has_age_constraint;

--153217 strata do not have an age constraint
SELECT count(*) FROM macrostrat_api.gbdb_strata WHERE NOT has_age_constraint;

--153179 strata do not have an age constraint but have a formation name
SELECT count(*) FROM macrostrat_api.gbdb_strata WHERE NOT has_age_constraint and (formation IS NOT NULL);



-- 5742 formations mentioned that do not have an age constraint defined
SELECT count(DISTINCT formation)
FROM macrostrat_api.gbdb_strata
WHERE NOT has_age_constraint
AND (formation IS NOT NULL);


SELECT DISTINCT (formation) formation
FROM macrostrat_api.gbdb_strata
WHERE NOT has_age_constraint
AND (formation IS NOT NULL);


SELECT * FROM macrostrat.intervals WHERE id = macrostrat_api.interval_for_age_range(100, 140);


SELECT count(*) FROM macrostrat_gbdb.sections WHERE has_age_constraint;


-- WITH duplicate_units AS (SELECT unit_id, section_id, COUNT(*)
-- FROM macrostrat_api.gbdb_strata
-- GROUP BY unit_id, section_id
-- HAVING COUNT(*) > 1
-- ORDER BY count DESC)
-- SELECT unit_id, array_agg(depth_scale) FROM duplicate_units
-- JOIN macrostrat_api.gbdb_strata USING (unit_id, section_id)
-- GROUP BY unit_id;
--
-- WITH duplicate_units AS (SELECT unit_id, section_id, COUNT(*)
-- FROM macrostrat_api.gbdb_strata
-- GROUP BY unit_id, section_id
-- HAVING COUNT(*) > 1
-- ORDER BY count DESC)
-- SELECT * FROM duplicate_units
-- JOIN macrostrat_api.gbdb_strata USING (unit_id, section_id);


WITH col_sections AS (SELECT section_id, sc.id col_id
FROM macrostrat_gbdb.sections s
JOIN macrostrat_gbdb.summary_columns sc
ON ST_Intersects(ST_SetSRID(ST_MakePoint(lng, lat), 4326), sc.geometry)
WHERE has_age_constraint)
SELECT
row_number() OVER () unit_id,
col_id,
f.formation unit_name,
min_ma t_age,
max_ma b_age
FROM macrostrat_api.gbdb_formations f
JOIN col_sections cs ON cs.section_id = f.section_id
WHERE f.min_ma IS NOT NULL AND f.max_ma IS NOT NULL
GROUP BY col_id, f.formation, min_ma, max_ma;

SELECT age_source, count(*), count(*)::numeric/(SELECT count(*) proportion FROM macrostrat_api.gbdb_strata_with_age_model) FROM macrostrat_api.gbdb_strata_with_age_model GROUP BY age_source;

SELECT * FROM macrostrat_api.gbdb_formations WHERE formation ILIKE '%Fangyan%';

SELECT * FROM macrostrat_api.gbdb_formations WHERE min_ma IS null ORDER BY formation;

SELECT count(*), round(count(*)::numeric/(SELECT count(*) proportion FROM macrostrat_api.gbdb_strata), 2) FROM macrostrat_api.gbdb_strata WHERE min_ma IS NOT NULL AND max_ma IS NOT NULL;


SELECT age_source, count(*), round(count(*)::numeric/(SELECT count(*) FROM macrostrat_gbdb.strata), 2) proportion FROM macrostrat_api.gbdb_strata_with_age_model WHERE country = 'China' GROUP BY age_source ;

-- Age control

SELECT DISTINCT ON (name_clean) * FROM macrostrat_gbdb.external_age_control;
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env bash -e


q="SELECT * FROM macrostrat_api.gbdb_strata_with_age_model"
macrostrat db psql -c "COPY ($q) TO STDOUT WITH CSV DELIMITER ',' HEADER" > gbdb-strata-with-linear-age-model-v2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from pathlib import Path

import numpy as N
from IPython import embed
from pandas import DataFrame, read_csv

from macrostrat.core.database import get_database


def ingest_strata_next(source: Path):
src = Path(source)

df = read_csv(src)
# Set empty strings to NaN
df.replace("", N.nan, inplace=True)

# Get unique section IDs
section_ids = df["section_id"].unique()

print(f"Found {len(section_ids)} unique sections")

db = get_database()
project_id = get_or_create_project(db, "GBDB")

for section_id in section_ids:
df1 = df[df["section_id"] == section_id]
ingest_column(df1)


def ingest_column(df: DataFrame):
df2 = df.iloc[0]
# Get the section name and other parameters

embed()
raise


def get_or_create_project(db, name: str):
res = db.run_query(
"SELECT id FROM macrostrat.projects WHERE project = :name",
{"name": name},
).fetchone()
if res is not None:
return res
res = db.run_query(
"INSERT INTO macrostrat.projects (project) VALUES (:name) RETURNING id",
{"name": name},
).fetchone()
return res
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash -e

root_dir=$ROOT_DIR

macrostrat db psql -c "TRUNCATE TABLE macrostrat_gbdb.chinalex"
cat "$root_dir/2024-08-18_chinalex.csv" \
| macrostrat db psql -c "COPY macrostrat_gbdb.chinalex FROM STDIN WITH (FORMAT CSV, HEADER true)"

macrostrat db psql -c "TRUNCATE TABLE macrostrat_gbdb.stratigraphic_dictionary_llm"
cat "$root_dir/stratigraphic-dictionary-llm-extraction.csv" \
| macrostrat db psql -c 'COPY macrostrat_gbdb.stratigraphic_dictionary_llm FROM STDIN WITH (FORMAT CSV, HEADER true)'
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env bash

cat "$ROOT_DIR/geological_strata.csv" \
| macrostrat db psql -c 'COPY macrostrat_gbdb.strata FROM STDIN WITH (FORMAT CSV, HEADER true, FORCE_NULL (unit_thickness))'
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
-- Some fixes to imported strata
UPDATE macrostrat_gbdb.strata SET member = null WHERE member = '';
UPDATE macrostrat_gbdb.strata SET formation = null WHERE formation = '';
UPDATE macrostrat_gbdb.strata SET epoch = null WHERE epoch = '';


TRUNCATE TABLE macrostrat_gbdb.sections;
WITH a AS (SELECT section_id,
lng,
lat,
count(ac.t_age) > 0 has_age_constraint,
MIN(ac.t_age) min_ma,
MAX(ac.b_age) max_ma
FROM macrostrat_gbdb.strata s
LEFT JOIN macrostrat_gbdb.best_external_age_control ac
ON lower(s.formation) = lower(ac.name_clean)
AND lower(s.formation) != 'unknown'
GROUP BY section_id, lng, lat
)
INSERT INTO macrostrat_gbdb.sections
SELECT *,
macrostrat_api.color_for_age_range(a.min_ma, a.max_ma) color
FROM a;

TRUNCATE TABLE macrostrat_gbdb.summary_columns;
WITH hexgrid AS (
SELECT ST_HexagonGrid(1, ST_MakeEnvelope(-180, -90, 180, 90, 4326)) AS hex
)
INSERT INTO macrostrat_gbdb.summary_columns
SELECT
row_number() OVER () id,
ST_ForceRHR((hex).geom) geometry
FROM hexgrid
WHERE ST_Intersects((hex).geom, (
SELECT ST_Union(ST_SetSRID(ST_MakePoint(lng, lat), 4326)) FROM macrostrat_gbdb.sections WHERE has_age_constraint
)
);


TRUNCATE TABLE macrostrat_gbdb.summary_units;
WITH col_sections AS (
SELECT section_id, sc.id col_id
FROM macrostrat_gbdb.sections s
JOIN macrostrat_gbdb.summary_columns sc
ON ST_Intersects(ST_SetSRID(ST_MakePoint(lng, lat), 4326), sc.geometry)
WHERE has_age_constraint)
INSERT INTO macrostrat_gbdb.summary_units
SELECT
row_number() OVER () unit_id,
col_id,
f.formation unit_name,
min_ma t_age,
max_ma b_age
FROM macrostrat_api.gbdb_formations f
JOIN col_sections cs ON cs.section_id = f.section_id
WHERE f.min_ma IS NOT NULL AND f.max_ma IS NOT NULL
GROUP BY col_id, f.formation, min_ma, max_ma;
Loading