Skip to content

Commit 358b3c7

Browse files
authored
Merge pull request #255 from UW-Macrostrat/stratigraphy-ingestion
Starting point for stratigraphy ingestion from spreadsheet
2 parents e5189d6 + 33492d3 commit 358b3c7

File tree

20 files changed

+1474
-11
lines changed

20 files changed

+1474
-11
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,6 @@
77
[submodule "submodules/storage-admin"]
88
path = submodules/storage-admin
99
url = https://github.com/UW-Macrostrat/radosgw_admin_client
10+
[submodule "submodules/column-ingestion"]
11+
path = submodules/column-ingestion
12+
url = https://github.com/Macrostrat/column-ingestion.git

py-modules/cli/macrostrat/cli/entrypoint.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,18 @@ def update_weaver(db):
273273
except ImportError as err:
274274
pass
275275

276+
try:
277+
from macrostrat.column_ingestion import app as column_app
278+
279+
main.add_typer(
280+
column_app,
281+
name="columns",
282+
rich_help_panel="Subsystems",
283+
short_help="Column data ingestion subsystem",
284+
)
285+
except ImportError as err:
286+
pass
287+
276288

277289
# Get subsystems config
278290
subsystems = getattr(settings, "subsystems", {})

py-modules/cli/pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ dependencies = [
2727
"macrostrat.match-utils",
2828
"criticalmaas.ta1-geopackage>=0.2.0,<0.3",
2929
"mapboard.topology-manager",
30+
"macrostrat.column-ingestion",
3031
"htpheno.radosgw-admin-client",
3132
"numpy>=1.23.4,<2",
3233
"psycopg2-binary>=2.9.4,<3",
@@ -39,7 +40,7 @@ dependencies = [
3940
"spectra>=0.0.11,<0.0.12",
4041
"tiletanic>=1.1.0,<2",
4142
"tqdm>=4.65.0,<5",
42-
"typer>=0.12,<0.13",
43+
"typer>=0.12,<0.24.0",
4344
"click>=8.1.0,<8.2.0",
4445
"docker>=7,<8",
4546
"toml>=0.10.2,<0.11",
@@ -77,6 +78,7 @@ default-groups = [
7778
"mapboard.topology-manager" = { path = "../../submodules/topology-manager", editable = true }
7879
"htpheno.radosgw-admin-client" = { path = "../../submodules/storage-admin", editable = true }
7980
"macrostrat.match-utils" = { path = "../match-utils", editable = true }
81+
"macrostrat.column-ingestion" = { path = "../../py-modules/column-ingestion", editable = true }
8082

8183
[tool.hatch.build.targets.sdist]
8284
include = ["macrostrat"]
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Column ingestion
2+
3+
Macrostrat utilities for stratigraphic column ingestion.
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from pathlib import Path
2+
3+
from typer import Argument, Typer
4+
5+
app = Typer(
6+
no_args_is_help=True,
7+
help="Column ingestion subsystem for Macrostrat",
8+
)
9+
10+
11+
@app.command(name="ingest")
12+
def ingest_columns(
13+
data_file: Path = Argument(..., help="Path to the data file to ingest")
14+
):
15+
"""Ingest columns tabular data."""
16+
from .ingest import ingest_columns_from_file
17+
18+
ingest_columns_from_file(data_file)
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
from dataclasses import dataclass, field
2+
from datetime import datetime
3+
4+
import polars as pl
5+
from sqlalchemy.dialects.postgresql import insert
6+
7+
from macrostrat.database import Database
8+
9+
from .database import get_macrostrat_table
10+
from .units import Unit
11+
12+
13+
@dataclass
14+
class Column:
15+
id: int = -1
16+
group_id: int = -1
17+
local_id: str | None = None
18+
name: str | None = None
19+
description: str | None = None
20+
project_id: int | None = None
21+
status_code: str = "in process"
22+
col_type: str = "column"
23+
geom: str | None = None
24+
rgeom: str | None = None
25+
units: list[Unit] = field(default_factory=list)
26+
27+
28+
def get_or_create_column_group(db: Database, project_id: int, name="Default") -> int:
29+
"""Get or create a column group for a given project ID."""
30+
col_groups_tbl = get_macrostrat_table(db, "col_groups")
31+
32+
# TODO: need to add an index on project_id to the col_groups table for this to work properly
33+
34+
# Find a pre-existing column group for the project, if it exists
35+
existing_group_id = (
36+
db.session.query(col_groups_tbl.c.id)
37+
.filter(col_groups_tbl.c.project_id == project_id)
38+
.filter(col_groups_tbl.c.col_group == name)
39+
.scalar()
40+
)
41+
if existing_group_id is not None:
42+
return existing_group_id
43+
44+
insert_stmt = (
45+
insert(col_groups_tbl)
46+
.values(
47+
project_id=project_id,
48+
col_group="Default",
49+
col_group_long="Default column group",
50+
)
51+
.returning(col_groups_tbl.c.id)
52+
)
53+
return db.session.execute(insert_stmt).scalar()
54+
55+
56+
def get_or_create_column(db: Database, col: Column) -> int:
57+
"""Get or create a column in the database."""
58+
cols_tbl = get_macrostrat_table(db, "cols")
59+
60+
# TODO: Use insert-on-conflict to get or create the column
61+
# Requires an index on (col_name, project_id) to work properly
62+
63+
vals = dict(
64+
status_code=col.status_code,
65+
col_type=col.col_type,
66+
col_group_id=col.group_id,
67+
)
68+
69+
default_vals = dict(
70+
# TODO: figure out how to handle these fields
71+
col_position="",
72+
col_area=0,
73+
created=datetime.now(),
74+
col=0,
75+
lat=0,
76+
lng=0,
77+
)
78+
79+
# Get an existing column by name and project_id, if it exists
80+
col_id = (
81+
db.session.query(cols_tbl.c.id)
82+
.filter(cols_tbl.c.col_name == col.name)
83+
.filter(cols_tbl.c.project_id == col.project_id)
84+
.scalar()
85+
)
86+
stmt = None
87+
if col_id is not None:
88+
# Update the existing column with any new values
89+
print("Updating existing column with ID", col_id)
90+
stmt = (
91+
cols_tbl.update()
92+
.where(cols_tbl.c.id == col_id)
93+
.values(**vals)
94+
.returning(cols_tbl.c.id)
95+
)
96+
else:
97+
stmt = (
98+
insert(cols_tbl)
99+
.values(
100+
col_name=col.name,
101+
project_id=col.project_id,
102+
**default_vals,
103+
**vals,
104+
)
105+
.returning(cols_tbl.c.id)
106+
)
107+
return db.session.execute(stmt).scalar()
108+
109+
110+
def get_or_create_section(db: Database, col_id: int) -> int:
111+
"""Get a single section in the database for a given column ID, creating it if it doesn't exist.
112+
Note: multiple sections are not supported as yet.
113+
"""
114+
sections_tbl = get_macrostrat_table(db, "sections")
115+
116+
# Get an existing section for the column, if it exists
117+
section_id = (
118+
db.session.query(sections_tbl.c.id)
119+
.filter(sections_tbl.c.col_id == col_id)
120+
.scalar()
121+
)
122+
if section_id is not None:
123+
return section_id
124+
125+
insert_stmt = (
126+
insert(sections_tbl)
127+
.values(col_id=col_id, fo=-1, fo_h=-1, lo=-1, lo_h=-1)
128+
.returning(sections_tbl.c.id)
129+
)
130+
131+
return db.session.execute(insert_stmt).scalar()
132+
133+
134+
def get_column_data(data_file, meta) -> list[Column]:
135+
df = pl.read_excel(data_file, sheet_name="columns")
136+
137+
df = df.rename(
138+
{
139+
"name": "col_name",
140+
"id": "col_id",
141+
"type": "col_type",
142+
},
143+
strict=False,
144+
)
145+
146+
print(df.head())
147+
148+
columns = []
149+
for row in df.iter_rows(named=True):
150+
151+
geom = row.get("rgeom", getattr(meta, "rgeom", None))
152+
153+
col = Column(
154+
# TODO: implement ID upgrading to handle existing columns
155+
local_id=str(row.get("col_id")),
156+
name=row.get("col_name"),
157+
description=row.get("description"),
158+
status_code=row.get(
159+
"status_code", getattr(meta, "status_code", "in process")
160+
),
161+
col_type=row.get("col_type", getattr(meta, "col_type", "column")),
162+
geom=row.get("geom"),
163+
rgeom=geom,
164+
)
165+
columns.append(col)
166+
return columns
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import re
2+
from typing import Optional
3+
4+
from pydantic import BaseModel
5+
6+
from macrostrat.core.database import get_database
7+
8+
9+
class ProjectIdentifier(BaseModel):
10+
id: Optional[int] = None
11+
slug: Optional[str] = None
12+
name: Optional[str] = None
13+
14+
# At least one of id, slug, or name must be provided
15+
def __init__(self, **data):
16+
super().__init__(**data)
17+
if not (self.id or self.slug or self.name):
18+
raise ValueError("At least one of id, slug, or name must be provided")
19+
20+
21+
class ProjectData(ProjectIdentifier):
22+
id: int
23+
slug: str
24+
name: str
25+
26+
27+
def get_macrostrat_model(db, table_name: str):
28+
"""Get the SQLAlchemy model for a given table name."""
29+
name = "macrostrat_" + table_name
30+
if not hasattr(db.model, name):
31+
db.automap(schemas=["macrostrat"])
32+
return getattr(db.model, name)
33+
34+
35+
def get_macrostrat_table(db, table_name: str):
36+
"""Get the SQLAlchemy table for a given table name."""
37+
name = "macrostrat_" + table_name
38+
if not hasattr(db.table, name):
39+
db.automap(schemas=["macrostrat"])
40+
return getattr(db.table, name)
41+
42+
43+
def get_or_create_project(
44+
db, project: ProjectIdentifier, create_if_not_exists: bool = True
45+
) -> ProjectData:
46+
"""Get or create a project in the database."""
47+
# map the project table
48+
Project = get_macrostrat_model(db, table_name="projects")
49+
50+
# Try to find the project by id, slug, or name
51+
query = db.session.query(Project)
52+
if project.id is not None:
53+
query = query.filter(Project.id == project.id)
54+
elif project.slug is not None:
55+
query = query.filter(Project.slug == project.slug)
56+
elif project.name is not None:
57+
query = query.filter(Project.project == project.name)
58+
else:
59+
raise ValueError("At least one of id, slug, or name must be provided")
60+
61+
existing_project = query.first()
62+
if existing_project:
63+
return ProjectData(
64+
id=existing_project.id,
65+
slug=existing_project.slug,
66+
name=existing_project.project,
67+
)
68+
69+
if create_if_not_exists:
70+
# Create a new project
71+
# Remove parentheticals from the project name for the slug
72+
slug = None
73+
if project.name is not None:
74+
75+
simple_name = re.sub(r"\s*\(.*?\)\s*", "", project.name)
76+
simple_name = re.sub(r"\s+", " ", simple_name).strip()
77+
slug = simple_name.lower().replace(" ", "-")
78+
79+
new_project = Project(
80+
id=project.id,
81+
slug=slug,
82+
project=project.name,
83+
descrip="A random description",
84+
timescale_id=1, # TODO: this should be set to a valid timescale ID
85+
)
86+
db.session.add(new_project)
87+
return ProjectData(
88+
id=new_project.id,
89+
slug=new_project.slug,
90+
name=new_project.project,
91+
)
92+
93+
return None
94+
95+
96+
def get_all_liths():
97+
"""Get all lithologies from the database."""
98+
db = get_database()
99+
return db.run_query("SELECT id, lith name FROM macrostrat.liths").fetchall()
100+
101+
102+
def get_all_lith_attributes():
103+
"""Get all lithology attributes from the database."""
104+
db = get_database()
105+
return db.run_query("SELECT id, lith_att name FROM macrostrat.lith_atts").fetchall()

0 commit comments

Comments
 (0)