Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions pipeline/sources/pleiades/downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import os
from pipeline.process.base.downloader import BaseDownloader

class PleiadesDownloader(BaseDownloader):
"""
Types url: https://atlantides.org/downloads/pleiades/rdf/place-types.ttl
Places url: https://atlantides.org/downloads/pleiades/json/pleiades-places-latest.json.gz
"""
def get_urls(self):
place_types_url = self.config['input_files']["records"][0]['url']
places_url = self.config['input_files']["records"][1]['url']
dumps_dir = self.config['dumps_dir']
place_types_path = os.path.join(dumps_dir, place_types_url.rsplit('/')[-1])
places_path = os.path.join(dumps_dir, places_url.rsplit('/')[-1])
return [{"url": place_types_url, "path": place_types_path}, {"url": places_url, "path": places_path}]
214 changes: 214 additions & 0 deletions pipeline/sources/pleiades/mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
# from lux_pipeline.process.base.mapper import Mapper
from pipeline.process.base.mapper import Mapper
from cromulent import model, vocab
import re

class PleiadesMapper(Mapper):
def __init__(self, config):
Mapper.__init__(self, config)
self.factory.auto_assign_id = False
# pass

def guess_type(self, data):
# Check the @type field first
if data.get("@type") == "Concept":
return model.Type
# Default to Place for everything else, including records with placeTypeURIs
return model.Place

def geojson_to_wkt(self, geom):
"""Convert a GeoJSON geometry dict to a WKT string."""
t = geom.get("type")
coords = geom.get("coordinates")
if t == "Point":
# Geometry and coordinates (long, lat order): { "type": "Point", "coordinates": [ 31.18, 36.935499999999998 ] }
return f"POINT ({coords[0]} {coords[1]})"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure about this? Could be [1] [0]?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Double checked a week ago and it's right, I believe.

elif t == "Polygon":
# Polygon: list of linear rings (first is exterior)
rings = []
for ring in coords:
rings.append(", ".join(f"{x} {y}" for x, y in ring))
return f"POLYGON (({rings[0]}))"
elif t == "MultiPolygon":
polys = []
for poly in coords:
rings = []
for ring in poly:
rings.append(", ".join(f"{x} {y}" for x, y in ring))
polys.append(f"(({rings[0]}))")
return f"MULTIPOLYGON ({', '.join(polys)})"
# Add more types as needed
return None

def bbox_to_wkt(self, bbox):
"""Convert a bounding box [minLon, minLat, maxLon, maxLat] to a WKT Polygon string."""
minx, miny, maxx, maxy = bbox
# Polygon: lower left, lower right, upper right, upper left, close
return (
f"POLYGON (({minx} {miny}, {maxx} {miny}, {maxx} {maxy}, {minx} {maxy}, {minx} {miny}))"
)

def parse_types(self, ttl_section):
"""Parse a TTL section for a place type concept and create a Linked Art record."""
# Extract the URI from the first line
uri_match = re.search(r'<https://pleiades\.stoa\.org/vocabularies/([^>]+)>', ttl_section)
if not uri_match:
return None

concept_id = uri_match.group(1)
uri = f"https://pleiades.stoa.org/vocabularies/{concept_id}"

# Extract prefLabel with optional language tag
label_match = re.search(r'skos:prefLabel "([^"]+)"(?:@([a-z]{2}))?', ttl_section)
if not label_match:
return None

label = label_match.group(1)
label_lang = label_match.group(2) if label_match.group(2) else 'en' # default to English if no language tag

# Extract scopeNote (description) with optional language tag
scope_match = re.search(r'skos:scopeNote "([^"]+)"(?:@([a-z]{2}))?', ttl_section)
description = scope_match.group(1) if scope_match else None
description_lang = scope_match.group(2) if scope_match and scope_match.group(2) else 'en' # default to English if no language tag

# Extract all owl:sameAs URIs (handling multiline format)
# First find the owl:sameAs section
same_as_match = re.search(r'owl:sameAs\s+(.+?);', ttl_section, re.DOTALL)
same_as_matches = []
if same_as_match:
same_as_section = same_as_match.group(1)
# Extract all URIs from this section
same_as_matches = re.findall(r'<([^>]+)>', same_as_section)

# Create the Type record
top = model.Type(ident=uri)

# Add primary name
primary_name = vocab.PrimaryName(content=label)
if label_lang in self.process_langs:
primary_name.language = self.process_langs[label_lang]
top.identified_by = primary_name

# Add description if present
if description:
desc = vocab.Description(content=description)
if description_lang and description_lang in self.process_langs:
desc.language = self.process_langs[description_lang]
top.referred_to_by = desc

# Add equivalents from owl:sameAs statements
for same_as_uri in same_as_matches:
# Skip self-references
if same_as_uri != uri:
top.equivalent = model.Type(ident=same_as_uri)


data = model.factory.toJSON(top)
return {"identifier": concept_id, "data": data, "source": "pleiades"}

def parse_place(self, record):
rec = record["data"]
recid = record["identifier"]

# Handle places
top = model.Place(ident=rec["uri"])
# Collect all available names
all_names = []
title = rec.get("title")
if title:
all_names.append({"content": title, "language": "en", "source": "title"})

# Add names from the names array
names = rec.get("names", [])
for n in names:
if "attested" in n and n["attested"]:
lang = n.get("language", "en")
all_names.append({"content": n["attested"], "language": lang, "source": "names"})

if not all_names:
return None

# Assign first name as primary
primary_name_data = all_names[0]
primary_name = vocab.PrimaryName(content=primary_name_data["content"])
lang_code = primary_name_data["language"]
if len(lang_code) == 3:
lang_code = self.lang_three_to_two.get(lang_code, lang_code)
if lang_code in self.process_langs:
primary_name.language = self.process_langs[lang_code]
top.identified_by = primary_name

# Add remaining names as alternate names
for name_data in all_names[1:]:
alt_name = vocab.AlternateName(content=name_data["content"])
lang_code = name_data["language"]
if len(lang_code) == 3:
lang_code = self.lang_three_to_two.get(lang_code, lang_code)
if lang_code in self.process_langs:
alt_name.language = self.process_langs[lang_code]
top.identified_by = alt_name

# Add description
if "description" in rec:
desc = vocab.Description(content=rec["description"])
# Add English language
if 'en' in self.process_langs:
desc.language = self.process_langs['en']
top.referred_to_by = desc

# Add place types using placeTypeURIs directly
if "placeTypeURIs" in rec:
for pt_uri in rec["placeTypeURIs"]:
top.classified_as = model.Type(ident=pt_uri)

# Add geospatial data: geometry, bbox, reprPoint (in that order)
wkt = None
if "geometry" in rec and rec["geometry"]:
wkt = self.geojson_to_wkt(rec["geometry"])
elif "bbox" in rec and rec["bbox"]:
wkt = self.bbox_to_wkt(rec["bbox"])
elif "boundingBox" in rec and rec["boundingBox"]:
wkt = self.bbox_to_wkt(rec["boundingBox"])
elif "reprPoint" in rec and rec["reprPoint"]:
coords = rec["reprPoint"]
if len(coords) >= 2:
wkt = f"POINT ({coords[0]} {coords[1]})"
if wkt:
top.defined_by = wkt

# Add part_of relationships
if "connections" in rec and rec["connections"]:
for conn in rec["connections"]:
#https://pleiades.stoa.org/vocabularies/relationship-types
if conn.get("connectionType") in [
"part_of_physical",
"part_of_admin",
"part_of_regional",
"located_in",
"in_territory_of",
"located_at",
"port_of",
"member_of",
"part_of_analytical",
"capital_of"
]:
related = model.Place(ident=conn["connectsTo"])
if "title" in conn:
related._label = conn["title"]
top.part_of = related
if "references" in rec and rec["references"]:
for ref in rec["references"]:
if "https://www.wikidata.org/wiki" in ref["accessURI"]:
top.equivalent = model.Place(ident=ref["accessURI"])
data = model.factory.toJSON(top)
return {"identifier": recid, "data": data, "source": "pleiades"}

def transform(self, record, rectype, reference=False):
if not rectype:
rectype = self.guess_type(record)
if rectype == "Place":
return self.parse_place(record)
elif rectype == "Type":
return self.parse_concept(record)
else:
return None