-
Notifications
You must be signed in to change notification settings - Fork 2
Pleiades as a Source #241
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
wjbmattingly
wants to merge
7
commits into
main
Choose a base branch
from
pleiades-test
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Pleiades as a Source #241
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
9a60cf4
pleiades mapper loaded
wjbmattingly 84b0cd8
Rename pipeline/sources/mapper.py to pipeline/sources/pleiades/mapper.py
wjbmattingly f1dab78
updated connection types for mapper for part of
wjbmattingly ef7dc71
double chekced lat/long and fixed rectype to guess_type
wjbmattingly ef2cfc1
added suport for two_to_three mapper
wjbmattingly ae797d5
added further support for langs
wjbmattingly 3b12edb
addressed broader language support issue
wjbmattingly File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| import os | ||
| from pipeline.process.base.downloader import BaseDownloader | ||
|
|
||
| class PleiadesDownloader(BaseDownloader): | ||
| """ | ||
| Types url: https://atlantides.org/downloads/pleiades/rdf/place-types.ttl | ||
| Places url: https://atlantides.org/downloads/pleiades/json/pleiades-places-latest.json.gz | ||
| """ | ||
| def get_urls(self): | ||
| place_types_url = self.config['input_files']["records"][0]['url'] | ||
| places_url = self.config['input_files']["records"][1]['url'] | ||
| dumps_dir = self.config['dumps_dir'] | ||
| place_types_path = os.path.join(dumps_dir, place_types_url.rsplit('/')[-1]) | ||
| places_path = os.path.join(dumps_dir, places_url.rsplit('/')[-1]) | ||
| return [{"url": place_types_url, "path": place_types_path}, {"url": places_url, "path": places_path}] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,214 @@ | ||
| # from lux_pipeline.process.base.mapper import Mapper | ||
| from pipeline.process.base.mapper import Mapper | ||
| from cromulent import model, vocab | ||
| import re | ||
|
|
||
| class PleiadesMapper(Mapper): | ||
| def __init__(self, config): | ||
| Mapper.__init__(self, config) | ||
| self.factory.auto_assign_id = False | ||
| # pass | ||
|
|
||
| def guess_type(self, data): | ||
| # Check the @type field first | ||
| if data.get("@type") == "Concept": | ||
| return model.Type | ||
| # Default to Place for everything else, including records with placeTypeURIs | ||
| return model.Place | ||
|
|
||
| def geojson_to_wkt(self, geom): | ||
| """Convert a GeoJSON geometry dict to a WKT string.""" | ||
| t = geom.get("type") | ||
| coords = geom.get("coordinates") | ||
| if t == "Point": | ||
| # Geometry and coordinates (long, lat order): { "type": "Point", "coordinates": [ 31.18, 36.935499999999998 ] } | ||
| return f"POINT ({coords[0]} {coords[1]})" | ||
| elif t == "Polygon": | ||
| # Polygon: list of linear rings (first is exterior) | ||
| rings = [] | ||
| for ring in coords: | ||
| rings.append(", ".join(f"{x} {y}" for x, y in ring)) | ||
| return f"POLYGON (({rings[0]}))" | ||
| elif t == "MultiPolygon": | ||
| polys = [] | ||
| for poly in coords: | ||
| rings = [] | ||
| for ring in poly: | ||
| rings.append(", ".join(f"{x} {y}" for x, y in ring)) | ||
| polys.append(f"(({rings[0]}))") | ||
| return f"MULTIPOLYGON ({', '.join(polys)})" | ||
| # Add more types as needed | ||
| return None | ||
|
|
||
| def bbox_to_wkt(self, bbox): | ||
| """Convert a bounding box [minLon, minLat, maxLon, maxLat] to a WKT Polygon string.""" | ||
| minx, miny, maxx, maxy = bbox | ||
| # Polygon: lower left, lower right, upper right, upper left, close | ||
| return ( | ||
| f"POLYGON (({minx} {miny}, {maxx} {miny}, {maxx} {maxy}, {minx} {maxy}, {minx} {miny}))" | ||
| ) | ||
|
|
||
| def parse_types(self, ttl_section): | ||
| """Parse a TTL section for a place type concept and create a Linked Art record.""" | ||
| # Extract the URI from the first line | ||
| uri_match = re.search(r'<https://pleiades\.stoa\.org/vocabularies/([^>]+)>', ttl_section) | ||
| if not uri_match: | ||
| return None | ||
|
|
||
| concept_id = uri_match.group(1) | ||
| uri = f"https://pleiades.stoa.org/vocabularies/{concept_id}" | ||
|
|
||
| # Extract prefLabel with optional language tag | ||
| label_match = re.search(r'skos:prefLabel "([^"]+)"(?:@([a-z]{2}))?', ttl_section) | ||
| if not label_match: | ||
| return None | ||
|
|
||
| label = label_match.group(1) | ||
| label_lang = label_match.group(2) if label_match.group(2) else 'en' # default to English if no language tag | ||
|
|
||
| # Extract scopeNote (description) with optional language tag | ||
| scope_match = re.search(r'skos:scopeNote "([^"]+)"(?:@([a-z]{2}))?', ttl_section) | ||
| description = scope_match.group(1) if scope_match else None | ||
| description_lang = scope_match.group(2) if scope_match and scope_match.group(2) else 'en' # default to English if no language tag | ||
|
|
||
| # Extract all owl:sameAs URIs (handling multiline format) | ||
| # First find the owl:sameAs section | ||
| same_as_match = re.search(r'owl:sameAs\s+(.+?);', ttl_section, re.DOTALL) | ||
| same_as_matches = [] | ||
| if same_as_match: | ||
| same_as_section = same_as_match.group(1) | ||
| # Extract all URIs from this section | ||
| same_as_matches = re.findall(r'<([^>]+)>', same_as_section) | ||
|
|
||
| # Create the Type record | ||
| top = model.Type(ident=uri) | ||
|
|
||
| # Add primary name | ||
| primary_name = vocab.PrimaryName(content=label) | ||
| if label_lang in self.process_langs: | ||
| primary_name.language = self.process_langs[label_lang] | ||
| top.identified_by = primary_name | ||
|
|
||
| # Add description if present | ||
| if description: | ||
| desc = vocab.Description(content=description) | ||
| if description_lang and description_lang in self.process_langs: | ||
| desc.language = self.process_langs[description_lang] | ||
| top.referred_to_by = desc | ||
|
|
||
| # Add equivalents from owl:sameAs statements | ||
| for same_as_uri in same_as_matches: | ||
| # Skip self-references | ||
| if same_as_uri != uri: | ||
| top.equivalent = model.Type(ident=same_as_uri) | ||
|
|
||
|
|
||
| data = model.factory.toJSON(top) | ||
| return {"identifier": concept_id, "data": data, "source": "pleiades"} | ||
|
|
||
| def parse_place(self, record): | ||
| rec = record["data"] | ||
| recid = record["identifier"] | ||
|
|
||
| # Handle places | ||
| top = model.Place(ident=rec["uri"]) | ||
| # Collect all available names | ||
| all_names = [] | ||
| title = rec.get("title") | ||
| if title: | ||
| all_names.append({"content": title, "language": "en", "source": "title"}) | ||
|
|
||
| # Add names from the names array | ||
| names = rec.get("names", []) | ||
| for n in names: | ||
| if "attested" in n and n["attested"]: | ||
| lang = n.get("language", "en") | ||
| all_names.append({"content": n["attested"], "language": lang, "source": "names"}) | ||
|
|
||
| if not all_names: | ||
| return None | ||
|
|
||
| # Assign first name as primary | ||
| primary_name_data = all_names[0] | ||
| primary_name = vocab.PrimaryName(content=primary_name_data["content"]) | ||
| lang_code = primary_name_data["language"] | ||
| if len(lang_code) == 3: | ||
| lang_code = self.lang_three_to_two.get(lang_code, lang_code) | ||
| if lang_code in self.process_langs: | ||
| primary_name.language = self.process_langs[lang_code] | ||
| top.identified_by = primary_name | ||
|
|
||
| # Add remaining names as alternate names | ||
| for name_data in all_names[1:]: | ||
| alt_name = vocab.AlternateName(content=name_data["content"]) | ||
| lang_code = name_data["language"] | ||
| if len(lang_code) == 3: | ||
| lang_code = self.lang_three_to_two.get(lang_code, lang_code) | ||
| if lang_code in self.process_langs: | ||
| alt_name.language = self.process_langs[lang_code] | ||
| top.identified_by = alt_name | ||
|
|
||
| # Add description | ||
| if "description" in rec: | ||
| desc = vocab.Description(content=rec["description"]) | ||
| # Add English language | ||
| if 'en' in self.process_langs: | ||
| desc.language = self.process_langs['en'] | ||
| top.referred_to_by = desc | ||
|
|
||
| # Add place types using placeTypeURIs directly | ||
| if "placeTypeURIs" in rec: | ||
| for pt_uri in rec["placeTypeURIs"]: | ||
| top.classified_as = model.Type(ident=pt_uri) | ||
|
|
||
| # Add geospatial data: geometry, bbox, reprPoint (in that order) | ||
| wkt = None | ||
| if "geometry" in rec and rec["geometry"]: | ||
| wkt = self.geojson_to_wkt(rec["geometry"]) | ||
| elif "bbox" in rec and rec["bbox"]: | ||
| wkt = self.bbox_to_wkt(rec["bbox"]) | ||
| elif "boundingBox" in rec and rec["boundingBox"]: | ||
| wkt = self.bbox_to_wkt(rec["boundingBox"]) | ||
| elif "reprPoint" in rec and rec["reprPoint"]: | ||
| coords = rec["reprPoint"] | ||
| if len(coords) >= 2: | ||
| wkt = f"POINT ({coords[0]} {coords[1]})" | ||
| if wkt: | ||
| top.defined_by = wkt | ||
|
|
||
| # Add part_of relationships | ||
| if "connections" in rec and rec["connections"]: | ||
| for conn in rec["connections"]: | ||
| #https://pleiades.stoa.org/vocabularies/relationship-types | ||
| if conn.get("connectionType") in [ | ||
| "part_of_physical", | ||
| "part_of_admin", | ||
| "part_of_regional", | ||
| "located_in", | ||
| "in_territory_of", | ||
| "located_at", | ||
| "port_of", | ||
| "member_of", | ||
| "part_of_analytical", | ||
| "capital_of" | ||
| ]: | ||
| related = model.Place(ident=conn["connectsTo"]) | ||
| if "title" in conn: | ||
| related._label = conn["title"] | ||
| top.part_of = related | ||
| if "references" in rec and rec["references"]: | ||
| for ref in rec["references"]: | ||
| if "https://www.wikidata.org/wiki" in ref["accessURI"]: | ||
| top.equivalent = model.Place(ident=ref["accessURI"]) | ||
| data = model.factory.toJSON(top) | ||
| return {"identifier": recid, "data": data, "source": "pleiades"} | ||
|
|
||
| def transform(self, record, rectype, reference=False): | ||
| if not rectype: | ||
| rectype = self.guess_type(record) | ||
| if rectype == "Place": | ||
| return self.parse_place(record) | ||
| elif rectype == "Type": | ||
| return self.parse_concept(record) | ||
| else: | ||
| return None | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you sure about this? Could be [1] [0]?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Double checked a week ago and it's right, I believe.