Skip to content

Commit d737467

Browse files
authored
Import NFDI collections BARTOC (#1888)
Closes #1817
1 parent 0881d15 commit d737467

12 files changed

Lines changed: 160 additions & 26 deletions

File tree

exports/alignment/bartoc.tsv

Lines changed: 5 additions & 3 deletions
Large diffs are not rendered by default.

exports/raw/bartoc.jsonl

Lines changed: 5 additions & 0 deletions
Large diffs are not rendered by default.

src/bioregistry/app/templates/nfdi.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@
1010
{%- if external_prefix == "tib.collection" %}
1111
Import of TIB OLS collections was implemented in
1212
<a href="https://github.com/biopragmatics/bioregistry/pull/1762">#1762</a>.
13-
The data can be synced by running <code>python -m bioregistry.curation.nfdi_collections</code>.
1413
{%- else %}
15-
Import of BARTOC collections is planned in
16-
<a href="https://github.com/biopragmatics/bioregistry/issues/1817">#1817</a>.
14+
Import of BARTOC collections was implemented in
15+
<a href="https://github.com/biopragmatics/bioregistry/issues/1888">#1888</a>.
1716
{%- endif %}
17+
The data can be synced by running <code>python -m bioregistry.curation.nfdi.sync</code>.
1818
</div>
1919
<table class="table table-striped">
2020
<thead>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Scripts for working with NFDI content."""

src/bioregistry/curation/add_nfdi_section_collections.py renamed to src/bioregistry/curation/nfdi/add_nfdi_section_collections.py

File renamed without changes.

src/bioregistry/curation/nfdi_collections.py renamed to src/bioregistry/curation/nfdi/sync.py

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from tqdm import tqdm
88

99
import bioregistry
10+
from bioregistry.external.bartoc import get_bartoc, get_bartoc_registries
1011
from bioregistry.external.ols.tib import get_tib_ts
1112
from bioregistry.schema_utils import get_collection_mappings
1213

@@ -15,9 +16,8 @@
1516
KEYWORD_TO_COLLECTION = {v: k for k, v in get_collection_mappings("tib.collection").items()}
1617

1718

18-
@click.command()
19-
def main() -> None:
20-
"""Populate collections based on keywords from the TIB terminology service."""
19+
def _import_tib() -> None:
20+
tqdm.write("\n\nImporting Collections from TIB OLS\n\n")
2121
counter: Counter[str] = Counter()
2222

2323
tib_to_internal = bioregistry.get_registry_invmap("tib")
@@ -28,14 +28,39 @@ def main() -> None:
2828
tqdm.write(f"no mapping from {tib_prefix}")
2929
continue
3030
for keyword in tib_data.get("keywords", []):
31-
collection = KEYWORD_TO_COLLECTION.get(keyword.lower())
32-
if not collection:
31+
collection_id = KEYWORD_TO_COLLECTION.get(keyword.lower())
32+
if not collection_id:
3333
counter[keyword.lower()] += 1
3434
continue
35-
bioregistry.add_to_collection(collection, internal_prefix)
35+
bioregistry.add_to_collection(collection_id, internal_prefix)
3636

3737
bioregistry.manager.write_collections()
38-
tqdm.write(tabulate(counter.most_common(), headers=["unmapped keyword", "count"]))
38+
tqdm.write(tabulate(counter.most_common(), headers=["unmapped TIB keyword", "count"]))
39+
40+
41+
def _import_bartoc() -> None:
42+
rows = []
43+
bartoc_registries = get_bartoc_registries()
44+
bartoc_to_internal = bioregistry.get_registry_invmap("bartoc")
45+
bartoc_data = get_bartoc()
46+
47+
for collection_id, registry_bartoc_id in get_collection_mappings("bartoc").items():
48+
for resource_bartoc_id in bartoc_registries[registry_bartoc_id]:
49+
prefix = bartoc_to_internal.get(resource_bartoc_id)
50+
if prefix:
51+
bioregistry.add_to_collection(collection_id, prefix)
52+
else:
53+
rows.append((resource_bartoc_id, bartoc_data[resource_bartoc_id].get("name")))
54+
continue
55+
bioregistry.manager.write_collections()
56+
tqdm.write(tabulate(rows, headers=["unmapped BARTOC ID", "name"]))
57+
58+
59+
@click.command()
60+
def main() -> None:
61+
"""Populate collections based on keywords from the TIB terminology service."""
62+
_import_bartoc()
63+
_import_tib()
3964

4065

4166
if __name__ == "__main__":

src/bioregistry/curation/seed_collection.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,4 @@ def main(keywords: list[str]) -> None:
5353

5454

5555
if __name__ == "__main__":
56-
main(
57-
[
58-
"education",
59-
"education level",
60-
"education science",
61-
"educational resource",
62-
"open educational resources",
63-
"discipline",
64-
]
65-
)
56+
main()

src/bioregistry/data/bioregistry.json

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21695,6 +21695,20 @@
2169521695
"prefix": "CIDOC-CRM",
2169621696
"version": "v6.2"
2169721697
},
21698+
"bartoc": {
21699+
"description": "The CIDOC Conceptual Reference Model (CRM) provides definitions and a formal structure for describing the implicit and explicit concepts and relationships used in cultural heritage documentation. The CIDOC CRM is intended to promote a shared understanding of cultural heritage information by providing a common and extensible semantic framework that any cultural heritage information can be mapped to. It is intended to be a common language for domain experts and implementers to formulate requirements for information systems and to serve as a guide for good practice of conceptual modelling. In this way, it can provide the 'semantic glue' needed to mediate between different sources of cultural heritage information, such as that published by museums, libraries and archives.",
21700+
"homepage": "http://www.cidoc-crm.org/cidoc-crm/",
21701+
"name": "CIDOC Conceptual Reference Model",
21702+
"pattern": "^[A-Z]{1,2}[0-9]+i?(_[A-Za-z_]+)?$",
21703+
"prefix": "1644",
21704+
"short_names": [
21705+
"CRM"
21706+
],
21707+
"uri_format": "http://www.cidoc-crm.org/cidoc-crm/$1",
21708+
"xrefs": {
21709+
"wikidata": "Q624005"
21710+
}
21711+
},
2169821712
"bioportal": {
2169921713
"contact": {
2170021714
"email": "patrick.le-boeuf@bnf.fr",
@@ -21734,6 +21748,7 @@
2173421748
"logo": "https://cidoc-crm.org/sites/default/files/logo3.png",
2173521749
"mappings": {
2173621750
"aberowl": "CIDOC-CRM",
21751+
"bartoc": "1644",
2173721752
"bioportal": "CIDOC-CRM",
2173821753
"fairsharing": "FAIRsharing.9xcr4z",
2173921754
"tib": "cidoc",
@@ -58363,6 +58378,20 @@
5836358378
"uri_format": "http://purl.obolibrary.org/obo/GEO_$1"
5836458379
},
5836558380
"geonames": {
58381+
"bartoc": {
58382+
"description": "The GeoNames geographical database is available for download free of charge under a creative commons attribution license. It contains over 10 million geographical names and consists of over 9 million unique features whereof 2.8 million populated places and 5.5 million alternate names. All features are categorized into one out of nine feature classes and further subcategorized into one out of 645 feature codes. (...) The data is accessible free of charge through a number of webservices and a daily database export. GeoNames is already serving up to over 150 million web service requests per day.\n\nGeoNames is integrating geographical data such as names of places in various languages, elevation, population and others from various sources. All lat/long coordinates are in WGS84 (World Geodetic System 1984). Users may manually edit, correct and add new names using a user friendly wiki interface. GeoNames has Ambassadors in many countries who assist with their help and expertise.",
58383+
"homepage": "http://www.geonames.org/",
58384+
"license": {
58385+
"spdx": "CC-BY-4.0",
58386+
"url": "http://creativecommons.org/licenses/by/4.0/"
58387+
},
58388+
"name": "GeoNames",
58389+
"prefix": "1674",
58390+
"uri_format": "http://www.geonames.org/$1",
58391+
"xrefs": {
58392+
"wikidata": "Q830106"
58393+
}
58394+
},
5836658395
"contact": {
5836758396
"email": "marc@geonames.org",
5836858397
"name": "Marc Wick"
@@ -58401,6 +58430,7 @@
5840158430
"license": "CC-BY-4.0",
5840258431
"logo": "https://www.geonames.org/img/globe.gif",
5840358432
"mappings": {
58433+
"bartoc": "1674",
5840458434
"fairsharing": "FAIRsharing.6dba71",
5840558435
"re3data": "r3d100010245"
5840658436
},
@@ -155667,6 +155697,19 @@
155667155697
]
155668155698
},
155669155699
"unesco.thesaurus": {
155700+
"bartoc": {
155701+
"description": "The UNESCO Thesaurus is a controlled and structured list of concepts used in subject analysis and retrieval of documents and publications in the fields of education, culture, natural sciences, social and human sciences, communication and information. Continuously enriched and updated, its multidisciplinary terminology reflects the evolution of UNESCO's programmes and activities. The first edition of the Thesaurus was released in English in 1977, with French and Spanish translations in 1983 and 1984. The second revised and restructured version was released in 1995. Today the Thesaurus is available in English, French, Russian and Spanish. Concepts are grouped into 7 broad subject areas which are broken down into microthesauri. The UNESCO Thesaurus is compliant with the ISO 25964 standard.",
155702+
"homepage": "http://vocabularies.unesco.org/browser/thesaurus/en/",
155703+
"license": {
155704+
"spdx": "CC BY-SA 3.0",
155705+
"url": "http://creativecommons.org/licenses/by-sa/3.0/"
155706+
},
155707+
"name": "UNESCO Thesaurus",
155708+
"prefix": "40",
155709+
"xrefs": {
155710+
"wikidata": "Q2467479"
155711+
}
155712+
},
155670155713
"contributor": {
155671155714
"email": "cthoyt@gmail.com",
155672155715
"github": "cthoyt",
@@ -155698,6 +155741,7 @@
155698155741
"homepage": "http://vocabularies.unesco.org/thesaurus",
155699155742
"license": "CC-BY-3.0-IGO",
155700155743
"mappings": {
155744+
"bartoc": "40",
155701155745
"fairsharing": "FAIRsharing.81dc5f"
155702155746
},
155703155747
"name": "UNESCO Thesaurus",

src/bioregistry/data/collections.json

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1748,7 +1748,20 @@
17481748
}
17491749
],
17501750
"resources": [
1751-
"bioregistry"
1751+
"bibo",
1752+
"bioregistry",
1753+
"cidoc.crm",
1754+
"geonames",
1755+
"getty.tgn",
1756+
"gnd",
1757+
"iconclass",
1758+
"orcid",
1759+
"ror",
1760+
"skos",
1761+
"time",
1762+
"unesco.thesaurus",
1763+
"viaf",
1764+
"wikidata"
17521765
]
17531766
},
17541767
{

src/bioregistry/data/curated_mappings.sssom.tsv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ bioregistry:cdd skos:exactMatch Not re3data:r3d100012041 semapv:MappingReview or
9999
bioregistry:cgnc skos:exactMatch Not re3data:r3d100012429 semapv:MappingReview orcid:0009-0009-5240-7463
100100
bioregistry:cgsc skos:exactMatch go:CGSC semapv:ManualMappingCuration orcid:0009-0008-8406-631X bioregistry.issue:1496
101101
bioregistry:chr skos:exactMatch aberowl:CHR semapv:ManualMappingCuration orcid:0009-0008-8406-631X bioregistry.issue:1496
102+
bioregistry:cidoc.crm skos:exactMatch bartoc:1644 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 bioregistry.issue:1888
102103
bioregistry:citexplore skos:exactMatch Not aberowl:CTX semapv:ManualMappingCuration orcid:0000-0003-4423-4370
103104
bioregistry:citexplore skos:exactMatch Not bioportal:CTX semapv:ManualMappingCuration orcid:0000-0003-4423-4370
104105
bioregistry:citexplore skos:exactMatch Not fairsharing:FAIRsharing.619eqr semapv:MappingReview orcid:0000-0001-9439-5346 bioregistry.issue:1457
@@ -193,6 +194,7 @@ bioregistry:geo skos:exactMatch Not obofoundry:geo semapv:ManualMappingCuration
193194
bioregistry:geo skos:exactMatch Not ols:geo semapv:ManualMappingCuration orcid:0000-0003-4423-4370
194195
bioregistry:geo skos:exactMatch Not ontobee:GEO semapv:ManualMappingCuration orcid:0000-0003-4423-4370
195196
bioregistry:geo skos:exactMatch Not zazuko:geo semapv:ManualMappingCuration orcid:0000-0003-4423-4370
197+
bioregistry:geonames skos:exactMatch bartoc:1674 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 bioregistry.issue:1888
196198
bioregistry:geonames skos:exactMatch Not fairsharing:FAIRsharing.56a0Uj semapv:ManualMappingCuration orcid:0000-0003-4423-4370
197199
bioregistry:geonames skos:exactMatch fairsharing:FAIRsharing.6dba71 semapv:ManualMappingCuration orcid:0009-0008-8406-631X bioregistry.issue:1496
198200
bioregistry:giardiadb skos:exactMatch integbio:nbdc01782 semapv:ManualMappingCuration orcid:0009-0008-8406-631X bioregistry.issue:1496
@@ -416,6 +418,7 @@ bioregistry:trans skos:exactMatch Not fairsharing:FAIRsharing.nygmp7 semapv:Mapp
416418
bioregistry:tsc skos:exactMatch rrid:TSC semapv:MappingReview orcid:0009-0009-5240-7463
417419
bioregistry:uberon skos:exactMatch togoid:Uberon semapv:ManualMappingCuration orcid:0009-0008-8406-631X bioregistry.issue:1496
418420
bioregistry:ucum skos:exactMatch bartoc:1895 semapv:ManualMappingCuration orcid:0009-0008-8406-631X bioregistry.issue:1496
421+
bioregistry:unesco.thesaurus skos:exactMatch bartoc:40 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 bioregistry.issue:1888
419422
bioregistry:unigene skos:exactMatch re3data:r3d100010774 semapv:ManualMappingCuration orcid:0009-0008-8406-631X bioregistry.issue:1496
420423
bioregistry:unii skos:exactMatch cheminf:000563 semapv:ManualMappingCuration orcid:0009-0008-8406-631X bioregistry.issue:1496
421424
bioregistry:uniprot bioregistry.schema:0000030 uniprot:DB-0004 semapv:MappingReview orcid:0000-0003-4423-4370

0 commit comments

Comments
 (0)