Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 26 additions & 25 deletions src/bioregistry/external/uniprot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,43 +26,55 @@
RAW_PATH = RAW_DIRECTORY / "uniprot.json"
PROCESSED_PATH = DIRECTORY / "processed.json"

# the field to use as the prefix in UniProt dblist records
PREFIX_FIELD = "abbrev"

#: resources with these UniProt prefixes don't exist anymore
skip_prefixes = {
UNIPROT_SKIP_PREFIXES = {
"UniPathway", # doesn't exist anymore
"BRENDA", # has bad format string contains EC, UniProt, and taxon
"eggNOG", # not sure what this does
"PlantReactome", # incomprehensible URLs
"Reactome", # incomprehensible URLs
"DB-0180", # genewiki
"GeneWiki", # genewiki abbrev
}

HAS_BAD_URI = {
"DB-0148", # ensembl.fungi
"DB-0147", # ensembl.bacteria
"EnsemblFungi", # ensembl.fungi
"EnsemblBacteria", # ensembl.bacteria
}


def process_uniprot_raw(path: Path) -> dict[str, Record]:
"""Process UniProt raw JSON."""
rv = {}
for record in json.loads(path.read_text())["results"]:
prefix = record.pop("id")
if prefix in skip_prefixes:
continue
processed_record = _process_record(prefix, record)
if processed_record is None:
prefix = record.pop(PREFIX_FIELD)
if prefix in UNIPROT_SKIP_PREFIXES:
continue
rv[prefix] = processed_record
if processed_record := _process_record(prefix, record):
rv[prefix] = processed_record
return rv


def _process_record(prefix: str, record: dict[str, Any]) -> Record | None:
rv = {
"name": record.pop("name"),
"abbreviation": record.pop("abbrev"),
"homepage": record.pop("servers")[0],
"keywords": [record.pop("category")],
}

value = record.pop("dbUrl")
if "%s" in value and "%u" in value:
logger.debug("has both formats: %s", value)
return None

value = value.replace("%s", "$1").replace("%u", "$1")
if "$1" in value and prefix not in HAS_BAD_URI:
rv[URI_FORMAT_KEY] = value
else:
logger.debug("no annotation in %s", prefix)

publication = {}
if doi := record.pop("doiId", None):
doi = doi.lower().rstrip(".")
Expand All @@ -75,19 +87,9 @@ def _process_record(prefix: str, record: dict[str, Any]) -> Record | None:
if publication:
rv["publications"] = [publication]

del record["linkType"]
del record["statistics"]
for key in ["id", "linkType", "statistics"]:
del record[key]

value = record.pop("dbUrl")
if "%s" in value and "%u" in value:
logger.debug(f"has both formats: {value}")
return None
else:
value = value.replace("%s", "$1").replace("%u", "$1")
if "$1" in value and prefix not in HAS_BAD_URI:
rv[URI_FORMAT_KEY] = value
else:
logger.debug("no annotation in %s", prefix)
if record:
logger.debug("forgot something: %s", record)
return Record.model_validate(rv)
Expand All @@ -105,9 +107,8 @@ class UniProtAligner(Aligner):
"""Aligner for UniProt."""

key = "uniprot"
alt_key_match = "abbreviation"
getter = get_uniprot
curation_header: ClassVar[Sequence[str]] = ("abbreviation", "name", URI_FORMAT_KEY, "keywords")
curation_header: ClassVar[Sequence[str]] = ("name", URI_FORMAT_KEY, "keywords")


if __name__ == "__main__":
Expand Down
Loading
Loading