diff --git a/config.json b/config.json index 58952e44..86675405 100644 --- a/config.json +++ b/config.json @@ -12,7 +12,7 @@ "UMLS_UniProtKB_download_raw_url": "https://raw.githubusercontent.com/cbizon/UMLS_UniProtKB/refs/heads/main/outputs/UMLS_UniProtKB.tsv", "ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"], - "ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"], + "ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "MP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"], "mods": ["WormBase","FB","MGI","ZFIN","RGD","SGD"], "anatomy_prefixes": ["UBERON","GO","CL","UMLS","MESH","NCIT","SNOMEDCT"], @@ -31,9 +31,9 @@ "protein_concords": ["UniProtKB","PR","NCIT_UniProtKB","NCIT_UMLS", "UMLS_UniProtKB"], "protein_outputs": ["Protein.txt"], - "disease_labelsandsynonyms": ["MONDO","DOID","Orphanet","HP","MESH","NCIT","UMLS","SNOMEDCT","EFO"], - "disease_ids": ["MONDO","DOID","Orphanet","HP","MESH","NCIT","UMLS","OMIM","EFO"], - "disease_concords": ["HP","MONDO","UMLS","DOID","EFO", "Manual"], + "disease_labelsandsynonyms": ["MONDO","DOID","Orphanet","HP","MP","MESH","NCIT","UMLS","SNOMEDCT","EFO"], + "disease_ids": ["MONDO","DOID","Orphanet","HP","MP","MESH","NCIT","UMLS","OMIM","EFO"], + "disease_concords": ["HP","MONDO","UMLS","DOID","EFO", "HP_MP","Manual"], "disease_outputs": ["Disease.txt", "PhenotypicFeature.txt"], "process_labels": ["GO","REACT","RHEA","EC","SMPDB","PANTHER.PATHWAY"], diff --git a/requirements.lock b/requirements.lock index 987e81f5..c87fc760 100644 --- a/requirements.lock +++ b/requirements.lock @@ -1,168 +1,190 @@ -aiohttp==3.8.4 +aiohttp==3.9.1 +aioredis==1.3.1 aiosignal==1.3.1 -airium==0.2.5 +airium==0.2.6 +annotated-types==0.6.0 antlr4-python3-runtime==4.9.3 -anyio==3.7.0 +anyio==3.7.1 appdirs==1.4.4 apybiomart==0.5.3 -async-timeout==4.0.2 +asgiref==3.7.2 +async-timeout==4.0.3 asyncio==3.4.3 attrs==23.1.0 -Babel==2.12.1 +Babel==2.13.1 bcp47==0.0.4 beautifulsoup4==4.12.2 biopython==1.81 -bmt==1.1.1 -cattrs==23.1.2 -certifi==2023.5.7 -cffconvert==2.0.0 +bmt==1.1.3 +cachetools==5.3.2 +cattrs==23.2.2 +certifi==2023.11.17 chardet==5.2.0 -charset-normalizer==3.1.0 +charset-normalizer==3.3.2 class-resolver==0.4.2 click==8.1.7 colorama==0.4.6 ConfigArgParse==1.7 connection-pool==0.0.3 -coverage==7.3.0 -curies==0.6.0 +coverage==7.3.2 +curies==0.6.7 datrie==0.8.2 +deepdiff==5.8.1 Deprecated==1.2.14 deprecation==2.1.0 -docopt==0.6.2 +distlib==0.3.7 +docker==6.1.3 docutils==0.20.1 dpath==2.1.6 EditorConfig==0.12.3 -elasticsearch==7.16.3 eutils==0.6.0 -fastapi==0.95.0 -fastjsonschema==2.18.0 +fastapi==0.83.0 +fastjsonschema==2.19.0 fastobo==0.12.2 -frozenlist==1.3.3 +filelock==3.13.1 +frozenlist==1.4.0 funowl==0.2.3 ghp-import==2.1.0 -gitdb==4.0.10 -GitPython==3.1.34 -greenlet==2.0.1 +gitdb==4.0.11 +GitPython==3.1.40 +greenlet==3.0.1 gunicorn==20.1.0 -h11==0.14.0 +h11==0.12.0 hbreader==0.9.1 +hiredis==2.2.3 +httpcore==0.15.0 +httptools==0.5.0 +httpx==0.23.0 humanfriendly==10.0 -idna==3.4 +idna==3.6 ijson==3.2.3 importlib-metadata==6.8.0 iniconfig==2.0.0 isodate==0.6.1 -itsdangerous==2.1.2 Jinja2==3.1.2 -jsbeautifier==1.14.9 +jsbeautifier==1.14.11 json-flattener==0.1.9 jsonasobj==1.3.1 jsonasobj2==1.0.4 jsonlines==4.0.0 -jsonschema==3.2.0 -jupyter_core==5.3.1 +jsonschema==4.6.2 +jsonschema-specifications==2023.11.1 +jupyter_core==5.5.0 kgcl-rdflib==0.5.0 kgcl-schema==0.6.0 -lark==1.1.7 +lark==1.1.8 linkml-renderer==0.3.0 -linkml-runtime==1.5.6 +linkml-runtime==1.6.2 lxml==4.9.3 -Markdown==3.4.4 +Markdown==3.5.1 MarkupSafe==2.1.3 mergedeep==1.3.4 -mistune==2.0.3 -mkdocs==1.5.2 -mkdocs-material==9.2.7 -mkdocs-material-extensions==1.1.1 +mkdocs==1.5.3 +mkdocs-material==9.4.14 +mkdocs-material-extensions==1.3.1 mkdocs-mermaid2-plugin==0.6.0 more-click==0.1.2 more-itertools==10.1.0 multidict==6.0.4 nbformat==5.9.2 -ndex2==3.5.1 -networkx==3.1 -numpy==1.25.2 -oaklib==0.5.18 +ndex2==3.6.0 +networkx==3.2.1 +numpy==1.26.2 +oaklib==0.5.22 ols-client==0.1.4 ontoportal-client==0.0.4 -packaging==23.1 +ordered-set==4.1.0 +orjson==3.8.10 +packaging==23.2 paginate==0.5.6 -pandas==2.1.0 +pandas==2.1.3 pansql==0.0.1 pathspec==0.11.2 -plac==1.3.5 -platformdirs==3.10.0 -pluggy==1.0.0 +plac==1.4.1 +platformdirs==4.0.0 +pluggy==1.3.0 prefixcommons==0.1.12 -prefixmaps==0.1.5 +prefixmaps==0.1.7 pronto==2.5.5 -psutil==5.9.5 -psycopg2-binary==2.9.7 +psutil==5.9.6 +psycopg2-binary==2.9.9 PuLP==2.7.0 -pydantic==1.10.9 -Pygments==2.16.1 +py==1.11.0 +pydantic==1.10.13 +pydantic_core==2.14.5 +Pygments==2.17.2 PyJSG==0.11.10 -pykwalify==1.8.0 -pymdown-extensions==10.3 +pymdown-extensions==10.5 pyoxigraph==0.2.5 pyparsing==3.1.1 -pyrsistent==0.17.3 +pyproject-api==1.6.1 +pyrsistent==0.20.0 pysolr==3.9.0 -pystow==0.5.0 -pytest==7.3.2 -pytest-cov==4.1.0 +pystow==0.5.2 +pytest==6.2.5 +pytest-asyncio==0.18.3 +pytest-cov==3.0.0 pytest-logging==2015.11.4 python-dateutil==2.8.2 python-Levenshtein-wheels==0.13.2 PyTrie==0.4.0 -pytz==2021.1 +pytz==2023.3.post1 PyYAML==6.0.1 pyyaml_env_tag==0.1 ratelimit==2.2.1 rdflib==7.0.0 rdflib-jsonld==0.6.1 rdflib-shim==1.0.3 -redis==4.4.2 -regex==2022.10.31 -requests==2.28.2 -requests-cache==1.1.0 +reasoner-pydantic==4.1.5 +redis==3.5.3 +redis-py-cluster==2.1.3 +referencing==0.31.0 +regex==2023.10.3 +requests==2.31.0 +requests-cache==1.1.1 requests-toolbelt==1.0.0 reretry==0.11.8 +rfc3986==1.5.0 rfc3987==1.3.8 -ruamel.yaml==0.17.26 -ruamel.yaml.clib==0.2.7 -scipy==1.11.2 +rpds-py==0.13.1 +scipy==1.11.4 semsimian==0.2.1 semsql==0.3.2 six==1.16.0 -smart-open==6.3.0 -smmap==5.0.0 -snakemake==7.32.3 +smart-open==6.4.0 +smmap==5.0.1 +snakemake==7.32.4 sniffio==1.3.0 sortedcontainers==2.4.0 soupsieve==2.5 SPARQLWrapper==2.0.0 -SQLAlchemy==2.0.20 +SQLAlchemy==2.0.23 SQLAlchemy-Utils==0.38.3 -sssom==0.3.40 +sssom==0.3.41 sssom-schema==0.15.0 -starlette==0.26.1 +starlette==0.19.1 stopit==1.1.2 stringcase==1.2.0 tabulate==0.9.0 +testcontainers==3.6.1 throttler==1.2.2 +toml==0.10.2 toposort==1.10 +tox==4.11.3 tqdm==4.66.1 -traitlets==5.9.0 -typing_extensions==4.6.3 +traitlets==5.13.0 +typing_extensions==4.8.0 tzdata==2023.3 url-normalize==1.4.3 -urllib3==1.26.16 -uvicorn==0.22.0 +urllib3==2.1.0 +uvicorn==0.17.6 +uvloop==0.17.0 validators==0.22.0 +virtualenv==20.24.7 watchdog==3.0.0 -wrapt==1.15.0 +websocket-client==1.6.4 +wrapt==1.16.0 xmltodict==0.13.0 -yarl==1.9.2 +yarl==1.9.3 yte==1.5.1 -zipp==3.16.2 +zipp==3.17.0 diff --git a/requirements.txt b/requirements.txt index e330149d..31d87eb7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,3 +25,5 @@ beautifulsoup4 curies # Added by Gaurav, May 2024 duckdb +# Added by Gaurav, Jul 2024 +sssom diff --git a/src/createcompendia/diseasephenotype.py b/src/createcompendia/diseasephenotype.py index 4f6ace16..d31814b2 100644 --- a/src/createcompendia/diseasephenotype.py +++ b/src/createcompendia/diseasephenotype.py @@ -1,9 +1,14 @@ +import logging from os import path from collections import defaultdict +import requests +from sssom import parsers + import src.datahandlers.obo as obo -from src.prefixes import MESH, NCIT, MONDO, OMIM, HP, SNOMEDCT, MEDDRA, EFO, ORPHANET, ICD0, ICD9, ICD10, UMLS, KEGGDISEASE +from src.prefixes import (MESH, NCIT, MONDO, OMIM, HP, SNOMEDCT, MEDDRA, EFO, ORPHANET, ICD0, ICD9, ICD10, UMLS, + KEGGDISEASE, MP) from src.categories import DISEASE, PHENOTYPIC_FEATURE from src.ubergraph import build_sets import src.datahandlers.umls as umls @@ -39,6 +44,14 @@ def write_hp_ids(outfile): phenotype_id = 'HP:0000118' write_obo_ids([(phenotype_id,PHENOTYPIC_FEATURE)],outfile) + +def write_mp_ids(outfile): + # Write terms from the Mammalian Phenotype Ontology + # https://github.com/TranslatorSRI/Babel/issues/240 + phenotype_id = 'MP:0000001' + write_obo_ids([(phenotype_id,PHENOTYPIC_FEATURE)],outfile) + + def write_omim_ids(infile,outfile): with open(infile,'r') as inf, open(outfile,'w') as outf: for line in inf: @@ -105,6 +118,42 @@ def build_disease_efo_relationships(idfile,outfile): efo.make_concords(idfile, outfile) +def build_hp_mp_concords(hp_mp_sssom_urls, outfile, threshold=0.8, acceptable_predicates=['skos:exactMatch']): + # We rely on the files from the + # Mouse-Human Ontology Mapping Initiative (https://github.com/mapping-commons/mh_mapping_initiative) + + if not hp_mp_sssom_urls: + raise RuntimeError("build_hp_mp_concords() called without any hp_mp_sssom_urls") + + with open(outfile, "w") as fout: + for hp_mp_sssom_url in hp_mp_sssom_urls: + count_mappings = 0 + result = parsers.parse_sssom_table(hp_mp_sssom_url) + + df = result.df + if 'confidence' in df.columns: + df_filtered = df[(df['confidence'] > threshold)] + logging.info(f"Filtered {df.size} to {df_filtered.size} by filtering by confidence > {threshold}") + else: + df_filtered = df + + for index in df_filtered.index: + subject_id = df_filtered['subject_id'][index] + object_id = df_filtered['object_id'][index] + predicate_id = df_filtered['predicate_id'][index] + + if subject_id == 'sssom:NoTermFound' or object_id == 'sssom:NoTermFound': + continue + + if predicate_id not in acceptable_predicates: + continue + + print(f"{subject_id}\t{predicate_id}\t{object_id}", file=fout) + count_mappings += 1 + + logging.info(f"Extracted {count_mappings} mappings from {hp_mp_sssom_url}") + + def build_disease_umls_relationships(mrconso, idfile, outfile, omimfile, ncitfile): #UMLS contains xrefs between a disease UMLS and a gene OMIM. So here we are saying: if you are going to link to # an omim identifier, make sure it's a disease omim, not some other thing. @@ -115,7 +164,9 @@ def build_disease_umls_relationships(mrconso, idfile, outfile, omimfile, ncitfil for line in inf: x = line.split()[0] good_ids[prefix].add(x) - umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'HPO': HP, 'MDR':MEDDRA, 'OMIM': OMIM},acceptable_identifiers=good_ids) + umls.build_sets(mrconso, idfile, outfile, { + 'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'HPO': HP, 'MP': MP, 'MDR':MEDDRA, 'OMIM': OMIM + },acceptable_identifiers=good_ids) def build_disease_doid_relationships(idfile,outfile): doid.build_xrefs(idfile, outfile, other_prefixes={'ICD10CM':ICD10, 'ICD9CM':ICD9, 'ICDO': ICD0, 'NCI': NCIT, @@ -131,7 +182,7 @@ def build_compendium(concordances, identifiers, mondoclose, badxrefs, icrdf_file for ifile in identifiers: print(ifile) new_identifiers,new_types = read_identifier_file(ifile) - glom(dicts, new_identifiers, unique_prefixes=[MONDO, HP]) + glom(dicts, new_identifiers, unique_prefixes=[MONDO, HP, MP]) types.update(new_types) #Load close Mondos with open(mondoclose, 'r') as inf: @@ -162,7 +213,7 @@ def build_compendium(concordances, identifiers, mondoclose, badxrefs, icrdf_file newpairs = remove_overused_xrefs(pairs) else: newpairs = pairs - glom(dicts, newpairs, unique_prefixes=[MONDO, HP], close={MONDO:close_mondos}) + glom(dicts, newpairs, unique_prefixes=[MONDO, HP, MP], close={MONDO:close_mondos}) try: print(dicts['OMIM:607644']) except: @@ -187,7 +238,7 @@ def create_typed_sets(eqsets,types): #prefixes = set([ Text.get_curie(x) for x in equivalent_ids]) prefixes = get_prefixes(equivalent_ids) found = False - for prefix in [MONDO, HP]: + for prefix in [MONDO, HP, MP]: if prefix in prefixes and not found: try: mytype = types[prefixes[prefix][0]] diff --git a/src/prefixes.py b/src/prefixes.py index 88bbbe75..d7a42da4 100644 --- a/src/prefixes.py +++ b/src/prefixes.py @@ -78,3 +78,7 @@ PMID = 'PMID' DOI = 'doi' PMC = 'PMC' + +# Mammalian Phenotype Ontology (https://www.informatics.jax.org/vocab/mp_ontology, +# e.g. http://purl.obolibrary.org/obo/MP_0001672) +MP = 'MP' \ No newline at end of file diff --git a/src/snakefiles/diseasephenotype.snakefile b/src/snakefiles/diseasephenotype.snakefile index b91e07e5..0b2da952 100644 --- a/src/snakefiles/diseasephenotype.snakefile +++ b/src/snakefiles/diseasephenotype.snakefile @@ -70,6 +70,12 @@ rule disease_hp_ids: run: diseasephenotype.write_hp_ids(output.outfile) +rule mammalian_phenotype_ids: + output: + outfile=config['intermediate_directory']+"/disease/ids/MP" + run: + diseasephenotype.write_mp_ids(output.outfile) + rule disease_omim_ids: input: infile=config['download_directory']+"/OMIM/mim2gene.txt" @@ -115,6 +121,32 @@ rule get_disease_doid_relationships: run: diseasephenotype.build_disease_doid_relationships(input.infile,output.outfile) +rule get_hp_mp_concord: + output: + outfile = config['intermediate_directory']+'/disease/concords/HP_MP' + run: + hp_mp_sssom_urls = [ + # https://github.com/mapping-commons/mh_mapping_initiative/blob/master/mappings/mp_hp_eye_impc.sssom.tsv + 'https://raw.githubusercontent.com/mapping-commons/mh_mapping_initiative/master/mappings/mp_hp_eye_impc.sssom.tsv', + # https://github.com/mapping-commons/mh_mapping_initiative/blob/master/mappings/mp_hp_hwt_impc.sssom.tsv + 'https://raw.githubusercontent.com/mapping-commons/mh_mapping_initiative/master/mappings/mp_hp_hwt_impc.sssom.tsv', + # https://github.com/mapping-commons/mh_mapping_initiative/blob/master/mappings/mp_hp_mgi_all.sssom.tsv + 'https://raw.githubusercontent.com/mapping-commons/mh_mapping_initiative/master/mappings/mp_hp_mgi_all.sssom.tsv', + # https://github.com/mapping-commons/mh_mapping_initiative/blob/master/mappings/mp_hp_owt_impc.sssom.tsv + 'https://raw.githubusercontent.com/mapping-commons/mh_mapping_initiative/master/mappings/mp_hp_owt_impc.sssom.tsv', + # https://github.com/mapping-commons/mh_mapping_initiative/blob/master/mappings/mp_hp_pat_impc.sssom.tsv + 'https://raw.githubusercontent.com/mapping-commons/mh_mapping_initiative/master/mappings/mp_hp_pat_impc.sssom.tsv', + # https://github.com/mapping-commons/mh_mapping_initiative/blob/master/mappings/mp_hp_pistoia.sssom.tsv + 'https://raw.githubusercontent.com/mapping-commons/mh_mapping_initiative/master/mappings/mp_hp_pistoia.sssom.tsv', + # https://github.com/mapping-commons/mh_mapping_initiative/blob/master/mappings/mp_hp_xry_impc.sssom.tsv + 'https://raw.githubusercontent.com/mapping-commons/mh_mapping_initiative/master/mappings/mp_hp_xry_impc.sssom.tsv', + ] + diseasephenotype.build_hp_mp_concords(hp_mp_sssom_urls, output.outfile, threshold=0.8, acceptable_predicates=[ + 'skos:exactMatch', + 'skos:closeMatch', + 'skos:relatedMatch' + ]) + rule disease_manual_concord: input: infile = 'input_data/manual_concords/disease.txt' diff --git a/src/ubergraph.py b/src/ubergraph.py index be158ac2..dd6ab73b 100644 --- a/src/ubergraph.py +++ b/src/ubergraph.py @@ -212,6 +212,7 @@ def get_subclasses_of(self,iri): prefix CHEBI: prefix MONDO: prefix HP: + prefix MP: prefix NCIT: prefix PR: prefix EFO: @@ -253,6 +254,7 @@ def get_subclasses_and_smiles(self,iri): prefix CHEBIP: prefix MONDO: prefix HP: + prefix MP: prefix NCIT: prefix PR: prefix EFO: @@ -297,6 +299,7 @@ def get_subclasses_and_xrefs(self,iri): prefix CHEBI: prefix MONDO: prefix HP: + prefix MP: prefix NCIT: prefix PR: select distinct ?descendent ?xref @@ -335,6 +338,7 @@ def get_subclasses_and_exacts(self,iri): prefix CHEBI: prefix MONDO: prefix HP: + prefix MP: prefix EFO: prefix NCIT: PREFIX EXACT_MATCH: @@ -397,6 +401,7 @@ def get_subclasses_and_close(self,iri): prefix CHEBI: prefix MONDO: prefix HP: + prefix MP: prefix EFO: prefix NCIT: PREFIX CLOSE_MATCH: