diff --git a/config.json b/config.json index ad38c949..489232be 100644 --- a/config.json +++ b/config.json @@ -5,8 +5,8 @@ "output_directory": "babel_outputs", "biolink_version": "3.5.4", - "umls_version": "2023AA", - "rxnorm_version": "08072023", + "umls_version": "2023AB", + "rxnorm_version": "11062023", "ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"], "ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"], diff --git a/kubernetes/babel-private.k8s.yaml b/kubernetes/babel-private.k8s.yaml deleted file mode 100644 index 41926028..00000000 --- a/kubernetes/babel-private.k8s.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Kubernetes file for setting up a private volume to use for Babel. -# -# This private volume is only needed to store some "private" data, such -# as UMLS files, that should not be included in the Docker image. -# The private volume only needs to be ~5Gi in size. - -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: babel-private - labels: - app: babel -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 10Gi - storageClassName: basic diff --git a/kubernetes/babel.k8s.yaml b/kubernetes/babel.k8s.yaml index efce9715..eb4050fd 100644 --- a/kubernetes/babel.k8s.yaml +++ b/kubernetes/babel.k8s.yaml @@ -24,8 +24,6 @@ spec: command: [ "/bin/bash", "-c", "--" ] args: [ "while true; echo Running; do sleep 30; done;" ] volumeMounts: - - mountPath: "/code/babel/input_data/private" - name: babel-private - mountPath: "/code/babel/babel_downloads" name: babel-downloads - mountPath: "/code/babel/babel_outputs" @@ -38,9 +36,6 @@ spec: memory: "500G" cpu: "4" volumes: - - name: babel-private - persistentVolumeClaim: - claimName: babel-private - name: babel-downloads persistentVolumeClaim: claimName: babel-downloads diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py index 1068a042..0cf68175 100644 --- a/src/createcompendia/drugchemical.py +++ b/src/createcompendia/drugchemical.py @@ -273,11 +273,25 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu x = line.strip().split('\t') subject = x[0] object = x[2] - #object is a PUBCHEM. It's by definition a clique_leader. + if subject in drug_rxcui_to_clique: subject = drug_rxcui_to_clique[subject] elif subject in chemical_rxcui_to_clique: subject = chemical_rxcui_to_clique[subject] + else: + raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}") + + if object in drug_rxcui_to_clique: + object = drug_rxcui_to_clique[object] + elif object in chemical_rxcui_to_clique: + object = chemical_rxcui_to_clique[object] + else: + logging.warning( + f"Skipping subject-object pair ({subject}, {object}) because the object isn't mapped to a RxCUI" + ) + continue + # raise RuntimeError(f"Unknown identifier in drugchemical conflation as object: {object}") + pairs.append((subject, object)) print("glom") gloms = {} diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 05fc705d..06bc6fbd 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -65,6 +65,7 @@ def write_ensembl_ids(ensembl_dir, outfile): dlpath = os.path.join(ensembl_dir, dl) if os.path.isdir(dlpath): infname = os.path.join(dlpath, 'BioMart.tsv') + print(f'write_ensembl_ids for input filename {infname}') if os.path.exists(infname): # open each ensembl file, find the id column, and put it in the output with open(infname, 'r') as inf: diff --git a/src/datahandlers/ensembl.py b/src/datahandlers/ensembl.py index 04b67e78..804284c3 100644 --- a/src/datahandlers/ensembl.py +++ b/src/datahandlers/ensembl.py @@ -12,11 +12,17 @@ # just what we need. def pull_ensembl(complete_file): f = find_datasets() + + skip_dataset_ids = {'hgfemale_gene_ensembl'} + cols = {"ensembl_gene_id", "ensembl_peptide_id", "description", "external_gene_name", "external_gene_source", "external_synonym", "chromosome_name", "source", "gene_biotype", "entrezgene_id", "zfin_id_id", 'mgi_id', 'rgd_id', 'flybase_gene_id', 'sgd_gene', 'wormbase_gene'} for ds in f['Dataset_ID']: print(ds) + if ds in skip_dataset_ids: + print(f'Skipping {ds} as it is included in skip_dataset_ids: {skip_dataset_ids}') + continue outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}') # Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our # config, and keep it up to date. Maybe you could have a job that gets the datasets and writes a dataset file, diff --git a/src/datahandlers/hgnc.py b/src/datahandlers/hgnc.py index 23fdd265..1776ee6d 100644 --- a/src/datahandlers/hgnc.py +++ b/src/datahandlers/hgnc.py @@ -1,9 +1,15 @@ -from src.babel_utils import make_local_name, pull_via_ftp +from src.babel_utils import make_local_name, pull_via_urllib import json def pull_hgnc(): - outfile='HGNC/hgnc_complete_set.json' - pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json',outfilename=outfile) + # On 2023nov26, I would get an error trying to download this file using FTP on Python (although + # weirdly enough, I could download the file without any problem using macOS Finder). So I changed + # it to use HTTP instead. + pull_via_urllib( + 'https://ftp.ebi.ac.uk/pub/databases/genenames/new/json/', + 'hgnc_complete_set.json', + decompress=False, + subpath="HGNC") def pull_hgnc_labels_and_synonyms(infile): with open(infile,'r') as data: diff --git a/src/datahandlers/uniprotkb.py b/src/datahandlers/uniprotkb.py index 461356e1..d4c95eea 100644 --- a/src/datahandlers/uniprotkb.py +++ b/src/datahandlers/uniprotkb.py @@ -1,7 +1,5 @@ from src.babel_utils import pull_via_urllib, make_local_name -def pull_one_uniprotkb(which): - pull_via_urllib('ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB') def readlabels(which): swissname = make_local_name(f'UniProtKB/uniprot_{which}.fasta') @@ -17,11 +15,6 @@ def readlabels(which): swissprot_labels[uniprotid] = f'{name} ({which})' return swissprot_labels -def pull_uniprotkb(): - pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/',f'idmapping.dat.gz',subpath='UniProtKB') - for which in ['sprot','trembl']: - pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB') - def pull_uniprot_labels(sprotfile,tremblfile,fname): slabels = readlabels('sprot') tlabels = readlabels('trembl') diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index f9b32534..563d934a 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -91,13 +91,20 @@ rule get_mods_labels: ### UniProtKB -rule get_uniprotkb: +rule get_uniprotkb_idmapping: output: - config['download_directory']+'/UniProtKB/uniprot_sprot.fasta', - config['download_directory']+'/UniProtKB/uniprot_trembl.fasta', - config['download_directory']+'/UniProtKB/idmapping.dat' - run: - uniprotkb.pull_uniprotkb() + idmapping = config['download_directory']+'/UniProtKB/idmapping.dat' + shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip -k {output.idmapping}.gz""" + +rule get_uniprotkb_sprot: + output: + uniprot_sprot = config['download_directory']+'/UniProtKB/uniprot_sprot.fasta' + shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip -k {output.uniprot_sprot}.gz""" + +rule get_uniprotkb_trembl: + output: + uniprot_trembl = config['download_directory']+'/UniProtKB/uniprot_trembl.fasta' + shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip -k {output.uniprot_trembl}.gz""" rule get_uniprotkb_labels: input: @@ -361,7 +368,7 @@ rule get_panther_pathways: output: outfile = config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.7.txt' run: - pantherpathways.pull_panther_pathways(output.outfile) + pantherpathways.pull_panther_pathways() rule get_panther_pathway_labels: input: diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile index 199cc6fc..a89ecc64 100644 --- a/src/snakefiles/geneprotein.snakefile +++ b/src/snakefiles/geneprotein.snakefile @@ -21,9 +21,20 @@ rule geneprotein_conflation: run: geneprotein.build_conflation(input.geneprotein_concord,input.gene_compendium,input.protein_compendium,output.outfile) +rule geneprotein_conflated_synonyms: + input: + geneprotein_conflations=[config['output_directory']+'/conflation/GeneProtein.txt'], + gene_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs']), + protein_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs']), + output: + geneprotein_conflated_synonyms=config['output_directory']+'/synonyms/GeneProteinConflated.txt' + run: + synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflations, output.geneprotein_conflated_synonyms) + rule geneprotein: input: - config['output_directory']+'/conflation/GeneProtein.txt' + config['output_directory']+'/conflation/GeneProtein.txt', + config['output_directory']+'/synonyms/GeneProteinConflated.txt' output: x=config['output_directory']+'/reports/geneprotein_done' shell: