NCATSTranslator · gaurav · Oct 13, 2023 · Oct 14, 2023 · Oct 14, 2023 · Nov 3, 2023
diff --git a/config.json b/config.json
@@ -5,8 +5,8 @@
   "output_directory": "babel_outputs",
 
   "biolink_version": "3.5.4",
-  "umls_version": "2023AA",
-  "rxnorm_version": "08072023",
+  "umls_version": "2023AB",
+  "rxnorm_version": "11062023",
 
   "ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"],
   "ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"],

diff --git a/kubernetes/babel-private.k8s.yaml b/kubernetes/babel-private.k8s.yaml
diff --git a/kubernetes/babel.k8s.yaml b/kubernetes/babel.k8s.yaml
@@ -24,8 +24,6 @@ spec:
     command: [ "/bin/bash", "-c", "--" ]
     args: [ "while true; echo Running; do sleep 30; done;" ]
     volumeMounts:
-    - mountPath: "/code/babel/input_data/private"
-      name: babel-private
     - mountPath: "/code/babel/babel_downloads"
       name: babel-downloads
     - mountPath: "/code/babel/babel_outputs"
@@ -38,9 +36,6 @@ spec:
         memory: "500G"
         cpu: "4"
   volumes:
-    - name: babel-private
-      persistentVolumeClaim:
-        claimName: babel-private
     - name: babel-downloads
       persistentVolumeClaim:
         claimName: babel-downloads

diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py
@@ -273,11 +273,25 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu
             x = line.strip().split('\t')
             subject = x[0]
             object = x[2]
-            #object is a PUBCHEM.  It's by definition a clique_leader.
+
             if subject in drug_rxcui_to_clique:
                 subject = drug_rxcui_to_clique[subject]
             elif subject in chemical_rxcui_to_clique:
                 subject = chemical_rxcui_to_clique[subject]
+            else:
+                raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}")
+
+            if object in drug_rxcui_to_clique:
+                object = drug_rxcui_to_clique[object]
+            elif object in chemical_rxcui_to_clique:
+                object = chemical_rxcui_to_clique[object]
+            else:
+                logging.warning(
+                    f"Skipping subject-object pair ({subject}, {object}) because the object isn't mapped to a RxCUI"
+                )
+                continue
+                # raise RuntimeError(f"Unknown identifier in drugchemical conflation as object: {object}")
+
             pairs.append((subject, object))
     print("glom")
     gloms = {}

diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py
@@ -65,6 +65,7 @@ def write_ensembl_ids(ensembl_dir, outfile):
             dlpath = os.path.join(ensembl_dir, dl)
             if os.path.isdir(dlpath):
                 infname = os.path.join(dlpath, 'BioMart.tsv')
+                print(f'write_ensembl_ids for input filename {infname}')
                 if os.path.exists(infname):
                     # open each ensembl file, find the id column, and put it in the output
                     with open(infname, 'r') as inf:

diff --git a/src/datahandlers/ensembl.py b/src/datahandlers/ensembl.py
@@ -12,11 +12,17 @@
 # just what we need.
 def pull_ensembl(complete_file):
     f = find_datasets()
+
+    skip_dataset_ids = {'hgfemale_gene_ensembl'}
+
     cols = {"ensembl_gene_id", "ensembl_peptide_id", "description", "external_gene_name", "external_gene_source",
             "external_synonym", "chromosome_name", "source", "gene_biotype", "entrezgene_id", "zfin_id_id", 'mgi_id',
             'rgd_id', 'flybase_gene_id', 'sgd_gene', 'wormbase_gene'}
     for ds in f['Dataset_ID']:
         print(ds)
+        if ds in skip_dataset_ids:
+            print(f'Skipping {ds} as it is included in skip_dataset_ids: {skip_dataset_ids}')
+            continue
         outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}')
         # Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our
         # config, and keep it up to date.  Maybe you could have a job that gets the datasets and writes a dataset file,

diff --git a/src/datahandlers/hgnc.py b/src/datahandlers/hgnc.py
@@ -1,9 +1,15 @@
-from src.babel_utils import make_local_name, pull_via_ftp
+from src.babel_utils import make_local_name, pull_via_urllib
 import json
 
 def pull_hgnc():
-    outfile='HGNC/hgnc_complete_set.json'
-    pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json',outfilename=outfile)
+    # On 2023nov26, I would get an error trying to download this file using FTP on Python (although
+    # weirdly enough, I could download the file without any problem using macOS Finder). So I changed
+    # it to use HTTP instead.
+    pull_via_urllib(
+        'https://ftp.ebi.ac.uk/pub/databases/genenames/new/json/',
+        'hgnc_complete_set.json',
+        decompress=False,
+        subpath="HGNC")
 
 def pull_hgnc_labels_and_synonyms(infile):
     with open(infile,'r') as data:

diff --git a/src/datahandlers/uniprotkb.py b/src/datahandlers/uniprotkb.py
@@ -1,7 +1,5 @@
 from src.babel_utils import pull_via_urllib, make_local_name
 
-def pull_one_uniprotkb(which):
-    pull_via_urllib('ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB')
 
 def readlabels(which):
     swissname = make_local_name(f'UniProtKB/uniprot_{which}.fasta')
@@ -17,11 +15,6 @@ def readlabels(which):
                 swissprot_labels[uniprotid] = f'{name} ({which})'
     return swissprot_labels
 
-def pull_uniprotkb():
-    pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/',f'idmapping.dat.gz',subpath='UniProtKB')
-    for which in ['sprot','trembl']:
-        pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB')
-
 def pull_uniprot_labels(sprotfile,tremblfile,fname):
     slabels = readlabels('sprot')
     tlabels = readlabels('trembl')

diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile
@@ -91,13 +91,20 @@ rule get_mods_labels:
 
 ### UniProtKB
 
-rule get_uniprotkb:
+rule get_uniprotkb_idmapping:
     output:
-        config['download_directory']+'/UniProtKB/uniprot_sprot.fasta',
-        config['download_directory']+'/UniProtKB/uniprot_trembl.fasta',
-        config['download_directory']+'/UniProtKB/idmapping.dat'
-    run:
-        uniprotkb.pull_uniprotkb()
+        idmapping = config['download_directory']+'/UniProtKB/idmapping.dat'
+    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip -k {output.idmapping}.gz"""
+
+rule get_uniprotkb_sprot:
+    output:
+        uniprot_sprot = config['download_directory']+'/UniProtKB/uniprot_sprot.fasta'
+    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip -k {output.uniprot_sprot}.gz"""
+
+rule get_uniprotkb_trembl:
+    output:
+        uniprot_trembl = config['download_directory']+'/UniProtKB/uniprot_trembl.fasta'
+    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip -k {output.uniprot_trembl}.gz"""
 
 rule get_uniprotkb_labels:
     input:
@@ -361,7 +368,7 @@ rule get_panther_pathways:
     output:
         outfile = config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.7.txt'
     run:
-        pantherpathways.pull_panther_pathways(output.outfile)
+        pantherpathways.pull_panther_pathways()
 
 rule get_panther_pathway_labels:
     input:

diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile
@@ -21,9 +21,20 @@ rule geneprotein_conflation:
     run:
         geneprotein.build_conflation(input.geneprotein_concord,input.gene_compendium,input.protein_compendium,output.outfile)
 
+rule geneprotein_conflated_synonyms:
+    input:
+        geneprotein_conflations=[config['output_directory']+'/conflation/GeneProtein.txt'],
+        gene_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs']),
+        protein_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs']),
+    output:
+        geneprotein_conflated_synonyms=config['output_directory']+'/synonyms/GeneProteinConflated.txt'
+    run:
+        synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflations, output.geneprotein_conflated_synonyms)
+
 rule geneprotein:
     input:
-        config['output_directory']+'/conflation/GeneProtein.txt'
+        config['output_directory']+'/conflation/GeneProtein.txt',
+        config['output_directory']+'/synonyms/GeneProteinConflated.txt'
     output:
         x=config['output_directory']+'/reports/geneprotein_done'
     shell: