Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"output_directory": "babel_outputs",

"biolink_version": "3.5.4",
"umls_version": "2023AA",
"rxnorm_version": "08072023",
"umls_version": "2023AB",
"rxnorm_version": "11062023",

"ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"],
"ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"],
Expand Down
19 changes: 0 additions & 19 deletions kubernetes/babel-private.k8s.yaml

This file was deleted.

5 changes: 0 additions & 5 deletions kubernetes/babel.k8s.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ spec:
command: [ "/bin/bash", "-c", "--" ]
args: [ "while true; echo Running; do sleep 30; done;" ]
volumeMounts:
- mountPath: "/code/babel/input_data/private"
name: babel-private
- mountPath: "/code/babel/babel_downloads"
name: babel-downloads
- mountPath: "/code/babel/babel_outputs"
Expand All @@ -38,9 +36,6 @@ spec:
memory: "500G"
cpu: "4"
volumes:
- name: babel-private
persistentVolumeClaim:
claimName: babel-private
- name: babel-downloads
persistentVolumeClaim:
claimName: babel-downloads
Expand Down
16 changes: 15 additions & 1 deletion src/createcompendia/drugchemical.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,11 +273,25 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu
x = line.strip().split('\t')
subject = x[0]
object = x[2]
#object is a PUBCHEM. It's by definition a clique_leader.

if subject in drug_rxcui_to_clique:
subject = drug_rxcui_to_clique[subject]
elif subject in chemical_rxcui_to_clique:
subject = chemical_rxcui_to_clique[subject]
else:
raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}")

if object in drug_rxcui_to_clique:
object = drug_rxcui_to_clique[object]
elif object in chemical_rxcui_to_clique:
object = chemical_rxcui_to_clique[object]
else:
logging.warning(
f"Skipping subject-object pair ({subject}, {object}) because the object isn't mapped to a RxCUI"
)
continue
# raise RuntimeError(f"Unknown identifier in drugchemical conflation as object: {object}")

pairs.append((subject, object))
print("glom")
gloms = {}
Expand Down
1 change: 1 addition & 0 deletions src/createcompendia/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def write_ensembl_ids(ensembl_dir, outfile):
dlpath = os.path.join(ensembl_dir, dl)
if os.path.isdir(dlpath):
infname = os.path.join(dlpath, 'BioMart.tsv')
print(f'write_ensembl_ids for input filename {infname}')
if os.path.exists(infname):
# open each ensembl file, find the id column, and put it in the output
with open(infname, 'r') as inf:
Expand Down
6 changes: 6 additions & 0 deletions src/datahandlers/ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,17 @@
# just what we need.
def pull_ensembl(complete_file):
f = find_datasets()

skip_dataset_ids = {'hgfemale_gene_ensembl'}

cols = {"ensembl_gene_id", "ensembl_peptide_id", "description", "external_gene_name", "external_gene_source",
"external_synonym", "chromosome_name", "source", "gene_biotype", "entrezgene_id", "zfin_id_id", 'mgi_id',
'rgd_id', 'flybase_gene_id', 'sgd_gene', 'wormbase_gene'}
for ds in f['Dataset_ID']:
print(ds)
if ds in skip_dataset_ids:
print(f'Skipping {ds} as it is included in skip_dataset_ids: {skip_dataset_ids}')
continue
outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}')
# Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our
# config, and keep it up to date. Maybe you could have a job that gets the datasets and writes a dataset file,
Expand Down
12 changes: 9 additions & 3 deletions src/datahandlers/hgnc.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
from src.babel_utils import make_local_name, pull_via_ftp
from src.babel_utils import make_local_name, pull_via_urllib
import json

def pull_hgnc():
outfile='HGNC/hgnc_complete_set.json'
pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json',outfilename=outfile)
# On 2023nov26, I would get an error trying to download this file using FTP on Python (although
# weirdly enough, I could download the file without any problem using macOS Finder). So I changed
# it to use HTTP instead.
pull_via_urllib(
'https://ftp.ebi.ac.uk/pub/databases/genenames/new/json/',
'hgnc_complete_set.json',
decompress=False,
subpath="HGNC")

def pull_hgnc_labels_and_synonyms(infile):
with open(infile,'r') as data:
Expand Down
7 changes: 0 additions & 7 deletions src/datahandlers/uniprotkb.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from src.babel_utils import pull_via_urllib, make_local_name

def pull_one_uniprotkb(which):
pull_via_urllib('ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB')

def readlabels(which):
swissname = make_local_name(f'UniProtKB/uniprot_{which}.fasta')
Expand All @@ -17,11 +15,6 @@ def readlabels(which):
swissprot_labels[uniprotid] = f'{name} ({which})'
return swissprot_labels

def pull_uniprotkb():
pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/',f'idmapping.dat.gz',subpath='UniProtKB')
for which in ['sprot','trembl']:
pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB')

def pull_uniprot_labels(sprotfile,tremblfile,fname):
slabels = readlabels('sprot')
tlabels = readlabels('trembl')
Expand Down
21 changes: 14 additions & 7 deletions src/snakefiles/datacollect.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,20 @@ rule get_mods_labels:

### UniProtKB

rule get_uniprotkb:
rule get_uniprotkb_idmapping:
output:
config['download_directory']+'/UniProtKB/uniprot_sprot.fasta',
config['download_directory']+'/UniProtKB/uniprot_trembl.fasta',
config['download_directory']+'/UniProtKB/idmapping.dat'
run:
uniprotkb.pull_uniprotkb()
idmapping = config['download_directory']+'/UniProtKB/idmapping.dat'
shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip -k {output.idmapping}.gz"""

rule get_uniprotkb_sprot:
output:
uniprot_sprot = config['download_directory']+'/UniProtKB/uniprot_sprot.fasta'
shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip -k {output.uniprot_sprot}.gz"""

rule get_uniprotkb_trembl:
output:
uniprot_trembl = config['download_directory']+'/UniProtKB/uniprot_trembl.fasta'
shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip -k {output.uniprot_trembl}.gz"""

rule get_uniprotkb_labels:
input:
Expand Down Expand Up @@ -361,7 +368,7 @@ rule get_panther_pathways:
output:
outfile = config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.7.txt'
run:
pantherpathways.pull_panther_pathways(output.outfile)
pantherpathways.pull_panther_pathways()

rule get_panther_pathway_labels:
input:
Expand Down
13 changes: 12 additions & 1 deletion src/snakefiles/geneprotein.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,20 @@ rule geneprotein_conflation:
run:
geneprotein.build_conflation(input.geneprotein_concord,input.gene_compendium,input.protein_compendium,output.outfile)

rule geneprotein_conflated_synonyms:
input:
geneprotein_conflations=[config['output_directory']+'/conflation/GeneProtein.txt'],
gene_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs']),
protein_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs']),
output:
geneprotein_conflated_synonyms=config['output_directory']+'/synonyms/GeneProteinConflated.txt'
run:
synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflations, output.geneprotein_conflated_synonyms)

rule geneprotein:
input:
config['output_directory']+'/conflation/GeneProtein.txt'
config['output_directory']+'/conflation/GeneProtein.txt',
config['output_directory']+'/synonyms/GeneProteinConflated.txt'
output:
x=config['output_directory']+'/reports/geneprotein_done'
shell:
Expand Down