Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,9 @@ chemical_outputs:
drugchemicalconflated_synonym_outputs:
- DrugChemicalConflated.txt

geneproteinconflated_synonym_outputs:
- GeneProteinConflated.txt

taxon_labels:
- NCBITaxon
- MESH
Expand Down
22 changes: 20 additions & 2 deletions src/snakefiles/geneprotein.snakefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import src.createcompendia.geneprotein as geneprotein
import src.assess_compendia as assessments
from src.synonyms import synonymconflation
from util import gzip_files

### Gene / Protein

Expand All @@ -22,9 +23,26 @@ rule geneprotein_conflation:
run:
geneprotein.build_conflation(input.geneprotein_concord,input.gene_compendium,input.protein_compendium,output.outfile)

rule geneprotein_conflated_synonyms:
input:
geneprotein_conflations=[config['output_directory']+'/conflation/GeneProtein.txt'],
gene_compendia=expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['gene_outputs']),
protein_compendia=expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['protein_outputs']),
gene_synonyms_gz=expand("{od}/synonyms/{ap}.gz", od = config['output_directory'], ap = config['gene_outputs']),
protein_synonyms_gz=expand("{od}/synonyms/{ap}.gz", od = config['output_directory'], ap = config['protein_outputs'])
output:
geneprotein_conflated_synonyms_gz=config['output_directory']+'/synonyms/GeneProteinConflated.txt.gz'
run:
synonymconflation.conflate_synonyms(
input.gene_synonyms_gz + input.protein_synonyms_gz,
input.gene_compendia + input.protein_compendia,
input.geneprotein_conflations,
output.geneprotein_conflated_synonyms_gz)

rule geneprotein:
input:
config['output_directory']+'/conflation/GeneProtein.txt'
config['output_directory']+'/conflation/GeneProtein.txt',
config['output_directory']+'/synonyms/GeneProteinConflated.txt.gz'
output:
x=config['output_directory']+'/reports/geneprotein_done'
shell:
Expand Down
3 changes: 3 additions & 0 deletions src/snakefiles/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def get_all_synonyms(config):
config['cell_line_outputs'] +
config['genefamily_outputs'] +
config['drugchemicalconflated_synonym_outputs'] +
config['geneproteinconflated_synonym_outputs'] +
config['umls_outputs'] +
config['macromolecularcomplex_outputs'] +
# Publication.txt is empty, but it's still created, so it needs to be here.
Expand All @@ -87,6 +88,7 @@ def get_all_synonyms_except_drugchemicalconflated(config):
config['cell_line_outputs'] +
config['genefamily_outputs'] +
# config['drugchemicalconflated_synonym_outputs'] +
config['geneproteinconflated_synonym_outputs'] +
config['umls_outputs'] +
config['macromolecularcomplex_outputs']
)
Expand All @@ -110,6 +112,7 @@ def get_all_synonyms_with_drugchemicalconflated(config):
config['cell_line_outputs'] +
config['genefamily_outputs'] +
config['drugchemicalconflated_synonym_outputs'] +
config['geneproteinconflated_synonym_outputs'] +
config['umls_outputs'] +
config['macromolecularcomplex_outputs']
)
Expand Down
2 changes: 1 addition & 1 deletion src/synonyms/synonymconflation.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def conflate_synonyms(synonym_files_gz, compendia_files, conflation_file, output
if 'taxa' in synonym:
if 'taxa' not in final_conflation:
final_conflation['taxa'] = set()
final_conflation.update(synonym['taxa'])
final_conflation['taxa'].update(synonym['taxa'])

# Convert the taxa into a list.
final_conflation['taxa'] = sorted(final_conflation['taxa'])
Expand Down