Skip to content

Commit b807cd9

Browse files
authored
Merge pull request #486 from TranslatorSRI/add-geneprotein-conflated-synonyms
This PR adds a new output: `synonyms/GeneProteinConflated.txt`, based on the GeneProtein conflation and the Gene and Protein outputs. It should be pretty similar to the first attempt (PR #185). Also fixes a bug in synonym conflating with taxa that we'd never run into because we'd only conflated DrugChemicals before. Closes NCATSTranslator/NameResolution#191.
2 parents 5c5560e + 531692d commit b807cd9

File tree

4 files changed

+27
-3
lines changed

4 files changed

+27
-3
lines changed

config.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,9 @@ chemical_outputs:
289289
drugchemicalconflated_synonym_outputs:
290290
- DrugChemicalConflated.txt
291291

292+
geneproteinconflated_synonym_outputs:
293+
- GeneProteinConflated.txt
294+
292295
taxon_labels:
293296
- NCBITaxon
294297
- MESH

src/snakefiles/geneprotein.snakefile

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import src.createcompendia.geneprotein as geneprotein
2-
import src.assess_compendia as assessments
2+
from src.synonyms import synonymconflation
3+
from util import gzip_files
34

45
### Gene / Protein
56

@@ -22,9 +23,26 @@ rule geneprotein_conflation:
2223
run:
2324
geneprotein.build_conflation(input.geneprotein_concord,input.gene_compendium,input.protein_compendium,output.outfile)
2425

26+
rule geneprotein_conflated_synonyms:
27+
input:
28+
geneprotein_conflations=[config['output_directory']+'/conflation/GeneProtein.txt'],
29+
gene_compendia=expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['gene_outputs']),
30+
protein_compendia=expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['protein_outputs']),
31+
gene_synonyms_gz=expand("{od}/synonyms/{ap}.gz", od = config['output_directory'], ap = config['gene_outputs']),
32+
protein_synonyms_gz=expand("{od}/synonyms/{ap}.gz", od = config['output_directory'], ap = config['protein_outputs'])
33+
output:
34+
geneprotein_conflated_synonyms_gz=config['output_directory']+'/synonyms/GeneProteinConflated.txt.gz'
35+
run:
36+
synonymconflation.conflate_synonyms(
37+
input.gene_synonyms_gz + input.protein_synonyms_gz,
38+
input.gene_compendia + input.protein_compendia,
39+
input.geneprotein_conflations,
40+
output.geneprotein_conflated_synonyms_gz)
41+
2542
rule geneprotein:
2643
input:
27-
config['output_directory']+'/conflation/GeneProtein.txt'
44+
config['output_directory']+'/conflation/GeneProtein.txt',
45+
config['output_directory']+'/synonyms/GeneProteinConflated.txt.gz'
2846
output:
2947
x=config['output_directory']+'/reports/geneprotein_done'
3048
shell:

src/snakefiles/util.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ def get_all_synonyms(config):
6262
config['cell_line_outputs'] +
6363
config['genefamily_outputs'] +
6464
config['drugchemicalconflated_synonym_outputs'] +
65+
config['geneproteinconflated_synonym_outputs'] +
6566
config['umls_outputs'] +
6667
config['macromolecularcomplex_outputs'] +
6768
# Publication.txt is empty, but it's still created, so it needs to be here.
@@ -87,6 +88,7 @@ def get_all_synonyms_except_drugchemicalconflated(config):
8788
config['cell_line_outputs'] +
8889
config['genefamily_outputs'] +
8990
# config['drugchemicalconflated_synonym_outputs'] +
91+
config['geneproteinconflated_synonym_outputs'] +
9092
config['umls_outputs'] +
9193
config['macromolecularcomplex_outputs']
9294
)
@@ -110,6 +112,7 @@ def get_all_synonyms_with_drugchemicalconflated(config):
110112
config['cell_line_outputs'] +
111113
config['genefamily_outputs'] +
112114
config['drugchemicalconflated_synonym_outputs'] +
115+
config['geneproteinconflated_synonym_outputs'] +
113116
config['umls_outputs'] +
114117
config['macromolecularcomplex_outputs']
115118
)

src/synonyms/synonymconflation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ def conflate_synonyms(synonym_files_gz, compendia_files, conflation_file, output
213213
if 'taxa' in synonym:
214214
if 'taxa' not in final_conflation:
215215
final_conflation['taxa'] = set()
216-
final_conflation.update(synonym['taxa'])
216+
final_conflation['taxa'].update(synonym['taxa'])
217217

218218
# Convert the taxa into a list.
219219
final_conflation['taxa'] = sorted(final_conflation['taxa'])

0 commit comments

Comments
 (0)