Skip to content

Commit c77703b

Browse files
committed
Remove disease_category and clingen_curation columns
1 parent 3a737f3 commit c77703b

5 files changed

Lines changed: 4 additions & 235 deletions

File tree

Lines changed: 4 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
import argparse
2-
import glob
32
import pandas as pd
43
from dotenv import load_dotenv
5-
import time
6-
import os
74
import sys
85
load_dotenv()
96

@@ -33,47 +30,10 @@
3330
3431
"""
3532

36-
prompt_prefix2 = """
37-
You are a clinical geneticist. You have assembled known gene-disease associations from authoritative sources that include OMIM, GenCC, ClinGen, ClinVar, PanelApp, Orphanet, and dbNSFP. Now, you need to select a single
38-
disease category that is the best match for the provided phenotypes. The possible disease categories are:
39-
40-
'BIOCHEMICAL/METABOLIC',
41-
'CANCER',
42-
'CARDIOVASCULAR',
43-
'DEAFNESS',
44-
'HEMATOLOGICAL',
45-
'IMMUNOLOGICAL',
46-
'OPHTHALMOLOGIC',
47-
'NEUROLOGICAL',
48-
'NEPHROLOGIC',
49-
'PULMONARY',
50-
'SKELETAL',
51-
'RHEUMATOLOGIC & AUTOIMMUNE',
52-
'ENDOCRINE',
53-
'PRENATAL/REPRODUCTIVE',
54-
'GASTROINTESTINAL',
55-
'DYSMORPHOLOGY',
56-
'SYNDROMIC',
57-
'CONNECTIVE TISSUE',
58-
'DERMATOLOGIC',
59-
'PSYCHIATRIC'
60-
If the phenotypes do not fit in one of these categories, put 'OTHER', or define a new disease area and label it.
61-
In all scenarios, please output only the disease category and nothing else. There should be no intro or explanation - just the category.
62-
63-
For example, if the phenotypes are: 'Congenital disorder of glycosylation, type Ie, OMIM:608799, GDP-Man:Dol-P mannosyltransferase deficiency (Disorders of multiple glycosylation and other glycosylation pathways); Congenital disorder of glycosylation, type Ie, OMIM:608799, GDP-Man:Dol-P mannosyltransferase deficiency (Disorders of multiple glycosylation and other glycosylation pathways); CONGENITAL DISORDERS OF GLYCOSYLATION; CONGENITAL DISORDERS OF GLYCOSYLATION 612379; Congenital disorder of glycosylation, type Ie, OMIM:608799; Conge
64-
nital disorder of glycosylation, type Ie, OMIM:608799; Congenital disorder of glycosylation, type Ie, 608799', you would output: 'BIOCHEMICAL/METABOLIC'.
65-
66-
or, if the phenotypes are: 'Microcephaly, short stature, and intellectual disability', you would output: 'DYSMORPHOLOGY'.
67-
68-
Now, please tell me the best disease category for the following phenotypes:
69-
"""
70-
71-
72-
7333
# read the combined table
7434
df = pd.read_table(args.combined_table)
7535

76-
def summarize_phenotypes(row, prompt_prefix=prompt_prefix1, blank_if_no_phenotypes=False):
36+
def summarize_phenotypes(row):
7737

7838
# concatenate the phenotypes into a single string, by source
7939
phenotypes = []
@@ -92,9 +52,6 @@ def summarize_phenotypes(row, prompt_prefix=prompt_prefix1, blank_if_no_phenotyp
9252
phenotypes.append(f"{label}: {row[phenotype_column]}")
9353

9454
if not phenotypes:
95-
if blank_if_no_phenotypes:
96-
return ""
97-
9855
if "GWAS_mondo_name" in row and not pd.isna(row["GWAS_mondo_name"]):
9956
return f"GWAS: " + str(row["GWAS_mondo_name"])
10057
else:
@@ -113,74 +70,18 @@ def summarize_phenotypes(row, prompt_prefix=prompt_prefix1, blank_if_no_phenotyp
11370
else:
11471
return ""
11572

116-
prompt = prompt_prefix + ", ".join(phenotypes)
73+
prompt = prompt_prefix1 + ", ".join(phenotypes)
11774
response = ask_gemini(prompt, model="2.5-flash", temperature=0, max_tokens=1000, system_prompt="", verbose=True)
11875

11976
return response
12077

12178
# add a column for the phenotype summary
122-
df["LLM_phenotype_summary"] = df.apply(summarize_phenotypes, axis=1, prompt_prefix=prompt_prefix1, blank_if_no_phenotypes=False)
123-
df["disease_category"] = df.apply(summarize_phenotypes, axis=1, prompt_prefix=prompt_prefix2, blank_if_no_phenotypes=True)
124-
125-
126-
clingen_curations_dir = os.path.dirname(os.path.dirname(__file__))
127-
clingen_curations_files = sorted(glob.glob(os.path.join(clingen_curations_dir, "CLINGEN_curations_export_at_*.csv")))
128-
if not clingen_curations_files:
129-
print(f"WARNING: No CLINGEN_curations_export_at_*.csv files found in {clingen_curations_dir}. Skipping ClinGen curation processing.")
130-
df["clingen_curation"] = ""
131-
else:
132-
clingen_curations_path = clingen_curations_files[-1] # use the latest file
133-
print(f"Using ClinGen curations file: {clingen_curations_path}")
134-
clingen_df = pd.read_csv(clingen_curations_path)
135-
"""Columns:
136-
Gene Symbol Expert Panel Curator Disease Entity Curation Type Rationales Uploaded Date Precuration Date Disease entity assigned Date Precuration Complete Date Curation Provisional Date Curation Approved Date Recuration assigned Date Retired Assignment Date Published Date Classification Created GCI UUID
137-
"""
138-
139-
gene_name_to_clingen_curation_value = {}
140-
gene_alias_to_gene_name = {}
141-
duplicate_aliases = []
142-
for _, row in df.iterrows():
143-
if not pd.isna(row["gene_symbol"]) and row["gene_symbol"]:
144-
gene_name_to_clingen_curation_value[row["gene_symbol"].upper()] = None
145-
146-
if not pd.isna(row["gene_aliases"]) and row["gene_aliases"] and not pd.isna(row["gene_symbol"]) and row["gene_symbol"]:
147-
gene_symbol = row["gene_symbol"].upper()
148-
for alias in row["gene_aliases"].split(","):
149-
alias = alias.strip().upper()
150-
if alias in gene_alias_to_gene_name and gene_alias_to_gene_name[alias] != gene_symbol:
151-
#print(f"WARNING: Duplicate alias: {alias} for {row['gene_symbol']} and {gene_alias_to_gene_name[alias]}")
152-
duplicate_aliases.append(alias)
153-
else:
154-
gene_alias_to_gene_name[alias] = gene_symbol
155-
156-
for alias in duplicate_aliases:
157-
if alias in gene_alias_to_gene_name:
158-
del gene_alias_to_gene_name[alias]
159-
160-
clingen_gene_names_not_in_df = []
161-
for _, row in clingen_df.iterrows():
162-
if not pd.isna(row["Curation Type"]) and row["Curation Type"]:
163-
value = f"Curated: {row['Curation Type']}"
164-
else:
165-
value = "In Scope"
166-
167-
gene_name = row["Gene Symbol"].strip().upper()
168-
gene_name = gene_alias_to_gene_name.get(gene_name, gene_name)
169-
if gene_name not in gene_name_to_clingen_curation_value:
170-
clingen_gene_names_not_in_df.append(gene_name)
171-
elif gene_name_to_clingen_curation_value[gene_name] is None or not gene_name_to_clingen_curation_value[gene_name].startswith("Curated"):
172-
gene_name_to_clingen_curation_value[gene_name] = value
173-
elif gene_name_to_clingen_curation_value[gene_name] != value and value is not None and value.startswith("Curated"):
174-
gene_name_to_clingen_curation_value[gene_name] += f", {value.replace('Curated: ', '')}"
175-
176-
print(f"{len(clingen_gene_names_not_in_df):,} CLINGEN gene names not in df: {', '.join(clingen_gene_names_not_in_df)}")
177-
178-
df["clingen_curation"] = df["gene_symbol"].str.upper().map(gene_name_to_clingen_curation_value)
79+
df["LLM_phenotype_summary"] = df.apply(summarize_phenotypes, axis=1)
17980

18081
# move the LLM_phenotype_summary column to be after the 'inheritance' column
18182
initial_columns = [
18283
"ensembl_gene_id", "hgnc_gene_id", "gene_symbol", "gene_aliases", "pLI_v2", "pLI_v4", "lof_oe_ci_upper_v4", "mis_oe_ci_upper_v4", "s_het",
183-
"inheritance", "disease_category", "clingen_curation", "LLM_phenotype_summary", "sources",
84+
"inheritance", "LLM_phenotype_summary", "sources",
18485
"gene_chrom", "gene_start", "gene_end",
18586
]
18687

@@ -189,17 +90,3 @@ def summarize_phenotypes(row, prompt_prefix=prompt_prefix1, blank_if_no_phenotyp
18990
df.to_csv(args.output_path, sep="\t", index=False)
19091

19192
print(f"Wrote {len(df):,d} rows to {args.output_path}")
192-
193-
# print ClinGen stats
194-
for label, df_subset in (
195-
("Curated", df[df.clingen_curation.str.startswith("Curated") & df.clingen_curation.notna()]),
196-
("In Scope", df[df.clingen_curation == "In Scope"]),
197-
("Not in scope", df[df.clingen_curation.isna()]),
198-
):
199-
df_subset = df_subset[df_subset.disease_category.notna() & (df_subset.disease_category.str.strip() != "")]
200-
201-
print("-" * 100)
202-
print(f"{label}: {len(df_subset):,d} rows")
203-
for category, count in df_subset.disease_category.value_counts().items():
204-
print(f"{count:,d}\t{category}")
205-
print()

website/gene_page_template.html

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,6 @@
285285
box1 += `<tr><td class="locus-info-label">Gene ID:</td><td>${geneId ? `<a href="https://ensembl.org/Homo_sapiens/Gene/Summary?g=${geneId}" target="_blank" class="action-link">${geneId}</a>` : ''}</td></tr>`
286286
box1 += `<tr><td class="locus-info-label">Aliases:</td><td>${row['gene_aliases'] || ''}</td></tr>`
287287
box1 += `<tr><td class="locus-info-label">Coordinates (hg38):</td><td>${coordinates}</td></tr>`
288-
box1 += `<tr><td class="locus-info-label">Disease Category:</td><td>${getDiseaseCategory(row['disease_category'])}</td></tr>`
289288
box1 += '</table>'
290289
$('#locus-page-row1-box1').html(box1)
291290

@@ -305,8 +304,6 @@
305304
box2 += `<tr><td class="locus-info-label">LOEUF (v4):</td><td>${loeuf}</td></tr>`
306305
box2 += `<tr><td class="locus-info-label">Missense O/E (v4):</td><td>${moeuf}</td></tr>`
307306

308-
// ClinGen curation summary
309-
box2 += `<tr><td class="locus-info-label">ClinGen Curation:</td><td>${getTruncatedText(row['clingen_curation'], 60)}</td></tr>`
310307
box2 += '</table>'
311308
$('#locus-page-row1-box2').html(box2)
312309

website/global_constants.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -111,15 +111,6 @@
111111
},
112112

113113
# AI Summary
114-
{
115-
"type": "STRING",
116-
"name": "disease_category",
117-
"description": "LLM-assigned disease category (e.g., NEUROLOGICAL, CARDIOVASCULAR).",
118-
"displayName": "Disease Category",
119-
"allowCustomFilter": True,
120-
"allowExport": True,
121-
"group": GROUP_AI_SUMMARY,
122-
},
123114
{
124115
"type": "STRING",
125116
"name": "LLM_phenotype_summary",
@@ -129,15 +120,6 @@
129120
"allowExport": True,
130121
"group": GROUP_AI_SUMMARY,
131122
},
132-
{
133-
"type": "STRING",
134-
"name": "clingen_curation",
135-
"description": "LLM-generated summary of ClinGen gene curation information.",
136-
"displayName": "ClinGen Curation",
137-
"allowCustomFilter": True,
138-
"allowExport": True,
139-
"group": GROUP_AI_SUMMARY,
140-
},
141123

142124
# Sources
143125
{

website/header_template.html

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -44,30 +44,6 @@
4444
...BIGQUERY_COLUMN_DESCRIPTIONS,
4545
}
4646

47-
// Disease category color mapping
48-
const DISEASE_CATEGORY_COLORS = {
49-
'NEUROLOGICAL': '#6B4C9A',
50-
'CARDIOVASCULAR': '#DC3545',
51-
'METABOLIC': '#FD7E14',
52-
'IMMUNOLOGICAL': '#20C997',
53-
'OPHTHALMOLOGICAL': '#0DCAF0',
54-
'DERMATOLOGICAL': '#D63384',
55-
'MUSCULOSKELETAL': '#6F42C1',
56-
'HEMATOLOGICAL': '#E83E8C',
57-
'RENAL': '#198754',
58-
'ENDOCRINE': '#FFC107',
59-
'HEPATIC': '#795548',
60-
'PULMONARY': '#17A2B8',
61-
'GASTROINTESTINAL': '#FF8C00',
62-
'ONCOLOGICAL': '#343A40',
63-
'DEVELOPMENTAL': '#0D6EFD',
64-
'MULTISYSTEMIC': '#6610F2',
65-
'AUDIOLOGICAL': '#2196F3',
66-
'DENTAL': '#8D6E63',
67-
'PSYCHIATRIC': '#9C27B0',
68-
'OTHER': '#6C757D',
69-
}
70-
7147
const DEFAULT_SORT_COLUMN = 'gene_symbol'
7248
const DEFAULT_SORT_DIRECTION = 'ASC'
7349

@@ -188,15 +164,6 @@
188164

189165
const escapeSqlString = (s) => String(s).replace(/'/g, "''")
190166

191-
const getDiseaseCategory = (category) => {
192-
if (!category) return ''
193-
const categories = category.split(';').map(c => c.trim()).filter(c => c)
194-
return categories.map(cat => {
195-
const color = DISEASE_CATEGORY_COLORS[cat.toUpperCase()] || DISEASE_CATEGORY_COLORS['OTHER']
196-
return `<span class="ui mini label" style="background-color: ${color}; color: white; margin: 1px;">${cat}</span>`
197-
}).join(' ')
198-
}
199-
200167
const getSourcesBadges = (sourcesStr) => {
201168
if (!sourcesStr) return ''
202169
// Format is like "3: OMIM, ClinGen, GenCC"
@@ -242,13 +209,6 @@
242209
getSortColumns: () => ['gene_symbol'],
243210
getRequiredColumns: () => ['gene_symbol', 'hgnc_gene_id', 'ensembl_gene_id'],
244211
},
245-
disease_category: {
246-
displayName: 'Category',
247-
helpText: RESULTS_TABLE_COLUMN_HELP_TEXTS['disease_category'],
248-
getValue: (row) => getDiseaseCategory(row['disease_category']),
249-
getSortColumns: () => ['disease_category'],
250-
getRequiredColumns: () => ['disease_category'],
251-
},
252212
inheritance: {
253213
displayName: 'Inheritance',
254214
helpText: RESULTS_TABLE_COLUMN_HELP_TEXTS['inheritance'],
@@ -355,13 +315,6 @@
355315
getSortColumns: () => ['gene_end'],
356316
getRequiredColumns: () => ['gene_end'],
357317
},
358-
clingen_curation: {
359-
displayName: 'ClinGen Curation',
360-
helpText: RESULTS_TABLE_COLUMN_HELP_TEXTS['clingen_curation'],
361-
getValue: (row) => getTruncatedText(row['clingen_curation'], 80),
362-
getSortColumns: () => ['clingen_curation'],
363-
getRequiredColumns: () => ['clingen_curation'],
364-
},
365318
pLI_v2: {
366319
displayName: 'pLI (v2)',
367320
helpText: RESULTS_TABLE_COLUMN_HELP_TEXTS['pLI_v2'],
@@ -730,7 +683,6 @@
730683

731684
const DEFAULT_VISIBLE_COLUMNS = [
732685
'gene_symbol',
733-
'disease_category',
734686
'inheritance',
735687
'sources',
736688
'LLM_phenotype_summary',

website/index_page_template.html

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -222,39 +222,6 @@
222222
</div>
223223
</div>
224224

225-
<div class="field" style="max-width: 350px;">
226-
<label>Disease Category
227-
<i class="question circle icon link" style="margin-left: 5px;" data-html="Filter by disease category (e.g., Neurological, Cardiovascular, etc.)."></i>
228-
</label>
229-
<div class="ui selection dropdown" id="disease-category-dropdown" style="min-width: 220px;">
230-
<input type="hidden" name="diseaseCategory">
231-
<i class="dropdown icon"></i>
232-
<div class="default text">Any category</div>
233-
<div class="menu">
234-
<div class="item" data-value="">Any</div>
235-
<div class="item" data-value="NEUROLOGICAL">Neurological</div>
236-
<div class="item" data-value="CARDIOVASCULAR">Cardiovascular</div>
237-
<div class="item" data-value="METABOLIC">Metabolic</div>
238-
<div class="item" data-value="IMMUNOLOGICAL">Immunological</div>
239-
<div class="item" data-value="OPHTHALMOLOGICAL">Ophthalmological</div>
240-
<div class="item" data-value="DERMATOLOGICAL">Dermatological</div>
241-
<div class="item" data-value="MUSCULOSKELETAL">Musculoskeletal</div>
242-
<div class="item" data-value="HEMATOLOGICAL">Hematological</div>
243-
<div class="item" data-value="RENAL">Renal</div>
244-
<div class="item" data-value="ENDOCRINE">Endocrine</div>
245-
<div class="item" data-value="HEPATIC">Hepatic</div>
246-
<div class="item" data-value="PULMONARY">Pulmonary</div>
247-
<div class="item" data-value="GASTROINTESTINAL">Gastrointestinal</div>
248-
<div class="item" data-value="ONCOLOGICAL">Oncological</div>
249-
<div class="item" data-value="DEVELOPMENTAL">Developmental</div>
250-
<div class="item" data-value="MULTISYSTEMIC">Multisystemic</div>
251-
<div class="item" data-value="AUDIOLOGICAL">Audiological</div>
252-
<div class="item" data-value="DENTAL">Dental</div>
253-
<div class="item" data-value="PSYCHIATRIC">Psychiatric</div>
254-
<div class="item" data-value="OTHER">Other</div>
255-
</div>
256-
</div>
257-
</div>
258225
</div>
259226

260227
<div id="custom-filter-rows"></div>
@@ -316,7 +283,6 @@
316283
DEFAULT_GLOBAL_PAGE_STATE['loeufThreshold'] = ''
317284
DEFAULT_GLOBAL_PAGE_STATE['moeufThreshold'] = ''
318285
DEFAULT_GLOBAL_PAGE_STATE['clingenClassification'] = ''
319-
DEFAULT_GLOBAL_PAGE_STATE['diseaseCategory'] = ''
320286

321287
// Copy the default persistent page state
322288
let GLOBAL_PAGE_STATE = JSON.parse(JSON.stringify(DEFAULT_GLOBAL_PAGE_STATE))
@@ -421,10 +387,6 @@
421387
if (GLOBAL_PAGE_STATE['clingenClassification']) {
422388
$('#clingen-classification-dropdown').dropdown('set selected', GLOBAL_PAGE_STATE['clingenClassification'])
423389
}
424-
if (GLOBAL_PAGE_STATE['diseaseCategory']) {
425-
$('#disease-category-dropdown').dropdown('set selected', GLOBAL_PAGE_STATE['diseaseCategory'])
426-
}
427-
428390
// Restore custom filters from URL
429391
$('#custom-filter-rows').empty()
430392
deserializeAndCreateCustomFilters(GLOBAL_PAGE_STATE['customFilters'])
@@ -1511,11 +1473,6 @@
15111473
}
15121474
}
15131475

1514-
// Disease category filter
1515-
const diseaseCategory = $('#disease-category-dropdown').dropdown('get value')
1516-
if (diseaseCategory) {
1517-
conditions.push(`UPPER(disease_category) LIKE UPPER('%${diseaseCategory}%')`)
1518-
}
15191476
}
15201477

15211478
return conditions
@@ -1603,7 +1560,6 @@
16031560
GLOBAL_PAGE_STATE['loeufThreshold'] = $('input[name="loeufThreshold"]').val()
16041561
GLOBAL_PAGE_STATE['moeufThreshold'] = $('input[name="moeufThreshold"]').val()
16051562
GLOBAL_PAGE_STATE['clingenClassification'] = $('#clingen-classification-dropdown').dropdown('get value')
1606-
GLOBAL_PAGE_STATE['diseaseCategory'] = $('#disease-category-dropdown').dropdown('get value')
16071563
writePersistentPageStateToUrl()
16081564

16091565
displaySearchResults()
@@ -1762,11 +1718,6 @@
17621718
onHide: function() { $(this).css('z-index', '') },
17631719
})
17641720

1765-
$('#disease-category-dropdown').dropdown({
1766-
onShow: function() { $(this).css('z-index', 10001) },
1767-
onHide: function() { $(this).css('z-index', '') },
1768-
})
1769-
17701721
// handle browser forward & back buttons
17711722
window.addEventListener('popstate', readAndApplyPersistentPageStateFromUrl)
17721723

0 commit comments

Comments
 (0)