Skip to content

Commit ad08766

Browse files
authored
feat: filter long alleles (#1080)
* feat: filter long alleles * trailing comma * Update validate_callset.py * bug: filter clinvar missing conditions (#1083)
1 parent 579db2b commit ad08766

File tree

5 files changed

+95
-9
lines changed

5 files changed

+95
-9
lines changed

Diff for: v03_pipeline/lib/model/dataset_type.py

+4
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,10 @@ def should_send_to_allele_registry(self):
373373
def requires_dataproc(self):
374374
return self == DatasetType.SNV_INDEL
375375

376+
@property
377+
def filter_invalid_sites(self):
378+
return self == DatasetType.SNV_INDEL
379+
376380
@property
377381
def should_export_to_vcf(self):
378382
return self == DatasetType.SV

Diff for: v03_pipeline/lib/reference_datasets/clinvar.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,13 @@ def select_fields(ht):
154154
conflictingPathogenicities=parsed_and_mapped_clnsigconf(ht),
155155
goldStars=CLINVAR_GOLD_STARS_LOOKUP.get(hl.delimit(ht.info.CLNREVSTAT)),
156156
submitters=ht.submitters,
157-
# assumes the format 'MedGen#:condition', e.g.'C0023264:Leigh syndrome'
158-
conditions=hl.map(
159-
lambda p: p.split(r':')[1],
160-
ht.conditions,
157+
# assumes the format 'MedGen#:condition;MedGen#:condition', e.g.'C0023264:Leigh syndrome'
158+
conditions=hl.filter(
159+
hl.is_defined,
160+
hl.flatmap(
161+
lambda p: p.split(';'),
162+
ht.conditions,
163+
).map(lambda p: p.split(':')[1]),
161164
),
162165
)
163166

Diff for: v03_pipeline/lib/reference_datasets/clinvar_test.py

+70
Original file line numberDiff line numberDiff line change
@@ -169,3 +169,73 @@ def test_get_ht(self):
169169
),
170170
],
171171
)
172+
173+
# VariationID 9 tests Conditions parsing
174+
self.assertListEqual(
175+
ht.collect()[8].submitters,
176+
[
177+
'Hemochromatosis type 1',
178+
'Hereditary cancer-predisposing syndrome',
179+
'HFE-related disorder',
180+
'Hemochromatosis type 1',
181+
'Hemochromatosis type 1',
182+
'Bronze diabetes',
183+
'Hemochromatosis type 1',
184+
'HFE-related disorder',
185+
'Hemochromatosis type 1',
186+
'Abdominal pain',
187+
'Atypical behavior',
188+
'Pain',
189+
'Peripheral neuropathy',
190+
'Abnormality of the nervous system',
191+
'Abnormality of the male genitalia',
192+
'Abnormal peripheral nervous system morphology',
193+
'Hereditary hemochromatosis',
194+
'Hemochromatosis type 1',
195+
'not provided',
196+
'Hereditary hemochromatosis',
197+
'not provided',
198+
'Hemochromatosis type 1',
199+
'Hemochromatosis type 1',
200+
'Hemochromatosis type 1',
201+
'Hemochromatosis type 1',
202+
'Hemochromatosis type 1',
203+
'Hemochromatosis type 1',
204+
'Hemochromatosis type 1',
205+
'not provided',
206+
'not provided',
207+
'Hemochromatosis type 1',
208+
'Hemochromatosis type 1',
209+
'Hereditary hemochromatosis',
210+
'Cardiomyopathy',
211+
'not provided',
212+
'Juvenile hemochromatosis',
213+
'Hemochromatosis type 1',
214+
'not provided',
215+
'not provided',
216+
'Inborn genetic diseases',
217+
'Hemochromatosis type 1',
218+
'not provided',
219+
'Hemochromatosis type 1',
220+
'Hemochromatosis type 1',
221+
'Hemochromatosis type 1',
222+
'Hemochromatosis type 1',
223+
'not provided',
224+
'Porphyrinuria',
225+
'Cutaneous photosensitivity',
226+
'Hemochromatosis type 1',
227+
'Hereditary hemochromatosis',
228+
'Hemochromatosis type 1',
229+
'Hemochromatosis type 1',
230+
'Hemochromatosis type 1',
231+
'not provided',
232+
'Hemochromatosis type 1',
233+
'Variegate porphyria',
234+
'Familial porphyria cutanea tarda',
235+
'Alzheimer disease type 1',
236+
'Microvascular complications of diabetes, susceptibility to, 7',
237+
'Transferrin serum level quantitative trait locus 2',
238+
'Hemochromatosis type 1',
239+
'Hemochromatosis type 1',
240+
],
241+
)

Diff for: v03_pipeline/lib/tasks/validate_callset.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
WriteValidationErrorsForRunTask,
2727
)
2828

29+
MAX_SNV_INDEL_ALLELE_LENGTH = 500
30+
2931

3032
@luigi.util.inherits(BaseLoadingRunParams)
3133
class ValidateCallsetTask(BaseUpdateTask):
@@ -71,14 +73,20 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
7173
self.callset_path,
7274
),
7375
)
74-
if self.dataset_type.can_run_validation:
75-
# Rather than throwing an error, we silently remove invalid contigs.
76-
# This happens fairly often for AnVIL requests.
76+
if self.dataset_type.filter_invalid_sites:
7777
mt = mt.filter_rows(
78-
hl.set(self.reference_genome.standard_contigs).contains(
79-
mt.locus.contig,
78+
(
79+
# Rather than throwing an error, we silently remove invalid contigs.
80+
# This happens fairly often for AnVIL requests.
81+
hl.set(self.reference_genome.standard_contigs).contains(
82+
mt.locus.contig,
83+
)
84+
# DRAGEN callsets produce long alternate alleles
85+
# that aren't particularly analyzable as INDELs.
86+
& (hl.len(mt.alleles[1]) < MAX_SNV_INDEL_ALLELE_LENGTH)
8087
),
8188
)
89+
8290
validation_exceptions = []
8391
if self.skip_validation or not self.dataset_type.can_run_validation:
8492
return mt.select_globals(

0 commit comments

Comments
 (0)