Skip to content

feat: filter long alleles #1080

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions v03_pipeline/lib/model/dataset_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,10 @@ def should_send_to_allele_registry(self):
def requires_dataproc(self):
return self == DatasetType.SNV_INDEL

@property
def filter_invalid_sites(self):
return self == DatasetType.SNV_INDEL

@property
def should_export_to_vcf(self):
return self == DatasetType.SV
Expand Down
11 changes: 7 additions & 4 deletions v03_pipeline/lib/reference_datasets/clinvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,13 @@ def select_fields(ht):
conflictingPathogenicities=parsed_and_mapped_clnsigconf(ht),
goldStars=CLINVAR_GOLD_STARS_LOOKUP.get(hl.delimit(ht.info.CLNREVSTAT)),
submitters=ht.submitters,
# assumes the format 'MedGen#:condition', e.g.'C0023264:Leigh syndrome'
conditions=hl.map(
lambda p: p.split(r':')[1],
ht.conditions,
# assumes the format 'MedGen#:condition;MedGen#:condition', e.g.'C0023264:Leigh syndrome'
conditions=hl.filter(
hl.is_defined,
hl.flatmap(
lambda p: p.split(';'),
ht.conditions,
).map(lambda p: p.split(':')[1]),
),
)

Expand Down
70 changes: 70 additions & 0 deletions v03_pipeline/lib/reference_datasets/clinvar_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,73 @@ def test_get_ht(self):
),
],
)

# VariationID 9 tests Conditions parsing
self.assertListEqual(
ht.collect()[8].submitters,
[
'Hemochromatosis type 1',
'Hereditary cancer-predisposing syndrome',
'HFE-related disorder',
'Hemochromatosis type 1',
'Hemochromatosis type 1',
'Bronze diabetes',
'Hemochromatosis type 1',
'HFE-related disorder',
'Hemochromatosis type 1',
'Abdominal pain',
'Atypical behavior',
'Pain',
'Peripheral neuropathy',
'Abnormality of the nervous system',
'Abnormality of the male genitalia',
'Abnormal peripheral nervous system morphology',
'Hereditary hemochromatosis',
'Hemochromatosis type 1',
'not provided',
'Hereditary hemochromatosis',
'not provided',
'Hemochromatosis type 1',
'Hemochromatosis type 1',
'Hemochromatosis type 1',
'Hemochromatosis type 1',
'Hemochromatosis type 1',
'Hemochromatosis type 1',
'Hemochromatosis type 1',
'not provided',
'not provided',
'Hemochromatosis type 1',
'Hemochromatosis type 1',
'Hereditary hemochromatosis',
'Cardiomyopathy',
'not provided',
'Juvenile hemochromatosis',
'Hemochromatosis type 1',
'not provided',
'not provided',
'Inborn genetic diseases',
'Hemochromatosis type 1',
'not provided',
'Hemochromatosis type 1',
'Hemochromatosis type 1',
'Hemochromatosis type 1',
'Hemochromatosis type 1',
'not provided',
'Porphyrinuria',
'Cutaneous photosensitivity',
'Hemochromatosis type 1',
'Hereditary hemochromatosis',
'Hemochromatosis type 1',
'Hemochromatosis type 1',
'Hemochromatosis type 1',
'not provided',
'Hemochromatosis type 1',
'Variegate porphyria',
'Familial porphyria cutanea tarda',
'Alzheimer disease type 1',
'Microvascular complications of diabetes, susceptibility to, 7',
'Transferrin serum level quantitative trait locus 2',
'Hemochromatosis type 1',
'Hemochromatosis type 1',
],
)
18 changes: 13 additions & 5 deletions v03_pipeline/lib/tasks/validate_callset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
WriteValidationErrorsForRunTask,
)

MAX_SNV_INDEL_ALLELE_LENGTH = 500


@luigi.util.inherits(BaseLoadingRunParams)
class ValidateCallsetTask(BaseUpdateTask):
Expand Down Expand Up @@ -71,14 +73,20 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
self.callset_path,
),
)
if self.dataset_type.can_run_validation:
# Rather than throwing an error, we silently remove invalid contigs.
# This happens fairly often for AnVIL requests.
if self.dataset_type.filter_invalid_sites:
mt = mt.filter_rows(
hl.set(self.reference_genome.standard_contigs).contains(
mt.locus.contig,
(
# Rather than throwing an error, we silently remove invalid contigs.
# This happens fairly often for AnVIL requests.
hl.set(self.reference_genome.standard_contigs).contains(
mt.locus.contig,
)
# DRAGEN callsets produce long alternate alleles
# that aren't particularly analyzable as INDELs.
& (hl.len(mt.alleles[1]) < MAX_SNV_INDEL_ALLELE_LENGTH)
),
)

validation_exceptions = []
if self.skip_validation or not self.dataset_type.can_run_validation:
return mt.select_globals(
Expand Down
Loading
Loading