Skip to content

feat: implement GCNV #1112

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jun 10, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions v03_pipeline/lib/model/dataset_type.py
Original file line number Diff line number Diff line change
@@ -393,8 +393,8 @@ def should_export_to_vcf(self):
return self == DatasetType.SV

@property
def should_export_to_parquet(self):
return self == DatasetType.SNV_INDEL
def export_all_callset_variants(self):
return self == DatasetType.GCNV

@property
def export_vcf_annotation_fns(self) -> list[Callable[..., hl.Expression]]:
35 changes: 34 additions & 1 deletion v03_pipeline/lib/tasks/exports/fields.py
Original file line number Diff line number Diff line change
@@ -60,6 +60,10 @@ def get_dataset_type_specific_annotations(
'svType': ht.sv_type,
'svTypeDetail': ht.sv_type_detail,
},
DatasetType.GCNV: lambda ht: {
'numExon': ht.num_exon,
'svType': ht.sv_type,
},
}[dataset_type](ht)


@@ -92,6 +96,20 @@ def get_calls_export_fields(
prevCall=fe.concordance.prev_call,
prevNumAlt=fe.concordance.prev_num_alt,
),
DatasetType.GCNV: lambda fe: hl.Struct(
sampleId=fe.s,
gt=fe.GT.n_alt_alleles(),
cn=fe.CN,
qs=fe.QS,
defragged=fe.defragged,
start=fe.sample_start,
end=fe.sample_end,
numExon=fe.sample_num_exon,
geneIds=fe.sample_gene_ids,
newCall=fe.concordance.new_call,
prevCall=fe.concordance.prev_call,
prevOverlap=fe.concordance.prev_overlap,
),
}[dataset_type](fe)


@@ -162,6 +180,9 @@ def get_predictions_export_fields(
DatasetType.SV: lambda ht: {
'strvctvre': ht.strvctvre.score,
},
DatasetType.GCNV: lambda ht: {
'strvctvre': ht.strvctvre.score,
},
}[dataset_type](ht)


@@ -233,6 +254,15 @@ def get_populations_export_fields(ht: hl.Table, dataset_type: DatasetType):
id=ht.gnomad_svs.ID,
),
},
DatasetType.GCNV: lambda ht: {
'seqrPop': hl.Struct(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'll need to handle the lack of a ClickHouse lookup table on the application side, but this was the cleanest way to get this to work...

af=ht.gt_stats.AF,
ac=ht.gt_stats.AC,
an=ht.gt_stats.AN,
Hom=ht.gt_stats.Hom,
Het=ht.gt_stats.Het,
),
},
}[dataset_type](ht)


@@ -241,7 +271,7 @@ def get_position_fields(ht: hl.Table, dataset_type: DatasetType):
return {
'chrom': reference_independent_contig(ht.start_locus),
'pos': ht.start_locus.position,
'end_locus': ht.end_locus.position,
'end': ht.end_locus.position,
'rg37LocusEnd': hl.Struct(
contig=reference_independent_contig(ht.rg37_locus_end),
position=ht.rg37_locus_end.position,
@@ -272,6 +302,9 @@ def get_variant_id_fields(
DatasetType.SV: lambda ht: {
'variantId': ht.variant_id,
},
DatasetType.GCNV: lambda ht: {
'variantId': ht.variant_id,
},
}[dataset_type](ht)


31 changes: 0 additions & 31 deletions v03_pipeline/lib/tasks/exports/selects.py

This file was deleted.

167 changes: 167 additions & 0 deletions v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py
Original file line number Diff line number Diff line change
@@ -35,13 +35,16 @@
TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
TEST_MITO_CALLSET = 'v03_pipeline/var/test/callsets/mito_1.mt'
TEST_SV_VCF_2 = 'v03_pipeline/var/test/callsets/sv_2.vcf'
TEST_GCNV_BED_FILE = 'v03_pipeline/var/test/callsets/gcnv_1.tsv'


TEST_RUN_ID = 'manual__2024-04-03'


class WriteNewEntriesParquetTest(MockedReferenceDatasetsTestCase):
def setUp(self) -> None:
# NOTE: The annotations tables are mocked for SNV_INDEL & MITO
# to avoid reference dataset updates that SV/GCNV do not have.
super().setUp()
mt = import_callset(
TEST_SNV_INDEL_VCF,
@@ -417,3 +420,167 @@ def test_sv_write_new_entries_parquet(self):
},
],
)

def test_gcnv_write_new_entries_parquet(self):
worker = luigi.worker.Worker()
task = WriteNewEntriesParquetTask(
reference_genome=ReferenceGenome.GRCh38,
dataset_type=DatasetType.GCNV,
sample_type=SampleType.WES,
callset_path=TEST_GCNV_BED_FILE,
project_guids=['R0115_test_project2'],
project_pedigree_paths=[TEST_PEDIGREE_5],
skip_validation=True,
run_id=TEST_RUN_ID,
)
worker.add(task)
worker.run()
self.assertTrue(task.output().exists())
self.assertTrue(task.complete())
df = pd.read_parquet(
new_entries_parquet_path(
ReferenceGenome.GRCh38,
DatasetType.GCNV,
TEST_RUN_ID,
),
)
export_json = convert_ndarray_to_list(df.to_dict('records'))
self.assertEqual(
export_json,
[
{
'key': 0,
'project_guid': 'R0115_test_project2',
'family_guid': 'family_2_1',
'sample_type': 'WES',
'xpos': 1100006937,
'filters': [],
'calls': [
{
'sampleId': 'RGP_164_1',
'gt': 1,
'cn': 1,
'qs': 4,
'defragged': False,
'start': 100006937,
'end': 100007881,
'numExon': 2.0,
'geneIds': ['ENSG00000117620', 'ENSG00000283761'],
'newCall': False,
'prevCall': True,
'prevOverlap': False,
},
{
'sampleId': 'RGP_164_2',
'gt': 1,
'cn': 1,
'qs': 5,
'defragged': False,
'start': 100017585,
'end': 100023213,
'numExon': None,
'geneIds': ['ENSG00000117620', 'ENSG00000283761'],
'newCall': False,
'prevCall': False,
'prevOverlap': False,
},
{
'sampleId': 'RGP_164_3',
'gt': 2,
'cn': 0,
'qs': 30,
'defragged': False,
'start': 100017585,
'end': 100023213,
'numExon': None,
'geneIds': ['ENSG00000117620', 'ENSG00000283761'],
'newCall': False,
'prevCall': True,
'prevOverlap': False,
},
{
'sampleId': 'RGP_164_4',
'gt': 2,
'cn': 0.0,
'qs': 30.0,
'defragged': False,
'start': 100017586.0,
'end': 100023212.0,
'numExon': 2.0,
'geneIds': ['ENSG00000283761', 'ENSG22222222222'],
'newCall': False,
'prevCall': True,
'prevOverlap': False,
},
],
'sign': 1,
},
{
'key': 1,
'project_guid': 'R0115_test_project2',
'family_guid': 'family_2_1',
'sample_type': 'WES',
'xpos': 1100017586,
'filters': [],
'calls': [
{
'sampleId': 'RGP_164_1',
'gt': None,
'cn': None,
'qs': None,
'defragged': None,
'start': None,
'end': None,
'numExon': None,
'geneIds': None,
'newCall': None,
'prevCall': None,
'prevOverlap': None,
},
{
'sampleId': 'RGP_164_2',
'gt': None,
'cn': None,
'qs': None,
'defragged': None,
'start': None,
'end': None,
'numExon': None,
'geneIds': None,
'newCall': None,
'prevCall': None,
'prevOverlap': None,
},
{
'sampleId': 'RGP_164_3',
'gt': 2,
'cn': 0.0,
'qs': 30.0,
'defragged': False,
'start': None,
'end': None,
'numExon': None,
'geneIds': None,
'newCall': False,
'prevCall': True,
'prevOverlap': False,
},
{
'sampleId': 'RGP_164_4',
'gt': None,
'cn': None,
'qs': None,
'defragged': None,
'start': None,
'end': None,
'numExon': None,
'geneIds': None,
'newCall': None,
'prevCall': None,
'prevOverlap': None,
},
],
'sign': 1,
},
],
)
42 changes: 31 additions & 11 deletions v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py
Original file line number Diff line number Diff line change
@@ -2,9 +2,11 @@
import luigi
import luigi.util

from v03_pipeline.lib.misc.callsets import get_callset_ht
from v03_pipeline.lib.paths import (
new_transcripts_parquet_path,
new_variants_table_path,
variant_annotations_table_path,
)
from v03_pipeline.lib.tasks.base.base_loading_run_params import (
BaseLoadingRunParams,
@@ -20,6 +22,9 @@
from v03_pipeline.lib.tasks.update_new_variants_with_caids import (
UpdateNewVariantsWithCAIDsTask,
)
from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import (
UpdateVariantAnnotationsTableWithNewSamplesTask,
)
from v03_pipeline.lib.tasks.write_new_variants_table import WriteNewVariantsTableTask


@@ -37,21 +42,36 @@ def output(self) -> luigi.Target:
def complete(self) -> luigi.Target:
return GCSorLocalFolderTarget(self.output().path).exists()

def requires(self) -> list[luigi.Task]:
return [
self.clone(UpdateNewVariantsWithCAIDsTask)
if self.dataset_type.should_send_to_allele_registry
else self.clone(WriteNewVariantsTableTask),
]
def requires(self) -> luigi.Task:
if self.dataset_type.export_all_callset_variants:
return self.clone(UpdateVariantAnnotationsTableWithNewSamplesTask)
if self.dataset_type.should_send_to_allele_registry:
return self.clone(UpdateNewVariantsWithCAIDsTask)
return self.clone(WriteNewVariantsTableTask)

def create_table(self) -> None:
ht = hl.read_table(
new_variants_table_path(
if self.dataset_type.export_all_callset_variants:
ht = hl.read_table(
variant_annotations_table_path(
self.reference_genome,
self.dataset_type,
),
)
callset_ht = get_callset_ht(
self.reference_genome,
self.dataset_type,
self.run_id,
),
)
self.callset_path,
self.project_guids,
)
ht = ht.semi_join(callset_ht)
else:
ht = hl.read_table(
new_variants_table_path(
self.reference_genome,
self.dataset_type,
self.run_id,
),
)
ht = unmap_formatting_annotation_enums(
ht,
self.reference_genome,
Loading