Skip to content

Commit 0cd5f8f

Browse files
authored
feat: implement SV export (#1107)
* bugfix: ensure family samples are written during migration * ruff * correct assertion * another attempt :/ * finish it off * ruff * next/iter * ruff * first pass refactor * Fix typing issues * ruff * cleanup * continue with mito * proper ht access * ruff * last few fields * First mito test * test empty transcripts * still working * still hacking * Get tests passing * sv transcripts * First pass at annotations * update test * update test * svs * Finish entries test
1 parent b01bd86 commit 0cd5f8f

24 files changed

+403
-20
lines changed

v03_pipeline/lib/tasks/exports/fields.py

Lines changed: 112 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import hail as hl
22

3-
from v03_pipeline.lib.annotations.expression_helpers import get_expr_for_xpos
43
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
5-
from v03_pipeline.lib.tasks.exports.misc import array_structexpression_fields
4+
from v03_pipeline.lib.tasks.exports.misc import (
5+
transcripts_field_name,
6+
)
67

78

89
def reference_independent_contig(locus: hl.LocusExpression):
@@ -31,6 +32,34 @@ def get_dataset_type_specific_annotations(
3132
'commonLowHeteroplasmy': ht.common_low_heteroplasmy,
3233
'mitomapPathogenic': ht.mitomap.pathogenic,
3334
},
35+
DatasetType.SV: lambda ht: {
36+
'algorithms': ht.algorithms,
37+
'bothsidesSupport': ht.bothsides_support,
38+
'cpxIntervals': ht.cpxIntervals.map(
39+
lambda cpx_i: hl.Struct(
40+
chrom=reference_independent_contig(cpx_i.start),
41+
start=cpx_i.start.position,
42+
end=cpx_i.end.position,
43+
type=cpx_i.type,
44+
),
45+
),
46+
'endChrom': hl.or_missing(
47+
(
48+
(ht.sv_type != 'INS')
49+
& (ht.start_locus.contig != ht.end_locus.contig)
50+
),
51+
reference_independent_contig(ht.end_locus),
52+
),
53+
'svSourceDetail': hl.or_missing(
54+
(
55+
(ht.sv_type == 'INS')
56+
& (ht.start_locus.contig != ht.end_locus.contig)
57+
),
58+
hl.Struct(chrom=reference_independent_contig(ht.end_locus)),
59+
),
60+
'svType': ht.sv_type,
61+
'svTypeDetail': ht.sv_type_detail,
62+
},
3463
}[dataset_type](ht)
3564

3665

@@ -54,6 +83,15 @@ def get_calls_export_fields(
5483
mitoCn=fe.mito_cn,
5584
contamination=fe.contamination,
5685
),
86+
DatasetType.SV: lambda fe: hl.Struct(
87+
sampleId=fe.s,
88+
gt=fe.GT.n_alt_alleles(),
89+
cn=fe.CN,
90+
gq=fe.GQ,
91+
newCall=fe.concordance.new_call,
92+
prevCall=fe.concordance.prev_call,
93+
prevNumAlt=fe.concordance.prev_num_alt,
94+
),
5795
}[dataset_type](fe)
5896

5997

@@ -68,7 +106,7 @@ def get_entries_export_fields(
68106
'project_guid': project_guid,
69107
'family_guid': ht.family_entries.family_guid[0],
70108
'sample_type': sample_type.value,
71-
'xpos': get_expr_for_xpos(ht.locus),
109+
'xpos': ht.xpos,
72110
**(
73111
{
74112
'is_gnomad_gt_5_percent': hl.is_defined(ht.is_gt_5_percent),
@@ -121,6 +159,9 @@ def get_predictions_export_fields(
121159
'sift': ht.dbnsfp.SIFT_score,
122160
'mlc': ht.local_constraint_mito.score,
123161
},
162+
DatasetType.SV: lambda ht: {
163+
'strvctvre': ht.strvctvre.score,
164+
},
124165
}[dataset_type](ht)
125166

126167

@@ -184,9 +225,74 @@ def get_populations_export_fields(ht: hl.Table, dataset_type: DatasetType):
184225
max_hl=ht.helix_mito.max_hl,
185226
),
186227
},
228+
DatasetType.SV: lambda ht: {
229+
'gnomad_svs': hl.Struct(
230+
af=ht.gnomad_svs.AF,
231+
het=ht.gnomad_svs.N_HET,
232+
hom=ht.gnomad_svs.N_HOM,
233+
id=ht.gnomad_svs.ID,
234+
),
235+
},
236+
}[dataset_type](ht)
237+
238+
239+
def get_position_fields(ht: hl.Table, dataset_type: DatasetType):
240+
if dataset_type in {DatasetType.SV, DatasetType.GCNV}:
241+
return {
242+
'chrom': reference_independent_contig(ht.start_locus),
243+
'pos': ht.start_locus.position,
244+
'end_locus': ht.end_locus.position,
245+
'rg37LocusEnd': hl.Struct(
246+
contig=reference_independent_contig(ht.rg37_locus_end),
247+
position=ht.rg37_locus_end.position,
248+
),
249+
}
250+
return {
251+
'chrom': reference_independent_contig(ht.locus),
252+
'pos': ht.locus.position,
253+
'ref': ht.alleles[0],
254+
'alt': ht.alleles[1],
255+
}
256+
257+
258+
def get_variant_id_fields(
259+
ht: hl.Table,
260+
dataset_type: DatasetType,
261+
):
262+
return {
263+
DatasetType.SNV_INDEL: lambda ht: {
264+
'variantId': ht.variant_id,
265+
'rsid': ht.rsid,
266+
'CAID': ht.CAID,
267+
},
268+
DatasetType.MITO: lambda ht: {
269+
'variantId': ht.variant_id,
270+
'rsid': ht.rsid,
271+
},
272+
DatasetType.SV: lambda ht: {
273+
'variantId': ht.variant_id,
274+
},
187275
}[dataset_type](ht)
188276

189277

278+
def get_consequences_fields(
279+
ht: hl.Table,
280+
reference_genome: ReferenceGenome,
281+
dataset_type: DatasetType,
282+
):
283+
consequences_field = transcripts_field_name(reference_genome, dataset_type)
284+
if (
285+
reference_genome == ReferenceGenome.GRCh38
286+
and dataset_type == DatasetType.SNV_INDEL
287+
):
288+
return {
289+
'sortedMotifFeatureConsequences': ht.sortedMotifFeatureConsequences,
290+
'sortedRegulatoryFeatureConsequences': ht.sortedRegulatoryFeatureConsequences,
291+
consequences_field: ht[consequences_field],
292+
}
293+
return {consequences_field: ht[consequences_field]}
294+
295+
190296
def get_variants_export_fields(
191297
ht: hl.Table,
192298
reference_genome: ReferenceGenome,
@@ -195,19 +301,8 @@ def get_variants_export_fields(
195301
return {
196302
'key_': ht.key_,
197303
'xpos': ht.xpos,
198-
'chrom': reference_independent_contig(ht.locus),
199-
'pos': ht.locus.position,
200-
'ref': ht.alleles[0],
201-
'alt': ht.alleles[1],
202-
'variantId': ht.variant_id,
203-
'rsid': ht.rsid,
204-
**(
205-
{
206-
'CAID': ht.CAID,
207-
}
208-
if hasattr(ht, 'CAID')
209-
else {}
210-
),
304+
**get_position_fields(ht, dataset_type),
305+
**get_variant_id_fields(ht, dataset_type),
211306
'liftedOverChrom': (
212307
reference_independent_contig(ht.rg37_locus)
213308
if hasattr(ht, 'rg37_locus')
@@ -225,5 +320,5 @@ def get_variants_export_fields(
225320
'populations': hl.Struct(
226321
**get_populations_export_fields(ht, dataset_type),
227322
),
228-
**{f: ht[f] for f in sorted(array_structexpression_fields(ht))},
323+
**get_consequences_fields(ht, reference_genome, dataset_type),
229324
}

v03_pipeline/lib/tasks/exports/misc.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,14 @@ def unmap_formatting_annotation_enums(
284284
),
285285
)
286286
ht = ht.annotate_globals(enums=ht.enums.drop('mitotip'))
287+
if 'cpx_intervals' in formatting_annotation_names:
288+
ht = ht.annotate(
289+
cpx_intervals=ht.cpx_intervals.map(
290+
lambda cpx_i: cpx_i.annotate(
291+
type=hl.array(SV_TYPES)[cpx_i.type_id],
292+
).drop('type_id'),
293+
),
294+
)
287295
if 'sv_type_id' in formatting_annotation_names:
288296
ht = ht.annotate(sv_type=hl.array(SV_TYPES)[ht.sv_type_id]).drop('sv_type_id')
289297
ht = ht.annotate_globals(enums=ht.enums.drop('sv_type'))

v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import luigi.worker
66
import pandas as pd
77

8+
from v03_pipeline.lib.annotations import shared
89
from v03_pipeline.lib.misc.io import import_callset, remap_pedigree_hash
910
from v03_pipeline.lib.model import (
1011
DatasetType,
@@ -26,11 +27,15 @@
2627

2728
TEST_PEDIGREE_3_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_3_remap.tsv'
2829
TEST_PEDIGREE_4_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_4_remap.tsv'
30+
TEST_PEDIGREE_5 = 'v03_pipeline/var/test/pedigrees/test_pedigree_5.tsv'
2931
TEST_MITO_EXPORT_PEDIGREE = (
3032
'v03_pipeline/var/test/pedigrees/test_mito_export_pedigree.tsv'
3133
)
34+
TEST_PEDIGREE_5 = 'v03_pipeline/var/test/pedigrees/test_pedigree_5.tsv'
3235
TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
3336
TEST_MITO_CALLSET = 'v03_pipeline/var/test/callsets/mito_1.mt'
37+
TEST_SV_VCF_2 = 'v03_pipeline/var/test/callsets/sv_2.vcf'
38+
3439

3540
TEST_RUN_ID = 'manual__2024-04-03'
3641

@@ -45,6 +50,7 @@ def setUp(self) -> None:
4550
)
4651
ht = mt.rows()
4752
ht = ht.add_index(name='key_')
53+
ht = ht.annotate(xpos=shared.xpos(ht))
4854
ht = ht.annotate_globals(
4955
updates={
5056
hl.Struct(
@@ -68,6 +74,7 @@ def setUp(self) -> None:
6874
mt = import_callset(TEST_MITO_CALLSET, ReferenceGenome.GRCh38, DatasetType.MITO)
6975
ht = mt.rows()
7076
ht = ht.add_index(name='key_')
77+
ht = ht.annotate(xpos=shared.xpos(ht))
7178
ht = ht.annotate_globals(
7279
updates={
7380
hl.Struct(
@@ -286,3 +293,127 @@ def test_mito_write_new_entries_parquet(self, mock_uvatwnst: Mock):
286293
},
287294
],
288295
)
296+
297+
def test_sv_write_new_entries_parquet(self):
298+
worker = luigi.worker.Worker()
299+
task = WriteNewEntriesParquetTask(
300+
reference_genome=ReferenceGenome.GRCh38,
301+
dataset_type=DatasetType.SV,
302+
sample_type=SampleType.WGS,
303+
callset_path=TEST_SV_VCF_2,
304+
project_guids=['R0115_test_project2'],
305+
project_pedigree_paths=[TEST_PEDIGREE_5],
306+
skip_validation=True,
307+
run_id=TEST_RUN_ID,
308+
)
309+
worker.add(task)
310+
worker.run()
311+
self.assertTrue(task.output().exists())
312+
self.assertTrue(task.complete())
313+
df = pd.read_parquet(
314+
new_entries_parquet_path(
315+
ReferenceGenome.GRCh38,
316+
DatasetType.SV,
317+
TEST_RUN_ID,
318+
),
319+
)
320+
export_json = convert_ndarray_to_list(df.to_dict('records'))
321+
self.assertEqual(
322+
export_json,
323+
[
324+
{
325+
'key': 0,
326+
'project_guid': 'R0115_test_project2',
327+
'family_guid': 'family_2_1',
328+
'sample_type': 'WGS',
329+
'xpos': 1000180929,
330+
'filters': ['HIGH_SR_BACKGROUND', 'UNRESOLVED'],
331+
'calls': [
332+
{
333+
'sampleId': 'RGP_164_1',
334+
'gt': 0,
335+
'cn': None,
336+
'gq': 99,
337+
'newCall': True,
338+
'prevCall': False,
339+
'prevNumAlt': None,
340+
},
341+
{
342+
'sampleId': 'RGP_164_2',
343+
'gt': 1,
344+
'cn': None,
345+
'gq': 31,
346+
'newCall': True,
347+
'prevCall': False,
348+
'prevNumAlt': None,
349+
},
350+
{
351+
'sampleId': 'RGP_164_3',
352+
'gt': 0,
353+
'cn': None,
354+
'gq': 99,
355+
'newCall': True,
356+
'prevCall': False,
357+
'prevNumAlt': None,
358+
},
359+
{
360+
'sampleId': 'RGP_164_4',
361+
'gt': 0,
362+
'cn': None,
363+
'gq': 99,
364+
'newCall': True,
365+
'prevCall': False,
366+
'prevNumAlt': None,
367+
},
368+
],
369+
'sign': 1,
370+
},
371+
{
372+
'key': 1,
373+
'project_guid': 'R0115_test_project2',
374+
'family_guid': 'family_2_1',
375+
'sample_type': 'WGS',
376+
'xpos': 1000257667,
377+
'filters': [],
378+
'calls': [
379+
{
380+
'sampleId': 'RGP_164_1',
381+
'gt': 0,
382+
'cn': 2.0,
383+
'gq': 99,
384+
'newCall': True,
385+
'prevCall': False,
386+
'prevNumAlt': None,
387+
},
388+
{
389+
'sampleId': 'RGP_164_2',
390+
'gt': 0,
391+
'cn': 2.0,
392+
'gq': 99,
393+
'newCall': True,
394+
'prevCall': False,
395+
'prevNumAlt': None,
396+
},
397+
{
398+
'sampleId': 'RGP_164_3',
399+
'gt': 1,
400+
'cn': 3.0,
401+
'gq': 8,
402+
'newCall': True,
403+
'prevCall': False,
404+
'prevNumAlt': None,
405+
},
406+
{
407+
'sampleId': 'RGP_164_4',
408+
'gt': 0,
409+
'cn': 1.0,
410+
'gq': 13,
411+
'newCall': True,
412+
'prevCall': False,
413+
'prevNumAlt': None,
414+
},
415+
],
416+
'sign': 1,
417+
},
418+
],
419+
)

v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,13 @@ def create_table(self) -> None:
6969
ht[transcripts_field_name(self.reference_genome, self.dataset_type)],
7070
)
7171
.starmap(
72-
lambda i, s: s.annotate(
73-
majorConsequence=s.consequenceTerms.first(),
74-
transcriptRank=i,
72+
lambda i, s: (
73+
s
74+
if hasattr(s, 'majorConsequence')
75+
else s.annotate(
76+
majorConsequence=s.consequenceTerms.first(),
77+
transcriptRank=i,
78+
)
7579
),
7680
)
7781
.map(sorted_hl_struct),

0 commit comments

Comments
 (0)