broadinstitute
diff --git a/‎v03_pipeline/lib/tasks/exports/fields.py
Lines changed: 112 additions & 17 deletions b/‎v03_pipeline/lib/tasks/exports/fields.py
Lines changed: 112 additions & 17 deletions
diff --git a/‎v03_pipeline/lib/tasks/exports/misc.py
Lines changed: 8 additions & 0 deletions b/‎v03_pipeline/lib/tasks/exports/misc.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py
Lines changed: 131 additions & 0 deletions b/‎v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py
Lines changed: 131 additions & 0 deletions
diff --git a/‎v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py
Lines changed: 7 additions & 3 deletions b/‎v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet.py
Lines changed: 7 additions & 3 deletions
@@ -1,8 +1,9 @@
 import hail as hl
 
-from v03_pipeline.lib.annotations.expression_helpers import get_expr_for_xpos
 from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
-from v03_pipeline.lib.tasks.exports.misc import array_structexpression_fields
+from v03_pipeline.lib.tasks.exports.misc import (
+    transcripts_field_name,
+)
 
 
 def reference_independent_contig(locus: hl.LocusExpression):
@@ -31,6 +32,34 @@ def get_dataset_type_specific_annotations(
             'commonLowHeteroplasmy': ht.common_low_heteroplasmy,
             'mitomapPathogenic': ht.mitomap.pathogenic,
         },
+        DatasetType.SV: lambda ht: {
+            'algorithms': ht.algorithms,
+            'bothsidesSupport': ht.bothsides_support,
+            'cpxIntervals': ht.cpxIntervals.map(
+                lambda cpx_i: hl.Struct(
+                    chrom=reference_independent_contig(cpx_i.start),
+                    start=cpx_i.start.position,
+                    end=cpx_i.end.position,
+                    type=cpx_i.type,
+                ),
+            ),
+            'endChrom': hl.or_missing(
+                (
+                    (ht.sv_type != 'INS')
+                    & (ht.start_locus.contig != ht.end_locus.contig)
+                ),
+                reference_independent_contig(ht.end_locus),
+            ),
+            'svSourceDetail': hl.or_missing(
+                (
+                    (ht.sv_type == 'INS')
+                    & (ht.start_locus.contig != ht.end_locus.contig)
+                ),
+                hl.Struct(chrom=reference_independent_contig(ht.end_locus)),
+            ),
+            'svType': ht.sv_type,
+            'svTypeDetail': ht.sv_type_detail,
+        },
     }[dataset_type](ht)
 
 
@@ -54,6 +83,15 @@ def get_calls_export_fields(
             mitoCn=fe.mito_cn,
             contamination=fe.contamination,
         ),
+        DatasetType.SV: lambda fe: hl.Struct(
+            sampleId=fe.s,
+            gt=fe.GT.n_alt_alleles(),
+            cn=fe.CN,
+            gq=fe.GQ,
+            newCall=fe.concordance.new_call,
+            prevCall=fe.concordance.prev_call,
+            prevNumAlt=fe.concordance.prev_num_alt,
+        ),
     }[dataset_type](fe)
 
 
@@ -68,7 +106,7 @@ def get_entries_export_fields(
         'project_guid': project_guid,
         'family_guid': ht.family_entries.family_guid[0],
         'sample_type': sample_type.value,
-        'xpos': get_expr_for_xpos(ht.locus),
+        'xpos': ht.xpos,
         **(
             {
                 'is_gnomad_gt_5_percent': hl.is_defined(ht.is_gt_5_percent),
@@ -121,6 +159,9 @@ def get_predictions_export_fields(
             'sift': ht.dbnsfp.SIFT_score,
             'mlc': ht.local_constraint_mito.score,
         },
+        DatasetType.SV: lambda ht: {
+            'strvctvre': ht.strvctvre.score,
+        },
     }[dataset_type](ht)
 
 
@@ -184,9 +225,74 @@ def get_populations_export_fields(ht: hl.Table, dataset_type: DatasetType):
                 max_hl=ht.helix_mito.max_hl,
             ),
         },
+        DatasetType.SV: lambda ht: {
+            'gnomad_svs': hl.Struct(
+                af=ht.gnomad_svs.AF,
+                het=ht.gnomad_svs.N_HET,
+                hom=ht.gnomad_svs.N_HOM,
+                id=ht.gnomad_svs.ID,
+            ),
+        },
+    }[dataset_type](ht)
+
+
+def get_position_fields(ht: hl.Table, dataset_type: DatasetType):
+    if dataset_type in {DatasetType.SV, DatasetType.GCNV}:
+        return {
+            'chrom': reference_independent_contig(ht.start_locus),
+            'pos': ht.start_locus.position,
+            'end_locus': ht.end_locus.position,
+            'rg37LocusEnd': hl.Struct(
+                contig=reference_independent_contig(ht.rg37_locus_end),
+                position=ht.rg37_locus_end.position,
+            ),
+        }
+    return {
+        'chrom': reference_independent_contig(ht.locus),
+        'pos': ht.locus.position,
+        'ref': ht.alleles[0],
+        'alt': ht.alleles[1],
+    }
+
+
+def get_variant_id_fields(
+    ht: hl.Table,
+    dataset_type: DatasetType,
+):
+    return {
+        DatasetType.SNV_INDEL: lambda ht: {
+            'variantId': ht.variant_id,
+            'rsid': ht.rsid,
+            'CAID': ht.CAID,
+        },
+        DatasetType.MITO: lambda ht: {
+            'variantId': ht.variant_id,
+            'rsid': ht.rsid,
+        },
+        DatasetType.SV: lambda ht: {
+            'variantId': ht.variant_id,
+        },
     }[dataset_type](ht)
 
 
+def get_consequences_fields(
+    ht: hl.Table,
+    reference_genome: ReferenceGenome,
+    dataset_type: DatasetType,
+):
+    consequences_field = transcripts_field_name(reference_genome, dataset_type)
+    if (
+        reference_genome == ReferenceGenome.GRCh38
+        and dataset_type == DatasetType.SNV_INDEL
+    ):
+        return {
+            'sortedMotifFeatureConsequences': ht.sortedMotifFeatureConsequences,
+            'sortedRegulatoryFeatureConsequences': ht.sortedRegulatoryFeatureConsequences,
+            consequences_field: ht[consequences_field],
+        }
+    return {consequences_field: ht[consequences_field]}
+
+
 def get_variants_export_fields(
     ht: hl.Table,
     reference_genome: ReferenceGenome,
@@ -195,19 +301,8 @@ def get_variants_export_fields(
     return {
         'key_': ht.key_,
         'xpos': ht.xpos,
-        'chrom': reference_independent_contig(ht.locus),
-        'pos': ht.locus.position,
-        'ref': ht.alleles[0],
-        'alt': ht.alleles[1],
-        'variantId': ht.variant_id,
-        'rsid': ht.rsid,
-        **(
-            {
-                'CAID': ht.CAID,
-            }
-            if hasattr(ht, 'CAID')
-            else {}
-        ),
+        **get_position_fields(ht, dataset_type),
+        **get_variant_id_fields(ht, dataset_type),
         'liftedOverChrom': (
             reference_independent_contig(ht.rg37_locus)
             if hasattr(ht, 'rg37_locus')
@@ -225,5 +320,5 @@ def get_variants_export_fields(
         'populations': hl.Struct(
             **get_populations_export_fields(ht, dataset_type),
         ),
-        **{f: ht[f] for f in sorted(array_structexpression_fields(ht))},
+        **get_consequences_fields(ht, reference_genome, dataset_type),
     }
@@ -284,6 +284,14 @@ def unmap_formatting_annotation_enums(
             ),
         )
         ht = ht.annotate_globals(enums=ht.enums.drop('mitotip'))
+    if 'cpx_intervals' in formatting_annotation_names:
+        ht = ht.annotate(
+            cpx_intervals=ht.cpx_intervals.map(
+                lambda cpx_i: cpx_i.annotate(
+                    type=hl.array(SV_TYPES)[cpx_i.type_id],
+                ).drop('type_id'),
+            ),
+        )
     if 'sv_type_id' in formatting_annotation_names:
         ht = ht.annotate(sv_type=hl.array(SV_TYPES)[ht.sv_type_id]).drop('sv_type_id')
         ht = ht.annotate_globals(enums=ht.enums.drop('sv_type'))
 
@@ -5,6 +5,7 @@
 import luigi.worker
 import pandas as pd
 
+from v03_pipeline.lib.annotations import shared
 from v03_pipeline.lib.misc.io import import_callset, remap_pedigree_hash
 from v03_pipeline.lib.model import (
     DatasetType,
@@ -26,11 +27,15 @@
 
 TEST_PEDIGREE_3_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_3_remap.tsv'
 TEST_PEDIGREE_4_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_4_remap.tsv'
+TEST_PEDIGREE_5 = 'v03_pipeline/var/test/pedigrees/test_pedigree_5.tsv'
 TEST_MITO_EXPORT_PEDIGREE = (
     'v03_pipeline/var/test/pedigrees/test_mito_export_pedigree.tsv'
 )
+TEST_PEDIGREE_5 = 'v03_pipeline/var/test/pedigrees/test_pedigree_5.tsv'
 TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
 TEST_MITO_CALLSET = 'v03_pipeline/var/test/callsets/mito_1.mt'
+TEST_SV_VCF_2 = 'v03_pipeline/var/test/callsets/sv_2.vcf'
+
 
 TEST_RUN_ID = 'manual__2024-04-03'
 
@@ -45,6 +50,7 @@ def setUp(self) -> None:
         )
         ht = mt.rows()
         ht = ht.add_index(name='key_')
+        ht = ht.annotate(xpos=shared.xpos(ht))
         ht = ht.annotate_globals(
             updates={
                 hl.Struct(
@@ -68,6 +74,7 @@ def setUp(self) -> None:
         mt = import_callset(TEST_MITO_CALLSET, ReferenceGenome.GRCh38, DatasetType.MITO)
         ht = mt.rows()
         ht = ht.add_index(name='key_')
+        ht = ht.annotate(xpos=shared.xpos(ht))
         ht = ht.annotate_globals(
             updates={
                 hl.Struct(
@@ -286,3 +293,127 @@ def test_mito_write_new_entries_parquet(self, mock_uvatwnst: Mock):
                 },
             ],
         )
+
+    def test_sv_write_new_entries_parquet(self):
+        worker = luigi.worker.Worker()
+        task = WriteNewEntriesParquetTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SV,
+            sample_type=SampleType.WGS,
+            callset_path=TEST_SV_VCF_2,
+            project_guids=['R0115_test_project2'],
+            project_pedigree_paths=[TEST_PEDIGREE_5],
+            skip_validation=True,
+            run_id=TEST_RUN_ID,
+        )
+        worker.add(task)
+        worker.run()
+        self.assertTrue(task.output().exists())
+        self.assertTrue(task.complete())
+        df = pd.read_parquet(
+            new_entries_parquet_path(
+                ReferenceGenome.GRCh38,
+                DatasetType.SV,
+                TEST_RUN_ID,
+            ),
+        )
+        export_json = convert_ndarray_to_list(df.to_dict('records'))
+        self.assertEqual(
+            export_json,
+            [
+                {
+                    'key': 0,
+                    'project_guid': 'R0115_test_project2',
+                    'family_guid': 'family_2_1',
+                    'sample_type': 'WGS',
+                    'xpos': 1000180929,
+                    'filters': ['HIGH_SR_BACKGROUND', 'UNRESOLVED'],
+                    'calls': [
+                        {
+                            'sampleId': 'RGP_164_1',
+                            'gt': 0,
+                            'cn': None,
+                            'gq': 99,
+                            'newCall': True,
+                            'prevCall': False,
+                            'prevNumAlt': None,
+                        },
+                        {
+                            'sampleId': 'RGP_164_2',
+                            'gt': 1,
+                            'cn': None,
+                            'gq': 31,
+                            'newCall': True,
+                            'prevCall': False,
+                            'prevNumAlt': None,
+                        },
+                        {
+                            'sampleId': 'RGP_164_3',
+                            'gt': 0,
+                            'cn': None,
+                            'gq': 99,
+                            'newCall': True,
+                            'prevCall': False,
+                            'prevNumAlt': None,
+                        },
+                        {
+                            'sampleId': 'RGP_164_4',
+                            'gt': 0,
+                            'cn': None,
+                            'gq': 99,
+                            'newCall': True,
+                            'prevCall': False,
+                            'prevNumAlt': None,
+                        },
+                    ],
+                    'sign': 1,
+                },
+                {
+                    'key': 1,
+                    'project_guid': 'R0115_test_project2',
+                    'family_guid': 'family_2_1',
+                    'sample_type': 'WGS',
+                    'xpos': 1000257667,
+                    'filters': [],
+                    'calls': [
+                        {
+                            'sampleId': 'RGP_164_1',
+                            'gt': 0,
+                            'cn': 2.0,
+                            'gq': 99,
+                            'newCall': True,
+                            'prevCall': False,
+                            'prevNumAlt': None,
+                        },
+                        {
+                            'sampleId': 'RGP_164_2',
+                            'gt': 0,
+                            'cn': 2.0,
+                            'gq': 99,
+                            'newCall': True,
+                            'prevCall': False,
+                            'prevNumAlt': None,
+                        },
+                        {
+                            'sampleId': 'RGP_164_3',
+                            'gt': 1,
+                            'cn': 3.0,
+                            'gq': 8,
+                            'newCall': True,
+                            'prevCall': False,
+                            'prevNumAlt': None,
+                        },
+                        {
+                            'sampleId': 'RGP_164_4',
+                            'gt': 0,
+                            'cn': 1.0,
+                            'gq': 13,
+                            'newCall': True,
+                            'prevCall': False,
+                            'prevNumAlt': None,
+                        },
+                    ],
+                    'sign': 1,
+                },
+            ],
+        )
@@ -69,9 +69,13 @@ def create_table(self) -> None:
                 ht[transcripts_field_name(self.reference_genome, self.dataset_type)],
             )
             .starmap(
-                lambda i, s: s.annotate(
-                    majorConsequence=s.consequenceTerms.first(),
-                    transcriptRank=i,
+                lambda i, s: (
+                    s
+                    if hasattr(s, 'majorConsequence')
+                    else s.annotate(
+                        majorConsequence=s.consequenceTerms.first(),
+                        transcriptRank=i,
+                    )
                 ),
             )
             .map(sorted_hl_struct),