Skip to content

Commit a23573f

Browse files
committed
use get_ht for spark dataframe
1 parent 68b7395 commit a23573f

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

loading_pipeline/lib/tasks/reference_data/updated_reference_dataset_parquet.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import luigi
22

3+
from loading_pipeline.lib.annotations.expression_helpers import get_expr_for_variant_id
34
from loading_pipeline.lib.core.dataset_type import DatasetType
45
from loading_pipeline.lib.core.definitions import ReferenceGenome
56
from loading_pipeline.lib.paths import reference_dataset_parquet
@@ -31,7 +32,11 @@ def output(self):
3132
)
3233

3334
def run(self):
34-
df = self.reference_dataset.get_spark_dataframe(self.reference_genome)
35+
ht = self.reference_dataset.get_ht(self.reference_genome)
36+
ht = ht.annotate(
37+
variant_id=get_expr_for_variant_id(ht),
38+
)
39+
df = ht.to_spark(flatten=False)
3540
df.write.parquet(
3641
self.output().path,
3742
mode='overwrite',

0 commit comments

Comments
 (0)