Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
3ad4c98
VS-1570 Add MANES check to VAT validation (#9266)
gbggrant Sep 23, 2025
4f0af4f
VS-1644. Increase disk for ExcludeSitesFromSitesOnlyVcf task. (#9271)
gbggrant Sep 23, 2025
67baa5a
PGEN Exome improvements [VS-1634] (#9272)
mcovarr Sep 25, 2025
67f06c8
VS-1743. Modify pgen .pvar file to use a new ID field format. (#9273)
gbggrant Sep 25, 2025
eab6024
WDLs and scripts from Foxtrot AN investigation [DST-2716] (#9278)
mcovarr Oct 8, 2025
ea30f0e
Adjust PGEN extract memory based on Foxtrot experience [VS-1634] (#9280)
mcovarr Oct 8, 2025
506f4c5
WDL to create the participant mapping table [VS-1632] (#9283)
mcovarr Oct 14, 2025
197136a
Remove a (now-not existant) clinvar significance from expected. (#9284)
gbggrant Oct 16, 2025
717e0d6
Find and fix another bug (#9285)
gbggrant Oct 17, 2025
a85aa64
CMRG-inspired cleanup [VS-1739] (#9287)
mcovarr Oct 20, 2025
e585309
Unmapped VID participant ID fixup [VS-1686] (#9286)
mcovarr Oct 21, 2025
9729e56
VS-1748 uniqueify prep tables for extract cohort (#9291)
gbggrant Oct 29, 2025
fde4fb4
Map dropped duplicate VIDs [VS-1757] (#9292)
mcovarr Nov 4, 2025
66d34af
VS-1747 Respect the withdrawn flag in GvsExtractCohortFromSampleNames…
gbggrant Dec 2, 2025
94bd318
Merge master, fix unrelated VDS-making breakage [VS-1777] (#9302)
mcovarr Dec 10, 2025
308ef08
VS-1772 Allow additional padding of intervals for GvsPrepareRangesCal…
gbggrant Jan 13, 2026
286fc9a
Use ID file for Variants Docker building [VS-1788] (#9308)
mcovarr Jan 13, 2026
080ca0b
Upate the variants docker.
gbggrant Jan 26, 2026
95db0a7
Merge branch 'ah_var_store' into gg_VS-1780
gbggrant Jan 26, 2026
b41883c
Merge remote-tracking branch 'origin/VS-1736' into gg_VS-1780
gbggrant Jan 28, 2026
6c9ae7b
Minor updates for testing.
gbggrant Jan 28, 2026
43d3ff3
Hacky method to set is_loaded in sample_info table.
gbggrant Jan 29, 2026
74d5678
Update hacky method to set is_loaded in sample_info table.
gbggrant Jan 29, 2026
be56266
To run integration test.
gbggrant Jan 29, 2026
a327dc5
Testing setting sample_load_status for parquet files.
gbggrant Jan 30, 2026
2b90504
Limit test to BulkIngest shall we?
gbggrant Jan 30, 2026
c4d1450
Add branch to .dockstore.yml
gbggrant Jan 30, 2026
30d693e
Test another way to set sample_info.is_loaded based on parquet
gbggrant Feb 1, 2026
e55fb95
For want of a ) a workflow has failed.
gbggrant Feb 2, 2026
7b9e0f2
Dang
gbggrant Feb 2, 2026
66b588c
Better query
gbggrant Feb 2, 2026
d9d5e1c
Cleanup
gbggrant Feb 2, 2026
c93bf4f
embed sample id in parquet filename
mcovarr Feb 5, 2026
179fba5
update Docker
mcovarr Feb 5, 2026
3c8fd75
oops
mcovarr Feb 5, 2026
d6e56cd
update Docker
mcovarr Feb 5, 2026
5bead97
reorder components for sanity
mcovarr Feb 5, 2026
f3bc8f4
cleanup
mcovarr Feb 5, 2026
8a87182
cleanup
mcovarr Feb 5, 2026
842f84d
update Docker
mcovarr Feb 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ workflows:
- master
- ah_var_store
- VS-1736
- gg_VS-1780
tags:
- /.*/
- name: GvsPrepareRangesCallset
Expand Down Expand Up @@ -290,7 +291,7 @@ workflows:
- master
- ah_var_store
- VS-1737
- gg_VS-1785
- gg_VS-1780
tags:
- /.*/
- name: GvsBeta
Expand Down Expand Up @@ -347,6 +348,7 @@ workflows:
- master
- ah_var_store
- vs_1777_build_failure
- gg_VS-1780
tags:
- /.*/
- name: GvsIngestTieout
Expand Down
94 changes: 85 additions & 9 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ workflow GvsImportGenomes {
String? billing_project_id

# Dump these parquet files to a bucket
Boolean use_parquet_ingest = true # currently only in limited use.
String output_gcs_dir
Boolean configure_parquet_bucket_lifecycle = false

Expand Down Expand Up @@ -187,12 +188,14 @@ workflow GvsImportGenomes {
}

if (load_vet_and_ref_ranges) {
call SetIsLoadedColumn {
input:
load_done = LoadData.done,
project_id = project_id,
dataset_name = dataset_name,
cloud_sdk_docker = effective_cloud_sdk_docker,
if (!use_parquet_ingest) {
call SetIsLoadedColumn {
input:
load_done = LoadData.done,
project_id = project_id,
dataset_name = dataset_name,
cloud_sdk_docker = effective_cloud_sdk_docker,
}
}

if (configure_parquet_bucket_lifecycle) {
Expand All @@ -211,7 +214,7 @@ workflow GvsImportGenomes {
input:
project_id = project_id,
dataset_name = dataset_name,
set_is_loaded_done = SetIsLoadedColumn.done,
load_done = LoadData.done,
lifecycle_configured = select_first([ConfigureParquetLifecycle.done, "done"]),
variants_docker = effective_variants_docker,
}
Expand Down Expand Up @@ -245,6 +248,17 @@ workflow GvsImportGenomes {
load_outputs = LoadParquetFilesToBQ.completion_status,
variants_docker = effective_variants_docker,
}

if (use_parquet_ingest) {
# Update sample_info.is_loaded once parquet loading has been verified
call SetIsLoadedColumnForParquetIngest {
input:
go = VerifyParquetLoading.done,
project_id = project_id,
dataset_name = dataset_name,
cloud_sdk_docker = effective_cloud_sdk_docker,
}
}
}

output {
Expand Down Expand Up @@ -459,7 +473,7 @@ task LoadData {
vet_parquet_file=`ls vet_*.parquet`
ref_parquet_file=`ls ref_*.parquet`

# parse the table partition out of the file name
# parse the table superpartition out of the file name
table_number=$(echo "$vet_parquet_file" | cut -d'_' -f2)

# copy the vet and ref parquet files to the gcs bucket in the right place
Expand Down Expand Up @@ -569,6 +583,67 @@ task SetIsLoadedColumn {
}
}

task SetIsLoadedColumnForParquetIngest {
input {
String dataset_name
String project_id

Boolean go
String cloud_sdk_docker
}
meta {
# This is doing some tricky stuff with `INFORMATION_SCHEMA` so just punt and let it be `volatile`.
volatile: true
}

# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:import_genomes"

command <<<
# Prepend date, time and pwd to xtrace log entries.
PS4='\D{+%F %T} \w $ '
set -o errexit -o nounset -o pipefail -o xtrace

echo "project_id = ~{project_id}" > ~/.bigqueryrc

# set is_loaded to true if there is a corresponding vet table partition with rows for that sample_id

# Note that we tried modifying CreateVariantIngestFiles to UPDATE sample_info.is_loaded on a per-sample basis.
# The major issue that was found is that BigQuery allows only 20 such concurrent DML statements. Considered using
# an exponential backoff, but at the number of samples that are being loaded this would introduce significant delays
# in workflow processing. So this method is used to set *all* of the saple_info.is_loaded flags at one time.
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Corrected spelling of 'saple_info' to 'sample_info'.

Suggested change
# in workflow processing. So this method is used to set *all* of the saple_info.is_loaded flags at one time.
# in workflow processing. So this method is used to set *all* of the sample_info.is_loaded flags at one time.

Copilot uses AI. Check for mistakes.

# bq query --max_rows check: ok update
bq --apilog=false --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} \
'UPDATE `~{dataset_name}.sample_info` SET is_loaded = true
WHERE sample_id IN (SELECT CAST(partition_id AS INT64)
Copy link
Collaborator

@mcovarr mcovarr Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this... a sample id is being compared to a partition id?

Copy link
Collaborator

@mcovarr mcovarr Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah ok... I tinkered with this query in the console and I think I see how this works. But I'm wondering if this is still going to return correct results for vet tables > 001? Wouldn't the partitions in vet_002 start at 1 again?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually copied the bit about the partition from the 'normal' SetIsLoadedColumn method - I had thought it had been put in there to avoid some of the weirdly named vet and ref_ranges tables that were created during foxtrot? Very possible I misunderstood that.

from `~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS`
WHERE partition_id NOT LIKE "__%" AND total_logical_bytes > 0 AND REGEXP_CONTAINS(table_name, "^vet_[0-9]+$")) OR
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure I understand why this big OR block is here

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't there be a ref_ranges version of the AND logic here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The big OR logic I added (that starting here at 622) is trying to find out that there is a sample_name (as extricated from the file_path) in parquet_load_status for a vet parquet file creation and also one for a ref_ranges parquet file creation.

sample_name in (
select sample_name from
(
select REGEXP_EXTRACT(file_path, r".*input_vcf_\d+_(.*).vcf.gz.parquet$") as sample_name,
FROM `~{dataset_name}.parquet_load_status`
where REGEXP_CONTAINS(file_path, ".*vet_[0-9]+_input_vcf_[0-9]+_.*$")
intersect distinct
select REGEXP_EXTRACT(file_path, r".*input_vcf_\d+_(.*).vcf.gz.parquet$") as sample_name,
FROM `~{dataset_name}.parquet_load_status`
where REGEXP_CONTAINS(file_path, ".*ref_ranges_[0-9]+_input_vcf_[0-9]+_.*$")
)
)'
>>>
runtime {
docker: cloud_sdk_docker
memory: "1 GB"
disks: "local-disk 10 HDD"
cpu: 1
}

output {
Boolean done = true
}
}

task GetUningestedSampleIds {
input {
String dataset_name
Expand Down Expand Up @@ -796,7 +871,7 @@ task CreateParquetTrackingTable {
input {
String project_id
String dataset_name
String set_is_loaded_done
Array[String] load_done
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parameter name 'load_done' is ambiguous when typed as Array[String]. Consider renaming to 'load_done_statuses' or 'load_completion_markers' to clarify that it's a collection rather than a single completion signal.

Copilot uses AI. Check for mistakes.
String lifecycle_configured
String variants_docker
}
Expand Down Expand Up @@ -955,5 +1030,6 @@ task VerifyParquetLoading {
Int loaded_files = read_json(results_json)["loaded_files"]
Int missing_files = read_json(results_json)["missing_files"]
File? missing_files_list = "verification_output/missing_files.txt"
Boolean done = true
}
}
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsJointVariantCalling.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import "GvsCreateFilterSet.wdl" as CreateFilterSet
import "GvsPrepareRangesCallset.wdl" as PrepareRangesCallset
import "GvsExtractCallset.wdl" as ExtractCallset
import "GvsUtils.wdl" as Utils
# 3

workflow GvsJointVariantCalling {
input {
Boolean go = true
Expand Down
4 changes: 2 additions & 2 deletions scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,9 @@ task GetToolVersions {
# GVS generally uses the smallest `alpine` version of the Google Cloud SDK as it suffices for most tasks, but
# there are a handful of tasks that require the larger GNU libc-based `slim`.
String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:524.0.0-slim"
String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2026-01-27-alpine-31607c946ac7"
String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2026-01-26-alpine-31607c946ac7"
String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19"
String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2025-12-09-gatkbase-cda718c731d5"
String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2026-02-05-gatkbase-3c5808440553"
String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest"
String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623"
String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ public static String getSampleName(final VCFHeader inputVCFHeader) {

public static Long getSampleId(final String sampleName, final File sampleMap) {
Long sampleId = null;
// Because BigQuery only supports partitioning based on timestamp or integer,
// sample names will be remapped into sample_id integers
// Because BigQuery only supports partitioning based on timestamp or integer,
// sample names will be remapped into sample_id integers.
try {
BufferedReader br = new BufferedReader(new FileReader(sampleMap));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,11 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin

try {
if (writeReferenceRanges) {
final File refOutputFile = new File(outputDirectory, REF_RANGES_FILETYPE_PREFIX + tableNumber + PREFIX_SEPARATOR + sampleIdentifierForOutputFileName + "." + outputType.toString().toLowerCase());
String[] sampleComponents = {tableNumber, sampleId.toString(), sampleIdentifierForOutputFileName};
String filename = REF_RANGES_FILETYPE_PREFIX + String.join(PREFIX_SEPARATOR, sampleComponents) +
"." + outputType.toString().toLowerCase();

final File refOutputFile = new File(outputDirectory, filename);
switch (outputType) {
case BQ:
if (projectId == null || datasetName == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ public VetCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin
vetWriter.setHeaderLine(getHeaders());
break;
case PARQUET:
final File parquetOutputFile = new File(outputDirectory, VET_FILETYPE_PREFIX + tableNumber + PREFIX_SEPARATOR + sampleIdentifierForOutputFileName + ".parquet");
String[] sampleComponents = {tableNumber, sampleId.toString(), sampleIdentifierForOutputFileName};
String filename = VET_FILETYPE_PREFIX + String.join(PREFIX_SEPARATOR, sampleComponents) +
"." + outputType.toString().toLowerCase();
final File parquetOutputFile = new File(outputDirectory, filename);
vetParquetFileWriter = new GvsVariantParquetFileWriter(new Path(parquetOutputFile.toURI()), parquetSchema, CompressionCodecName.SNAPPY);
break;
}
Expand Down