Lk aou admixture unsupervised (#1759)

ekiernan · web-flow · commit c9c4fa34ed1c · 2026-01-30T11:41:23.000-05:00
Added admixture wdls and docs and cmrg readme
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -30,6 +30,10 @@ workflows:
     subclass: WDL
     primaryDescriptorPath: /all_of_us/rna_seq/CalculatePhenotypeGroups.wdl
   
+  - name: convert_vcf_to_plink_bed
+    subclass: WDL
+    primaryDescriptorPath: /all_of_us/admixture/convert_vcf_to_plink_bed.wdl
+  
   - name: CramToUnmappedBams
     subclass: WDL
     primaryDescriptorPath: /pipelines/wdl/reprocessing/cram_to_unmapped_bams/CramToUnmappedBams.wdl
@@ -143,7 +147,10 @@ workflows:
   - name: run_preprocess_admixture_est_rye
     subclass: WDL
     primaryDescriptorPath: /all_of_us/admixture/run_preprocess_admixture_est_rye.wdl
-
+  
+  - name: run_admixture
+    subclass: WDL
+    primaryDescriptorPath: /all_of_us/admixture/run_admixture.wdl
   - name: run_admixture_est_rye
     subclass: WDL
     primaryDescriptorPath: /all_of_us/admixture/run_admixture_est_rye.wdl
diff --git a/all_of_us/admixture/README.md b/all_of_us/admixture/README.md
@@ -0,0 +1,148 @@
+# Admixture & Ancestry Estimation Workflows (WDL)
+
+This directory contains three WDL workflows used for preparing genotype data and estimating genetic ancestry proportions. These workflows are designed to operate on genotype data derived from a **combined reference panel of 1000 Genomes Project (1KG) and Human Genome Diversity Project (HGDP)** samples.
+
+## Background & Interpretation Notes
+
+The 1KG + HGDP reference panel provides broad global coverage but has **known limitations**, including uneven population representation and limited resolution for certain ancestries. As a result:
+
+* Some ancestry components may be **overestimated or underestimated**, depending on the method and reference composition.
+* Outputs from these workflows should be **interpreted cautiously** and in the context of these reference limitations.
+* Results are best suited for **population-level summaries** rather than precise individual-level ancestry inference.
+
+To address specific biases observed with Rye-based admixture estimation, an alternative ADMIXTURE-based workflow is included and was used to generate a **pruned reference for All of Us (AoU) admixture analyses**.
+
+---
+
+## Workflow 1: `convert_vcf_to_plink_bed`
+
+### Description
+
+Converts a merged VCF file into PLINK binary format (`.bed/.bim/.fam`) for downstream ancestry and admixture analyses.
+
+### Required Inputs
+
+* `prefix` (String): Prefix for output files
+* `merged_vcf_shards` (File): Merged VCF file
+* `merged_vcf_shards_idx` (File): Index file for the merged VCF
+
+### Tasks & Software
+
+* **Task:** `convert_vcf_to_plink_bed`
+* **Software:**
+
+  * PLINK (`/app/bin/plink`)
+  * Docker image: `mussmann/admixpipe:3.0`
+
+PLINK is run with:
+
+* `--vcf` input
+* `--make-bed` to generate binary PLINK files
+* `--double-id` and `--allow-extra-chr` for compatibility with reference data
+
+### Outputs
+
+* `*.bed`: PLINK binary genotype file
+* `*.bim`: Variant information file
+* `*.fam`: Sample metadata file
+
+---
+
+## Workflow 2: `run_admixture_est_rye`
+
+### Description
+
+Generates ancestry (admixture) estimates using the **Rye** tool, based on PCA-derived eigenvalues and eigenvectors. Outputs are analogous to ADMIXTURE `Q` and `fam` files.
+
+This workflow is useful for fast, PCA-informed ancestry estimation but was observed to **overestimate certain populations** when using the AoU reference.
+
+### Required Inputs
+
+* `eigenvalues_file` (File): PCA eigenvalues
+* `eigenvec_file` (File): PCA eigenvectors
+* `pop2group_file` (File): Population-to-group mapping
+* `prefix` (String): Output file prefix
+* `pcs` (Int, default = 20): Number of principal components
+* `rounds` (Int, default = 200): Optimization rounds
+* `iter` (Int, default = 100): Optimization iterations
+* `cpus` (Int, default = 16)
+* `docker_image` (String): Rye Docker image
+
+### Tasks & Software
+
+* **Task:** `run_rye`
+* **Software:**
+
+  * Rye (`rye.R`)
+  * Docker image:
+    `us-central1-docker.pkg.dev/broad-dsde-methods/aou-auxiliary/rye-admixture-estimation-tool:v1.0`
+
+### Outputs
+
+* `*.Q`: Admixture proportion file
+* `*.fam`: Sample metadata file
+
+Output files are renamed for consistency:
+
+```
+<prefix>-<pcs>.Q
+<prefix>-<pcs>.fam
+```
+
+---
+
+## Workflow 3: `run_admixture`
+
+### Description
+
+Runs the **ADMIXTURE** software directly on PLINK binary files to estimate ancestry proportions.
+
+This workflow was used to generate a **pruned AoU admixture reference**, specifically because **Rye-based estimates were found to overestimate certain populations**, while ADMIXTURE tends to **underestimate those same groups**, providing a complementary and more conservative estimate.
+
+### Required Inputs
+
+* `bed` (File): PLINK `.bed` file
+* `bim` (File): PLINK `.bim` file
+* `fam` (File): PLINK `.fam` file
+* `K_in` (Int, optional): Number of ancestry components (default = 6)
+* `num_cpus_in` (Int, optional): Number of CPU threads (default = 4)
+* `mem_gb` (Int, default = 120): Memory allocation
+
+### Tasks & Software
+
+* **Task:** `run_admixture`
+* **Software:**
+
+  * ADMIXTURE (`/app/bin/admixture`)
+  * Docker image: `mussmann/admixpipe:3.0`
+
+ADMIXTURE is executed with multithreading (`-j`) for performance.
+
+### Outputs
+
+* `*.Q`: Ancestry proportion matrix
+* `*.P`: Population allele frequency matrix
+
+Output naming follows ADMIXTURE conventions:
+
+```
+<basename>.<K>.Q
+<basename>.<K>.P
+```
+
+---
+
+## Summary
+
+Together, these workflows support:
+
+1. Conversion of VCF data to PLINK format
+2. PCA-based admixture estimation using Rye
+3. Direct admixture estimation using ADMIXTURE for reference refinement
+
+When interpreting results, users should consider:
+
+* Reference panel composition (1KG + HGDP)
+* Method-specific biases (Rye vs. ADMIXTURE)
+* The intended use case (population-level inference vs. individual ancestry)
+
diff --git a/all_of_us/admixture/convert_vcf_to_plink_bed.changelog.md b/all_of_us/admixture/convert_vcf_to_plink_bed.changelog.md
@@ -0,0 +1,5 @@
+# aou_9.0.0
+2025-11-24 (Date of Last Commit)
+
+* Added convert_vcf_to_plink wdl
+* Added pipeline_version string
diff --git a/all_of_us/admixture/convert_vcf_to_plink_bed.wdl b/all_of_us/admixture/convert_vcf_to_plink_bed.wdl
@@ -0,0 +1,50 @@
+version 1.0
+
+
+workflow convert_vcf_to_plink_bed {
+    input {
+        String prefix
+        File merged_vcf_shards
+        File merged_vcf_shards_idx
+    }
+    String pipeline_version = "aou_9.0.1"
+
+    call convert_vcf_to_plink_bed {
+        input:
+            prefix=prefix,
+            vcf=merged_vcf_shards,
+            vcf_idx=merged_vcf_shards_idx
+    }
+
+    output {
+        File bed = convert_vcf_to_plink_bed.bed
+        File bim = convert_vcf_to_plink_bed.bim
+        File fam = convert_vcf_to_plink_bed.fam
+    }
+}
+task convert_vcf_to_plink_bed {
+    input {
+        String prefix
+        File vcf
+        File vcf_idx
+    }
+    parameter_meta {
+    }
+    command <<<
+        set -e
+        /app/bin/plink --double-id --vcf ~{vcf} --make-bed --allow-extra-chr --out ~{prefix}
+    >>>
+
+    output {
+        File bed = "~{prefix}.bed"
+        File bim = "~{prefix}.bim"
+        File fam = "~{prefix}.fam"
+    }
+
+    runtime {
+        docker: "mussmann/admixpipe:3.0"
+        memory: "31 GB"
+        cpu: "4"
+        disks: "local-disk 500 HDD"
+    }
+}
diff --git a/all_of_us/admixture/run_admixture.changelog.md b/all_of_us/admixture/run_admixture.changelog.md
@@ -0,0 +1,10 @@
+# aou_9.0.1
+2026-01-29 (Date of Last Commit)
+
+* Parameterized memory for local ancestry reference
+
+# aou_9.0.0
+2025-11-24 (Date of Last Commit)
+
+* Added run admixture wdl for aou 9.0.0 processing to allow for unsupervised clustering
+* Added pipeline_version as an input string
diff --git a/all_of_us/admixture/run_admixture.wdl b/all_of_us/admixture/run_admixture.wdl
@@ -0,0 +1,57 @@
+version 1.0
+
+# For more information on the files and the contents, see: http://dalexander.github.io/admixture/admixture-manual.pdf
+workflow run_admixture {
+    input {
+        File bed
+        File bim
+        File fam
+    }
+
+    String pipeline_version="aou_9.0.2"
+
+    call run_admixture {
+        input:
+            bed=bed,
+            bim=bim,
+            fam=fam
+    }
+
+    output {
+        File admixture_Q = run_admixture.Q
+        File admixture_P = run_admixture.P
+    }
+}
+task run_admixture {
+    input {
+        File bed
+        File bim
+        File fam
+        Int? K_in
+        Int? num_cpus_in
+        Int mem_gb = 120
+    }
+    Int K = select_first([K_in, 6])
+    Int num_cpus = select_first([num_cpus_in, 4])
+    String basename = basename(bed, ".bed")
+
+    command <<<
+        set -e
+        /app/bin/admixture ~{bed} ~{K} -j~{num_cpus}
+
+        ls -la
+
+    >>>
+
+    output {
+        File Q = "~{basename}.~{K}.Q"
+        File P = "~{basename}.~{K}.P"
+    }
+
+    runtime {
+        docker: "mussmann/admixpipe:3.0"
+        memory: mem_gb + " GB" # Was 31 GB originally, increased for local ancestry
+        cpu: "~{num_cpus}"
+        disks: "local-disk 500 HDD"
+    }
+}
diff --git a/all_of_us/cmrg/FixItFelixAndVariantCall.changelog.md b/all_of_us/cmrg/FixItFelixAndVariantCall.changelog.md
@@ -1,3 +1,7 @@
+# aou_9.0.1
+2026-01-29 (Date of Last Commit)
+* Added set e with pipefail to FixItFelix
+
 # aou_9.0.0
 2025-08-11 (Date of Last Commit)
 
diff --git a/all_of_us/cmrg/FixItFelixAndVariantCall.wdl b/all_of_us/cmrg/FixItFelixAndVariantCall.wdl
@@ -26,7 +26,7 @@ workflow FixItFelixAndVariantCall {
     }
 
     Int original_cram_size = ceil(size(cram_file, "GB"))
-    String pipeline_version = "aou_9.0.0"
+    String pipeline_version = "aou_9.0.1"
 
     call subset_cram {
         input:
@@ -106,6 +106,7 @@ task subset_cram {
     String output_index = sample_name + ".bai"
 
     command {
+        set -euo pipefail
         ~{gatk_path} --java-options "-Xmx2G" \
         PrintReads \
         -R ~{ref_fasta} \
@@ -148,6 +149,7 @@ task FixItFelix {
     }
 
     command <<<
+        set -euo pipefail
         bam=~{reads}
         bed=~{intervals}
         ref=~{masked_ref_fasta}
@@ -250,6 +252,7 @@ task call_variants {
     String output_filename = output_name + (if generate_gvcf then ".g.vcf.gz" else ".vcf.gz")
 
     command <<<
+        set -euo pipefail
         ~{gatk_path} --java-options "-Xmx3G" \
         HaplotypeCaller \
         -R ~{ref_fasta} \
diff --git a/all_of_us/cmrg/README.md b/all_of_us/cmrg/README.md