Skip to content

Commit 50782b1

Browse files
committed
Add CRF training and haplotype inference modules for pangenome graphs
1 parent fa388e0 commit 50782b1

9 files changed

Lines changed: 2139 additions & 10 deletions

File tree

.cargo/config.toml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[env]
2+
LIBTORCH = { value = "/Users/kiran/repositories/hidive/venv/lib/python3.12/site-packages/torch", relative = false }
3+
LD_LIBRARY_PATH = { value = "/Users/kiran/repositories/hidive/venv/lib/python3.12/site-packages/torch/lib", relative = false }
4+
5+
[target.x86_64-apple-darwin]
6+
rustflags = ["-C", "link-args=-Wl,-rpath,/Users/kiran/repositories/hidive/venv/lib/python3.12/site-packages/torch/lib"]
7+
8+
[target.aarch64-apple-darwin]
9+
rustflags = ["-C", "link-args=-Wl,-rpath,/Users/kiran/repositories/hidive/venv/lib/python3.12/site-packages/torch/lib"]

Cargo.lock

Lines changed: 35 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scripts/run_dummy_pipeline.sh

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
5+
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
6+
DATA_DIR="${REPO_ROOT}/examples/dummy_data"
7+
OUT_DIR="${1:-${DATA_DIR}/output}"
8+
9+
GFA_PATH="${OUT_DIR}/pangenome.gfa"
10+
MODEL_PATH="${OUT_DIR}/crf_model.json"
11+
HAP_PATH="${OUT_DIR}/haplotypes.fa"
12+
13+
mkdir -p "${OUT_DIR}"
14+
15+
if ! command -v seqwish >/dev/null 2>&1; then
16+
echo "Error: seqwish is not available in PATH. Install it before running this script." >&2
17+
exit 1
18+
fi
19+
20+
echo "=== Building dummy pangenome graph ==="
21+
cargo run --bin hidive -- build-pangenome \
22+
--tier1-fasta-paths "${DATA_DIR}/tier1.fa" \
23+
--tier2-fasta-paths "${DATA_DIR}/tier2.fa" \
24+
--tier3-fasta-paths "${DATA_DIR}/tier3.fa" \
25+
--output "${GFA_PATH}" \
26+
--kmer-size 11 \
27+
--min-aln-len 30
28+
29+
echo "=== Training CRF on dummy data ==="
30+
cargo run --bin hidive -- train-crf \
31+
--graph "${GFA_PATH}" \
32+
--reads "${DATA_DIR}/reads.fa" \
33+
--truth-haplotypes "${DATA_DIR}/truth_hap1.fa" \
34+
--truth-haplotypes "${DATA_DIR}/truth_hap2.fa" \
35+
--output "${MODEL_PATH}" \
36+
--kmer-size 11 \
37+
--iterations 5
38+
39+
echo "=== Inferring haplotypes on dummy data ==="
40+
cargo run --bin hidive -- infer-haplotypes \
41+
--graph "${GFA_PATH}" \
42+
--model "${MODEL_PATH}" \
43+
--reads "${DATA_DIR}/reads.fa" \
44+
--output "${HAP_PATH}" \
45+
--kmer-size 11
46+
47+
echo "Dummy pipeline complete."
48+
echo "GFA: ${GFA_PATH}"
49+
echo "Model: ${MODEL_PATH}"
50+
echo "Haplotypes: ${HAP_PATH}"
51+

src/hidive/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ resolver = "2"
99
[dependencies]
1010
bio = "=2.0.1"
1111
chrono = "=0.4.38"
12-
clap = { version = "=4.5.1", features = ["derive"] }
12+
clap = { version = "=4.5.41", features = ["derive"] }
1313
flate2 = "=1.0.30"
1414
gbdt = "=0.1.3"
1515
gaoya = "=0.2.0"
@@ -23,6 +23,7 @@ needletail = "=0.5.1"
2323
ndarray = { version = "=0.16.1", features = ["rayon"] }
2424
ndarray-stats = "=0.6.0"
2525
num-format = "=0.4.4"
26+
parfait-gfa = "=0.1.2"
2627
path-absolutize = "=3.1.1"
2728
petgraph = "=0.6.5"
2829
rand = "=0.8.5"

0 commit comments

Comments
 (0)