Skip to content

Commit 0c36dc8

Browse files
committed
fixes for vcf and haploid X issues
- more packaging changes
1 parent 3bad11e commit 0c36dc8

19 files changed

Lines changed: 1065 additions & 263 deletions

File tree

rust/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

rust/bioscript-cli/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ bioscript-schema = { path = "../bioscript-schema" }
1515
monty = { path = "../../monty/crates/monty" }
1616
serde_json = { version = "1.0.133", features = ["preserve_order"] }
1717
serde_yaml = "0.9.34"
18+
sha2 = "0.10"
1819
zip = { version = "2.2.0", default-features = false, features = ["deflate"] }
1920

2021
[lints.clippy]

rust/bioscript-cli/src/cli_bootstrap.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ fn run_cli() -> Result<(), String> {
5757
Ok(())
5858
}
5959

60-
const USAGE: &str = "usage: bioscript <script.py|manifest.yaml|package.zip|https://.../package.zip> [--root <dir>] [--input-file <path>] [--output-file <path>] [--participant-id <id>] [--trace-report <path>] [--timing-report <path>] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index <path>] [--reference-file <path>] [--reference-index <path>] [--auto-index] [--cache-dir <path>] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report <manifest.yaml|package.zip|https://.../package.zip> --input-file <path> [--input-file <path>...] --output-dir <dir> [--html] [--open] [--root <dir>] [--input-format auto|text|zip|vcf|cram] [--detect-sex] [--sample-sex male|female|unknown] [--analysis-max-duration-ms N]\n bioscript review <manifest.yaml|package.zip> --cases <cases.yaml> --output-dir <dir> [--html] [--root <dir>] [--filter key=value]\n bioscript import-package <package.zip|https://.../package.zip> [--root <dir>] [--output-dir <dir>]\n bioscript validate-variants <path> [--report <file>]\n bioscript validate-panels <path> [--report <file>]\n bioscript validate-assays <path> [--report <file>]\n bioscript prepare [--root <dir>] [--input-file <path>] [--reference-file <path>] [--input-format auto|text|zip|vcf|cram] [--cache-dir <path>]\n bioscript inspect <path> [--input-index <path>] [--reference-file <path>] [--reference-index <path>] [--detect-sex]";
60+
const USAGE: &str = "usage: bioscript <script.py|manifest.yaml|package.yaml|package.zip|https://.../package.yaml|https://.../package.zip> [--root <dir>] [--input-file <path>] [--output-file <path>] [--participant-id <id>] [--trace-report <path>] [--timing-report <path>] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index <path>] [--reference-file <path>] [--reference-index <path>] [--auto-index] [--cache-dir <path>] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report <manifest.yaml|package.yaml|package.zip|https://.../package.yaml|https://.../package.zip> --input-file <path> [--input-file <path>...] --output-dir <dir> [--html] [--open] [--root <dir>] [--input-format auto|text|zip|vcf|cram] [--detect-sex] [--sample-sex male|female|unknown] [--analysis-max-duration-ms N]\n bioscript review <manifest.yaml|package.yaml|package.zip> --cases <cases.yaml> --output-dir <dir> [--html] [--root <dir>] [--filter key=value]\n bioscript import-package <package.yaml|package.zip|https://.../package.yaml|https://.../package.zip> [--root <dir>] [--output-dir <dir>]\n bioscript validate-variants <path> [--report <file>]\n bioscript validate-panels <path> [--report <file>]\n bioscript validate-assays <path> [--report <file>]\n bioscript prepare [--root <dir>] [--input-file <path>] [--reference-file <path>] [--input-format auto|text|zip|vcf|cram] [--cache-dir <path>]\n bioscript inspect <path> [--input-index <path>] [--reference-file <path>] [--reference-index <path>] [--detect-sex]";
6161

6262
struct CliOptions {
6363
script_path: Option<PathBuf>,

rust/bioscript-cli/src/manifest_runner.rs

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,10 @@ fn run_panel_manifest_with_store(
122122
participant_id: Option<&str>,
123123
filters: &[String],
124124
) -> Result<Vec<BTreeMap<String, String>>, String> {
125-
let mut rows = Vec::new();
125+
let mut rows_by_member: Vec<Vec<BTreeMap<String, String>>> = vec![Vec::new(); panel.members.len()];
126+
let mut variant_entries = Vec::new();
126127

127-
for member in &panel.members {
128+
for (member_index, member) in panel.members.iter().enumerate() {
128129
let Some(path) = &member.path else {
129130
return Err("remote panel members are not executable yet".to_owned());
130131
};
@@ -134,21 +135,16 @@ fn run_panel_manifest_with_store(
134135
if !matches_filters(&manifest, &resolved, filters) {
135136
continue;
136137
}
137-
rows.push(run_variant_manifest_with_store(
138-
runtime_root,
139-
&manifest,
140-
store,
141-
participant_id,
142-
)?);
138+
variant_entries.push((member_index, resolved, manifest));
143139
} else if member.kind == "assay" {
144140
let assay = load_assay_manifest(&resolved)?;
145-
rows.extend(run_assay_manifest_with_store(
141+
rows_by_member[member_index] = run_assay_manifest_with_store(
146142
runtime_root,
147143
&assay,
148144
store,
149145
participant_id,
150146
filters,
151-
)?);
147+
)?;
152148
} else {
153149
return Err(format!(
154150
"panel member kind '{}' is not executable",
@@ -157,6 +153,29 @@ fn run_panel_manifest_with_store(
157153
}
158154
}
159155

156+
let observations = store
157+
.lookup_variants(
158+
&variant_entries
159+
.iter()
160+
.map(|(_, _, manifest)| manifest.spec.clone())
161+
.collect::<Vec<_>>(),
162+
)
163+
.map_err(|err| err.to_string())?;
164+
165+
for ((member_index, resolved, manifest), observation) in
166+
variant_entries.into_iter().zip(observations)
167+
{
168+
rows_by_member[member_index].push(variant_row(
169+
runtime_root,
170+
&resolved,
171+
&manifest.name,
172+
&manifest.tags,
173+
&observation,
174+
participant_id,
175+
));
176+
}
177+
178+
let rows = rows_by_member.into_iter().flatten().collect();
160179
Ok(rows)
161180
}
162181

@@ -181,7 +200,7 @@ fn run_assay_manifest_with_store(
181200
participant_id: Option<&str>,
182201
filters: &[String],
183202
) -> Result<Vec<BTreeMap<String, String>>, String> {
184-
let mut rows = Vec::new();
203+
let mut entries = Vec::new();
185204

186205
for member in &assay.members {
187206
if member.kind != "variant" {
@@ -198,18 +217,32 @@ fn run_assay_manifest_with_store(
198217
if !matches_filters(&manifest, &resolved, filters) {
199218
continue;
200219
}
201-
let observation = store
202-
.lookup_variant(&manifest.spec)
203-
.map_err(|err| err.to_string())?;
204-
rows.push(variant_row(
220+
entries.push((resolved, manifest));
221+
}
222+
223+
let observations = store
224+
.lookup_variants(
225+
&entries
226+
.iter()
227+
.map(|(_, manifest)| manifest.spec.clone())
228+
.collect::<Vec<_>>(),
229+
)
230+
.map_err(|err| err.to_string())?;
231+
232+
let rows = entries
233+
.into_iter()
234+
.zip(observations)
235+
.map(|((resolved, manifest), observation)| {
236+
variant_row(
205237
runtime_root,
206238
&resolved,
207239
&manifest.name,
208240
&manifest.tags,
209241
&observation,
210242
participant_id,
211-
));
212-
}
243+
)
244+
})
245+
.collect();
213246

214247
Ok(rows)
215248
}

rust/bioscript-cli/src/package.rs

Lines changed: 57 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,35 @@ const MAX_PACKAGE_FILES: usize = 1000;
66
const MAX_PACKAGE_FILE_BYTES: u64 = 16 * 1024 * 1024;
77
const MAX_PACKAGE_TOTAL_BYTES: u64 = 64 * 1024 * 1024;
88

9+
include!("package_release.rs");
10+
911
fn prepare_package_entrypoint_from_arg(
1012
runtime_root: &Path,
1113
source: &Path,
1214
) -> Result<PathBuf, String> {
1315
let source_text = source.to_string_lossy();
14-
let package_path = if is_package_url(&source_text) {
15-
download_package_url(runtime_root, &source_text)?
16+
let source_url = if is_package_url(&source_text) {
17+
Some(source_text.to_string())
18+
} else {
19+
None
20+
};
21+
let package_path = if let Some(url) = &source_url {
22+
download_package_url(runtime_root, url)?
1623
} else {
1724
source.to_path_buf()
1825
};
1926
if is_package_zip_path(&package_path) {
2027
let imported = import_package_zip(runtime_root, &package_path, None)?;
2128
Ok(imported.entrypoint)
29+
} else if is_package_release_path(&package_path) {
30+
match package_zip_from_release_manifest(runtime_root, &package_path, source_url.as_deref())?
31+
{
32+
Some(zip_path) => {
33+
let imported = import_package_zip(runtime_root, &zip_path, None)?;
34+
Ok(imported.entrypoint)
35+
}
36+
None => Ok(package_path),
37+
}
2238
} else {
2339
Ok(package_path)
2440
}
@@ -47,11 +63,19 @@ fn run_import_package(args: Vec<String>) -> Result<(), String> {
4763
.map_or_else(env::current_dir, Ok)
4864
.map_err(|err| format!("failed to get current directory: {err}"))?;
4965
let source_text = source.to_string_lossy();
50-
let package_path = if is_package_url(&source_text) {
51-
download_package_url(&runtime_root, &source_text)?
66+
let source_url = if is_package_url(&source_text) {
67+
Some(source_text.to_string())
68+
} else {
69+
None
70+
};
71+
let package_path = if let Some(url) = &source_url {
72+
download_package_url(&runtime_root, url)?
5273
} else {
5374
absolutize(&runtime_root, &source)
5475
};
76+
let package_path =
77+
package_zip_from_release_manifest(&runtime_root, &package_path, source_url.as_deref())?
78+
.unwrap_or(package_path);
5579
let imported = import_package_zip(&runtime_root, &package_path, output_dir.as_deref())?;
5680
println!("root\t{}", imported.root.display());
5781
println!("entrypoint\t{}", imported.entrypoint.display());
@@ -170,6 +194,23 @@ fn load_package_descriptor(root: &Path) -> Result<PackageDescriptor, String> {
170194
.ok_or_else(|| {
171195
format!("package descriptor {} is missing schema", path.display())
172196
})?;
197+
if matches!(
198+
schema,
199+
"bioscript:panel:1.0"
200+
| "bioscript:assay:1.0"
201+
| "bioscript:variant:1.0"
202+
| "bioscript:variant"
203+
) {
204+
let package_name = value
205+
.as_mapping()
206+
.and_then(|mapping| mapping.get(serde_yaml::Value::String("name".to_owned())))
207+
.and_then(serde_yaml::Value::as_str)
208+
.map(ToOwned::to_owned);
209+
return Ok(PackageDescriptor {
210+
entrypoint: PathBuf::from(PACKAGE_DESCRIPTOR),
211+
name: package_name,
212+
});
213+
}
173214
if schema != "bioscript:package:1.0" {
174215
return Err(format!(
175216
"package descriptor {} has unsupported schema '{schema}'",
@@ -352,11 +393,13 @@ fn download_package_url(runtime_root: &Path, url: &str) -> Result<PathBuf, Strin
352393
return Err("package URLs must use https://".to_owned());
353394
}
354395
let url_path = url.split('?').next().unwrap_or(url);
355-
if !Path::new(url_path)
396+
let extension = Path::new(url_path)
356397
.extension()
357-
.is_some_and(|ext| ext.eq_ignore_ascii_case("zip"))
358-
{
359-
return Err("package URL must point to a .zip file".to_owned());
398+
.and_then(|ext| ext.to_str())
399+
.unwrap_or_default()
400+
.to_ascii_lowercase();
401+
if !matches!(extension.as_str(), "zip" | "yaml" | "yml") {
402+
return Err("package URL must point to a .zip, .yaml, or .yml file".to_owned());
360403
}
361404
let downloads = runtime_root.join(PACKAGE_DOWNLOAD_DIR);
362405
fs::create_dir_all(&downloads).map_err(|err| {
@@ -408,3 +451,9 @@ fn is_package_zip_path(path: &Path) -> bool {
408451
.and_then(|ext| ext.to_str())
409452
.is_some_and(|ext| ext.eq_ignore_ascii_case("zip"))
410453
}
454+
455+
fn is_package_release_path(path: &Path) -> bool {
456+
path.extension()
457+
.and_then(|ext| ext.to_str())
458+
.is_some_and(|ext| matches!(ext.to_ascii_lowercase().as_str(), "yaml" | "yml"))
459+
}
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
fn package_zip_from_release_manifest(
2+
runtime_root: &Path,
3+
path: &Path,
4+
source_url: Option<&str>,
5+
) -> Result<Option<PathBuf>, String> {
6+
if !is_package_release_path(path) || !path.exists() {
7+
return Ok(None);
8+
}
9+
let text = fs::read_to_string(path)
10+
.map_err(|err| format!("failed to read package release {}: {err}", path.display()))?;
11+
let value: serde_yaml::Value = serde_yaml::from_str(&text)
12+
.map_err(|err| format!("failed to parse package release {}: {err}", path.display()))?;
13+
let schema = yaml_string(&value, "schema");
14+
if schema.as_deref() != Some("bioscript:package-release:1.0") {
15+
return Ok(None);
16+
}
17+
let artifact = value
18+
.as_mapping()
19+
.and_then(|mapping| mapping.get(serde_yaml::Value::String("artifact".to_owned())))
20+
.and_then(serde_yaml::Value::as_mapping)
21+
.ok_or_else(|| format!("package release {} is missing artifact", path.display()))?;
22+
let artifact_path = artifact
23+
.get(serde_yaml::Value::String("path".to_owned()))
24+
.and_then(serde_yaml::Value::as_str);
25+
let artifact_url = artifact
26+
.get(serde_yaml::Value::String("url".to_owned()))
27+
.and_then(serde_yaml::Value::as_str);
28+
let zip_path = if let Some(url) = artifact_url {
29+
download_package_url(runtime_root, url)?
30+
} else if let Some(relative) = artifact_path {
31+
if let Some(base_url) = source_url {
32+
download_package_url(runtime_root, &join_url(base_url, relative))?
33+
} else {
34+
path.parent()
35+
.ok_or_else(|| format!("package release has no parent: {}", path.display()))?
36+
.join(checked_relative_package_path(relative)?)
37+
}
38+
} else {
39+
return Err(format!(
40+
"package release {} artifact needs path or url",
41+
path.display()
42+
));
43+
};
44+
if let Some(expected) = artifact
45+
.get(serde_yaml::Value::String("sha256".to_owned()))
46+
.and_then(serde_yaml::Value::as_str)
47+
{
48+
let actual = sha256_file(&zip_path)?;
49+
if actual != expected {
50+
return Err(format!(
51+
"package artifact sha256 mismatch for {}: expected {expected}, got {actual}",
52+
zip_path.display()
53+
));
54+
}
55+
}
56+
Ok(Some(zip_path))
57+
}
58+
59+
fn yaml_string(value: &serde_yaml::Value, key: &str) -> Option<String> {
60+
value
61+
.as_mapping()
62+
.and_then(|mapping| mapping.get(serde_yaml::Value::String(key.to_owned())))
63+
.and_then(serde_yaml::Value::as_str)
64+
.map(ToOwned::to_owned)
65+
}
66+
67+
fn sha256_file(path: &Path) -> Result<String, String> {
68+
use sha2::{Digest, Sha256};
69+
70+
let mut file = fs::File::open(path)
71+
.map_err(|err| format!("failed to open artifact {}: {err}", path.display()))?;
72+
let mut digest = Sha256::new();
73+
let mut buffer = vec![0_u8; 1024 * 64];
74+
loop {
75+
let n = std::io::Read::read(&mut file, &mut buffer)
76+
.map_err(|err| format!("failed to read artifact {}: {err}", path.display()))?;
77+
if n == 0 {
78+
break;
79+
}
80+
digest.update(&buffer[..n]);
81+
}
82+
Ok(format!("{:x}", digest.finalize()))
83+
}
84+
85+
fn join_url(base_url: &str, relative: &str) -> String {
86+
if relative.starts_with("https://") || relative.starts_with("http://") {
87+
return relative.to_owned();
88+
}
89+
let base = base_url.split('?').next().unwrap_or(base_url);
90+
match base.rsplit_once('/') {
91+
Some((prefix, _)) => format!("{prefix}/{relative}"),
92+
None => relative.to_owned(),
93+
}
94+
}

rust/bioscript-cli/src/report_options.rs

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,12 +255,13 @@ fn generate_app_report(options: &AppReportOptions) -> Result<(), String> {
255255
if let Some(sample_sex) = options.sample_sex {
256256
input_inspection.inferred_sex = Some(explicit_sample_sex_inference(sample_sex));
257257
}
258+
let input_loader = loader_with_inspection(&options.loader, &input_inspection);
258259
let rows = run_manifest_rows_for_report(
259260
&options.root,
260261
&options.manifest_path,
261262
input_file,
262263
&participant_id,
263-
&options.loader,
264+
&input_loader,
264265
&options.filters,
265266
)?;
266267
let input_observations = rows
@@ -280,7 +281,7 @@ fn generate_app_report(options: &AppReportOptions) -> Result<(), String> {
280281
runtime_root: &options.root,
281282
input_file,
282283
participant_id: &participant_id,
283-
loader: &options.loader,
284+
loader: &input_loader,
284285
output_dir: &options.output_dir,
285286
filters: &options.filters,
286287
max_duration_ms: options.analysis_max_duration_ms,
@@ -317,6 +318,20 @@ fn generate_app_report(options: &AppReportOptions) -> Result<(), String> {
317318
Ok(())
318319
}
319320

321+
fn loader_with_inspection(
322+
base: &GenotypeLoadOptions,
323+
inspection: &bioscript_formats::FileInspection,
324+
) -> GenotypeLoadOptions {
325+
let mut loader = base.clone();
326+
loader.assembly = inspection.assembly.or(loader.assembly);
327+
loader.inferred_sex = inspection
328+
.inferred_sex
329+
.as_ref()
330+
.map(|inference| inference.sex)
331+
.or(loader.inferred_sex);
332+
loader
333+
}
334+
320335
fn open_app_html_report_if_requested(options: &AppReportOptions) {
321336
if options.open_report
322337
&& let Err(err) = open_html_report(&options.output_dir.join("index.html"))

0 commit comments

Comments
 (0)