Skip to content

Commit 2a25e8c

Browse files
authored
Merge pull request #21 from tcezard/split_source_same_as_target
Remove source assembly that is the same as the target
2 parents 4b86b64 + a0e18fe commit 2a25e8c

5 files changed

Lines changed: 75 additions & 14 deletions

File tree

.github/workflows/python-package.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,4 @@ jobs:
6262
# Run nextflow tests
6363
export NXF_DEFAULT_DSL=2
6464
tests/nextflow-tests/run_tests.sh
65+
tests/nextflow-tests/run_tests_no_remapping.sh

eva_assembly_ingestion/nextflow/remap_cluster.nf

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,14 @@ workflow {
4545
species_name = params.species_name.toLowerCase().replace(" ", "_")
4646
remapping_required = params.source_assemblies_and_taxonomies.any { it[0] != params.target_assembly_accession }
4747

48-
if (remapping_required) {
49-
// Get only source assemblies and tax IDs that require remapping
50-
source_asm_and_tax_ids = Channel.fromList(params.source_assemblies_and_taxonomies)
51-
.filter { (source_asm, tax_ids) -> source_asm != params.target_assembly_accession }
48+
// Source assemblies that differ from target and require remapping
49+
// (source assemblies equal to target are removed because variants are already in target assembly)
50+
assemblies_to_remap = Channel.fromList(params.source_assemblies_and_taxonomies)
51+
.filter { it[0] != params.target_assembly_accession }
5252

53+
if (remapping_required) {
5354
// Process source genomes
54-
retrieve_source_genome(source_asm_and_tax_ids, species_name)
55+
retrieve_source_genome(assemblies_to_remap, species_name)
5556
update_source_genome(
5657
retrieve_source_genome.out.fasta_and_report,
5758
params.remapping_config)
@@ -64,14 +65,14 @@ workflow {
6465
params.remapping_config)
6566

6667
// Remap required source assemblies
67-
asm_tax_fasta_report = source_asm_and_tax_ids.combine(update_source_genome.out.updated_fasta_and_report, by: 0)
68+
asm_tax_fasta_report = assemblies_to_remap.combine(update_source_genome.out.updated_fasta_and_report, by: 0)
6869
.transpose()
6970
extract_vcf_from_mongo(asm_tax_fasta_report)
7071
remap_variants(
7172
extract_vcf_from_mongo.out.source_vcfs.transpose(),
7273
update_target_genome.out.updated_target_fasta)
7374
ingest_vcf_into_mongo(
74-
remap_variants.out.remapped_vcfs,
75+
remap_variants.out.remapped_vcfs,
7576
update_target_genome.out.updated_target_report)
7677

7778
gather_counts(ingest_vcf_into_mongo.out.ingestion_log_filename)
@@ -82,16 +83,16 @@ workflow {
8283
cluster_unclustered_variants(qc_process_remapped.out.qc_log_filename)
8384
qc_clustering(cluster_unclustered_variants.out.rs_report_filename)
8485
qc_clustering_duplicate_rs_acc(cluster_unclustered_variants.out.rs_report_filename)
85-
86-
// Backpropagate to source assemblies
86+
87+
// Backpropagate to source assemblies that required remapping
88+
// (source assemblies equal to target are excluded — no backpropagation needed)
8789
backpropagate_clusters(
88-
source_asm_and_tax_ids,
90+
assemblies_to_remap,
8991
qc_clustering.out.qc_log_filename,
9092
qc_clustering_duplicate_rs_acc.out.qc_log_filename)
9193
} else {
92-
// Only perform clustering on target assembly
93-
// We're using params.genome_assembly_dir because cluster_unclustered_variants needs to receive a file object
94-
cluster_unclustered_variants(params.genome_assembly_dir)
94+
// All source assemblies are already on the target assembly; no remapping needed.
95+
cluster_unclustered_variants(Channel.value(1))
9596
qc_clustering(cluster_unclustered_variants.out.rs_report_filename)
9697
qc_clustering_duplicate_rs_acc(cluster_unclustered_variants.out.rs_report_filename)
9798
}
@@ -344,7 +345,7 @@ process cluster_unclustered_variants {
344345
-e $params.output_dir/logs/${log_filename}.err"
345346

346347
input:
347-
path qc_log_filename
348+
val start_flag
348349

349350
output:
350351
path "${params.target_assembly_accession}_new_rs_report.txt", emit: rs_report_filename
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
# Test the case where all source assemblies are the same as the target assembly.
3+
4+
set -Eeuo pipefail
5+
6+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
7+
SOURCE_DIR="$(dirname $(dirname $SCRIPT_DIR))"
8+
9+
cwd=${PWD}
10+
cd ${SCRIPT_DIR}
11+
12+
mkdir -p ${SCRIPT_DIR}/genomes
13+
PATH=${SCRIPT_DIR}/bin:$PATH
14+
15+
printf "\e[32m===== CLUSTERING ONLY PIPELINE (source == target assembly) =====\e[0m\n"
16+
nextflow run ${SOURCE_DIR}/eva_assembly_ingestion/nextflow/remap_cluster.nf -params-file test_config_no_remapping.yaml \
17+
--target_assembly_accession GCA_0000002 \
18+
--species_name "Thingy thungus" \
19+
--genome_assembly_dir ${SCRIPT_DIR}/genomes \
20+
--extraction_properties ${SCRIPT_DIR}/template.properties \
21+
--ingestion_properties ${SCRIPT_DIR}/template.properties \
22+
--clustering_properties ${SCRIPT_DIR}/template.properties \
23+
--output_dir ${SCRIPT_DIR}/output \
24+
--remapping_config ${SCRIPT_DIR}/test_config_no_remapping.yaml \
25+
--release_version 7 \
26+
-resume
27+
28+
29+
# Verify clustering RS report was produced
30+
ls ${SCRIPT_DIR}/output/logs/GCA_0000002_new_rs_report.txt
31+
32+
# clean up
33+
rm -rf work .nextflow* output genomes
34+
cd ${cwd}

tests/nextflow-tests/test_config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ source_assemblies_and_taxonomies:
44
- 1234
55
- - GCA_0000001.2
66
- - 1234
7+
- - GCA_0000002
8+
- - 1234
79

810
executable:
911
genome_downloader: ../../../bin/fake_genome_downloader.py
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
source_assemblies_and_taxonomies:
2+
- - GCA_0000002
3+
- - 1234
4+
5+
executable:
6+
genome_downloader: ../../../bin/fake_genome_downloader.py
7+
custom_assembly: ../../../bin/fake_custom_assembly.py
8+
count_variants_from_logs: ../../../bin/fake_count_variants.py
9+
nextflow: nextflow
10+
bcftools: bcftools
11+
samtools: samtools
12+
bedtools: bedtools
13+
minimap2: minimap2
14+
bgzip: bgzip
15+
tabix: tabix
16+
python_activate: ../../../bin/venv_activate
17+
18+
nextflow:
19+
remapping: ../../../bin/fake_remapping_pipeline.nf
20+
jar:
21+
vcf_extractor: ../../../java/extraction.jar
22+
vcf_ingestion: ../../../java/loading.jar
23+
clustering: ../../../java/clustering.jar

0 commit comments

Comments
 (0)