Skip to content

Commit 2868444

Browse files
authored
Merge pull request #113 from eldariont/FEATURE/improve_junction_detection
[FEATURE] Improve junction detection (insertion calling 1/2)
2 parents 60e3d9f + 4328592 commit 2868444

17 files changed

Lines changed: 476 additions & 198 deletions

File tree

include/iGenVar.hpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@ struct cmd_arguments
1212
std::vector<detection_methods> methods{cigar_string, split_read, read_pairs, read_depth}; // default: all methods
1313
clustering_methods clustering_method{hierarchical_clustering}; // default: hierarchical clustering method
1414
refinement_methods refinement_method{no_refinement}; // default: no refinement
15-
uint64_t min_var_length = 30;
16-
uint64_t max_var_length = 1000000;
17-
uint64_t max_tol_inserted_length = 5;
15+
int32_t min_var_length = 30;
16+
int32_t max_var_length = 1000000;
17+
int32_t max_tol_inserted_length = 5;
18+
int32_t max_overlap = 10;
1819
};
1920

2021
void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments & args);
@@ -37,9 +38,10 @@ void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments
3738
* (0: no_refinement,
3839
* 1: sViper_refinement_method,
3940
* 2: sVirl_refinement_method) - *default: no refinement*\n
40-
* **args.min_var_length** - minimum length of variants to detect - *default: 30 bp*\n
41-
* **args.max_var_length** - maximum length of variants to detect - *default: 1,000,000 bp*\n
42-
* **args.max_tol_inserted_length** - longest tolerated inserted sequence at non-INS SV types - *default: 5 bp*
41+
* **args.min_var_length** - minimum length of variants to detect (expected to be non-negative) - *default: 30 bp*\n
42+
* **args.max_var_length** - maximum length of variants to detect (expected to be non-negative) - *default: 1,000,000 bp*\n
43+
* **args.max_tol_inserted_length** - longest tolerated inserted sequence at non-INS SV types (expected to be non-negative) - *default: 5 bp*\n
44+
* **args.max_overlap** - maximum overlap between alignment segments (expected to be non-negative) - *default: 10 bp*
4345
*
4446
*
4547
* \details Detects novel junctions from read alignment records using different detection methods.

include/modules/sv_detection_methods/analyze_cigar_method.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* \param[in] cigar_string - CIGAR field of the SAM/BAM file
1111
* \param[in] query_sequence - SEQ field of the SAM/BAM file
1212
* \param[in, out] junctions - vector for storing junctions
13-
* \param[in] min_length - minimum length of variants to detect (default 30 bp)
13+
* \param[in] min_length - minimum length of variants to detect (default 30 bp, expected to be non-negative)
1414
*
1515
* \details This function steps through the CIGAR string and stores junctions with their position in reference and read.
1616
* We distinguish 4 cases of CIGAR operation characters:
@@ -36,4 +36,4 @@ void analyze_cigar(std::string const & read_name,
3636
std::vector<seqan3::cigar> & cigar_string,
3737
seqan3::dna5_vector const & query_sequence,
3838
std::vector<Junction> & junctions,
39-
uint64_t const min_length);
39+
int32_t const min_length);

include/modules/sv_detection_methods/analyze_sa_tag_method.hpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
#pragma once
22

3-
#include "structures/aligned_segment.hpp" // for struct AlignedSegment
3+
#include "iGenVar.hpp" // for struct cmd_arguments
4+
#include "structures/aligned_segment.hpp" // for struct AlignedSegment
5+
#include "structures/junction.hpp" // for class Junction
6+
#include "variant_detection/bam_functions.hpp" // for seqan3::sam_flag and hasFlag* functions
47

58
/*! \brief Splits a string by a given delimiter and stores substrings in a given container.
69
*
@@ -39,11 +42,15 @@ void retrieve_aligned_segments(std::string const & sa_string, std::vector<Aligne
3942
* \param[in, out] junctions - vector for storing junctions
4043
* \param[in, out] query_sequence - SEQ field of the SAM/BAM file
4144
* \param[in] read_name - QNAME field of the SAM/BAM file
45+
* \param[in] min_length - minimum length of variants to detect (expected to be non-negative)
46+
* \param[in] max_overlap - maximum overlap between alignment segments (expected to be non-negative)
4247
*/
4348
void analyze_aligned_segments(std::vector<AlignedSegment> const & aligned_segments,
4449
std::vector<Junction> & junctions,
4550
seqan3::dna5_vector const & query_sequence,
46-
std::string const & read_name);
51+
std::string const & read_name,
52+
int32_t const min_length,
53+
int32_t const max_overlap);
4754

4855
/*! \brief Parse the SA tag from the SAM/BAM alignment of a chimeric/split-aligned read. Build
4956
* [aligned_segments](\ref AlignedSegment), one for each alignment segment of the read.
@@ -57,6 +64,9 @@ void analyze_aligned_segments(std::vector<AlignedSegment> const & aligned_segmen
5764
* \param[in] cigar - CIGAR field of the SAM/BAM file
5865
* \param[in] seq - SEQ field of the SAM/BAM file
5966
* \param[in] sa_tag - SA tag, one tag from the read of the SAM/BAM file
67+
* \param[in] args - command line arguments:\n
68+
* **args.min_var_length** - minimum length of variants to detect (expected to be non-negative)\n
69+
* **args.max_overlap** - maximum overlap between alignment segments (expected to be non-negative)
6070
* \param[in, out] junctions - vector for storing junctions
6171
*/
6272
void analyze_sa_tag(std::string const & query_name,
@@ -67,4 +77,5 @@ void analyze_sa_tag(std::string const & query_name,
6777
std::vector<seqan3::cigar> const & cigar,
6878
seqan3::dna5_vector const & seq,
6979
std::string const & sa_tag,
80+
cmd_arguments const & args,
7081
std::vector<Junction> & junctions);

include/structures/aligned_segment.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@ struct AlignedSegment
2121
int32_t mapq;
2222
std::vector<seqan3::cigar> cig;
2323

24+
//! \brief Returns the reference position of the first aligned based.
2425
int32_t get_reference_start() const;
2526

27+
//! \brief Returns the reference position of the last aligned based.
2628
int32_t get_reference_end() const;
2729

2830
int32_t get_left_soft_clip() const;

include/variant_detection/variant_detection.hpp

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,47 +3,43 @@
33
#include <seqan3/std/filesystem> // for filesystem
44
#include <vector>
55

6-
#include "method_enums.hpp" // for enum detection_methods, clustering_methods and refinement_methods
6+
#include "iGenVar.hpp" // for struct cmd_arguments
77
#include "structures/junction.hpp" // for class Junction
88

99
/*! \brief Detects junctions between distant genomic positions by analyzing a short read alignment file (sam/bam). The
1010
* detected junctions are stored in a vector.
1111
*
1212
* \param[in, out] junctions - a vector of junctions
13-
* \param[in] alignment_short_reads_file_path - short reads input file, path to the sam/bam file
14-
* \param[in] methods - list of methods for detecting junctions (0: cigar_string,
15-
* 1: split_read,
16-
* 2: read_pairs,
17-
* 3: read_depth)
18-
* \param[in] min_var_length - minimum length of variants to detect (default 30 bp)
13+
* \param[in] args - command line arguments:\n
14+
* **args.alignment_short_reads_file_path** - short reads input file, path to the sam/bam file\n
15+
* **args.methods** - list of methods for detecting junctions
16+
* (0: cigar_string, 1: split_read, 2: read_pairs, 3: read_depth) - *default: all methods*\n
17+
*
1918
*
2019
* \details Detects junctions from the CIGAR strings and supplementary alignment tags of read alignment records.
2120
* We filter unmapped alignments, secondary alignments, duplicates and alignments with low mapping quality.
2221
* Then, the CIGAR string of all remaining alignments is analyzed.
2322
* For primary alignments, also the split read information is analyzed.
2423
*/
2524
void detect_junctions_in_short_reads_sam_file(std::vector<Junction> & junctions,
26-
std::filesystem::path const & alignment_short_reads_file_path,
27-
std::vector<detection_methods> const & methods,
28-
uint64_t const min_var_length);
25+
cmd_arguments const & args);
2926

3027
/*! \brief Detects junctions between distant genomic positions by analyzing a long read alignment file (sam/bam). The
3128
* detected junctions are stored in a vector.
3229
*
3330
* \param[in, out] junctions - a vector of junctions
34-
* \param[in] alignment_long_reads_file_path - long reads input file, path to the sam/bam file
35-
* \param[in] methods - list of methods for detecting junctions (0: cigar_string,
36-
* 1: split_read,
37-
* 2: read_pairs,
38-
* 3: read_depth)
39-
* \param[in] min_var_length - minimum length of variants to detect (default 30 bp)
31+
* \param[in] args - command line arguments:\n
32+
* **args.alignment_long_reads_file_path** - long reads input file, path to the sam/bam file\n
33+
* **args.methods** - list of methods for detecting junctions
34+
* (0: cigar_string, 1: split_read, 2: read_pairs, 3: read_depth) - *default: all methods*\n
35+
* **args.min_var_length** - minimum length of variants to detect (expected to be non-negative) - *default: 30 bp*\n
36+
* **args.max_overlap** - maximum overlap between alignment segments (expected to be non-negative) - *default: 10 bp*
37+
*
4038
*
4139
* \details Detects junctions from the CIGAR strings and supplementary alignment tags of read alignment records.
4240
* We filter unmapped alignments, secondary alignments, duplicates and alignments with low mapping quality.
4341
* Then, the CIGAR string of all remaining alignments is analyzed.
4442
* For primary alignments, also the split read information is analyzed.
4543
*/
4644
void detect_junctions_in_long_reads_sam_file(std::vector<Junction> & junctions,
47-
std::filesystem::path const & alignment_long_reads_file_path,
48-
std::vector<detection_methods> const & methods,
49-
uint64_t const min_var_length);
45+
cmd_arguments const & args);

include/variant_detection/variant_output.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
*
1313
* \param[in] clusters - input junction clusters
1414
* \param[in] args - command line arguments:\n
15-
* **args.min_var_length** - minimum length of variants to detect - *default: 30 bp*\n
16-
* **args.max_var_length** - maximum length of variants to detect - *default: 1,000,000 bp*\n
17-
* **args.max_tol_inserted_length** - longest tolerated inserted sequence at non-INS SV types - *default: 5 bp*
15+
* **args.min_var_length** - minimum length of variants to detect (expected to be non-negative) - *default: 30 bp* \n
16+
* **args.max_var_length** - maximum length of variants to detect (expected to be non-negative) - *default: 1,000,000 bp*\n
17+
* **args.max_tol_inserted_length** - longest tolerated inserted sequence at non-INS SV types (expected to be non-negative) - *default: 5 bp*
1818
* \param[in, out] out_stream - output stream
1919
*
2020
* \details Extracts genomic variants from given junction clusters.
@@ -29,9 +29,9 @@ void find_and_output_variants(std::vector<Cluster> const & clusters,
2929
*
3030
* \param[in] clusters - input junction clusters
3131
* \param[in] args - command line arguments:\n
32-
* **args.min_var_length** - minimum length of variants to detect - *default: 30 bp*\n
33-
* **args.max_var_length** - maximum length of variants to detect - *default: 1,000,000 bp*\n
34-
* **args.max_tol_inserted_length** - longest tolerated inserted sequence at non-INS SV types - *default: 5 bp*
32+
* **args.min_var_length** - minimum length of variants to detect (expected to be non-negative) - *default: 30 bp*\n
33+
* **args.max_var_length** - maximum length of variants to detect (expected to be non-negative) - *default: 1,000,000 bp*\n
34+
* **args.max_tol_inserted_length** - longest tolerated inserted sequence at non-INS SV types (expected to be non-negative) - *default: 5 bp*
3535
* \param[in] output_file_path - output file path
3636
*
3737
* \details Extracts genomic variants from given junction clusters.

src/iGenVar.cpp

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,21 @@ void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments
5656

5757
// Options - SV specifications:
5858
parser.add_option(args.min_var_length, 'l', "min_var_length",
59-
"Specify what should be the minimum length of your SVs to be detected.",
59+
"Specify what should be the minimum length of your SVs to be detected. "
60+
"This value needs to be non-negative.",
6061
seqan3::option_spec::advanced);
6162
parser.add_option(args.max_var_length, 'x', "max_var_length",
6263
"Specify what should be the maximum length of your SVs to be detected. "
63-
"SVs larger than this threshold can still be output as translocations.",
64+
"SVs larger than this threshold can still be output as translocations. "
65+
"This value needs to be non-negative.",
6466
seqan3::option_spec::advanced);
6567
parser.add_option(args.max_tol_inserted_length, 't', "max_tol_inserted_length",
66-
"Specify what should be the longest tolerated inserted sequence at sites of non-INS SVs.",
68+
"Specify what should be the longest tolerated inserted sequence at sites of non-INS SVs. "
69+
"This value needs to be non-negative.",
70+
seqan3::option_spec::advanced);
71+
parser.add_option(args.max_overlap, 'p', "max_overlap",
72+
"Specify the maximum allowed overlap between two alignment segments. "
73+
"This value needs to be non-negative.",
6774
seqan3::option_spec::advanced);
6875
}
6976

@@ -77,19 +84,15 @@ void detect_variants_in_alignment_file(cmd_arguments const & args)
7784
{
7885
seqan3::debug_stream << "Detect junctions in short reads...\n";
7986
detect_junctions_in_short_reads_sam_file(junctions,
80-
args.alignment_short_reads_file_path,
81-
args.methods,
82-
args.min_var_length);
87+
args);
8388
}
8489

8590
// long reads
8691
if (args.alignment_long_reads_file_path != "")
8792
{
8893
seqan3::debug_stream << "Detect junctions in long reads...\n";
8994
detect_junctions_in_long_reads_sam_file(junctions,
90-
args.alignment_long_reads_file_path,
91-
args.methods,
92-
args.min_var_length);
95+
args);
9396
}
9497

9598
std::sort(junctions.begin(), junctions.end());
@@ -168,6 +171,28 @@ int main(int argc, char ** argv)
168171
return -1;
169172
}
170173

174+
// Check that the given parameters are non-negative.
175+
if (args.min_var_length < 0)
176+
{
177+
seqan3::debug_stream << "[Error] You gave a negative min_var_length parameter.\n";
178+
return -1;
179+
}
180+
if (args.max_var_length < 0)
181+
{
182+
seqan3::debug_stream << "[Error] You gave a negative max_var_length parameter.\n";
183+
return -1;
184+
}
185+
if (args.max_tol_inserted_length < 0)
186+
{
187+
seqan3::debug_stream << "[Error] You gave a negative max_tol_inserted_length parameter.\n";
188+
return -1;
189+
}
190+
if (args.max_overlap < 0)
191+
{
192+
seqan3::debug_stream << "[Error] You gave a negative max_overlap parameter.\n";
193+
return -1;
194+
}
195+
171196
detect_variants_in_alignment_file(args);
172197

173198
return 0;

src/modules/sv_detection_methods/analyze_cigar_method.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ void analyze_cigar(std::string const & read_name,
1515
std::vector<seqan3::cigar> & cigar_string,
1616
seqan3::dna5_vector const & query_sequence,
1717
std::vector<Junction> & junctions,
18-
uint64_t const min_length)
18+
int32_t const min_length)
1919
{
2020
// Step through CIGAR string and store current position in reference and read
2121
int32_t pos_ref = query_start_pos;

0 commit comments

Comments
 (0)