Skip to content

Commit 7acd56a

Browse files
authored
Merge pull request #4536 from vgteam/insignificant-rescue-hits
Drop insignificant rescue hits
2 parents 856e065 + ebb31f1 commit 7acd56a

File tree

2 files changed

+103
-43
lines changed

2 files changed

+103
-43
lines changed

src/minimizer_mapper.cpp

Lines changed: 99 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3130,6 +3130,12 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r
31303130

31313131
if (rescue_nodes.size() == 0) {
31323132
//If the rescue subgraph is empty
3133+
if (show_work) {
3134+
#pragma omp critical (cerr)
3135+
{
3136+
cerr << log_name() << "Rescue subgraph is empty" << endl;
3137+
}
3138+
}
31333139
return;
31343140
}
31353141

@@ -3147,13 +3153,25 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r
31473153
// Find all seeds in the subgraph and try to get a full-length extension.
31483154
GaplessExtender::cluster_type seeds = this->seeds_in_subgraph(minimizers, rescue_nodes);
31493155
if (seeds.size() > this->rescue_seed_limit) {
3156+
if (show_work) {
3157+
#pragma omp critical (cerr)
3158+
{
3159+
cerr << log_name() << "Rescue seed limit exceeded: " << seeds.size() << " seeds" << endl;
3160+
}
3161+
}
31503162
return;
31513163
}
31523164
std::vector<GaplessExtension> extensions = this->extender->extend(seeds, rescued_alignment.sequence(), &cached_graph);
31533165

31543166
// If we have a full-length extension, use it as the rescued alignment.
31553167
if (GaplessExtender::full_length_extensions(extensions)) {
31563168
this->extension_to_alignment(extensions.front(), rescued_alignment);
3169+
if (show_work) {
3170+
#pragma omp critical (cerr)
3171+
{
3172+
cerr << log_name() << "Rescue found full-length gapless extension." << endl;
3173+
}
3174+
}
31573175
return;
31583176
}
31593177

@@ -3181,6 +3199,9 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r
31813199
gcsa::node_type node = gcsa::Node::encode(id, extension.offset, is_reverse);
31823200
dozeu_seed.back().nodes.push_back(node);
31833201
}
3202+
3203+
// Track the un-doubled rescue subgraph size.
3204+
size_t subgraph_size = 0;
31843205

31853206
// GSSW and dozeu assume that the graph is a DAG.
31863207
std::vector<handle_t> topological_order = gbwtgraph::topological_order(cached_graph, rescue_nodes);
@@ -3190,6 +3211,7 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r
31903211
for (auto& h : topological_order) {
31913212
rescue_subgraph_bases += cached_graph.get_length(h);
31923213
}
3214+
subgraph_size = rescue_subgraph_bases;
31933215
if (rescue_subgraph_bases * rescued_alignment.sequence().size() > max_dozeu_cells) {
31943216
if (!warned_about_rescue_size.test_and_set()) {
31953217
cerr << "warning[vg::giraffe]: Refusing to perform too-large rescue alignment of "
@@ -3210,54 +3232,56 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r
32103232
} else {
32113233
get_regular_aligner()->align(rescued_alignment, cached_graph, topological_order);
32123234
}
3213-
return;
3214-
}
3235+
} else {
32153236

3216-
// Build a subgraph overlay.
3217-
SubHandleGraph sub_graph(&cached_graph);
3218-
for (id_t id : rescue_nodes) {
3219-
sub_graph.add_handle(cached_graph.get_handle(id));
3220-
}
3237+
// Build a subgraph overlay, while tracking how many bases it is.
3238+
SubHandleGraph sub_graph(&cached_graph);
3239+
for (id_t id : rescue_nodes) {
3240+
handle_t h = cached_graph.get_handle(id);
3241+
sub_graph.add_handle(h);
3242+
subgraph_size += cached_graph.get_length(h);
3243+
}
32213244

3222-
// Create an overlay where each strand is a separate node.
3223-
StrandSplitGraph split_graph(&sub_graph);
3245+
// Create an overlay where each strand is a separate node.
3246+
StrandSplitGraph split_graph(&sub_graph);
32243247

3225-
// Dagify the subgraph.
3226-
bdsg::HashGraph dagified;
3227-
std::unordered_map<id_t, id_t> dagify_trans =
3228-
handlealgs::dagify(&split_graph, &dagified, rescued_alignment.sequence().size());
3248+
// Dagify the subgraph.
3249+
bdsg::HashGraph dagified;
3250+
std::unordered_map<id_t, id_t> dagify_trans =
3251+
handlealgs::dagify(&split_graph, &dagified, rescued_alignment.sequence().size());
3252+
3253+
size_t rescue_subgraph_bases = dagified.get_total_length();
3254+
if (rescue_subgraph_bases * rescued_alignment.sequence().size() > max_dozeu_cells) {
3255+
if (!warned_about_rescue_size.test_and_set()) {
3256+
cerr << "warning[vg::giraffe]: Refusing to perform too-large rescue alignment of "
3257+
<< rescued_alignment.sequence().size() << " bp against "
3258+
<< rescue_subgraph_bases << " bp dagified subgraph for read " << rescued_alignment.name()
3259+
<< " which would use more than " << max_dozeu_cells
3260+
<< " cells and might exhaust Dozeu's allocator; suppressing further warnings." << endl;
3261+
}
3262+
return;
3263+
}
3264+
3265+
// Align to the subgraph.
3266+
// TODO: Map the seed to the dagified subgraph.
3267+
if (this->rescue_algorithm == rescue_dozeu) {
3268+
size_t gap_limit = this->get_regular_aligner()->longest_detectable_gap(rescued_alignment);
3269+
get_regular_aligner()->align_xdrop(rescued_alignment, dagified, std::vector<MaximalExactMatch>(), false, gap_limit);
3270+
this->fix_dozeu_score(rescued_alignment, dagified, std::vector<handle_t>());
3271+
this->fix_dozeu_end_deletions(rescued_alignment);
3272+
} else if (this->rescue_algorithm == rescue_gssw) {
3273+
get_regular_aligner()->align(rescued_alignment, dagified, true);
3274+
}
32293275

3230-
size_t rescue_subgraph_bases = dagified.get_total_length();
3231-
if (rescue_subgraph_bases * rescued_alignment.sequence().size() > max_dozeu_cells) {
3232-
if (!warned_about_rescue_size.test_and_set()) {
3233-
cerr << "warning[vg::giraffe]: Refusing to perform too-large rescue alignment of "
3234-
<< rescued_alignment.sequence().size() << " bp against "
3235-
<< rescue_subgraph_bases << " bp dagified subgraph for read " << rescued_alignment.name()
3236-
<< " which would use more than " << max_dozeu_cells
3237-
<< " cells and might exhaust Dozeu's allocator; suppressing further warnings." << endl;
3276+
// Map the alignment back to the original graph.
3277+
Path& path = *(rescued_alignment.mutable_path());
3278+
for (size_t i = 0; i < path.mapping_size(); i++) {
3279+
Position& pos = *(path.mutable_mapping(i)->mutable_position());
3280+
id_t id = dagify_trans[pos.node_id()];
3281+
handle_t handle = split_graph.get_underlying_handle(split_graph.get_handle(id));
3282+
pos.set_node_id(sub_graph.get_id(handle));
3283+
pos.set_is_reverse(sub_graph.get_is_reverse(handle));
32383284
}
3239-
return;
3240-
}
3241-
3242-
// Align to the subgraph.
3243-
// TODO: Map the seed to the dagified subgraph.
3244-
if (this->rescue_algorithm == rescue_dozeu) {
3245-
size_t gap_limit = this->get_regular_aligner()->longest_detectable_gap(rescued_alignment);
3246-
get_regular_aligner()->align_xdrop(rescued_alignment, dagified, std::vector<MaximalExactMatch>(), false, gap_limit);
3247-
this->fix_dozeu_score(rescued_alignment, dagified, std::vector<handle_t>());
3248-
this->fix_dozeu_end_deletions(rescued_alignment);
3249-
} else if (this->rescue_algorithm == rescue_gssw) {
3250-
get_regular_aligner()->align(rescued_alignment, dagified, true);
3251-
}
3252-
3253-
// Map the alignment back to the original graph.
3254-
Path& path = *(rescued_alignment.mutable_path());
3255-
for (size_t i = 0; i < path.mapping_size(); i++) {
3256-
Position& pos = *(path.mutable_mapping(i)->mutable_position());
3257-
id_t id = dagify_trans[pos.node_id()];
3258-
handle_t handle = split_graph.get_underlying_handle(split_graph.get_handle(id));
3259-
pos.set_node_id(sub_graph.get_id(handle));
3260-
pos.set_is_reverse(sub_graph.get_is_reverse(handle));
32613285
}
32623286

32633287
if (show_work) {
@@ -3266,6 +3290,38 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r
32663290
cerr << log_name() << "Rescue result: " << log_alignment(rescued_alignment) << endl;
32673291
}
32683292
}
3293+
3294+
// Work out how many matches of score the alignment is
3295+
int effective_matches = rescued_alignment.score() / this->get_regular_aligner()->match;
3296+
if (effective_matches <= rescued_alignment.sequence().size() && effective_matches <= subgraph_size) {
3297+
// We haven't hit any weird full-length bonuses
3298+
3299+
// Compute the probability of this many matches in a row by chance
3300+
// between two linear sequences of these lengths, under an
3301+
// each-base-equally-likely null model.
3302+
double by_chance_likelihood = 1.0 - pow(1.0 - pow(0.25, effective_matches), (rescued_alignment.sequence().size() - effective_matches + 1)*(subgraph_size - effective_matches + 1));
3303+
if (show_work) {
3304+
#pragma omp critical (cerr)
3305+
{
3306+
cerr << log_name() << "Likelihood of a " << effective_matches << " bp match between random sequences of " << rescued_alignment.sequence().size() << " bp and " << subgraph_size << " bp: " << by_chance_likelihood << endl;
3307+
}
3308+
}
3309+
3310+
if (by_chance_likelihood > rescue_likelihood_limit) {
3311+
// This is too plausible by chance.
3312+
// TODO: Should this go into MAPQ instead?
3313+
3314+
if (show_work) {
3315+
#pragma omp critical (cerr)
3316+
{
3317+
cerr << log_name() << "\tToo likely by chance!" << endl;
3318+
}
3319+
}
3320+
3321+
rescued_alignment.clear_path();
3322+
rescued_alignment.set_score(0);
3323+
}
3324+
}
32693325
}
32703326

32713327
GaplessExtender::cluster_type MinimizerMapper::seeds_in_subgraph(const VectorView<Minimizer>& minimizers,

src/minimizer_mapper.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,10 @@ class MinimizerMapper : public AlignerClient {
514514
/// still break this!
515515
static constexpr size_t default_max_dozeu_cells = (size_t)(1.5 * 1024 * 1024);
516516
size_t max_dozeu_cells = default_max_dozeu_cells;
517+
518+
/// For rescure, how likely can an alignment be by chance to still accept it?
519+
static constexpr double default_rescue_likelihood_limit = 0.05;
520+
double rescue_likelihood_limit = default_rescue_likelihood_limit;
517521

518522
///What is the maximum fragment length that we accept as valid for paired-end reads?
519523
static constexpr size_t default_max_fragment_length = 2000;

0 commit comments

Comments
 (0)