@@ -3130,6 +3130,12 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r
31303130
31313131 if (rescue_nodes.size () == 0 ) {
31323132 // If the rescue subgraph is empty
3133+ if (show_work) {
3134+ #pragma omp critical (cerr)
3135+ {
3136+ cerr << log_name () << " Rescue subgraph is empty" << endl;
3137+ }
3138+ }
31333139 return ;
31343140 }
31353141
@@ -3147,13 +3153,25 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r
31473153 // Find all seeds in the subgraph and try to get a full-length extension.
31483154 GaplessExtender::cluster_type seeds = this ->seeds_in_subgraph (minimizers, rescue_nodes);
31493155 if (seeds.size () > this ->rescue_seed_limit ) {
3156+ if (show_work) {
3157+ #pragma omp critical (cerr)
3158+ {
3159+ cerr << log_name () << " Rescue seed limit exceeded: " << seeds.size () << " seeds" << endl;
3160+ }
3161+ }
31503162 return ;
31513163 }
31523164 std::vector<GaplessExtension> extensions = this ->extender ->extend (seeds, rescued_alignment.sequence (), &cached_graph);
31533165
31543166 // If we have a full-length extension, use it as the rescued alignment.
31553167 if (GaplessExtender::full_length_extensions (extensions)) {
31563168 this ->extension_to_alignment (extensions.front (), rescued_alignment);
3169+ if (show_work) {
3170+ #pragma omp critical (cerr)
3171+ {
3172+ cerr << log_name () << " Rescue found full-length gapless extension." << endl;
3173+ }
3174+ }
31573175 return ;
31583176 }
31593177
@@ -3181,6 +3199,9 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r
31813199 gcsa::node_type node = gcsa::Node::encode (id, extension.offset , is_reverse);
31823200 dozeu_seed.back ().nodes .push_back (node);
31833201 }
3202+
3203+ // Track the un-doubled rescue subgraph size.
3204+ size_t subgraph_size = 0 ;
31843205
31853206 // GSSW and dozeu assume that the graph is a DAG.
31863207 std::vector<handle_t > topological_order = gbwtgraph::topological_order (cached_graph, rescue_nodes);
@@ -3190,6 +3211,7 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r
31903211 for (auto & h : topological_order) {
31913212 rescue_subgraph_bases += cached_graph.get_length (h);
31923213 }
3214+ subgraph_size = rescue_subgraph_bases;
31933215 if (rescue_subgraph_bases * rescued_alignment.sequence ().size () > max_dozeu_cells) {
31943216 if (!warned_about_rescue_size.test_and_set ()) {
31953217 cerr << " warning[vg::giraffe]: Refusing to perform too-large rescue alignment of "
@@ -3210,54 +3232,56 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r
32103232 } else {
32113233 get_regular_aligner ()->align (rescued_alignment, cached_graph, topological_order);
32123234 }
3213- return ;
3214- }
3235+ } else {
32153236
3216- // Build a subgraph overlay.
3217- SubHandleGraph sub_graph (&cached_graph);
3218- for (id_t id : rescue_nodes) {
3219- sub_graph.add_handle (cached_graph.get_handle (id));
3220- }
3237+ // Build a subgraph overlay, while tracking how many bases it is.
3238+ SubHandleGraph sub_graph (&cached_graph);
3239+ for (id_t id : rescue_nodes) {
3240+ handle_t h = cached_graph.get_handle (id);
3241+ sub_graph.add_handle (h);
3242+ subgraph_size += cached_graph.get_length (h);
3243+ }
32213244
3222- // Create an overlay where each strand is a separate node.
3223- StrandSplitGraph split_graph (&sub_graph);
3245+ // Create an overlay where each strand is a separate node.
3246+ StrandSplitGraph split_graph (&sub_graph);
32243247
3225- // Dagify the subgraph.
3226- bdsg::HashGraph dagified;
3227- std::unordered_map<id_t , id_t > dagify_trans =
3228- handlealgs::dagify (&split_graph, &dagified, rescued_alignment.sequence ().size ());
3248+ // Dagify the subgraph.
3249+ bdsg::HashGraph dagified;
3250+ std::unordered_map<id_t , id_t > dagify_trans =
3251+ handlealgs::dagify (&split_graph, &dagified, rescued_alignment.sequence ().size ());
3252+
3253+ size_t rescue_subgraph_bases = dagified.get_total_length ();
3254+ if (rescue_subgraph_bases * rescued_alignment.sequence ().size () > max_dozeu_cells) {
3255+ if (!warned_about_rescue_size.test_and_set ()) {
3256+ cerr << " warning[vg::giraffe]: Refusing to perform too-large rescue alignment of "
3257+ << rescued_alignment.sequence ().size () << " bp against "
3258+ << rescue_subgraph_bases << " bp dagified subgraph for read " << rescued_alignment.name ()
3259+ << " which would use more than " << max_dozeu_cells
3260+ << " cells and might exhaust Dozeu's allocator; suppressing further warnings." << endl;
3261+ }
3262+ return ;
3263+ }
3264+
3265+ // Align to the subgraph.
3266+ // TODO: Map the seed to the dagified subgraph.
3267+ if (this ->rescue_algorithm == rescue_dozeu) {
3268+ size_t gap_limit = this ->get_regular_aligner ()->longest_detectable_gap (rescued_alignment);
3269+ get_regular_aligner ()->align_xdrop (rescued_alignment, dagified, std::vector<MaximalExactMatch>(), false , gap_limit);
3270+ this ->fix_dozeu_score (rescued_alignment, dagified, std::vector<handle_t >());
3271+ this ->fix_dozeu_end_deletions (rescued_alignment);
3272+ } else if (this ->rescue_algorithm == rescue_gssw) {
3273+ get_regular_aligner ()->align (rescued_alignment, dagified, true );
3274+ }
32293275
3230- size_t rescue_subgraph_bases = dagified. get_total_length ();
3231- if (rescue_subgraph_bases * rescued_alignment.sequence (). size () > max_dozeu_cells) {
3232- if (!warned_about_rescue_size. test_and_set () ) {
3233- cerr << " warning[vg::giraffe]: Refusing to perform too-large rescue alignment of "
3234- << rescued_alignment. sequence (). size () << " bp against "
3235- << rescue_subgraph_bases << " bp dagified subgraph for read " << rescued_alignment. name ()
3236- << " which would use more than " << max_dozeu_cells
3237- << " cells and might exhaust Dozeu's allocator; suppressing further warnings. " << endl ;
3276+ // Map the alignment back to the original graph.
3277+ Path& path = *( rescued_alignment.mutable_path ());
3278+ for ( size_t i = 0 ; i < path. mapping_size (); i++ ) {
3279+ Position& pos = *(path. mutable_mapping (i)-> mutable_position ());
3280+ id_t id = dagify_trans[pos. node_id ()];
3281+ handle_t handle = split_graph. get_underlying_handle (split_graph. get_handle (id));
3282+ pos. set_node_id (sub_graph. get_id (handle));
3283+ pos. set_is_reverse (sub_graph. get_is_reverse (handle)) ;
32383284 }
3239- return ;
3240- }
3241-
3242- // Align to the subgraph.
3243- // TODO: Map the seed to the dagified subgraph.
3244- if (this ->rescue_algorithm == rescue_dozeu) {
3245- size_t gap_limit = this ->get_regular_aligner ()->longest_detectable_gap (rescued_alignment);
3246- get_regular_aligner ()->align_xdrop (rescued_alignment, dagified, std::vector<MaximalExactMatch>(), false , gap_limit);
3247- this ->fix_dozeu_score (rescued_alignment, dagified, std::vector<handle_t >());
3248- this ->fix_dozeu_end_deletions (rescued_alignment);
3249- } else if (this ->rescue_algorithm == rescue_gssw) {
3250- get_regular_aligner ()->align (rescued_alignment, dagified, true );
3251- }
3252-
3253- // Map the alignment back to the original graph.
3254- Path& path = *(rescued_alignment.mutable_path ());
3255- for (size_t i = 0 ; i < path.mapping_size (); i++) {
3256- Position& pos = *(path.mutable_mapping (i)->mutable_position ());
3257- id_t id = dagify_trans[pos.node_id ()];
3258- handle_t handle = split_graph.get_underlying_handle (split_graph.get_handle (id));
3259- pos.set_node_id (sub_graph.get_id (handle));
3260- pos.set_is_reverse (sub_graph.get_is_reverse (handle));
32613285 }
32623286
32633287 if (show_work) {
@@ -3266,6 +3290,38 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r
32663290 cerr << log_name () << " Rescue result: " << log_alignment (rescued_alignment) << endl;
32673291 }
32683292 }
3293+
3294+ // Work out how many matches of score the alignment is
3295+ int effective_matches = rescued_alignment.score () / this ->get_regular_aligner ()->match ;
3296+ if (effective_matches <= rescued_alignment.sequence ().size () && effective_matches <= subgraph_size) {
3297+ // We haven't hit any weird full-length bonuses
3298+
3299+ // Compute the probability of this many matches in a row by chance
3300+ // between two linear sequences of these lengths, under an
3301+ // each-base-equally-likely null model.
3302+ double by_chance_likelihood = 1.0 - pow (1.0 - pow (0.25 , effective_matches), (rescued_alignment.sequence ().size () - effective_matches + 1 )*(subgraph_size - effective_matches + 1 ));
3303+ if (show_work) {
3304+ #pragma omp critical (cerr)
3305+ {
3306+ cerr << log_name () << " Likelihood of a " << effective_matches << " bp match between random sequences of " << rescued_alignment.sequence ().size () << " bp and " << subgraph_size << " bp: " << by_chance_likelihood << endl;
3307+ }
3308+ }
3309+
3310+ if (by_chance_likelihood > rescue_likelihood_limit) {
3311+ // This is too plausible by chance.
3312+ // TODO: Should this go into MAPQ instead?
3313+
3314+ if (show_work) {
3315+ #pragma omp critical (cerr)
3316+ {
3317+ cerr << log_name () << " \t Too likely by chance!" << endl;
3318+ }
3319+ }
3320+
3321+ rescued_alignment.clear_path ();
3322+ rescued_alignment.set_score (0 );
3323+ }
3324+ }
32693325}
32703326
32713327GaplessExtender::cluster_type MinimizerMapper::seeds_in_subgraph (const VectorView<Minimizer>& minimizers,
0 commit comments