Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions src/graph_caller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,8 @@ bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa
const Snarl& snarl, const vector<SnarlTraversal>& called_traversals,
const vector<int>& genotype, int ref_trav_idx, const unique_ptr<SnarlCaller::CallInfo>& call_info,
const string& ref_path_name, int ref_offset, bool genotype_snarls, int ploidy,
function<string(const vector<SnarlTraversal>&, const vector<int>&, int, int, int)> trav_to_string) {
function<string(const vector<SnarlTraversal>&, const vector<int>&, int, int, int)> trav_to_string,
const vector<pair<string, string>>& info_tags) {

#ifdef debug
cerr << "emitting variant for " << pb2json(snarl) << endl;
Expand Down Expand Up @@ -581,6 +582,11 @@ bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa
// add some support info
snarl_caller.update_vcf_info(snarl, site_traversals, site_genotype, call_info, sample_name, out_variant);

// add the info
for (const pair<string, string>& info_tag : info_tags) {
out_variant.info[info_tag.first].push_back(info_tag.second);
}

// if genotype_snarls, then we only flatten up to the snarl endpoints
// (this is when we are in genotyping mode and want consistent calls regardless of the sample)
int64_t flatten_len_s = 0;
Expand Down Expand Up @@ -1827,13 +1833,24 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) {

vector<SnarlTraversal> travs;
FlowTraversalFinder* flow_trav_finder = dynamic_cast<FlowTraversalFinder*>(&traversal_finder);
vector<pair<string, string>> extra_info_tags;
int64_t site_allele_count = -1;
if (flow_trav_finder != nullptr) {
// find the max flow traversals using specialized interface that accepts avg heurstic toggle
pair<vector<SnarlTraversal>, vector<double>> weighted_travs = flow_trav_finder->find_weighted_traversals(snarl, greedy_avg_flow);
travs = std::move(weighted_travs.first);
} else {
// find the traversals using the generic interface
travs = traversal_finder.find_traversals(snarl);
// find the traversals with the gbwt. also count the haplotypes in the site and remember it
// for a VCF tag (GAN)
GBWTTraversalFinder* gbwt_trav_finder = dynamic_cast<GBWTTraversalFinder*>(&traversal_finder);
if (gbwt_trav_finder != nullptr) {
pair<vector<SnarlTraversal>, vector<gbwt::size_type>> gbwt_path_travs = gbwt_trav_finder->find_path_traversals(snarl);
travs = std::move(gbwt_path_travs.first);
extra_info_tags.push_back(make_pair("GAN", std::to_string(gbwt_trav_finder->count_haplotypes(gbwt_path_travs.second))));
} else {
// find the traversals using the generic interface
travs = traversal_finder.find_traversals(snarl);
}
}

if (travs.empty()) {
Expand Down Expand Up @@ -1893,7 +1910,7 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) {
bool added = true;
if (!gaf_output) {
added = emit_variant(graph, snarl_caller, snarl, travs, trav_genotype, ref_trav_idx, trav_call_info, ref_path_name,
ref_offsets[ref_path_name], genotype_snarls, ploidy);
ref_offsets[ref_path_name], genotype_snarls, ploidy, nullptr, extra_info_tags);
} else {
pair<string, int64_t> pos_info = get_ref_position(graph, snarl, ref_path_name, ref_offsets[ref_path_name]);
emit_gaf_variant(graph, print_snarl(snarl), travs, trav_genotype, ref_trav_idx, pos_info.first, pos_info.second, &support_finder);
Expand All @@ -1908,6 +1925,9 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) {
string FlowCaller::vcf_header(const PathHandleGraph& graph, const vector<string>& contigs,
const vector<size_t>& contig_length_overrides) const {
string header = VCFOutputCaller::vcf_header(graph, contigs, contig_length_overrides);
if (dynamic_cast<GBWTTraversalFinder*>(&this->traversal_finder) != nullptr) {
header+= "##INFO=<ID=GAN,Number=1,Type=Integer,Description=\"Number of haplotypes going through site in the graph\">\n";
}
header += "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n";
snarl_caller.update_vcf_header(header);
header += "##FILTER=<ID=PASS,Description=\"All filters passed\">\n";
Expand Down
3 changes: 2 additions & 1 deletion src/graph_caller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ class VCFOutputCaller {
const Snarl& snarl, const vector<SnarlTraversal>& called_traversals,
const vector<int>& genotype, int ref_trav_idx, const unique_ptr<SnarlCaller::CallInfo>& call_info,
const string& ref_path_name, int ref_offset, bool genotype_snarls, int ploidy,
function<string(const vector<SnarlTraversal>&, const vector<int>&, int, int, int)> trav_to_string = nullptr);
function<string(const vector<SnarlTraversal>&, const vector<int>&, int, int, int)> trav_to_string = nullptr,
const vector<pair<string, string>>& info_tags = {});

/// get the interval of a snarl from our reference path using the PathPositionHandleGraph interface
/// the bool is true if the snarl's backward on the path
Expand Down
22 changes: 21 additions & 1 deletion src/traversal_finder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "cactus.hpp"
#include "gbwt_helper.hpp"
#include "haplotype_extracter.hpp"
#include <gbwtgraph/gbwtgraph.h>
//#define debug

namespace vg {
Expand Down Expand Up @@ -3435,7 +3436,7 @@ pair<vector<SnarlTraversal>, vector<double>> FlowTraversalFinder::find_weighted_
GBWTTraversalFinder::GBWTTraversalFinder(const HandleGraph& graph, const gbwt::GBWT& gbwt) :
graph(graph),
gbwt(gbwt) {

this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(gbwt);
}

GBWTTraversalFinder::~GBWTTraversalFinder() {
Expand Down Expand Up @@ -3599,7 +3600,26 @@ pair<vector<Traversal>, vector<gbwt::size_type>> GBWTTraversalFinder::find_path_
return path_traversals;
}

int64_t GBWTTraversalFinder::count_haplotypes(const vector<gbwt::size_type>& gbwt_paths) const {
unordered_set<pair<string, int64_t>> haplotypes;
for (const gbwt::size_type& gbwt_path : gbwt_paths) {
gbwt::size_type path_id = gbwt::Path::id(gbwt_path);
// metadata check copied from vg deconsturct
if (!gbwt.hasMetadata() || !gbwt.metadata.hasPathNames() || path_id >= gbwt.metadata.paths()) {
continue;
}
PathSense sense = gbwtgraph::get_path_sense(gbwt, path_id, gbwt_reference_samples);
string sample_name = gbwtgraph::get_path_sample_name(gbwt, path_id, sense);
if (sample_name != PathMetadata::NO_SAMPLE_NAME) {
haplotypes.insert(make_pair(sample_name,
gbwtgraph::get_path_haplotype(gbwt, path_id, sense)));
} else {
haplotypes.insert(make_pair(gbwtgraph::compose_path_name(gbwt, path_id, sense), 0));
}
}
return haplotypes.size();

}

}

Expand Down
9 changes: 8 additions & 1 deletion src/traversal_finder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,9 @@ class GBWTTraversalFinder : public TraversalFinder {

const HandleGraph& graph;
const gbwt::GBWT& gbwt;

// When using the gbwt we need some precomputed information to ask about stored paths.
unordered_set<string> gbwt_reference_samples;

public:

GBWTTraversalFinder(const HandleGraph& graph, const gbwt::GBWT& gbwt);
Expand Down Expand Up @@ -666,6 +668,11 @@ class GBWTTraversalFinder : public TraversalFinder {
virtual pair<vector<Traversal>, vector<gbwt::size_type>> find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end);

const gbwt::GBWT& get_gbwt() { return gbwt; }

/**
* Count the unique sample/haplotype pairs that are found in the nsarl (from second output of find_path_traversals)
*/
int64_t count_haplotypes(const vector<gbwt::size_type>& gbwt_paths) const;

protected:

Expand Down
Loading