Skip to content

Commit bc61c69

Browse files
committed
Merge remote-tracking branch 'origin/master' into lr-giraffe
2 parents 7455828 + e20fd03 commit bc61c69

15 files changed

+220
-67
lines changed

Makefile

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -717,15 +717,17 @@ $(LIB_DIR)/libtabixpp.a: $(LIB_DIR)/libhts.a $(TABIXPP_DIR)/*.cpp $(TABIXPP_DIR)
717717
+echo "Cflags: -I$(CWD)/$(INC_DIR)" >> $(LIB_DIR)/pkgconfig/tabixpp.pc
718718
+echo "Libs: -L$(CWD)/$(LIB_DIR) -ltabixpp" >> $(LIB_DIR)/pkgconfig/tabixpp.pc
719719

720-
# Build vcflib. Install the library and headers but not binaries or man pages.
721-
# We need to build as RelWithDebInfo to avoid vcflib using its own
720+
# Build vcflib. Build the library and any needed binarise. Install the library
721+
# and headers but not binaries or man pages. We need to build as RelWithDebInfo
722+
# to avoid vcflib using its own
722723
# -march=native, which would conflict with the -march that comes in through
723-
# CXXFLAGS from the vg Dockerfile.
724-
# We also need to use the magic path hint to let CMake find Mac OpenMP.
725-
# We need to use /usr first for CMake search or Ubuntu 22.04 will decide pybind11 is installed in / when actually it is only fully installed in /usr.
724+
# CXXFLAGS from the vg Dockerfile. We also need to use the magic path hint to
725+
# let CMake find Mac OpenMP. We need to use /usr first for CMake search or
726+
# Ubuntu 22.04 will decide pybind11 is installed in / when actually it is only
727+
# fully installed in /usr.
726728
$(LIB_DIR)/libvcflib.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/libtabixpp.a $(VCFLIB_DIR)/src/*.cpp $(VCFLIB_DIR)/src/*.hpp $(VCFLIB_DIR)/contrib/*/*.cpp $(VCFLIB_DIR)/contrib/*/*.h
727729
+rm -f $(VCFLIB_DIR)/contrib/WFA2-lib/VERSION
728-
+cd $(VCFLIB_DIR) && rm -Rf build && mkdir build && cd build && PKG_CONFIG_PATH="$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH)" cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DZIG=OFF -DCMAKE_C_FLAGS="$(CFLAGS)" -DCMAKE_CXX_FLAGS="$(CXXFLAGS) ${CPPFLAGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" .. && cmake --build .
730+
+cd $(VCFLIB_DIR) && rm -Rf build && mkdir build && cd build && PKG_CONFIG_PATH="$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH)" cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DZIG=OFF -DCMAKE_C_FLAGS="$(CFLAGS)" -DCMAKE_CXX_FLAGS="$(CXXFLAGS) ${CPPFLAGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" .. && cmake --build . --target vcflib vcf2tsv
729731
+cp $(VCFLIB_DIR)/contrib/filevercmp/*.h* $(INC_DIR)
730732
+cp $(VCFLIB_DIR)/contrib/fastahack/*.h* $(INC_DIR)
731733
+cp $(VCFLIB_DIR)/contrib/smithwaterman/*.h* $(INC_DIR)

deps/libvgio

doc/wiki

Submodule wiki updated from 815d586 to b4bb95d

src/hts_alignment_emitter.cpp

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ pair<vector<pair<string, int64_t>>, unordered_map<string, int64_t>> extract_path
119119
return make_pair(path_names_and_lengths, subpath_to_length);
120120
}
121121

122-
vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const string& filename, const vector<string>& path_names, const PathPositionHandleGraph& graph) {
122+
vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const string& filename, const vector<string>& path_names, const std::unordered_set<std::string>& reference_samples, const PathPositionHandleGraph& graph) {
123123

124124
// TODO: We assume we're using the one true default path metadata <-> name mapping
125125

@@ -282,9 +282,21 @@ vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const strin
282282
// We got no paths in so we need to guess them.
283283
// We will deduplicate their base names, without subpath info.
284284
unordered_set<string> base_names;
285+
286+
// We will also track the distinct reference samples and complain if
287+
// there are several, because people don't usually want that.
288+
std::unordered_set<std::string> sample_names_in_use;
285289

286290
// When we find a path or subpath we want, we will keep it.
287291
auto keep_path_or_subpath = [&](const path_handle_t& path) {
292+
if (!reference_samples.empty()) {
293+
if (!reference_samples.count(graph.get_sample_name(path))) {
294+
// This is not on any acceptable reference, so skip it.
295+
return;
296+
}
297+
}
298+
sample_names_in_use.insert(graph.get_sample_name(path));
299+
288300
string base_name = get_path_base_name(graph, path);
289301
int64_t base_length = -1;
290302
if (graph.get_subrange(path) == PathMetadata::NO_SUBRANGE) {
@@ -305,9 +317,35 @@ vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const strin
305317

306318
// First look for reference sense paths and their subpaths
307319
graph.for_each_path_of_sense(PathSense::REFERENCE, keep_path_or_subpath);
320+
321+
if (sample_names_in_use.size() > 1 && reference_samples.empty()) {
322+
// We auto-detected multiple references. Warn the user that they probably don't want this.
323+
#pragma omp critical (cerr)
324+
{
325+
std::cerr << "warning:[vg::get_sequence_dictionary] Using multiple target reference assemblies (";
326+
for (auto it = sample_names_in_use.begin(); it != sample_names_in_use.end(); ++it) {
327+
if (it != sample_names_in_use.begin()) {
328+
std::cerr << ", ";
329+
}
330+
std::cerr << *it;
331+
}
332+
std::cerr << "). Most tools that read SAM/BAM/CRAM will not support this. Consider providing a reference path list or dictionary, or reference sample name." << std::endl;
333+
}
334+
}
335+
336+
if (sample_names_in_use.size() < reference_samples.size()) {
337+
for (auto& name : reference_samples) {
338+
if (!sample_names_in_use.count(name)) {
339+
// TODO: We should learn to promote whole haplotype-sense samples on the fly. For now error out.
340+
cerr << "error:[vg::get_sequence_dictionary] Requested reference assembly " << name << " is not a reference assembly in the graph." << endl;
341+
exit(1);
342+
}
343+
}
344+
}
308345

309346
if (input_names_lengths.empty()) {
310347
// If none of those exist, try generic sense paths and their subpaths
348+
#pragma omp critical (cerr)
311349
cerr << "warning:[vg::get_sequence_dictionary] No reference-sense paths available in the graph; falling back to generic paths." << endl;
312350
graph.for_each_path_of_sense(PathSense::GENERIC, [&](const path_handle_t& path) {
313351
if (Paths::is_alt(graph.get_path_name(path))) {

src/hts_alignment_emitter.hpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,11 @@ unique_ptr<AlignmentEmitter> get_alignment_emitter(const string& filename, const
7373
* those in the file, if any.
7474
*
7575
* If the filename is itself an empty string, and no path names are passed,
76-
* then all reference-sense paths from the graph will be collected in arbitrary
76+
* then reference-sense paths from the graph will be collected in arbitrary
7777
* order. If there are none, all non-alt-allele generic sense paths from the
78-
* graph will be collected in arbitrary order.
78+
* graph will be collected in arbitrary order. If reference_samples is
79+
* nonempty, only reference-sense paths in those particular samples will be
80+
* used.
7981
*
8082
* TODO: Be able to generate the autosomes human-sort, X, Y, MT order typical
8183
* of references.
@@ -85,7 +87,7 @@ unique_ptr<AlignmentEmitter> get_alignment_emitter(const string& filename, const
8587
* This information needs to come from the user in order to be correct, but
8688
* if it's not specified, it'll be guessed from the graph
8789
*/
88-
vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const string& filename, const vector<string>& path_names, const PathPositionHandleGraph& graph);
90+
vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const string& filename, const vector<string>& path_names, const std::unordered_set<std::string>& reference_samples, const PathPositionHandleGraph& graph);
8991

9092
/**
9193
* Given a list of path handles and size info (from get_sequence_dictionary), return two things:

src/minimizer_mapper.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1082,13 +1082,15 @@ class MinimizerMapper : public AlignerClient {
10821082
*/
10831083
static std::pair<size_t, size_t> align_sequence_between(const pos_t& left_anchor, const pos_t& right_anchor, size_t max_path_length, size_t max_gap_length, const HandleGraph* graph, const GSSWAligner* aligner, Alignment& alignment, const std::string* alignment_name = nullptr, size_t max_dp_cells = std::numeric_limits<size_t>::max(), const std::function<size_t(const Alignment&, const HandleGraph&)>& choose_band_padding = algorithms::pad_band_random_walk());
10841084

1085+
public:
10851086
/**
10861087
* Version of align_sequence_between() that guarantees that you get the
10871088
* same answer (modulo reverse-complementation) no matter whether the
10881089
* sequence and anchors are reverse-complemented or not.
10891090
*/
10901091
static std::pair<size_t, size_t> align_sequence_between_consistently(const pos_t& left_anchor, const pos_t& right_anchor, size_t max_path_length, size_t max_gap_length, const HandleGraph* graph, const GSSWAligner* aligner, Alignment& alignment, const std::string* alignment_name = nullptr, size_t max_dp_cells = std::numeric_limits<size_t>::max(), const std::function<size_t(const Alignment&, const HandleGraph&)>& choose_band_padding = algorithms::pad_band_random_walk());
1091-
1092+
1093+
protected:
10921094
/**
10931095
* Produce a WFAAlignment of the given sequence between the given points
10941096
* that will be the same (modulo reverse-complementation) no matter whether

src/minimizer_mapper_from_chains.cpp

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1011,7 +1011,6 @@ vector<Alignment> MinimizerMapper::map_from_chains(Alignment& aln) {
10111011
return mappings;
10121012
}
10131013

1014-
#define debug
10151014
void MinimizerMapper::do_fragmenting_on_trees(Alignment& aln, const ZipCodeForest& zip_code_forest,
10161015
const std::vector<Seed>& seeds, const VectorView<MinimizerMapper::Minimizer>& minimizers,
10171016
const vector<algorithms::Anchor>& seed_anchors,
@@ -1618,7 +1617,6 @@ void MinimizerMapper::do_fragmenting_on_trees(Alignment& aln, const ZipCodeFores
16181617
}
16191618

16201619
}
1621-
#undef debug
16221620

16231621
void MinimizerMapper::do_chaining_on_fragments(Alignment& aln, const ZipCodeForest& zip_code_forest,
16241622
const std::vector<Seed>& seeds, const VectorView<MinimizerMapper::Minimizer>& minimizers,
@@ -3587,14 +3585,19 @@ void MinimizerMapper::with_dagified_local_graph(const pos_t& left_anchor, const
35873585
nid_t local_left_anchor_id = 0;
35883586
nid_t local_right_anchor_id = 0;
35893587
for (auto& kv : local_to_base) {
3590-
if (kv.second == id(left_anchor) && kv.second == id(right_anchor)) {
3588+
auto& local_id = kv.first;
3589+
auto& base_id = kv.second;
3590+
if (base_id == id(left_anchor) && base_id == id(right_anchor)) {
35913591
// The left and right anchors are on the same node, and this is a copy of it.
35923592
// It could be that the anchors face each other, and we extracted one intervening piece of node.
35933593
// In which case we go through this section once.
35943594
if (local_left_anchor_id == 0 && local_right_anchor_id == 0) {
35953595
// First time through, say we probably cut out the middle piece of a node
3596-
local_left_anchor_id = kv.first;
3597-
local_right_anchor_id = kv.first;
3596+
local_left_anchor_id = local_id;
3597+
local_right_anchor_id = local_id;
3598+
#ifdef debug
3599+
std::cerr << "Assume left and right anchors are both node " << local_id << " representing " << base_id << std::endl;
3600+
#endif
35983601
} else {
35993602
// Or it could be that we have two pieces of the original
36003603
// shared node represented as separate nodes, because the
@@ -3619,10 +3622,13 @@ void MinimizerMapper::with_dagified_local_graph(const pos_t& left_anchor, const
36193622
}
36203623
// Whichever copy has the lower ID is the left one and
36213624
// whichever copy has the higher ID is the right one.
3622-
local_left_anchor_id = std::min(local_left_anchor_id, kv.first);
3623-
local_right_anchor_id = std::max(local_right_anchor_id, kv.second);
3625+
local_left_anchor_id = std::min(local_left_anchor_id, local_id);
3626+
local_right_anchor_id = std::max(local_right_anchor_id, local_id);
3627+
#ifdef debug
3628+
std::cerr << "Second shared anchor copy as " << local_id << " representing " << base_id << "; left is now " << local_left_anchor_id << " and right is " << local_right_anchor_id << std::endl;
3629+
#endif
36243630
}
3625-
} else if (kv.second == id(left_anchor)) {
3631+
} else if (base_id == id(left_anchor)) {
36263632
if (local_left_anchor_id != 0) {
36273633
// We thought we already figured out the start node; there are
36283634
// too many copies of our start node to find it.
@@ -3635,8 +3641,11 @@ void MinimizerMapper::with_dagified_local_graph(const pos_t& left_anchor, const
36353641
local_graph.serialize("crashdump.vg");
36363642
throw std::runtime_error(ss.str());
36373643
}
3638-
local_left_anchor_id = kv.first;
3639-
} else if (kv.second == id(right_anchor)) {
3644+
local_left_anchor_id = local_id;
3645+
#ifdef debug
3646+
std::cerr << "Left anchor is " << local_left_anchor_id << std::endl;
3647+
#endif
3648+
} else if (base_id == id(right_anchor)) {
36403649
if (local_right_anchor_id != 0) {
36413650
// We thought we already figured out the end node; there are
36423651
// too many copies of our end node to find it.
@@ -3649,18 +3658,15 @@ void MinimizerMapper::with_dagified_local_graph(const pos_t& left_anchor, const
36493658
local_graph.serialize("crashdump.vg");
36503659
throw std::runtime_error(ss.str());
36513660
}
3652-
local_right_anchor_id = kv.first;
3661+
local_right_anchor_id = local_id;
3662+
#ifdef debug
3663+
std::cerr << "Right anchor is " << local_right_anchor_id << std::endl;
3664+
#endif
36533665
}
36543666
// TODO: Stop early when we found them all.
36553667
}
36563668

36573669
if (!is_empty(left_anchor) && local_left_anchor_id == 0) {
3658-
#pragma omp critical (cerr)
3659-
{
3660-
for (auto& kv : local_to_base) {
3661-
std::cerr << "Local ID " << kv.first << " = base graph ID " << kv.second << std::endl;
3662-
}
3663-
}
36643670
// Somehow the left anchor didn't come through. Complain.
36653671
std::stringstream ss;
36663672
ss << "Extracted graph of " << local_graph.get_node_count() << " nodes";
@@ -3716,6 +3722,10 @@ void MinimizerMapper::with_dagified_local_graph(const pos_t& left_anchor, const
37163722

37173723
// And get the node that that orientation of it is in the strand-split graph
37183724
handle_t overlay_handle = split_graph.get_overlay_handle(local_handle);
3725+
3726+
#ifdef debug
3727+
std::cerr << "Left anchor " << local_graph.get_id(local_handle) << (local_graph.get_is_reverse(local_handle) ? "-" : "+") << " in local graph is " << split_graph.get_id(overlay_handle) << (split_graph.get_is_reverse(overlay_handle) ? "-" : "+") << " in strand-split graph." << std::endl;
3728+
#endif
37193729

37203730
// And use that
37213731
bounding_handles.push_back(overlay_handle);
@@ -3742,6 +3752,10 @@ void MinimizerMapper::with_dagified_local_graph(const pos_t& left_anchor, const
37423752
// And get the node that that orientation of it is in the strand-split graph
37433753
// But flip it because we want to dagify going inwards from the right
37443754
handle_t overlay_handle = split_graph.flip(split_graph.get_overlay_handle(local_handle));
3755+
3756+
#ifdef debug
3757+
std::cerr << "Right anchor " << local_graph.get_id(local_handle) << (local_graph.get_is_reverse(local_handle) ? "-" : "+") << " in local graph is " << split_graph.get_id(overlay_handle) << (split_graph.get_is_reverse(overlay_handle) ? "-" : "+") << " in strand-split graph." << std::endl;
3758+
#endif
37453759

37463760
// And use that
37473761
bounding_handles.push_back(overlay_handle);
@@ -3856,20 +3870,20 @@ std::pair<size_t, size_t> MinimizerMapper::align_sequence_between(const pos_t& l
38563870
// This is a head in the subgraph, and either matches the
38573871
// left anchor or we don't have any, so keep it.
38583872
#ifdef debug
3859-
std::cerr << "Dagified graph node " << dagified_graph.get_id(h) << " " << dagified_graph.get_is_reverse(h) << " is an acceptable source (" << base_coords.first << " " << base_coords.second << ")" << std::endl;
3873+
std::cerr << "Dagified graph node " << dagified_graph.get_id(h) << " " << dagified_graph.get_is_reverse(h) << " is an acceptable source (" << base_coords.first << " " << base_coords.second << ") length " << dagified_graph.get_length(h) << std::endl;
38603874
#endif
38613875
} else if (dagified_graph.get_is_reverse(h) && (is_empty(right_anchor) || h == dagified_graph.flip(right_anchor_handle))) {
38623876
// Tip is inward reverse, so it's a sink.
38633877
// This is a tail in the subgraph, and either matches the right
38643878
// anchor (which faces outwards) or we don't have any, so keep it.
38653879
#ifdef debug
3866-
std::cerr << "Dagified graph node " << dagified_graph.get_id(h) << " " << dagified_graph.get_is_reverse(h) << " is an acceptable sink (" << base_coords.first << " " << base_coords.second << ")" << std::endl;
3880+
std::cerr << "Dagified graph node " << dagified_graph.get_id(h) << " " << dagified_graph.get_is_reverse(h) << " is an acceptable sink (" << base_coords.first << " " << base_coords.second << ") length " << dagified_graph.get_length(h) << std::endl;
38673881
#endif
38683882
} else {
38693883
// This is a wrong orientation or other copy of an anchoring node, or some other tip.
38703884
// We don't want to keep this handle
38713885
#ifdef debug
3872-
std::cerr << "Dagified graph node " << dagified_graph.get_id(h) << " " << dagified_graph.get_is_reverse(h) << " is an unacceptable tip (" << base_coords.first << " " << base_coords.second << ")" << std::endl;
3886+
std::cerr << "Dagified graph node " << dagified_graph.get_id(h) << " " << dagified_graph.get_is_reverse(h) << " is an unacceptable tip (" << base_coords.first << " " << base_coords.second << ") length " << dagified_graph.get_length(h) << std::endl;
38733887
#endif
38743888
nid_t dagified_id = dagified_graph.get_id(h);
38753889
if (!to_remove_ids.count(dagified_id)) {

src/primer_filter.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,8 @@ std::pair<string, size_t> PrimerFinder::get_graph_coordinates_from_sequence(cons
288288
//TODO: These are empty but they could be command line arguments
289289
string path_file;
290290
vector<string> path_names;
291-
vector<tuple<path_handle_t, size_t, size_t>> sequence_dictionary = get_sequence_dictionary(path_file, path_names, *graph);
291+
std::unordered_set<std::string> reference_assembly_names;
292+
vector<tuple<path_handle_t, size_t, size_t>> sequence_dictionary = get_sequence_dictionary(path_file, path_names, reference_assembly_names, *graph);
292293
unordered_set<path_handle_t> reference_paths;
293294
reference_paths.reserve(sequence_dictionary.size());
294295
for (auto& entry : sequence_dictionary) {

0 commit comments

Comments
 (0)