From 18d9bfcbb20e7b5efea7e7c70fa3d22fcc9d8335 Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Mon, 19 Jan 2026 23:53:26 +0100 Subject: [PATCH 01/17] Postrebase fix --- .../scaffold_graph/scaffold_graph.cpp | 307 +++++++++++++ .../scaffold_graph/scaffold_graph.hpp | 262 ++++++++++++ .../scaffold_graph/scaffold_vertex_index.hpp | 2 +- .../scaffold_vertex_index_builder.hpp | 2 +- .../modules/path_extend/pipeline/launcher.cpp | 8 +- .../modules/path_extend/pipeline/launcher.hpp | 4 +- .../contracted_gfa_writer.cpp | 0 .../read_cloud_connection_conditions.cpp | 140 ++++++ .../read_cloud_connection_conditions.hpp | 122 ++++++ .../scaffold_graph_extractor.cpp | 77 ++++ .../scaffold_graph_extractor.hpp | 29 ++ .../connection_condition2015.cpp | 6 + .../connection_condition2015.hpp | 2 + .../scaffolder2015/scaffold_graph.hpp | 2 +- .../scaffold_graph_constructor.cpp | 286 ++++++++++++- .../scaffold_graph_constructor.hpp | 165 ++++++- .../scaffold_graph_dijkstra.hpp | 246 +++++++++++ .../scaffold_graph_visualizer.cpp | 18 +- .../scaffold_graph_visualizer.hpp | 36 +- .../scaffold_vertex_predicates.cpp | 26 ++ .../scaffold_vertex_predicates.hpp | 51 +++ src/projects/splitter/CMakeLists.txt | 20 + .../splitter/barcode_index_construction.cpp | 83 ++++ .../splitter/barcode_index_construction.hpp | 40 ++ src/projects/splitter/graph_resolver.cpp | 128 ++++++ src/projects/splitter/graph_resolver.hpp | 46 ++ src/projects/splitter/graph_resolver_io.cpp | 43 ++ src/projects/splitter/graph_resolver_io.hpp | 25 ++ src/projects/splitter/main.cpp | 403 ++++++++++++++++++ src/projects/splitter/path_extractor.cpp | 166 ++++++++ src/projects/splitter/path_extractor.hpp | 57 +++ .../splitter/scaffold_graph_helper.cpp | 254 +++++++++++ .../splitter/scaffold_graph_helper.hpp | 117 +++++ src/projects/splitter/vertex_resolver.hpp | 292 +++++++++++++ 34 files changed, 3418 insertions(+), 47 deletions(-) create mode 100644 src/common/auxiliary_graphs/scaffold_graph/scaffold_graph.cpp create mode 100644 src/common/auxiliary_graphs/scaffold_graph/scaffold_graph.hpp create mode 100644 src/common/modules/path_extend/read_cloud_path_extend/contracted_graph_scaffolding/contracted_gfa_writer.cpp create mode 100644 src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.cpp create mode 100644 src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.hpp create mode 100644 src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.cpp create mode 100644 src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.hpp create mode 100644 src/common/modules/path_extend/scaffolder2015/scaffold_graph_dijkstra.hpp create mode 100644 src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.cpp create mode 100644 src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.hpp create mode 100644 src/projects/splitter/CMakeLists.txt create mode 100644 src/projects/splitter/barcode_index_construction.cpp create mode 100644 src/projects/splitter/barcode_index_construction.hpp create mode 100644 src/projects/splitter/graph_resolver.cpp create mode 100644 src/projects/splitter/graph_resolver.hpp create mode 100644 src/projects/splitter/graph_resolver_io.cpp create mode 100644 src/projects/splitter/graph_resolver_io.hpp create mode 100644 src/projects/splitter/main.cpp create mode 100644 src/projects/splitter/path_extractor.cpp create mode 100644 src/projects/splitter/path_extractor.hpp create mode 100644 src/projects/splitter/scaffold_graph_helper.cpp create mode 100644 src/projects/splitter/scaffold_graph_helper.hpp create mode 100644 src/projects/splitter/vertex_resolver.hpp diff --git a/src/common/auxiliary_graphs/scaffold_graph/scaffold_graph.cpp b/src/common/auxiliary_graphs/scaffold_graph/scaffold_graph.cpp new file mode 100644 index 0000000000..f071dcb9c2 --- /dev/null +++ b/src/common/auxiliary_graphs/scaffold_graph/scaffold_graph.cpp @@ -0,0 +1,307 @@ + +//*************************************************************************** +//* Copyright (c) 2023-2024 SPAdes team +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "scaffold_graph.hpp" + +namespace scaffold_graph { + +std::atomic ScaffoldGraph::ScaffoldEdge::scaffold_edge_id_{0}; + +bool ScaffoldGraph::ScaffoldEdge::operator<(const ScaffoldGraph::ScaffoldEdge& rhs) const { + return id_ < rhs.id_; +} +bool ScaffoldGraph::ScaffoldEdge::operator>(const ScaffoldGraph::ScaffoldEdge& rhs) const { + return rhs < *this; +} +bool ScaffoldGraph::ScaffoldEdge::operator<=(const ScaffoldGraph::ScaffoldEdge& rhs) const { + return !(rhs < *this); +} +bool ScaffoldGraph::ScaffoldEdge::operator>=(const ScaffoldGraph::ScaffoldEdge& rhs) const { + return !(*this < rhs); +} +bool ScaffoldGraph::ScaffoldEdge::operator==(const ScaffoldGraph::ScaffoldEdge &e) const { + return color_ == e.color_ && weight_ == e.weight_ && start_ == e.start_ && end_ == e.end_; +} + +void ScaffoldGraph::AddEdgeSimple(const ScaffoldGraph::ScaffoldEdge &e) { + edges_.emplace(e.getId(), e); + outgoing_edges_.emplace(e.getStart(), e.getId()); + incoming_edges_.emplace(e.getEnd(), e.getId()); +} + +void ScaffoldGraph::DeleteOutgoing(const ScaffoldGraph::ScaffoldEdge &e) { + auto e_range = outgoing_edges_.equal_range(e.getStart()); + for (auto edge_id = e_range.first; edge_id != e_range.second;) { + if (edges_.at(edge_id->second) == e) { + edge_id = outgoing_edges_.erase(edge_id); + } else { + ++edge_id; + } + } +} + +void ScaffoldGraph::DeleteIncoming(const ScaffoldGraph::ScaffoldEdge &e) { + auto e_range = incoming_edges_.equal_range(e.getEnd()); + for (auto edge_id = e_range.first; edge_id != e_range.second;) { + if (edges_.at(edge_id->second) == e) { + edge_id = incoming_edges_.erase(edge_id); + } else { + ++edge_id; + } + } +} + +void ScaffoldGraph::DeleteAllOutgoingEdgesSimple(ScaffoldVertex v) { + auto e_range = outgoing_edges_.equal_range(v); + for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) { + DeleteIncoming(edges_.at(edge_id->second)); + } + outgoing_edges_.erase(v); +} + +void ScaffoldGraph::DeleteEdgeFromStorage(const ScaffoldGraph::ScaffoldEdge &e) { + VERIFY(!Exists(e)); + edges_.erase(e.getId()); +} + +void ScaffoldGraph::DeleteAllIncomingEdgesSimple(ScaffoldVertex v) { + auto e_range = incoming_edges_.equal_range(v); + for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) { + DeleteOutgoing(edges_.at(edge_id->second)); + } + incoming_edges_.erase(v); +} + +bool ScaffoldGraph::Exists(ScaffoldVertex assembly_graph_edge) const { + return vertices_.count(assembly_graph_edge) != 0; +} + +bool ScaffoldGraph::Exists(const ScaffoldGraph::ScaffoldEdge &e) const { + auto e_range = outgoing_edges_.equal_range(e.getStart()); + for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) { + if (edges_.at(edge_id->second) == e) { + return true; + } + } + return false; +} + +ScaffoldVertex ScaffoldGraph::conjugate(ScaffoldVertex scaffold_vertex) const { + return scaffold_vertex.GetConjugateFromGraph(assembly_graph_); +} + +ScaffoldGraph::ScaffoldEdge ScaffoldGraph::conjugate(const ScaffoldGraph::ScaffoldEdge &e) const { + return ScaffoldEdge(conjugate(e.getEnd()), conjugate(e.getStart()), e.getColor(), e.getWeight()); +} + +bool ScaffoldGraph::AddVertex(ScaffoldVertex scaffold_vertex) { + if (!Exists(scaffold_vertex)) { + VERIFY(!Exists(conjugate(scaffold_vertex))); + vertices_.insert(scaffold_vertex); + vertices_.insert(conjugate(scaffold_vertex)); + return true; + } + return false; +} + +bool ScaffoldGraph::AddEdge(ScaffoldVertex v1, ScaffoldVertex v2, size_t lib_id, double weight, size_t length) { + VERIFY(Exists(v1)); + VERIFY(Exists(v2)); + + ScaffoldEdge e(v1, v2, lib_id, weight, length); + if (Exists(e)) { + return false; + } + + AddEdgeSimple(e); + return true; +} + +void ScaffoldGraph::Print(std::ostream &os) const { + for (auto v: vertices_) { + os << "Vertex " << int_id(v) << " ~ " << int_id(conjugate(v)) + << ": len = " << v.GetLengthFromGraph(assembly_graph_) << ", cov = " + << v.GetCoverageFromGraph(assembly_graph_) << std::endl; + } + for (auto e_iter = edges_.begin(); e_iter != edges_.end(); ++e_iter) { + os << "Edge " << e_iter->second.getId() << + ": " << int_id(e_iter->second.getStart()) << " -> " << int_id(e_iter->second.getEnd()) << + ", lib index = " << e_iter->second.getColor() << ", weight " << e_iter->second.getWeight() + << ", length = " << e_iter->second.getLength() << std::endl; + } +} + +ScaffoldGraph::ScaffoldEdge ScaffoldGraph::UniqueIncoming(ScaffoldVertex assembly_graph_edge) const { + VERIFY(HasUniqueIncoming(assembly_graph_edge)); + return edges_.at(incoming_edges_.find(assembly_graph_edge)->second); +} + +ScaffoldGraph::ScaffoldEdge ScaffoldGraph::UniqueOutgoing(ScaffoldVertex assembly_graph_edge) const { + VERIFY(HasUniqueOutgoing(assembly_graph_edge)); + return edges_.at(outgoing_edges_.find(assembly_graph_edge)->second); +} + +bool ScaffoldGraph::HasUniqueIncoming(ScaffoldVertex assembly_graph_edge) const { + return IncomingEdgeCount(assembly_graph_edge) == 1; +} + +bool ScaffoldGraph::HasUniqueOutgoing(ScaffoldVertex assembly_graph_edge) const { + return OutgoingEdgeCount(assembly_graph_edge) == 1; +} + +size_t ScaffoldGraph::IncomingEdgeCount(ScaffoldVertex assembly_graph_edge) const { + return incoming_edges_.count(assembly_graph_edge); +} + +size_t ScaffoldGraph::OutgoingEdgeCount(ScaffoldVertex assembly_graph_edge) const { + return outgoing_edges_.count(assembly_graph_edge); +} + +std::vector ScaffoldGraph::IncomingEdges(scaffold_graph::ScaffoldVertex assembly_graph_edge) const { + std::vector result; + auto e_range = incoming_edges_.equal_range(assembly_graph_edge); + for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) { + result.push_back(edges_.at(edge_id->second)); + } + return result; +} + +std::vector ScaffoldGraph::OutgoingEdges(scaffold_graph::ScaffoldVertex assembly_graph_edge) const { + std::vector result; + auto e_range = outgoing_edges_.equal_range(assembly_graph_edge); + for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) { + result.push_back(edges_.at(edge_id->second)); + } + return result; +} + +const debruijn_graph::Graph &ScaffoldGraph::AssemblyGraph() const { + return assembly_graph_; +} + +size_t ScaffoldGraph::EdgeCount() const { + return edges_.size(); +} + +size_t ScaffoldGraph::VertexCount() const { + return vertices_.size(); +} + +ScaffoldVertex ScaffoldGraph::EdgeEnd(ScaffoldEdge e) const { + return e.getEnd(); +} + +ScaffoldVertex ScaffoldGraph::EdgeStart(ScaffoldEdge e) const { + return e.getStart(); +} + +size_t ScaffoldGraph::int_id(ScaffoldGraph::ScaffoldEdge e) const { + return e.getId(); +} + +size_t ScaffoldGraph::int_id(ScaffoldVertex v) const { + return v.int_id(); +} + +ScaffoldGraph::ConstScaffoldEdgeIterator ScaffoldGraph::eend() const { + return ConstScaffoldEdgeIterator(edges_.cend()); +} + +ScaffoldGraph::ConstScaffoldEdgeIterator ScaffoldGraph::ebegin() const { + return ConstScaffoldEdgeIterator(edges_.cbegin()); +} + +ScaffoldGraph::VertexStorage::const_iterator ScaffoldGraph::vend() const { + return vertices_.cend(); +} + +ScaffoldGraph::VertexStorage::const_iterator ScaffoldGraph::vbegin() const { + return vertices_.cbegin(); +} + +adt::iterator_range ScaffoldGraph::vertices() const { + return adt::make_range(vbegin(), vend()); +} + +adt::iterator_range ScaffoldGraph::edges() const { + return adt::make_range(ebegin(), eend()); +} + +bool ScaffoldGraph::IsVertexIsolated(ScaffoldVertex assembly_graph_edge) const { + bool result = incoming_edges_.count(assembly_graph_edge) == 0 && outgoing_edges_.count(assembly_graph_edge) == 0; + return result; +} + +bool ScaffoldGraph::RemoveVertex(ScaffoldVertex scaffold_vertex) { + if (Exists(scaffold_vertex)) { + VERIFY(Exists(conjugate(scaffold_vertex))); + + DeleteAllOutgoingEdgesSimple(scaffold_vertex); + DeleteAllIncomingEdgesSimple(scaffold_vertex); + DeleteAllOutgoingEdgesSimple(conjugate(scaffold_vertex)); + DeleteAllIncomingEdgesSimple(conjugate(scaffold_vertex)); + + VERIFY(incoming_edges_.count(scaffold_vertex) == 0); + VERIFY(outgoing_edges_.count(scaffold_vertex) == 0); + VERIFY(incoming_edges_.count(conjugate(scaffold_vertex)) == 0); + VERIFY(outgoing_edges_.count(conjugate(scaffold_vertex)) == 0); + + vertices_.erase(scaffold_vertex); + vertices_.erase(conjugate(scaffold_vertex)); + + return true; + } + return false; +} + +bool ScaffoldGraph::RemoveEdge(const ScaffoldGraph::ScaffoldEdge &e) { + if (Exists(e)) { + DeleteOutgoing(e); + DeleteIncoming(e); + DeleteEdgeFromStorage(e); + + return true; + } + return false; +} + +bool ScaffoldGraph::AddEdge(const ScaffoldGraph::ScaffoldEdge &e) { + return AddEdge(e.getStart(), e.getEnd(), e.getColor(), e.getWeight(), e.getLength()); +} +std::string ScaffoldGraph::str(const ScaffoldVertex& vertex) const { + return vertex.str(assembly_graph_); +} +std::string ScaffoldGraph::str(const ScaffoldGraph::ScaffoldEdge& edge) const { + return "(" + std::to_string(edge.getStart().int_id()) + ", " + std::to_string(edge.getEnd().int_id()) + ")"; +} +size_t ScaffoldGraph::length(const ScaffoldGraph::ScaffoldEdge &edge) const { + return edge.getLength(); +} +size_t ScaffoldGraph::length(const ScaffoldVertex &vertex) const { + return vertex.GetLengthFromGraph(assembly_graph_); +} +double ScaffoldGraph::coverage(const ScaffoldVertex &vertex) const { + return vertex.GetCoverageFromGraph(assembly_graph_); +} +void ScaffoldGraph::AddVertices(const std::set &vertices) { + for (const auto& v: vertices) { + AddVertex(v); + } +} +ScaffoldGraph &ScaffoldGraph::operator=(ScaffoldGraph other) { + swap(other); + return *this; +} +void ScaffoldGraph::swap(ScaffoldGraph &other) { + VERIFY(&assembly_graph_ == &other.assembly_graph_); + std::swap(vertices_, other.vertices_); + std::swap(edges_, other.edges_); + std::swap(outgoing_edges_, other.outgoing_edges_); + std::swap(incoming_edges_, other.incoming_edges_); +} + +} //scaffold_graph diff --git a/src/common/auxiliary_graphs/scaffold_graph/scaffold_graph.hpp b/src/common/auxiliary_graphs/scaffold_graph/scaffold_graph.hpp new file mode 100644 index 0000000000..1ee3ffd9d9 --- /dev/null +++ b/src/common/auxiliary_graphs/scaffold_graph/scaffold_graph.hpp @@ -0,0 +1,262 @@ + +//*************************************************************************** +//* Copyright (c) 2023-2024 SPAdes team +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +// +// Created by andrey on 17.09.15. +// +#pragma once + +#include "scaffold_vertex.hpp" +#include "assembly_graph/core/graph.hpp" +#include "utils/logger/logger.hpp" + +#include "adt/iterator_range.hpp" + +#include + +namespace scaffold_graph { + +//do NOT add "using namespace debruijn_graph" in order not to confuse between EdgeId typdefs + +class ScaffoldGraph { + +public: + //EdgeId in de Bruijn graph is vertex in scaffolding graph + typedef ScaffoldVertex ScaffoldGraphVertex; + + //Unique edge id + typedef size_t ScaffoldEdgeIdT; + + //Scaffold edge indormation class + struct ScaffoldEdge { + private: + //unique id + ScaffoldEdgeIdT id_; + //id counter + static std::atomic scaffold_edge_id_; + + ScaffoldGraphVertex start_; + ScaffoldGraphVertex end_; + //color = lib# + size_t color_; + //read pair weight or anything else + double weight_; + //todo discuss (distance between vertices by default) + size_t length_; + + public: + + ScaffoldEdge(ScaffoldVertex start, ScaffoldVertex end, size_t lib_id = (size_t) -1, double weight = 0, size_t length = 0) : + id_(scaffold_edge_id_++), + start_(start), end_(end), + color_(lib_id), + weight_(weight), + length_(length){ + } + //for consistency with dijkstra + explicit ScaffoldEdge(size_t ): id_(scaffold_edge_id_++), start_(nullptr), end_(nullptr), + color_((size_t) -1), weight_(0), length_(0) {} + + ScaffoldEdge(): id_(scaffold_edge_id_++), start_(nullptr), end_(nullptr), + color_((size_t) -1), weight_(0), length_(0) {} + + ScaffoldEdgeIdT getId() const { + return id_; + } + + + size_t getColor() const { + return color_; + } + + double getWeight() const { + return weight_; + } + + size_t getLength() const { + return length_; + } + + const ScaffoldGraphVertex getStart() const { + return start_; + } + + const ScaffoldGraphVertex getEnd() const { + return end_; + } + + bool operator==(const ScaffoldEdge &e) const; + + bool operator<(const ScaffoldEdge& rhs) const; + bool operator>(const ScaffoldEdge& rhs) const; + bool operator<=(const ScaffoldEdge& rhs) const; + bool operator>=(const ScaffoldEdge& rhs) const; + }; + + //typedef to use in templated graph algorithms + typedef ScaffoldVertex VertexId; + typedef ScaffoldEdge EdgeId; + + //All vertices are stored in set + typedef std::set VertexStorage; + //Edges are stored in map: Id -> Edge Information + typedef std::unordered_map EdgeStorage; + //Adjacency list contains vertrx and edge id (instead of whole edge information) + typedef std::multimap AdjacencyStorage; + + struct ConstScaffoldEdgeIterator: public boost::iterator_facade { + private: + EdgeStorage::const_iterator iter_; + + public: + ConstScaffoldEdgeIterator(EdgeStorage::const_iterator iter) : iter_(iter) { + } + + private: + friend class boost::iterator_core_access; + + void increment() { + ++iter_; + } + + bool equal(const ConstScaffoldEdgeIterator &other) const { + return iter_ == other.iter_; + } + + const ScaffoldEdge& dereference() const { + return iter_->second; + } + }; + +//TODO:: fix this. Seems that only ebegin and eend are broken. +private: + EdgeStorage edges_; + + VertexStorage vertices_; + + const debruijn_graph::Graph &assembly_graph_; + + AdjacencyStorage outgoing_edges_; + + AdjacencyStorage incoming_edges_; + + //Delete outgoing edge from adjancecy list without checks + void DeleteOutgoing(const ScaffoldEdge &e); + + //Delete incoming edge from adjancecy list without checks + void DeleteIncoming(const ScaffoldEdge &e); + + //Delete all edge info from storage + void DeleteEdgeFromStorage(const ScaffoldEdge &e); + + //Detelte all outgoing from v edges from adjacency lists + void DeleteAllOutgoingEdgesSimple(ScaffoldGraphVertex v); + + //Detelte all incoming from v edges from adjacency lists + void DeleteAllIncomingEdgesSimple(ScaffoldGraphVertex v); + +public: + ScaffoldGraph(const debruijn_graph::Graph &g) : assembly_graph_(g) { + } + + ScaffoldGraph(const ScaffoldGraph& other) = default; + + ScaffoldGraph(ScaffoldGraph&& other) = default; + + bool Exists(ScaffoldGraphVertex assembly_graph_edge) const; + + bool Exists(const ScaffoldEdge &e) const; + + ScaffoldGraphVertex conjugate(ScaffoldGraphVertex scaffold_vertex) const; + + //fixme move back to private + void AddEdgeSimple(const ScaffoldEdge &e); + + //Return structure thay is equal to conjugate of e (not exactrly the same structure as in graph) + ScaffoldEdge conjugate(const ScaffoldEdge &e) const; + + //Add isolated vertex to the graph if not exitsts + bool AddVertex(ScaffoldGraphVertex scaffold_vertex); + + void AddVertices(const std::set &vertices); + + //Add edge (and conjugate) if not exists + //v1 and v2 must exist + bool AddEdge(ScaffoldGraphVertex v1, ScaffoldGraphVertex v2, size_t lib_id, double weight, size_t length); + + bool AddEdge(const ScaffoldEdge &e); + + //Rempve edge from edge container and all adjacency lists + bool RemoveEdge(const ScaffoldEdge &e); + + //Remove vertex and all adjacent edges + bool RemoveVertex(ScaffoldGraphVertex scaffold_vertex); + + bool IsVertexIsolated(ScaffoldGraphVertex assembly_graph_edge) const; + + VertexStorage::const_iterator vbegin() const; + + VertexStorage::const_iterator vend() const; + + adt::iterator_range vertices() const; + + ConstScaffoldEdgeIterator ebegin() const; + + ConstScaffoldEdgeIterator eend() const; + + adt::iterator_range edges() const; + + size_t int_id(ScaffoldGraphVertex v) const; + + size_t int_id(ScaffoldEdge e) const; + + ScaffoldGraphVertex EdgeStart(ScaffoldEdge e) const; + + ScaffoldGraphVertex EdgeEnd(ScaffoldEdge e) const; + + size_t VertexCount() const; + + size_t EdgeCount() const; + + const debruijn_graph::Graph & AssemblyGraph() const; + + std::vector OutgoingEdges(ScaffoldVertex assembly_graph_edge) const; + + std::vector IncomingEdges(ScaffoldVertex assembly_graph_edge) const; + + size_t OutgoingEdgeCount(ScaffoldGraphVertex assembly_graph_edge) const; + + size_t IncomingEdgeCount(ScaffoldGraphVertex assembly_graph_edge) const; + + bool HasUniqueOutgoing(ScaffoldGraphVertex assembly_graph_edge) const; + + bool HasUniqueIncoming(ScaffoldGraphVertex assembly_graph_edge) const; + + ScaffoldEdge UniqueOutgoing(ScaffoldGraphVertex assembly_graph_edge) const; + + ScaffoldEdge UniqueIncoming(ScaffoldGraphVertex assembly_graph_edge) const; + + void Print(std::ostream &os) const; + + std::string str(const ScaffoldGraphVertex &vertex) const; + + std::string str(const ScaffoldGraph::ScaffoldEdge &edge) const; + + size_t length(const ScaffoldGraphVertex &vertex) const; + size_t length(const ScaffoldGraph::ScaffoldEdge &edge) const; + + double coverage(const ScaffoldGraphVertex &vertex) const; + + ScaffoldGraph& operator =(ScaffoldGraph other); + + void swap(ScaffoldGraph& other); +}; + +} //scaffold_graph + diff --git a/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index.hpp b/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index.hpp index cd5265b6fe..399d5ec97b 100644 --- a/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index.hpp +++ b/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index.hpp @@ -7,10 +7,10 @@ #pragma once -#include "barcode_info_extractor.hpp" #include "auxiliary_graphs/scaffold_graph/scaffold_vertex.hpp" #include "assembly_graph/core/graph.hpp" #include "adt/iterator_range.hpp" +#include "barcode_index/barcode_info_extractor.hpp" namespace barcode_index { diff --git a/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index_builder.hpp b/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index_builder.hpp index 07356373a1..1c74ceeb0e 100644 --- a/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index_builder.hpp +++ b/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index_builder.hpp @@ -7,8 +7,8 @@ #pragma once +#include "barcode_index/barcode_info_extractor.hpp" #include "scaffold_vertex_index.hpp" -#include "barcode_info_extractor.hpp" namespace barcode_index { diff --git a/src/common/modules/path_extend/pipeline/launcher.cpp b/src/common/modules/path_extend/pipeline/launcher.cpp index 0f2554c9fe..3ac0954f25 100644 --- a/src/common/modules/path_extend/pipeline/launcher.cpp +++ b/src/common/modules/path_extend/pipeline/launcher.cpp @@ -56,8 +56,8 @@ PathExtendLauncher::ConstructPairedConnectionConditions(const ScaffoldingUniqueE return conditions; } -std::shared_ptr PathExtendLauncher::ConstructScaffoldGraph(const ScaffoldingUniqueEdgeStorage &edge_storage) const { - using namespace scaffold_graph; +std::shared_ptr PathExtendLauncher::ConstructScaffoldGraph(const ScaffoldingUniqueEdgeStorage &edge_storage) const { + using namespace scaffolder; const pe_config::ParamSetT::ScaffoldGraphParamsT ¶ms = params_.pset.scaffold_graph_params; @@ -84,11 +84,11 @@ std::shared_ptr PathExtendLauncher::ConstructScaf return scaffold_graph; } -void PathExtendLauncher::PrintScaffoldGraph(const scaffold_graph::ScaffoldGraph &scaffold_graph, +void PathExtendLauncher::PrintScaffoldGraph(const scaffolder::ScaffoldGraph &scaffold_graph, const std::set &main_edge_set, const debruijn_graph::GenomeConsistenceChecker &genome_checker, const std::filesystem::path &filename) const { - using namespace scaffold_graph; + using namespace scaffolder; auto vertex_colorer = std::make_shared(main_edge_set); auto edge_colorer = std::make_shared(); diff --git a/src/common/modules/path_extend/pipeline/launcher.hpp b/src/common/modules/path_extend/pipeline/launcher.hpp index eff18a64c8..14aadf2dcf 100644 --- a/src/common/modules/path_extend/pipeline/launcher.hpp +++ b/src/common/modules/path_extend/pipeline/launcher.hpp @@ -37,10 +37,10 @@ class PathExtendLauncher { std::vector> ConstructPairedConnectionConditions(const ScaffoldingUniqueEdgeStorage &edge_storage) const; - std::shared_ptr + std::shared_ptr ConstructScaffoldGraph(const ScaffoldingUniqueEdgeStorage &edge_storage) const; - void PrintScaffoldGraph(const scaffold_graph::ScaffoldGraph &scaffold_graph, + void PrintScaffoldGraph(const scaffolder::ScaffoldGraph &scaffold_graph, const std::set &main_edge_set, const debruijn_graph::GenomeConsistenceChecker &genome_checker, const std::filesystem::path &filename) const; diff --git a/src/common/modules/path_extend/read_cloud_path_extend/contracted_graph_scaffolding/contracted_gfa_writer.cpp b/src/common/modules/path_extend/read_cloud_path_extend/contracted_graph_scaffolding/contracted_gfa_writer.cpp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.cpp b/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.cpp new file mode 100644 index 0000000000..452f66782f --- /dev/null +++ b/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.cpp @@ -0,0 +1,140 @@ +//*************************************************************************** +//* Copyright (c) 2019 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "read_cloud_connection_conditions.hpp" + +#include "modules/path_extend/pipeline/launcher.hpp" + +namespace path_extend { +namespace read_cloud { + +double NormalizedBarcodeScoreFunction::GetScore(const scaffolder::ScaffoldGraph::ScaffoldEdge &edge) const { + auto first = edge.getStart(); + auto second = edge.getEnd(); + DEBUG("Checking edge " << edge.getStart().int_id() << " -> " << edge.getEnd().int_id()); + size_t first_length = first.GetLengthFromGraph(graph_); + size_t second_length = second.GetLengthFromGraph(graph_); + size_t first_size = barcode_extractor_->GetTailSize(first); + size_t second_size = barcode_extractor_->GetHeadSize(second); + if (first_size == 0 or second_size == 0) { + DEBUG("No barcodes on one of the long edges"); + return 0.0; + } + size_t shared_count = barcode_extractor_->GetIntersectionSize(first, second); + size_t min_size = std::min(first_size, second_size); + double containment_index = static_cast(shared_count) / static_cast(min_size); + if (math::ge(containment_index, 0.05)) { + DEBUG("First length: " << first_length); + DEBUG("Second length: " << second_length); + DEBUG("First size: " << first_size); + DEBUG("Second size: " << second_size); + DEBUG("Intersection: " << shared_count); + DEBUG("Score: " << containment_index); + } + VERIFY(math::ge(1.0, containment_index)); +// double first_coverage = first.GetCoverageFromGraph(graph_); +// double second_coverage = second.GetCoverageFromGraph(graph_); + return containment_index; +} +NormalizedBarcodeScoreFunction::NormalizedBarcodeScoreFunction( + const Graph &graph_, + std::shared_ptr barcode_extractor_) : + AbstractBarcodeScoreFunction(graph_, barcode_extractor_) {} + +TransitiveEdgesPredicate::TransitiveEdgesPredicate(const scaffolder::ScaffoldGraph &graph, + const Graph &g, + size_t distance_threshold) : + scaffold_graph_(graph), g_(g), distance_threshold_(distance_threshold) {} +bool TransitiveEdgesPredicate::Check(const ScaffoldEdgePredicate::ScaffoldEdge &scaffold_edge) const { + ScaffoldVertex current = scaffold_edge.getStart(); + ScaffoldVertex candidate = scaffold_edge.getEnd(); + //fixme replace with dijkstra and length threshold + DEBUG("Checking edge (" << current.int_id() << ", " << candidate.int_id() << ")"); + SimpleSearcher simple_searcher(scaffold_graph_, g_, distance_threshold_); + auto reachable_vertices = simple_searcher.GetReachableVertices(current, scaffold_edge); + for (const auto &vertex: reachable_vertices) { + if (candidate == vertex) { + DEBUG("Found another path, false"); + return false; + } + } + DEBUG("True"); + return true; +} +SimpleSearcher::SimpleSearcher(const scaffolder::ScaffoldGraph &graph, const Graph &g, size_t distance) + : scaff_graph_(graph), g_(g), distance_threshold_(distance) {} +std::vector SimpleSearcher::GetReachableVertices( + const SimpleSearcher::ScaffoldVertex &vertex, + const ScaffoldGraph::ScaffoldEdge &restricted_edge) { + std::vector result; + VertexWithDistance new_vertex(vertex, 0); + std::queue vertex_queue; + vertex_queue.push(new_vertex); + std::unordered_set visited; + visited.insert(vertex); + visited.insert(vertex.GetConjugateFromGraph(g_)); + visited.insert(restricted_edge.getEnd().GetConjugateFromGraph(g_)); + while (not vertex_queue.empty()) { + auto current_vertex = vertex_queue.front(); + vertex_queue.pop(); + DEBUG("Id: " << current_vertex.vertex.int_id()); + DEBUG("Distance: " << current_vertex.distance); + if (current_vertex.distance <= distance_threshold_) { + DEBUG("Passed threshold. Processing") + ProcessVertex(vertex_queue, current_vertex, visited, restricted_edge); + DEBUG("Processing finished"); + result.push_back(current_vertex.vertex); + } + } + return result; +} + +void SimpleSearcher::ProcessVertex(std::queue &vertex_queue, + const VertexWithDistance &vertex, + std::unordered_set &visited, + const ScaffoldGraph::ScaffoldEdge &restricted_edge) { + size_t current_distance = vertex.distance; + size_t new_distance = current_distance + 1; + for (const ScaffoldGraph::ScaffoldEdge &edge: scaff_graph_.OutgoingEdges(vertex.vertex)) { + DEBUG("Checking vertex: " << edge.getEnd().int_id()); + DEBUG("Visited: " << (visited.find(edge.getEnd()) != visited.end())); + DEBUG("Edge restricted: " << AreEqual(edge, restricted_edge)); + if (visited.find(edge.getEnd()) == visited.end() and not AreEqual(edge, restricted_edge)) { + DEBUG("Passed"); + vertex_queue.emplace(edge.getEnd(), new_distance); + visited.insert(edge.getEnd()); + } + } +} +bool SimpleSearcher::AreEqual(const scaffolder::ScaffoldGraph::ScaffoldEdge &first, + const scaffolder::ScaffoldGraph::ScaffoldEdge &second) { + return first.getStart() == second.getStart() and first.getEnd() == second.getEnd(); +} + +SimpleSearcher::VertexWithDistance::VertexWithDistance(const SimpleSearcher::ScaffoldVertex &vertex, size_t distance) + : vertex(vertex), distance(distance) {} + +AbstractBarcodeScoreFunction::AbstractBarcodeScoreFunction( + const Graph &graph_, + const std::shared_ptr barcode_extractor) + : + graph_(graph_), + barcode_extractor_(barcode_extractor) {} +TrivialBarcodeScoreFunction::TrivialBarcodeScoreFunction( + const Graph &graph_, + std::shared_ptr barcode_extractor_, + const size_t read_count_threshold_, + const size_t tail_threshold_) : AbstractBarcodeScoreFunction(graph_, + barcode_extractor_), + read_count_threshold_(read_count_threshold_), + tail_threshold_(tail_threshold_) {} +double TrivialBarcodeScoreFunction::GetScore(const scaffolder::ScaffoldGraph::ScaffoldEdge &edge) const { + size_t shared_count = barcode_extractor_->GetIntersectionSize(edge.getStart(), edge.getEnd()); + + return static_cast(shared_count); +} +} +} \ No newline at end of file diff --git a/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.hpp b/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.hpp new file mode 100644 index 0000000000..8ffc8af8a1 --- /dev/null +++ b/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.hpp @@ -0,0 +1,122 @@ +//*************************************************************************** +//* Copyright (c) 2019 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "auxiliary_graphs/scaffold_graph/scaffold_graph.hpp" +#include "auxiliary_graphs/scaffold_graph/scaffold_vertex_index.hpp" +#include "modules/path_extend/path_extender.hpp" +#include "modules/path_extend/pipeline/launch_support.hpp" +#include "modules/path_extend/extension_chooser.hpp" +#include "modules/path_extend/scaffolder2015/connection_condition2015.hpp" + +namespace path_extend { +namespace read_cloud { + +class ScaffoldEdgePredicate : public func::AbstractPredicate { + public: + typedef scaffold_graph::ScaffoldGraph ScaffoldGraph; + typedef scaffold_graph::ScaffoldGraph::ScaffoldEdge ScaffoldEdge; + + virtual ~ScaffoldEdgePredicate() = default; +}; + +class SimpleSearcher { + public: + typedef scaffold_graph::ScaffoldGraph ScaffoldGraph; + typedef ScaffoldGraph::ScaffoldGraphVertex ScaffoldVertex; + + struct VertexWithDistance { + ScaffoldVertex vertex; + size_t distance; + VertexWithDistance(const ScaffoldVertex &vertex, size_t distance); + }; + + SimpleSearcher(const scaffold_graph::ScaffoldGraph &graph_, const Graph &g, size_t distance_); + + std::vector GetReachableVertices(const ScaffoldVertex &vertex, + const ScaffoldGraph::ScaffoldEdge &restricted_edge); + void ProcessVertex(std::queue &vertex_queue, const VertexWithDistance &vertex, + std::unordered_set &visited, const ScaffoldGraph::ScaffoldEdge &restricted_edge); + bool AreEqual(const ScaffoldGraph::ScaffoldEdge &first, const ScaffoldGraph::ScaffoldEdge &second); + + private: + const ScaffoldGraph &scaff_graph_; + const Graph &g_; + size_t distance_threshold_; + + DECL_LOGGER("SimpleSearcher"); +}; + +class TransitiveEdgesPredicate : public ScaffoldEdgePredicate { + public: + using ScaffoldEdgePredicate::ScaffoldEdge; + typedef scaffold_graph::ScaffoldVertex ScaffoldVertex; + + TransitiveEdgesPredicate(const scaffold_graph::ScaffoldGraph &graph, const Graph &g, size_t distance_threshold); + + bool Check(const ScaffoldEdge &scaffold_edge) const override; + + private: + const scaffold_graph::ScaffoldGraph scaffold_graph_; + const Graph &g_; + size_t distance_threshold_; + + DECL_LOGGER("TransitiveEdgesPredicate"); +}; + +class ScaffoldEdgeScoreFunction { + public: + typedef scaffold_graph::ScaffoldGraph ScaffoldGraph; + typedef scaffold_graph::ScaffoldGraph::ScaffoldEdge ScaffoldEdge; + virtual double GetScore(const scaffold_graph::ScaffoldGraph::ScaffoldEdge &edge) const = 0; + virtual ~ScaffoldEdgeScoreFunction() = default; +}; + +class AbstractBarcodeScoreFunction : public ScaffoldEdgeScoreFunction { + public: + AbstractBarcodeScoreFunction( + const Graph &graph_, + std::shared_ptr barcode_extractor_); + + protected: + const Graph &graph_; + std::shared_ptr barcode_extractor_; +}; + +class NormalizedBarcodeScoreFunction : public AbstractBarcodeScoreFunction { + public: + NormalizedBarcodeScoreFunction(const Graph &graph_, + std::shared_ptr barcode_extractor_); + + double GetScore(const scaffold_graph::ScaffoldGraph::ScaffoldEdge &edge) const override; + + protected: + using AbstractBarcodeScoreFunction::barcode_extractor_; + using AbstractBarcodeScoreFunction::graph_; + + DECL_LOGGER("NormalizedBarcodeScoreFunction"); +}; + +class TrivialBarcodeScoreFunction : public AbstractBarcodeScoreFunction { + public: + TrivialBarcodeScoreFunction( + const Graph &graph_, + std::shared_ptr barcode_extractor_, + size_t read_count_threshold_, + size_t tail_threshold_); + + double GetScore(const scaffold_graph::ScaffoldGraph::ScaffoldEdge &edge) const override; + + protected: + using AbstractBarcodeScoreFunction::barcode_extractor_; + using AbstractBarcodeScoreFunction::graph_; + const size_t read_count_threshold_; + const size_t tail_threshold_; +}; + +} +} \ No newline at end of file diff --git a/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.cpp b/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.cpp new file mode 100644 index 0000000000..ece4aea37c --- /dev/null +++ b/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.cpp @@ -0,0 +1,77 @@ +//*************************************************************************** +//* Copyright (c) 2019 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "scaffold_graph_extractor.hpp" + +namespace path_extend { +namespace read_cloud { + +std::vector ScaffoldGraphExtractor::ExtractReliableEdges( + const ScaffoldGraph &scaffold_graph) const { + std::vector result; + for (const ScaffoldGraph::ScaffoldEdge &edge: scaffold_graph.edges()) { + if (scaffold_graph.HasUniqueOutgoing(edge.getStart()) and scaffold_graph.HasUniqueIncoming(edge.getEnd())) { + result.push_back(edge); + } + } + return result; +} +std::vector ScaffoldGraphExtractor::ExtractMaxScoreEdges( + const ScaffoldGraphExtractor::ScaffoldGraph &scaffold_graph) const { + std::vector result; + std::unordered_map start_to_edge; + std::unordered_map end_to_edge; + size_t edge_counter = 0; + size_t edge_block = 10000; + for (const ScaffoldGraph::ScaffoldEdge &edge: scaffold_graph.edges()) { + double score = edge.getWeight(); + auto start = edge.getStart(); + auto end = edge.getEnd(); + bool is_max_score_edge = true; + for (const auto &in_edge: scaffold_graph.IncomingEdges(end)) { + double in_score = in_edge.getWeight(); + if (math::gr(in_score, score)) { + is_max_score_edge = false; + break; + } + } + if (not is_max_score_edge) { + continue; + } + for (const auto &out_edge: scaffold_graph.OutgoingEdges(start)) { + double out_score = out_edge.getWeight(); + if (math::gr(out_score, score)) { + is_max_score_edge = false; + break; + } + } + if (is_max_score_edge) { + if (start_to_edge.find(start) == start_to_edge.end() and end_to_edge.find(end) == end_to_edge.end()) { + start_to_edge.insert({start, edge}); + end_to_edge.insert({end, edge}); + result.push_back(edge); + } + } + ++edge_counter; + if (edge_counter % edge_block == 0) { + INFO("Processed " << edge_counter << " edges out of " << scaffold_graph.EdgeCount()); + } + } + return result; +} +std::unordered_map ScaffoldGraphExtractor::GetFirstEdgeMap( + const ScaffoldGraphExtractor::ScaffoldGraph &scaffold_graph, const func::TypedPredicate &pred) const { + std::unordered_map result; + for (const ScaffoldVertex &vertex: scaffold_graph.vertices()) { + auto first_edge = vertex.GetFirstEdgeWithPredicate(pred); + if (first_edge.is_initialized()) { + result[first_edge.get()].insert(vertex); + } + } + return result; +} +} +} \ No newline at end of file diff --git a/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.hpp b/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.hpp new file mode 100644 index 0000000000..7d4cc2caf9 --- /dev/null +++ b/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.hpp @@ -0,0 +1,29 @@ +//*************************************************************************** +//* Copyright (c) 2019 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "auxiliary_graphs/scaffold_graph/scaffold_graph.hpp" + +namespace path_extend { +namespace read_cloud { + +class ScaffoldGraphExtractor { + public: + typedef debruijn_graph::EdgeId EdgeId; + typedef scaffold_graph::ScaffoldGraph ScaffoldGraph; + typedef ScaffoldGraph::ScaffoldEdge ScaffoldEdge; + typedef scaffold_graph::ScaffoldVertex ScaffoldVertex; + typedef std::unordered_set VertexSet; + + std::vector ExtractMaxScoreEdges(const ScaffoldGraph &scaffold_graph) const; + std::vector ExtractReliableEdges(const ScaffoldGraph &scaffold_graph) const; + std::unordered_map GetFirstEdgeMap(const ScaffoldGraph &scaffold_graph, + const func::TypedPredicate &pred) const; +}; + +} +} \ No newline at end of file diff --git a/src/common/modules/path_extend/scaffolder2015/connection_condition2015.cpp b/src/common/modules/path_extend/scaffolder2015/connection_condition2015.cpp index 40f58d49bc..e41b33109c 100644 --- a/src/common/modules/path_extend/scaffolder2015/connection_condition2015.cpp +++ b/src/common/modules/path_extend/scaffolder2015/connection_condition2015.cpp @@ -21,6 +21,9 @@ Connections ConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e, } return res; } +bool ConnectionCondition::IsLast() const { + return false; +} PairedLibConnectionCondition::PairedLibConnectionCondition(const debruijn_graph::Graph &graph, std::shared_ptr lib, @@ -276,5 +279,8 @@ size_t AssemblyGraphConnectionCondition::GetLibIndex() const { int AssemblyGraphConnectionCondition::GetMedianGap (debruijn_graph::EdgeId, debruijn_graph::EdgeId) const { return 0; } +bool AssemblyGraphConnectionCondition::IsLast() const { + return true; +} } diff --git a/src/common/modules/path_extend/scaffolder2015/connection_condition2015.hpp b/src/common/modules/path_extend/scaffolder2015/connection_condition2015.hpp index b7027e30e4..836127c3f6 100644 --- a/src/common/modules/path_extend/scaffolder2015/connection_condition2015.hpp +++ b/src/common/modules/path_extend/scaffolder2015/connection_condition2015.hpp @@ -58,6 +58,7 @@ class ConnectionCondition { virtual Connections ConnectedWith(EdgeId e, const ScaffoldingUniqueEdgeStorage &storage) const; virtual int GetMedianGap(EdgeId e1, EdgeId e2) const = 0; virtual size_t GetLibIndex() const = 0; + virtual bool IsLast() const; virtual ~ConnectionCondition() { } }; @@ -150,6 +151,7 @@ class AssemblyGraphConnectionCondition : public ConnectionCondition { void AddInterestingEdges(func::TypedPredicate edge_condition); Connections ConnectedWith(EdgeId e) const override; size_t GetLibIndex() const override; + virtual bool IsLast() const override; int GetMedianGap(EdgeId, EdgeId ) const override; }; } diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp index a45a251eb8..dded2d9ce1 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp @@ -17,7 +17,7 @@ #include "adt/iterator_range.hpp" namespace path_extend { -namespace scaffold_graph { +namespace scaffolder { //do NOT add "using namespace debruijn_graph" in order not to confuse between EdgeId typdefs diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp index 5fe211c861..17171f84b0 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp @@ -9,11 +9,13 @@ // Created by andrey on 04.12.15. // +#include "common/modules/path_extend/scaffolder2015/scaffold_graph_dijkstra.hpp" #include "scaffold_graph_constructor.hpp" +#include "scaffold_graph_dijkstra.hpp" namespace path_extend { -namespace scaffold_graph { +namespace scaffolder { void BaseScaffoldGraphConstructor::ConstructFromEdgeConditions(func::TypedPredicate edge_condition, ConnectionConditions &connection_conditions, @@ -28,7 +30,10 @@ void BaseScaffoldGraphConstructor::ConstructFromEdgeConditions(func::TypedPredic void BaseScaffoldGraphConstructor::ConstructFromSet(const EdgeSet &edge_set, ConnectionConditions &connection_conditions, bool use_terminal_vertices_only) { - graph_->AddVertices(edge_set); + for (const auto &v: edge_set) { + graph_->AddVertex(v); + } + INFO("Added vertices") ConstructFromConditions(connection_conditions, use_terminal_vertices_only); } @@ -36,7 +41,7 @@ void BaseScaffoldGraphConstructor::ConstructFromConditions(ConnectionConditions bool use_terminal_vertices_only) { //TODO :: awful. It depends on ordering of connected conditions. for (auto condition : connection_conditions) { - if (condition->GetLibIndex() == (size_t) -1) + if (condition->IsLast()) ConstructFromSingleCondition(condition, true); else ConstructFromSingleCondition(condition, use_terminal_vertices_only); @@ -51,7 +56,8 @@ void BaseScaffoldGraphConstructor::ConstructFromSingleCondition(const std::share if (use_terminal_vertices_only && graph_->OutgoingEdgeCount(v) > 0) continue; - auto connected_with = condition->ConnectedWith(v); + EdgeId e = v.GetFirstEdge(); + auto connected_with = condition->ConnectedWith(e); for (const auto& pair : connected_with) { EdgeId connected = pair.first; double w = pair.second; @@ -59,23 +65,287 @@ void BaseScaffoldGraphConstructor::ConstructFromSingleCondition(const std::share if (graph_->Exists(connected)) { if (use_terminal_vertices_only && graph_->IncomingEdgeCount(connected) > 0) continue; - graph_->AddEdge(v, connected, condition->GetLibIndex(), w); + graph_->AddEdge(e, connected, condition->GetLibIndex(), w, 0); } } } } - -std::shared_ptr SimpleScaffoldGraphConstructor::Construct() { +std::shared_ptr SimpleScaffoldGraphConstructor::Construct() { ConstructFromSet(edge_set_, connection_conditions_); return graph_; } -std::shared_ptr DefaultScaffoldGraphConstructor::Construct() { +std::shared_ptr DefaultScaffoldGraphConstructor::Construct() { ConstructFromSet(edge_set_, connection_conditions_); ConstructFromEdgeConditions(edge_condition_, connection_conditions_); return graph_; } +PredicateScaffoldGraphFilter::PredicateScaffoldGraphFilter(const Graph &assembly_graph, + const ScaffoldGraph &old_graph, + std::shared_ptr predicate, + size_t max_threads) + : BaseScaffoldGraphConstructor(assembly_graph), old_graph_(old_graph), + predicate_(predicate), max_threads_(max_threads) {} + +void PredicateScaffoldGraphFilter::ConstructFromGraphAndPredicate(const ScaffoldGraph &old_graph, + std::shared_ptr predicate) { + for (const auto& vertex: old_graph.vertices()) { + graph_->AddVertex(vertex); + } + std::vector scaffold_edges; + for (const auto& edge: old_graph.edges()) { + scaffold_edges.push_back(edge); + } + size_t counter = 0; + const size_t block_size = scaffold_edges.size() / 10; + size_t threads = max_threads_; + DEBUG("Number of threads: " << threads); +#pragma omp parallel for num_threads(threads) + for (size_t i = 0; i < scaffold_edges.size(); ++i) { + auto edge = scaffold_edges[i]; + TRACE("Checking"); + bool check_predicate = (*predicate)(edge); + TRACE("Check result: " << check_predicate); +#pragma omp critical + { + if (check_predicate) { + graph_->AddEdge(edge); + } + ++counter; + if (block_size != 0 and counter % block_size == 0) { + DEBUG("Processed " << counter << " edges out of " << scaffold_edges.size()); + } + } + } +} + +std::shared_ptr PredicateScaffoldGraphFilter::Construct() { + ConstructFromGraphAndPredicate(old_graph_, predicate_); + return graph_; +} +ScoreFunctionScaffoldGraphFilter::ScoreFunctionScaffoldGraphFilter(const Graph &assembly_graph, + const ScaffoldGraph &old_graph, + std::shared_ptr score_function, + double score_threshold, size_t num_threads) + : BaseScaffoldGraphConstructor(assembly_graph), old_graph_(old_graph), + score_function_(score_function), score_threshold_(score_threshold), num_threads_(num_threads) {} + +void ScoreFunctionScaffoldGraphFilter::ConstructFromGraphAndScore(const ScaffoldGraph &graph, + std::shared_ptr score_function, + double score_threshold, size_t threads) { + for (const auto& vertex: graph.vertices()) { + graph_->AddVertex(vertex); + } + std::vector scaffold_edges; + for (const auto& edge: graph.edges()) { + scaffold_edges.push_back(edge); + } + size_t counter = 0; + const size_t block_size = scaffold_edges.size() / 25; + #pragma omp parallel for num_threads(threads) + for (size_t i = 0; i < scaffold_edges.size(); ++i) { + ScaffoldGraph::ScaffoldEdge edge = scaffold_edges[i]; + double score = score_function->GetScore(edge); + #pragma omp critical + { + TRACE("Checking edge " << edge.getStart().int_id() << " -> " << edge.getEnd().int_id()); + TRACE("Score: " << score); + TRACE("Score threshold: " << score_threshold); + if (math::ge(score, score_threshold)) { + TRACE("Success"); + graph_->AddEdge(edge.getStart(), edge.getEnd(), edge.getColor(), score, edge.getLength()); + } + TRACE("Edge added"); + ++counter; + if (counter % block_size == 0) { + INFO("Processed " << counter << " edges out of " << scaffold_edges.size()); + } + } + } +} +std::shared_ptr ScoreFunctionScaffoldGraphFilter::Construct() { + ConstructFromGraphAndScore(old_graph_, score_function_, score_threshold_, num_threads_); + return graph_; +} +std::shared_ptr ScoreFunctionGraphConstructor::Construct() { + ConstructFromScore(score_function_, score_threshold_); + return graph_; +} +ScoreFunctionGraphConstructor::ScoreFunctionGraphConstructor(const Graph &assembly_graph, + std::vector chunks, + std::shared_ptr score_function, + double score_threshold, + size_t num_threads): + BaseScaffoldGraphConstructor(assembly_graph), + chunks_(chunks), + score_function_(score_function), + score_threshold_(score_threshold), + num_threads_(num_threads) {} +void ScoreFunctionGraphConstructor::ConstructFromScore(std::shared_ptr score_function, + double score_threshold) { + for (const auto &chunk: chunks_) { + graph_->AddVertex(chunk.vertex_); + } + size_t approx_block_counter = 0; + const size_t NUM_BLOCKS = 100; + const size_t block_size = chunks_.size() / NUM_BLOCKS; +#pragma omp parallel for schedule(guided) + for (size_t i = 0; i < chunks_.size(); ++i) { + for (auto it = chunks_[i].begin_; it != chunks_[i].end_; ++it) { + const ScaffoldVertex &first = chunks_[i].vertex_; + //todo move this check elsewhere + if (first != *it and first.GetConjugateFromGraph(graph_->AssemblyGraph()) != *it) { + ScaffoldGraph::ScaffoldEdge edge(first, *it, 0, .0, 0); + double score = score_function->GetScore(edge); + if (math::ge(score, score_threshold)) { + #pragma omp critical + { + TRACE("Success"); + ScaffoldGraph::ScaffoldEdge new_edge(first, *it, 0, score, 0); + graph_->AddEdgeSimple(new_edge); + } + } + } + } + if (i % block_size == 0 and i != 0) { +#pragma omp critical + { + ++approx_block_counter; + INFO("Processed " << approx_block_counter << " out of " << NUM_BLOCKS << " blocks"); + } + } + } +} + +std::shared_ptr ScaffoldSubgraphConstructor::Construct() { + for (const ScaffoldVertex& vertex: large_graph_.vertices()) { + if (vertex_condition_(vertex)) { + graph_->AddVertex(vertex); + } + } + INFO(graph_->VertexCount() << " vertices"); + + //todo add distance calculation + ScaffoldDijkstraHelper helper; + for (const ScaffoldVertex& vertex: graph_->vertices()) { + auto scaffold_dijkstra = helper.CreatePredicateBasedScaffoldDijkstra(large_graph_, vertex, vertex_condition_); + scaffold_dijkstra.Run(vertex); + for (auto reached: scaffold_dijkstra.ReachedVertices()) { + size_t distance = scaffold_dijkstra.GetDistance(reached); + if (distance < distance_threshold_ and vertex_condition_(reached) and vertex != reached) { + graph_->AddEdge(vertex, reached, (size_t) - 1, 0, distance); + } + } + } + return graph_; +} +ScaffoldSubgraphConstructor::ScaffoldSubgraphConstructor(const Graph &assembly_graph, + const func::TypedPredicate &vertex_condition, + const ScaffoldGraph &large_graph, + const size_t distance_threshold) + : BaseScaffoldGraphConstructor(assembly_graph), + vertex_condition_(vertex_condition), + large_graph_(large_graph), + distance_threshold_(distance_threshold) {} +ScoreFunctionScaffoldGraphConstructor::ScoreFunctionScaffoldGraphConstructor( + const Graph &assembly_graph, + const std::set &scaffold_vertices, + const std::shared_ptr &score_function, + double score_threshold, + size_t num_threads) + : BaseScaffoldGraphConstructor(assembly_graph), + scaffold_vertices_(scaffold_vertices), + score_function_(score_function), + score_threshold_(score_threshold), + num_threads_(num_threads) {} + +std::shared_ptr ScoreFunctionScaffoldGraphConstructor::Construct() { + for (const auto& vertex: scaffold_vertices_) { + graph_->AddVertex(vertex); + } + //fixme switch to tbb or use chunk splitter + std::vector scaffold_vertex_vec; + for (const auto& vertex: scaffold_vertices_) { + scaffold_vertex_vec.push_back(vertex); + } + size_t counter = 0; + size_t edges_size = scaffold_vertices_.size() * scaffold_vertices_.size(); + const size_t block_size = edges_size / 10; +#pragma omp parallel for num_threads(num_threads_) + for (size_t i = 0; i < scaffold_vertex_vec.size(); ++i) { + for (size_t j = 0; j < scaffold_vertex_vec.size(); ++j) { + const ScaffoldVertex& from = scaffold_vertex_vec[i]; + const ScaffoldVertex& to = scaffold_vertex_vec[j]; + ScaffoldGraph::ScaffoldEdge edge(from, to); + double score = score_function_->GetScore(edge); +#pragma omp critical + { + TRACE("Checking edge " << edge.getStart().int_id() << " -> " << edge.getEnd().int_id()); + TRACE("Score: " << score); + TRACE("Score threshold: " << score_threshold_); + bool are_conjugate = from == to.GetConjugateFromGraph(graph_->AssemblyGraph()); + if (math::ge(score, score_threshold_) and from != to and not are_conjugate) { + TRACE("Success"); + graph_->AddEdge(edge.getStart(), edge.getEnd(), edge.getColor(), score, edge.getLength()); + } + TRACE("Edge added"); + ++counter; + if (block_size != 0 and counter % block_size == 0) { + DEBUG("Processed " << counter << " edges out of " << edges_size); + } + } + } + } + return graph_; +} +std::shared_ptr InternalScoreScaffoldGraphFilter::Construct() { + for (const auto& vertex: old_graph_.vertices()) { + graph_->AddVertex(vertex); + } + for (const ScaffoldVertex &vertex: old_graph_.vertices()) { + auto outgoing = old_graph_.OutgoingEdges(vertex); + auto incoming = old_graph_.IncomingEdges(vertex); + ProcessEdges(incoming); + ProcessEdges(outgoing); + } + return graph_; +} +boost::optional InternalScoreScaffoldGraphFilter::GetWinnerVertex( + std::vector &edges) const { + boost::optional result; + if (edges.size() < 2) { + return result; + } + std::sort(edges.begin(), edges.end(), [this](const ScaffoldEdge &first, const ScaffoldEdge &second) { + return math::gr(score_function_->GetScore(first), score_function_->GetScore(second)); + }); + const double top_score = score_function_->GetScore(edges[0]); + const double second_score = score_function_->GetScore(edges[1]); + if (math::gr(top_score, relative_threshold_ * second_score)) { + return edges[0]; + } + return result; +} +void InternalScoreScaffoldGraphFilter::ProcessEdges(std::vector &edges) { + boost::optional incoming_winner = GetWinnerVertex(edges); + if (incoming_winner.is_initialized()) { + graph_->AddEdge(incoming_winner.get()); + } else { + for (const auto &edge: edges) { + graph_->AddEdge(edge); + } + } +} +InternalScoreScaffoldGraphFilter::InternalScoreScaffoldGraphFilter( + const Graph &assembly_graph, + const ScaffoldGraph &old_graph, + std::shared_ptr score_function, + double relative_threshold) + : BaseScaffoldGraphConstructor(assembly_graph), + old_graph_(old_graph), + score_function_(score_function), + relative_threshold_(relative_threshold) {} } //scaffold_graph } //path_extend diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp index 833f66d819..232fd04b61 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp @@ -11,24 +11,33 @@ #pragma once -#include "scaffold_graph.hpp" +#include "auxiliary_graphs/scaffold_graph/scaffold_graph.hpp" +#include "connection_condition2015.hpp" +#include "modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.hpp" + +#include +#include namespace path_extend { -namespace scaffold_graph { +namespace scaffolder { typedef std::vector> ConnectionConditions; //Iterface class ScaffoldGraphConstructor { - public: + typedef scaffold_graph::ScaffoldGraph ScaffoldGraph; + virtual std::shared_ptr Construct() = 0; }; //Basic scaffold graph constructor functions class BaseScaffoldGraphConstructor: public ScaffoldGraphConstructor { protected: + using ScaffoldGraphConstructor::ScaffoldGraph; + typedef scaffold_graph::ScaffoldVertex ScaffoldVertex; + std::shared_ptr graph_; BaseScaffoldGraphConstructor(const debruijn_graph::Graph& assembly_graph) { @@ -48,11 +57,15 @@ class BaseScaffoldGraphConstructor: public ScaffoldGraphConstructor { void ConstructFromEdgeConditions(func::TypedPredicate edge_condition, ConnectionConditions &connection_conditions, bool use_terminal_vertices_only = false); + + DECL_LOGGER("BaseScaffoldGraphConstructor"); }; class SimpleScaffoldGraphConstructor: public BaseScaffoldGraphConstructor { protected: + using BaseScaffoldGraphConstructor::ScaffoldGraph; + const EdgeSet &edge_set_; ConnectionConditions &connection_conditions_; @@ -68,6 +81,8 @@ class SimpleScaffoldGraphConstructor: public BaseScaffoldGraphConstructor { class DefaultScaffoldGraphConstructor: public SimpleScaffoldGraphConstructor { protected: + using SimpleScaffoldGraphConstructor::ScaffoldGraph; + func::TypedPredicate edge_condition_; public: @@ -82,6 +97,150 @@ class DefaultScaffoldGraphConstructor: public SimpleScaffoldGraphConstructor { std::shared_ptr Construct() override; }; +class ScaffoldSubgraphConstructor: public BaseScaffoldGraphConstructor { + using BaseScaffoldGraphConstructor::ScaffoldGraph; + + func::TypedPredicate vertex_condition_; + const ScaffoldGraph& large_graph_; + const size_t distance_threshold_; + + public: + ScaffoldSubgraphConstructor(const Graph &assembly_graph, + const func::TypedPredicate &vertex_condition, + const ScaffoldGraph &large_graph, + const size_t distance_threshold); + + std::shared_ptr Construct() override; +}; + +class PredicateScaffoldGraphFilter: public BaseScaffoldGraphConstructor { + public: + typedef read_cloud::ScaffoldEdgePredicate EdgePairPredicate; + using BaseScaffoldGraphConstructor::ScaffoldGraph; + using BaseScaffoldGraphConstructor::ScaffoldVertex; + protected: + const ScaffoldGraph& old_graph_; + const std::shared_ptr predicate_; + const size_t max_threads_; + + public: + PredicateScaffoldGraphFilter(const Graph& assembly_graph, + const ScaffoldGraph& old_graph, + std::shared_ptr predicate, + size_t max_threads); + + std::shared_ptr Construct() override; + protected: + void ConstructFromGraphAndPredicate(const ScaffoldGraph& old_graph, std::shared_ptr predicate); + + DECL_LOGGER("PredicateScaffoldGraphFilter"); + +}; + +struct ScaffoldVertexPairChunk { + public: + using ScaffoldVertex = scaffold_graph::ScaffoldVertex; + //todo other containers? + using scaffold_vertex_iterator = std::unordered_set::const_iterator; + + ScaffoldVertexPairChunk(const ScaffoldVertex &vertex, + scaffold_vertex_iterator begin, + scaffold_vertex_iterator end) : vertex_(vertex), begin_(begin), end_(end) {} + + ScaffoldVertex vertex_; + scaffold_vertex_iterator begin_; + scaffold_vertex_iterator end_; +}; + +class ScoreFunctionGraphConstructor: public BaseScaffoldGraphConstructor { + public: + typedef read_cloud::ScaffoldEdgeScoreFunction EdgePairScoreFunction; + using BaseScaffoldGraphConstructor::ScaffoldGraph; + using BaseScaffoldGraphConstructor::ScaffoldVertex; + + ScoreFunctionGraphConstructor(const Graph &assembly_graph, + std::vector chunks, + std::shared_ptr score_function, + double score_threshold, size_t num_threads); + + std::shared_ptr Construct() override; + private: + void ConstructFromScore(std::shared_ptr score_function, + double score_threshold); + + std::vector chunks_; + const std::shared_ptr score_function_; + const double score_threshold_; + const size_t num_threads_; + DECL_LOGGER("ScoreFunctionScaffoldGraphConstructor") +}; + +class ScoreFunctionScaffoldGraphFilter: public BaseScaffoldGraphConstructor { + typedef read_cloud::ScaffoldEdgeScoreFunction EdgePairScoreFunction; + using BaseScaffoldGraphConstructor::ScaffoldGraph; + using BaseScaffoldGraphConstructor::ScaffoldVertex; + protected: + const ScaffoldGraph &old_graph_; + const std::shared_ptr score_function_; + const double score_threshold_; + const size_t num_threads_; + public: + ScoreFunctionScaffoldGraphFilter(const Graph& assembly_graph, + const ScaffoldGraph& old_graph, + std::shared_ptr score_function, + double score_threshold, size_t num_threads); + + std::shared_ptr Construct() override; + protected: + void ConstructFromGraphAndScore(const ScaffoldGraph& graph, std::shared_ptr score_function, + double score_threshold, size_t threads); + DECL_LOGGER("ScoreFunctionScaffoldGraphConstructor") +}; + +class InternalScoreScaffoldGraphFilter: public BaseScaffoldGraphConstructor { + typedef read_cloud::ScaffoldEdgeScoreFunction EdgePairScoreFunction; + using BaseScaffoldGraphConstructor::ScaffoldGraph; + using BaseScaffoldGraphConstructor::ScaffoldVertex; + typedef ScaffoldGraph::ScaffoldEdge ScaffoldEdge; + protected: + const ScaffoldGraph &old_graph_; + std::shared_ptr score_function_; + const double relative_threshold_; + public: + InternalScoreScaffoldGraphFilter(const Graph &assembly_graph, + const ScaffoldGraph &old_graph, + std::shared_ptr score_function, + double relative_threshold); + + std::shared_ptr Construct() override; + private: + void ProcessEdges(std::vector &edges); + + boost::optional GetWinnerVertex(std::vector &edges) const; +}; + +class ScoreFunctionScaffoldGraphConstructor: public BaseScaffoldGraphConstructor { + typedef read_cloud::ScaffoldEdgeScoreFunction EdgePairScoreFunction; + using BaseScaffoldGraphConstructor::ScaffoldGraph; + using BaseScaffoldGraphConstructor::ScaffoldVertex; + + protected: + const std::set scaffold_vertices_; + const std::shared_ptr score_function_; + const double score_threshold_; + const size_t num_threads_; + + public: + ScoreFunctionScaffoldGraphConstructor(const Graph &assembly_graph, + const std::set &scaffold_vertices, + const std::shared_ptr &score_function, + double score_threshold, + size_t num_threads); + + std::shared_ptr Construct() override; + + DECL_LOGGER("ScoreFunctionScaffoldGraphConstructor"); +}; } //scaffold_graph } //path_extend diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_dijkstra.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_dijkstra.hpp new file mode 100644 index 0000000000..5948e84f28 --- /dev/null +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_dijkstra.hpp @@ -0,0 +1,246 @@ +//*************************************************************************** +//* Copyright (c) 2019 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "assembly_graph/dijkstra/dijkstra_helper.hpp" +#include "auxiliary_graphs/scaffold_graph/scaffold_graph.hpp" +#include "modules/path_extend/scaffolder2015/scaffold_vertex_predicates.hpp" + +namespace omnigraph { +template<> +class ForwardNeighbourIterator { + typedef scaffold_graph::ScaffoldGraph ScaffoldGraph; + typedef typename ScaffoldGraph::VertexId VertexId; + typedef typename ScaffoldGraph::EdgeId EdgeId; + typedef typename std::vector::const_iterator edge_const_iterator; + std::vector out_edges_; + edge_const_iterator current_; + public: + ForwardNeighbourIterator(const ScaffoldGraph &graph, VertexId vertex) : + out_edges_(graph.OutgoingEdges(vertex)), current_(out_edges_.begin()) {} + + bool HasNext() { + return current_ != out_edges_.end(); + } + + vertex_neighbour Next() { + TRACE("Before increment"); + TRACE(current_->getStart().int_id() << ", " << current_->getEnd().int_id()); + vertex_neighbour res(current_->getEnd(), *current_); + current_++; + return res; + } + + DECL_LOGGER("ScaffoldForwardNeighbourItetator"); +}; + +template<> +class BackwardNeighbourIterator { + typedef scaffold_graph::ScaffoldGraph ScaffoldGraph; + typedef typename ScaffoldGraph::VertexId VertexId; + typedef typename ScaffoldGraph::EdgeId EdgeId; + typedef typename std::vector::const_iterator edge_const_iterator; + + std::vector in_edges_; + edge_const_iterator current_; + public: + BackwardNeighbourIterator(const ScaffoldGraph &graph, VertexId vertex) : + in_edges_(graph.IncomingEdges(vertex)), current_(in_edges_.begin()) {} + + bool HasNext() { + return current_ != in_edges_.end(); + } + + vertex_neighbour Next() { + vertex_neighbour res(current_->getStart(), *current_); + current_++; + return res; + } +}; +} + +namespace path_extend { + +namespace scaffolder { + +template +class SimpleScaffoldGraphLengthCalculator { + protected: + typedef typename Graph::EdgeId EdgeId; + typedef typename Graph::VertexId VertexId; + public: + distance_t GetLength(EdgeId) const { + return 1; + } +}; + +template +class DistanceBasedScaffoldGraphLengthCalculator { + protected: + typedef typename Graph::EdgeId EdgeId; + typedef typename Graph::VertexId VertexId; + + const Graph &graph_; + public: + explicit DistanceBasedScaffoldGraphLengthCalculator(const Graph &graph) : graph_(graph) {} + distance_t GetLength(EdgeId edge) const { + return graph_.length(edge) + graph_.length(edge.getEnd()); + } +}; + +template +class ScaffoldBarcodedPathPutChecker { + typedef typename Graph::VertexId VertexId; + typedef typename Graph::EdgeId EdgeId; + + const Graph &g_; + const VertexId first_; + const VertexId second_; + std::shared_ptr predicate_; + + public: + ScaffoldBarcodedPathPutChecker(const Graph &g, const VertexId &first, const VertexId &second, + std::shared_ptr predicate) : + g_(g), + first_(first), + second_(second), + predicate_(predicate) { + TRACE("Construction"); + TRACE("First id: " << first_.int_id()); + TRACE("Second id: " << second_.int_id()); + } + + bool Check(VertexId vertex, EdgeId /*unused*/, distance_t distance) const { + TRACE("Checking vertex " << g_.str(vertex)); + TRACE("Id: " << vertex.int_id()); + TRACE("First id: " << first_.int_id()); + TRACE("Second id: " << second_.int_id()); + bool target_reached = distance > 0 and (vertex == first_ or vertex == second_); + if (target_reached) { + TRACE("Target reached"); + return false; + } + TRACE("Checking"); + return predicate_->Check(vertex); + } + DECL_LOGGER("ScaffoldBarcodePutChecker"); +}; + +template +class StartPredicateProcessChecker { + typedef typename Graph::VertexId VertexId; + typedef typename Graph::EdgeId EdgeId; + + const Graph &g_; + const VertexId start_; + const func::TypedPredicate &predicate_; + public: + StartPredicateProcessChecker(const Graph &g, + const VertexId &start, + const func::TypedPredicate &predicate) + : g_(g), start_(start), predicate_(predicate) {} + + bool Check(VertexId vertex, distance_t /*distance*/) const { + return vertex == start_ or not predicate_(vertex); + } +}; + +template +class TrivialScaffoldPutChecker { + typedef typename Graph::VertexId VertexId; + typedef typename Graph::EdgeId EdgeId; + + public: + TrivialScaffoldPutChecker() {} + + bool Check(VertexId /*unused*/, EdgeId /*unused*/, distance_t /*unused*/) const { + return true; + } + DECL_LOGGER("TrivialScaffoldPutChecker"); +}; + +typedef omnigraph::ComposedDijkstraSettings, + StartPredicateProcessChecker, + TrivialScaffoldPutChecker, + omnigraph::ForwardNeighbourIteratorFactory > + PredicateBasedScaffoldDijkstraSettings; + +typedef omnigraph::Dijkstra + PredicateBasedScaffoldDijkstra; + +//forward scaffold dijkstra + +typedef omnigraph::ComposedDijkstraSettings, + omnigraph::BoundedVertexTargetedProcessChecker, + ScaffoldBarcodedPathPutChecker, + omnigraph::ForwardNeighbourIteratorFactory > + ForwardBoundedScaffoldDijkstraSettings; + +typedef omnigraph::Dijkstra + ForwardBoundedScaffoldDijkstra; + +//backward scaffold dijkstra + +typedef omnigraph::ComposedDijkstraSettings, + omnigraph::BoundedVertexTargetedProcessChecker, + ScaffoldBarcodedPathPutChecker, + omnigraph::BackwardNeighbourIteratorFactory > + BackwardBoundedScaffoldDijkstraSettings; + +typedef omnigraph::Dijkstra + BackwardBoundedScaffoldDijkstra; + +class ScaffoldDijkstraHelper { + public: + static BackwardBoundedScaffoldDijkstra CreateBackwardBoundedScaffoldDijkstra( + const scaffold_graph::ScaffoldGraph &graph, + const scaffold_graph::ScaffoldVertex first, + const scaffold_graph::ScaffoldVertex second, + size_t length_bound, + std::shared_ptr predicate, + size_t max_vertex_number = -1ul) { + return BackwardBoundedScaffoldDijkstra(graph, BackwardBoundedScaffoldDijkstraSettings( + SimpleScaffoldGraphLengthCalculator(), + omnigraph::BoundedVertexTargetedProcessChecker(first, length_bound), + ScaffoldBarcodedPathPutChecker(graph, first, second, predicate), + omnigraph::BackwardNeighbourIteratorFactory(graph)), + max_vertex_number); + } + + static ForwardBoundedScaffoldDijkstra CreateForwardBoundedScaffoldDijkstra( + const scaffold_graph::ScaffoldGraph &graph, + const scaffold_graph::ScaffoldVertex &first, + const scaffold_graph::ScaffoldVertex &second, + size_t length_bound, + std::shared_ptr predicate, + size_t max_vertex_number = -1ul) { + return ForwardBoundedScaffoldDijkstra(graph, ForwardBoundedScaffoldDijkstraSettings( + SimpleScaffoldGraphLengthCalculator(), + omnigraph::BoundedVertexTargetedProcessChecker(second, length_bound), + ScaffoldBarcodedPathPutChecker(graph, first, second, predicate), + omnigraph::ForwardNeighbourIteratorFactory(graph)), + max_vertex_number); + } + + static PredicateBasedScaffoldDijkstra CreatePredicateBasedScaffoldDijkstra( + const scaffold_graph::ScaffoldGraph &graph, + const scaffold_graph::ScaffoldVertex &vertex, + const func::TypedPredicate &predicate, + size_t max_vertex_number = -1ul) { + return PredicateBasedScaffoldDijkstra(graph, PredicateBasedScaffoldDijkstraSettings( + DistanceBasedScaffoldGraphLengthCalculator(graph), + StartPredicateProcessChecker(graph, vertex, predicate), + TrivialScaffoldPutChecker(), + omnigraph::ForwardNeighbourIteratorFactory(graph)), + max_vertex_number); + } +}; +} +} \ No newline at end of file diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp index f86a23d29b..1f20eb2369 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp @@ -13,7 +13,7 @@ namespace path_extend { -namespace scaffold_graph { +namespace scaffolder { const std::map ScaffoldEdgeColorer::color_map = {{(size_t) -1, "black"}, @@ -36,12 +36,12 @@ std::string ScaffoldGraphLabeler::label(VertexId v) const { auto it = additional_vertex_labels_.find(v); std::string additional_label = it == additional_vertex_labels_.end() ? "" : it->second + "\n"; return "ID: " + std::to_string(graph_.int_id(v)) + - "\\n Len: " + std::to_string(graph_.AssemblyGraph().length(v)) + - "\\n Cov: " + std::to_string(graph_.AssemblyGraph().coverage(v)) + "\n" + + "\\n Len: " + std::to_string(graph_.length(v)) + + "\\n Cov: " + std::to_string(graph_.coverage(v)) + "\n" + additional_label; } -void ScaffoldGraphVisualizer::Visualize(graph_printer::GraphPrinter &printer) { +void ScaffoldGraphVisualizer::Visualize(graph_printer::GraphPrinter &printer) { printer.open(); printer.AddVertices(graph_.vbegin(), graph_.vend()); for (const auto& e : graph_.edges()) { @@ -51,15 +51,15 @@ void ScaffoldGraphVisualizer::Visualize(graph_printer::GraphPrinter &colorer) { + graph_colorer::CompositeGraphColorer &colorer) { ScaffoldGraphLabeler labeler(graph_, additional_vertex_labels_); - vertex_linker::EmptyGraphLinker linker; + vertex_linker::EmptyGraphLinker linker; - graph_printer::SingleGraphPrinter printer(graph_, os, labeler, colorer, linker); + graph_printer::SingleGraphPrinter printer(graph_, os, labeler, colorer, linker); Visualize(printer); } -std::string ScaffoldEdgeColorer::GetValue(ScaffoldGraph::EdgeId e) const { +std::string ScaffoldEdgeColorer::GetValue(scaffold_graph::ScaffoldGraph::EdgeId e) const { auto it = color_map.find(e.getColor()); if (it != color_map.end()) { return it->second; @@ -67,7 +67,7 @@ std::string ScaffoldEdgeColorer::GetValue(ScaffoldGraph::EdgeId e) const { return default_color; } -std::string ScaffoldVertexSetColorer::GetValue(ScaffoldGraph::VertexId v) const { +std::string ScaffoldVertexSetColorer::GetValue(scaffold_graph::ScaffoldGraph::VertexId v) const { if (vertex_set_.count(v) > 0) return "white"; return "yellow"; diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp index 8005c65db0..b1bec1248f 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp @@ -12,27 +12,27 @@ #ifndef PROJECT_SCAFFOLD_GRAPH_VISUALIZER_HPP #define PROJECT_SCAFFOLD_GRAPH_VISUALIZER_HPP -#include "scaffold_graph.hpp" +#include "auxiliary_graphs/scaffold_graph/scaffold_graph.hpp" #include "visualization/graph_colorer.hpp" -#include "visualization/graph_labeler.hpp" #include "visualization/graph_printer.hpp" namespace path_extend { -namespace scaffold_graph { +namespace scaffolder { using namespace visualization; -class ScaffoldGraphLabeler : public graph_labeler::GraphLabeler { + class ScaffoldGraphLabeler : public graph_labeler::GraphLabeler { private: - const ScaffoldGraph &graph_; + const scaffold_graph::ScaffoldGraph &graph_; const std::map &additional_vertex_labels_; public: - ScaffoldGraphLabeler(const ScaffoldGraph &graph, const std::map &additional_vertex_labels): + ScaffoldGraphLabeler(const scaffold_graph::ScaffoldGraph &graph, + const std::map &additional_vertex_labels): graph_(graph), additional_vertex_labels_(additional_vertex_labels) { } @@ -42,44 +42,44 @@ class ScaffoldGraphLabeler : public graph_labeler::GraphLabeler { }; -class ScaffoldEdgeColorer : public graph_colorer::ElementColorer { +class ScaffoldEdgeColorer : public graph_colorer::ElementColorer { private: static const std::map color_map; static const std::string default_color; public: - std::string GetValue(ScaffoldGraph::EdgeId e) const; + std::string GetValue(scaffold_graph::ScaffoldGraph::EdgeId e) const; }; -class ScaffoldVertexSetColorer : public graph_colorer::ElementColorer { +class ScaffoldVertexSetColorer : public graph_colorer::ElementColorer { private: - std::set vertex_set_; + std::set vertex_set_; public: - ScaffoldVertexSetColorer(const std::set &vertex_set): vertex_set_(vertex_set) { + ScaffoldVertexSetColorer(const std::set &vertex_set): vertex_set_(vertex_set) { } - std::string GetValue(ScaffoldGraph::VertexId v) const; + std::string GetValue(scaffold_graph::ScaffoldGraph::VertexId v) const; }; class ScaffoldGraphVisualizer { private: - const ScaffoldGraph &graph_; + const scaffold_graph::ScaffoldGraph &graph_; - const std::map &additional_vertex_labels_; + const std::map &additional_vertex_labels_; private: - void Visualize(graph_printer::GraphPrinter &printer); + void Visualize(graph_printer::GraphPrinter &printer); public: - ScaffoldGraphVisualizer(const ScaffoldGraph &graph, - const std::map &additional_vertex_labels) : + ScaffoldGraphVisualizer(const scaffold_graph::ScaffoldGraph &graph, + const std::map &additional_vertex_labels) : graph_(graph), additional_vertex_labels_(additional_vertex_labels){ } - void Visualize(std::ostream &os, graph_colorer::CompositeGraphColorer &colorer); + void Visualize(std::ostream &os, graph_colorer::CompositeGraphColorer &colorer); }; } //scaffold_graph diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.cpp b/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.cpp new file mode 100644 index 0000000000..94feaf12db --- /dev/null +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.cpp @@ -0,0 +1,26 @@ +//*************************************************************************** +//* Copyright (c) 2019 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "scaffold_vertex_predicates.hpp" + +namespace path_extend { +namespace scaffolder { + +LengthChecker::LengthChecker(size_t length_threshold, const Graph &g) + : length_threshold_(length_threshold), g_(g) {} +bool LengthChecker::Check(const ScaffoldVertex &vertex) const { + return vertex.GetLengthFromGraph(g_) < length_threshold_; +} + +AndPredicate::AndPredicate(std::shared_ptr first, + std::shared_ptr second) : + first_(first), second_(second) {} +bool AndPredicate::Check(const ScaffoldVertex &scaffold_vertex) const { + return first_->Check(scaffold_vertex) and second_->Check(scaffold_vertex); +} + +} +} \ No newline at end of file diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.hpp new file mode 100644 index 0000000000..dd9134068a --- /dev/null +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.hpp @@ -0,0 +1,51 @@ +//*************************************************************************** +//* Copyright (c) 2019 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "auxiliary_graphs/scaffold_graph/scaffold_vertex.hpp" +#include "assembly_graph/core/graph.hpp" + +#include + +namespace path_extend { +namespace scaffolder { + +class ScaffoldVertexPredicate + : public func::AbstractPredicate { + public: + virtual ~ScaffoldVertexPredicate() = default; +}; + +class LengthChecker : public ScaffoldVertexPredicate { + public: + typedef scaffold_graph::ScaffoldVertex ScaffoldVertex; + typedef debruijn_graph::Graph Graph; + + LengthChecker(size_t length_threshold, const Graph &g); + + bool Check(const ScaffoldVertex &vertex) const override; + + private: + const size_t length_threshold_; + const Graph &g_; +}; + +class AndPredicate : public ScaffoldVertexPredicate { + public: + typedef scaffold_graph::ScaffoldVertex ScaffoldVertex; + + AndPredicate(std::shared_ptr first, std::shared_ptr second); + + bool Check(const ScaffoldVertex &scaffold_vertex) const override; + + private: + std::shared_ptr first_; + std::shared_ptr second_; +}; + +} +} \ No newline at end of file diff --git a/src/projects/splitter/CMakeLists.txt b/src/projects/splitter/CMakeLists.txt new file mode 100644 index 0000000000..b7ec4af085 --- /dev/null +++ b/src/projects/splitter/CMakeLists.txt @@ -0,0 +1,20 @@ +############################################################################ +# Copyright (c) 2023-2026 SPAdes team +# Copyright (c) 2021-2022 Saint Petersburg State University +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +project(splitter CXX) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +add_executable(splitter + main.cpp barcode_index_construction.cpp graph_resolver.cpp graph_resolver_io.cpp + path_extractor.cpp scaffold_graph_helper.cpp) + +target_link_libraries(splitter graphio toolchain common_modules ${COMMON_LIBRARIES} auxiliary_graphs) + +install(TARGETS splitter + DESTINATION bin + COMPONENT splitter) diff --git a/src/projects/splitter/barcode_index_construction.cpp b/src/projects/splitter/barcode_index_construction.cpp new file mode 100644 index 0000000000..6eda228326 --- /dev/null +++ b/src/projects/splitter/barcode_index_construction.cpp @@ -0,0 +1,83 @@ +//*************************************************************************** +//* Copyright (c) 2023-2026 SPAdes team +//* Copyright (c) 2021-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "alignment/bwa_sequence_mapper.hpp" +#include "alignment/kmer_sequence_mapper.hpp" + +#include "barcode_index_construction.hpp" +#include "barcode_index/barcode_index_builder.hpp" + +#include "utils/verify.hpp" + +namespace cont_index { + +using namespace barcode_index; + +void ConstructBarcodeIndex(barcode_index::FrameBarcodeIndex &barcode_index, + paired_info::SequencingLib &lib, + const debruijn_graph::Graph &graph, + const std::filesystem::path &workdir, + unsigned nthreads, + size_t frame_size, + unsigned mapping_k, + bool bin_load, + bool bin_save) { + if (!bin_load) { + const std::vector barcode_prefices = {"BC:Z:", "BX:Z:"}; +// alignment::BWAReadMapper mapper(graph); + const unsigned min_occ = 2; + alignment::ShortKMerReadMapper mapper(graph, workdir, mapping_k, min_occ); + FrameConcurrentBarcodeIndexBuffer buffer(graph, frame_size); + FrameBarcodeIndexBuilder barcode_index_builder(graph, mapper, barcode_prefices, frame_size, nthreads); + bool is_tellseq = lib.type() == io::LibraryType::TellSeqReads; + if (not is_tellseq) { + barcode_index_builder.ConstructBarcodeIndex(io::paired_easy_readers(lib, false, 0), barcode_index, lib, is_tellseq); + } + if (is_tellseq) { + INFO("Constructing from tellseq lib"); + barcode_index_builder.ConstructBarcodeIndex(io::tellseq_easy_readers(lib, false, 0), barcode_index, lib, is_tellseq); + } + INFO("Barcode index construction finished."); + + if (bin_save) { + INFO("Saving barcode index"); + io::binary::Save((workdir / "barcode_index").string(), barcode_index); + } + } else { + INFO("Loading barcode index"); + io::binary::Load((workdir / "barcode_index").string(), barcode_index); + } + INFO("Barcode index size: " << barcode_index.size()); + using BarcodeExtractor = barcode_index::FrameBarcodeIndexInfoExtractor; + auto barcode_extractor_ptr = std::make_shared(barcode_index, graph); + size_t total_reads = 0; + for (const auto &edge: graph.edges()) { + auto begin = barcode_extractor_ptr->barcode_iterator_begin(edge); + auto end = barcode_extractor_ptr->barcode_iterator_end(edge); + for (auto it = begin; it != end; ++it) { + total_reads += it->second.GetCount(); + } + } + INFO(total_reads << " total reads in the barcode index"); +} + +void DownsampleBarcodeIndex(const debruijn_graph::Graph &graph, + unsigned nthreads, + barcode_index::FrameBarcodeIndex &barcode_index, + barcode_index::FrameBarcodeIndex &downsampled_index, + double sampling_factor) { + VERIFY_DEV(math::ls(sampling_factor, 1.0)); + const size_t mapping_k = 31; + const std::vector barcode_prefices = {"BC:Z:", "BX:Z:"}; + debruijn_graph::Graph empty_graph(mapping_k); + alignment::BWAReadMapper mapper(empty_graph); + FrameBarcodeIndexBuilder barcode_index_builder(graph, mapper, barcode_prefices, barcode_index.GetFrameSize(), nthreads); + barcode_index_builder.DownsampleBarcodeIndex(downsampled_index, barcode_index, sampling_factor); +} + +} + diff --git a/src/projects/splitter/barcode_index_construction.hpp b/src/projects/splitter/barcode_index_construction.hpp new file mode 100644 index 0000000000..b511614332 --- /dev/null +++ b/src/projects/splitter/barcode_index_construction.hpp @@ -0,0 +1,40 @@ +//*************************************************************************** +//* Copyright (c) 2023-2026 SPAdes team +//* Copyright (c) 2021-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "assembly_graph/core/graph.hpp" +#include "barcode_index/barcode_info_extractor.hpp" +#include "paired_info/paired_info_utils.hpp" + +#include "io/binary/read_cloud.hpp" +#include "io/dataset_support/read_converter.hpp" + +#include +#include + +namespace cont_index { + +typedef io::DataSet DataSet; +typedef io::SequencingLibrary SequencingLib; + +void ConstructBarcodeIndex(barcode_index::FrameBarcodeIndex &barcode_index, + paired_info::SequencingLib &lib, + const debruijn_graph::Graph &graph, + const std::filesystem::path &workdir, + unsigned nthreads, + size_t frame_size, + unsigned mapping_k, + bool bin_load, + bool bin_save); + +void DownsampleBarcodeIndex(const debruijn_graph::Graph &graph, + unsigned nthreads, + barcode_index::FrameBarcodeIndex &barcode_index, + barcode_index::FrameBarcodeIndex &downsampled_index, + double sampling_factor); +} diff --git a/src/projects/splitter/graph_resolver.cpp b/src/projects/splitter/graph_resolver.cpp new file mode 100644 index 0000000000..06da28975b --- /dev/null +++ b/src/projects/splitter/graph_resolver.cpp @@ -0,0 +1,128 @@ +//*************************************************************************** +//* Copyright (c) 2023-2026 SPAdes team +//* Copyright (c) 2021-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "graph_resolver.hpp" + +#include "assembly_graph/core/construction_helper.hpp" +#include "assembly_graph/core/debruijn_data.hpp" + +namespace cont_index { + +GraphResolver::GraphResolverInfo::VertexMap GraphResolver::SplitVertices(debruijn_graph::Graph &graph, + const VertexResults &vertex_results) const { + GraphResolver::GraphResolverInfo::VertexMap transformed_vertex_to_original; + auto helper = graph.GetConstructionHelper(); + for (const auto &vertex_entry: vertex_results.vertex_to_result) { + const VertexId &vertex = vertex_entry.first; + DEBUG("Conjugate: " << graph.conjugate(vertex).int_id()); + const auto &vertex_result = vertex_entry.second; + + if (vertex_result.state == VertexState::Completely or vertex_result.state == VertexState::Partially) { + auto in_to_correct_link = GetLinkMap(graph, vertex, vertex_result); + VERIFY_DEV(in_to_correct_link.size() == vertex_entry.second.supported_pairs.size()); + std::unordered_set resolved_in_edges; + std::unordered_set resolved_out_edges; + for (const auto &entry: vertex_result.supported_pairs) { + EdgeId in_edge = entry.first; + EdgeId out_edge = entry.second; + LinkId link = in_to_correct_link.at(in_edge); + DEBUG("In edge: " << in_edge.int_id() << ", out edge: " << out_edge.int_id() << ", vertex: " << vertex.int_id()); + helper.DeleteLink(vertex, out_edge); + helper.DeleteLink(graph.conjugate(vertex), graph.conjugate(in_edge)); + std::vector links {link}; + VertexId new_vertex = helper.CreateVertex(debruijn_graph::DeBruijnVertexData(links)); + transformed_vertex_to_original[new_vertex] = vertex; + helper.LinkIncomingEdge(new_vertex, in_edge); + helper.LinkOutgoingEdge(new_vertex, out_edge); + resolved_in_edges.insert(in_edge); + resolved_out_edges.insert(out_edge); + } + if (vertex_result.state == VertexState::Completely) { + graph.DeleteVertex(vertex); + } else { + std::vector links = graph.move_links(vertex); + std::vector new_links; + for (const auto &link_id: links) { + auto link = graph.link(link_id); + if (resolved_in_edges.find(link.link.first) == resolved_in_edges.end() and resolved_out_edges.find(link.link.second) == resolved_out_edges.end()) { + new_links.push_back(link_id); + } + } + VertexId new_vertex = helper.CreateVertex(debruijn_graph::DeBruijnVertexData(new_links)); + for (const auto &in_edge: graph.IncomingEdges(vertex)) { + helper.DeleteLink(graph.conjugate(vertex), graph.conjugate(in_edge)); + helper.LinkIncomingEdge(new_vertex, in_edge); + } + for (const auto &out_edge: graph.OutgoingEdges(vertex)) { + helper.DeleteLink(vertex, out_edge); + helper.LinkOutgoingEdge(new_vertex, out_edge); + } + graph.DeleteVertex(vertex); + } + } + } + return transformed_vertex_to_original; +} +GraphResolver::GraphResolverInfo::EdgeMap GraphResolver::MergePaths(debruijn_graph::Graph &graph, + const path_extend::PathContainer &paths) const { + GraphResolver::GraphResolverInfo::EdgeMap original_edge_to_transformed; + for (const auto &path: paths) { + if (path.first->Size() == 1) { + original_edge_to_transformed[path.first->Back()] = path.first->Back(); + continue; + } + std::vector simple_path; + std::vector overlaps; + const auto &first_path = *(path.first); + for (size_t i = 0; i < first_path.Size(); ++i) { + if (i > 0 and graph.is_complex(graph.EdgeStart(first_path[i]))) { + size_t overlap = graph.link_length(graph.EdgeStart(first_path[i]), first_path[i - 1], first_path[i]); + overlaps.push_back(static_cast(overlap)); + } + simple_path.push_back(first_path[i]); + } + + EdgeId resulting_edge = graph.MergePath(simple_path, true, overlaps); + for (const auto &edge: simple_path) { + original_edge_to_transformed[edge] = resulting_edge; + } + } + return original_edge_to_transformed; +} +GraphResolver::GraphResolverInfo GraphResolver::TransformGraph(debruijn_graph::Graph &graph, + const path_extend::PathContainer &paths, + const VertexResults &vertex_results) const { + INFO("Transforming assembly graph"); + auto vertex_map = SplitVertices(graph, vertex_results); + auto edge_map = MergePaths(graph, paths); + GraphResolverInfo result(vertex_map, edge_map); + return result; +} +GraphResolver::LinkMap GraphResolver::GetLinkMap(const debruijn_graph::Graph &graph, + GraphResolver::VertexId vertex, + const VertexResult &vertex_result) const { + std::unordered_map in_to_out; + std::unordered_map in_to_correct_link; + for (const auto &entry: vertex_result.supported_pairs) { + EdgeId in_edge = entry.first; + EdgeId out_edge = entry.second; + VERIFY_DEV(in_to_out.find(in_edge) == in_to_out.end()); + in_to_out[in_edge] = out_edge; + } + for (const LinkId &link_id: graph.links(vertex)) { + const auto &link = graph.link(link_id); + auto in_result = in_to_out.find(link.link.first); + if (in_result == in_to_out.end()) { + continue; + } + if (in_result->second == link.link.second) { + in_to_correct_link.insert({link.link.first, link_id}); + } + } + return in_to_correct_link; +} +} diff --git a/src/projects/splitter/graph_resolver.hpp b/src/projects/splitter/graph_resolver.hpp new file mode 100644 index 0000000000..8258d1fb68 --- /dev/null +++ b/src/projects/splitter/graph_resolver.hpp @@ -0,0 +1,46 @@ +//*************************************************************************** +//* Copyright (c) 2023-2026 SPAdes team +//* Copyright (c) 2021-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "vertex_resolver.hpp" + +#include "assembly_graph/core/observable_graph.hpp" +#include "assembly_graph/paths/bidirectional_path_container.hpp" + +#pragma once + +namespace cont_index { +class GraphResolver { + public: + using EdgeId = debruijn_graph::EdgeId; + using VertexId = debruijn_graph::VertexId; + using LinkId = debruijn_graph::LinkId; + using LinkMap = std::unordered_map; + + struct GraphResolverInfo { + public: + using VertexMap = std::unordered_map; + using EdgeMap = std::unordered_map; + + GraphResolverInfo(const VertexMap &transformed_vertex_to_original, const EdgeMap &original_edge_to_transformed) + : transformed_vertex_to_original(transformed_vertex_to_original), + original_edge_to_transformed(original_edge_to_transformed) {} + VertexMap transformed_vertex_to_original; + EdgeMap original_edge_to_transformed; + }; + + GraphResolverInfo TransformGraph(debruijn_graph::Graph &graph, + const path_extend::PathContainer &paths, + const VertexResults &vertex_results) const; + private: + GraphResolverInfo::VertexMap SplitVertices(debruijn_graph::Graph &graph, + const VertexResults &vertex_results) const; + GraphResolverInfo::EdgeMap MergePaths(debruijn_graph::Graph &graph, const path_extend::PathContainer &paths) const; + LinkMap GetLinkMap(const debruijn_graph::Graph &graph, + VertexId vertex, + const VertexResult &vertex_result) const; +}; +} diff --git a/src/projects/splitter/graph_resolver_io.cpp b/src/projects/splitter/graph_resolver_io.cpp new file mode 100644 index 0000000000..92aee594da --- /dev/null +++ b/src/projects/splitter/graph_resolver_io.cpp @@ -0,0 +1,43 @@ +//*************************************************************************** +//* Copyright (c) 2023-2026 SPAdes team +//* Copyright (c) 2021-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "graph_resolver_io.hpp" + +#include "assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.hpp" +#include "io/graph/gfa_writer.hpp" + +namespace cont_index { + +void TransformedGraphIO::PrintGraph(const debruijn_graph::Graph &graph, + const GraphResolver::GraphResolverInfo &resolver_info, + const std::filesystem::path &output_base) const { + auto resolved_graph_out = std::ofstream(output_base / ("resolved_graph.gfa")); + path_extend::GFAPathWriter resolved_graph_writer(graph, resolved_graph_out); + resolved_graph_writer.WriteSegmentsAndLinks(); + + auto new_name_generator = std::make_shared(); + path_extend::ContigWriter resolved_writer(graph, new_name_generator); + path_extend::PathContainer resolved_edges; + for (const auto &edge: graph.canonical_edges()) { + resolved_edges.Create(graph, edge); + } + std::vector edge_writers; + edge_writers.push_back([&](const path_extend::ScaffoldStorage &scaffold_storage) { + auto fn = output_base / ("resolved_edges.fasta"); + INFO("Outputting edges to " << fn); + path_extend::ContigWriter::WriteScaffolds(scaffold_storage, fn); + }); + resolved_writer.OutputPaths(resolved_edges, edge_writers); + + auto edge_out = output_base / "edge_transform.tsv"; + auto edge_out_stream = std::ofstream(edge_out); + edge_out_stream << "Original edge id\tResolved graph edge id\n"; + for (const auto &entry: resolver_info.original_edge_to_transformed) { + edge_out_stream << (*id_mapper_)[entry.first.int_id()] << "\t" << entry.second.int_id() << std::endl; + } +} +} diff --git a/src/projects/splitter/graph_resolver_io.hpp b/src/projects/splitter/graph_resolver_io.hpp new file mode 100644 index 0000000000..fea905074b --- /dev/null +++ b/src/projects/splitter/graph_resolver_io.hpp @@ -0,0 +1,25 @@ +//*************************************************************************** +//* Copyright (c) 2023-2026 SPAdes team +//* Copyright (c) 2021-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "graph_resolver.hpp" + +#include + +namespace cont_index { +class TransformedGraphIO { + public: + explicit TransformedGraphIO(io::IdMapper *id_mapper) : id_mapper_(id_mapper) {} + void PrintGraph(const debruijn_graph::Graph &graph, + const GraphResolver::GraphResolverInfo &resolver_info, + const std::filesystem::path &output_base) const; + + private: + io::IdMapper *id_mapper_; +}; +} diff --git a/src/projects/splitter/main.cpp b/src/projects/splitter/main.cpp new file mode 100644 index 0000000000..fbe182b947 --- /dev/null +++ b/src/projects/splitter/main.cpp @@ -0,0 +1,403 @@ +//*************************************************************************** +//* Copyright (c) 2023-2026 SPAdes team +//* Copyright (c) 2021-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "barcode_index_construction.hpp" +#include "graph_resolver.hpp" +#include "graph_resolver_io.hpp" +#include "path_extractor.hpp" +#include "scaffold_graph_helper.hpp" +#include "vertex_resolver.hpp" + +#include "auxiliary_graphs/contracted_graph/contracted_graph_builder.hpp" +#include "io/binary/read_cloud.hpp" +#include "io/graph/gfa_writer.hpp" +#include "toolchain/utils.hpp" +#include "utils/filesystem/path_helper.hpp" +#include "utils/parallel/openmp_wrapper.h" +#include "utils/segfault_handler.hpp" +#include "utils/verify.hpp" + +#include + +using namespace debruijn_graph; +using namespace cont_index; +using namespace path_extend::read_cloud; + +enum class GraphType { + Blunted, + Multiplexed +}; + +enum class ResolutionMode { + Diploid, + Meta +}; + +struct gcfg { + unsigned k = 55; + unsigned mapping_k = 31; + std::filesystem::path graph; + std::filesystem::path output_dir; + std::filesystem::path refpath; + std::filesystem::path assembly_info; + unsigned nthreads = (omp_get_max_threads() / 2 + 1); + std::filesystem::path file = ""; + std::filesystem::path tmpdir = "saves"; + unsigned libindex = 0; + GraphType graph_type = GraphType::Multiplexed; + ResolutionMode mode = ResolutionMode::Diploid; + bool bin_load = false; + bool debug = false; + + //barcode_index_construction + size_t frame_size = 40000; + size_t read_linkage_distance = 40000; + double sampling_factor = 1.0; + + //graph construction + double graph_score_threshold = 2.0; + size_t tail_threshold = 200000; + size_t count_threshold = 1; + + //vertex resolution + double rel_threshold = 2.0; + bool scaffold_links = false; + + //meta mode + size_t length_threshold = 2000; +}; + +static void process_cmdline(int argc, char** argv, gcfg& cfg) { + using namespace clipp; + + std::string graph; + std::string output_dir; + std::string refpath; + std::string file; + std::string tmpdir; + std::string assembly_info; + + auto cli = ( + graph << value("graph (in binary or GFA)"), + file << value("SLR library description (in YAML)"), + output_dir << value("path to output directory"), + (option("--dataset") & value("yaml", file)) % "dataset description (in YAML)", + (option("-l") & integer("value", cfg.libindex)) % "library index (0-based, default: 0)", + (option("--assembly-info") & value("assembly-info", assembly_info)) + % "Path to metaflye assembly_info.txt file (meta mode, metaFlye graphs only)", + (option("-t") & integer("value", cfg.nthreads)) % "# of threads to use", + (option("--mapping-k") & integer("value", cfg.mapping_k)) % "k for read mapping", + (option("--tmp-dir") & value("tmp", tmpdir)) % "scratch directory to use", + (option("--ref") & value("reference", refpath)) % "Reference path for repeat resolution evaluation (developer option)", + (option("--bin-load").set(cfg.bin_load)) % "load binary-converted reads from tmpdir (developer option)", + (option("--debug").set(cfg.debug)) % "produce lots of debug data (developer option)", + (option("--sampling-factor") & value("sampling-factor", cfg.sampling_factor)) % "Sampling factor for read downsampling", + (with_prefix("-G", + option("mdbg").set(cfg.graph_type, GraphType::Multiplexed) | + option("blunt").set(cfg.graph_type, GraphType::Blunted)) % "assembly graph type (mDBG or blunted)"), + (with_prefix("-M", + option("diploid").set(cfg.mode, ResolutionMode::Diploid) | + option("meta").set(cfg.mode, ResolutionMode::Meta)) % "repeat resolution mode (diploid or meta)"), + (option("--frame-size") & value("frame-size", cfg.frame_size)) % "Resolution of barcode index", + (option("--linkage-distance") & value("read-linkage-distance", cfg.read_linkage_distance)) % + "Reads are assigned to the same fragment based on linkage distance", + (option("--score") & value("score", cfg.graph_score_threshold)) % "Score threshold for link index", + (option("--rel-threshold") & value("rel-threshold", cfg.rel_threshold)) % "Relative score threshold for vertex resolution", + (option("--tail-threshold") & value("tail-threshold", cfg.tail_threshold)) % + "Barcodes are assigned to the first and last nucleotides of the edge", + (option("--count-threshold") & value("count-threshold", cfg.count_threshold)) + % "Minimum number of reads for barcode index", + (option("--scaffold-links").set(cfg.scaffold_links)) % "Use scaffold links in addition to graph links for repeat resolution", + (option("--length-threshold") & value("length-threshold", cfg.length_threshold)) + % "Minimum scaffold graph edge length (meta mode option)" + ); + + auto result = parse(argc, argv, cli); + if (!result) { + std::cout << make_man_page(cli, argv[0]); + exit(1); + } + cfg.graph = graph; + cfg.output_dir = output_dir; + cfg.refpath = refpath; + cfg.file = file; + cfg.tmpdir = tmpdir; + cfg.assembly_info = assembly_info; +} + +struct TimeTracerRAII { + TimeTracerRAII(llvm::StringRef program_name, + unsigned granularity = 500, + const std::string &prefix = "", const std::string &suffix = "") { + time_trace_file_ = prefix + "time_trace_" + suffix + ".json"; + llvm::timeTraceProfilerInitialize(granularity, program_name); + } + ~TimeTracerRAII() { + if (auto E = llvm::timeTraceProfilerWrite(time_trace_file_, "cont-index")) { + handleAllErrors(std::move(E), + [&](const llvm::StringError &SE) { + ERROR("" << SE.getMessage() << "\n"); + }); + return; + } else { + INFO("Time trace is written to: " << time_trace_file_); + } + llvm::timeTraceProfilerCleanup(); + } + + std::string time_trace_file_; +}; + +gfa::GFAReader ReadGraph(const gcfg &cfg, + debruijn_graph::Graph &graph, + io::IdMapper *id_mapper) { + gfa::GFAReader gfa(cfg.graph); + gfa.to_graph(graph, id_mapper); + INFO("GFA segments: " << gfa.num_edges() << ", links: " << gfa.num_links() << ", paths: " + << gfa.num_paths()); + return gfa; +} + +std::unordered_set ParseRepetitiveEdges(const debruijn_graph::Graph &graph, + const std::string &path_to_info, + io::IdMapper *id_mapper) { + std::unordered_set result; + size_t total_repetitive_length = 0; + std::ifstream info_stream(path_to_info); + std::string ctg_id; + std::string is_repeat; + std::string blank; + std::string graph_path; + for (size_t i = 0; i < 8; ++i) { + info_stream >> blank; + } + while (!info_stream.eof()) { + info_stream >> ctg_id; + info_stream >> blank; + info_stream >> blank; + info_stream >> blank; + info_stream >> is_repeat; + info_stream >> blank; + info_stream >> blank; + info_stream >> graph_path; + if (is_repeat == "Y") { + auto current_pos = graph_path.find(','); + while (current_pos != std::string::npos) { + std::string edge_num = graph_path.substr(0, current_pos); + if (edge_num != "*") { + EdgeId edge = (*id_mapper)["edge_" + edge_num]; + result.insert(edge); + result.insert(graph.conjugate(edge)); + total_repetitive_length += graph.length(edge); + } + graph_path.erase(0, current_pos + 1); + current_pos = graph_path.find(','); + } + } + } + INFO(result.size() << " repetitive edges, total length: " << total_repetitive_length); + return result; +} + +size_t GetLengthThreshold(const gcfg &cfg) { + switch (cfg.mode) { + default: + FATAL_ERROR("Unknown repeat resolution mode"); + case ResolutionMode::Diploid: { + return 0; + } + case ResolutionMode::Meta: { + return cfg.length_threshold; + } + } +} + +cont_index::VertexResolver::LinkMap GetTrustedContigLinks(const std::unordered_set &repetitive_edges, + debruijn_graph::Graph &graph, + const gfa::GFAReader &gfa) { + cont_index::VertexResolver::LinkMap trusted_link_map; + std::unordered_set non_unique_starts; + size_t total_path_edges = 0; + std::vector> non_repetitive_paths; + for (const auto &path: gfa.paths()) { + std::vector non_repetitive_path; + for (const auto &edge: path.edges) { + if (repetitive_edges.find(edge) == repetitive_edges.end()) { + non_repetitive_path.push_back(edge); + ++total_path_edges; + } + } + non_repetitive_paths.push_back(non_repetitive_path); + } + for (const auto &path: non_repetitive_paths) { + if (path.size() < 2) { + continue; + } + for (auto it1 = path.begin(), it2 = std::next(it1); it2 != path.end(); ++it1, ++it2) { + EdgeId current = *it1; + EdgeId next = *it2; + trusted_link_map[current].insert(next); + trusted_link_map[graph.conjugate(next)].insert(graph.conjugate(current)); + } + } + size_t total_links = 0; + for (const auto &entry: trusted_link_map) { + total_links += entry.second.size(); + } + return trusted_link_map; +} + +cont_index::VertexResults GetRepeatResolutionResults(const gcfg &cfg, + debruijn_graph::Graph &graph, + const gfa::GFAReader &gfa, + std::shared_ptr barcode_extractor_ptr, + io::IdMapper *id_mapper) { + size_t length_threshold = GetLengthThreshold(cfg); + std::filesystem::path vertex_output_path = cfg.output_dir / "vertex_stats.tsv"; + + switch (cfg.mode) { + default: + FATAL_ERROR("Unknown repeat resolution mode"); + case ResolutionMode::Diploid: { + cont_index::VertexResolver::LinkMap empty_map; + cont_index::VertexResolver vertex_resolver + (graph, graph, empty_map, barcode_extractor_ptr, cfg.count_threshold, cfg.tail_threshold, + length_threshold, cfg.nthreads, cfg.graph_score_threshold, cfg.rel_threshold); + auto results = vertex_resolver.ResolveVertices(); + vertex_resolver.PrintVertexResults(results, vertex_output_path, id_mapper); + return results; + } + case ResolutionMode::Meta: { + auto repetitive_edges = ParseRepetitiveEdges(graph, cfg.assembly_info, id_mapper); + auto repeat_predicate = [&repetitive_edges](const debruijn_graph::EdgeId &edge) { + return repetitive_edges.find(edge) == repetitive_edges.end(); + }; + contracted_graph::DBGContractedGraphFactory factory(graph, repeat_predicate); + factory.Construct(); + INFO("Constructed graph"); + auto contracted_graph = factory.GetGraph(); + INFO("Total contracted graph vertices: " << contracted_graph->size()); + auto trusted_link_map = GetTrustedContigLinks(repetitive_edges, graph, gfa); + + cont_index::VertexResolver vertex_resolver + (*contracted_graph, contracted_graph->GetAssemblyGraph(), trusted_link_map, barcode_extractor_ptr, + cfg.count_threshold, cfg.tail_threshold, length_threshold, cfg.nthreads, + cfg.graph_score_threshold, cfg.rel_threshold); + auto results = vertex_resolver.ResolveVertices(); + vertex_resolver.PrintVertexResults(results, vertex_output_path, id_mapper); + return results; + } + } +} + +void ResolveComplexVertices(const gcfg &cfg, + debruijn_graph::Graph &graph, + std::shared_ptr barcode_extractor_ptr, + io::IdMapper *id_mapper, + const gfa::GFAReader &gfa, + path_extend::GFAPathWriter gfa_writer) { + auto vertex_results = GetRepeatResolutionResults(cfg, graph, gfa, barcode_extractor_ptr, id_mapper); + + cont_index::PathExtractor path_extractor(graph); + path_extend::PathContainer paths; + path_extractor.ExtractPaths(paths, vertex_results); + INFO("Extracted paths") + + auto name_generator = std::make_shared(); + path_extend::ContigWriter writer(graph, name_generator); + std::vector path_writers; + INFO("Creating writers") + path_writers.push_back([&](const path_extend::ScaffoldStorage &scaffold_storage) { + auto fn = cfg.output_dir / ("contigs.fasta"); + INFO("Outputting contigs to " << fn); + path_extend::ContigWriter::WriteScaffolds(scaffold_storage, fn); + }); + path_writers.push_back([&](const path_extend::ScaffoldStorage &storage) { + INFO("Populating GFA with scaffold paths"); + gfa_writer.WritePaths(storage); + }); + writer.OutputPaths(paths, path_writers); + + if (cfg.mode == ResolutionMode::Diploid) { + cont_index::GraphResolver graph_resolver; + auto graph_resolver_info = graph_resolver.TransformGraph(graph, paths, vertex_results); + TransformedGraphIO graph_output(id_mapper); + graph_output.PrintGraph(graph, graph_resolver_info, cfg.output_dir); + } +} + +int main(int argc, char** argv) { + utils::segfault_handler sh; + gcfg cfg; + + srand(42); + srandom(42); + + process_cmdline(argc, argv, cfg); + + toolchain::create_console_logger(); + START_BANNER("SpLitteR"); + + cfg.nthreads = spades_set_omp_threads(cfg.nthreads); + INFO("Maximum # of threads to use (adjusted due to OMP capabilities): " << cfg.nthreads); + + std::filesystem::create_directory(cfg.output_dir); + std::filesystem::create_directory(cfg.tmpdir); + + INFO("Loading graph"); + std::unique_ptr> id_mapper(new io::IdMapper()); + + debruijn_graph::Graph graph(cfg.k); + auto gfa = ReadGraph(cfg, graph, id_mapper.get()); + INFO("Graph loaded. Total vertices: " << graph.size() << ", total edges: " << graph.e_size()); + + std::ofstream graph_out(cfg.output_dir / "assembly_graph.gfa"); + path_extend::GFAPathWriter gfa_writer(graph, graph_out, + io::MapNamingF(*id_mapper)); + gfa_writer.WriteSegmentsAndLinks(); + + INFO("Building barcode index"); + if (cfg.libindex != -1u) { + INFO("Processing paired-end reads"); + DataSet dataset; + if (cfg.file != "") { + dataset.load(cfg.file); + if (cfg.libindex == -1u) + cfg.libindex = 0; + CHECK_FATAL_ERROR(cfg.libindex < dataset.lib_count(), "invalid library index"); + } + + debruijn_graph::config::init_libs(dataset, cfg.nthreads, cfg.tmpdir); + barcode_index::FrameBarcodeIndex barcode_index(graph, cfg.frame_size); + using BarcodeExtractor = barcode_index::FrameBarcodeIndexInfoExtractor; + auto barcode_extractor_ptr = std::make_shared(barcode_index, graph); + + std::unique_ptr traceraii; + traceraii.reset(new TimeTracerRAII(argv[0], 500)); + INFO("Time tracing is enabled"); + + TIME_TRACE_SCOPE("Containment index"); + + auto &lib = dataset[cfg.libindex]; + if (lib.type() == io::LibraryType::Clouds10x or lib.type() == io::LibraryType::TellSeqReads) { + cont_index::ConstructBarcodeIndex(barcode_index, lib, graph, cfg.tmpdir, cfg.nthreads, cfg.frame_size, + cfg.mapping_k, cfg.bin_load, cfg.debug); + } else { + ERROR("Only read cloud libraries with barcode tags are supported for links"); + } + + barcode_index::FrameBarcodeIndex downsampled_index(graph, cfg.frame_size); + if (not math::eq(cfg.sampling_factor, 1.0)) { + INFO("Downsampling the barcode index with factor " << cfg.sampling_factor); + cont_index::DownsampleBarcodeIndex(graph, cfg.nthreads, barcode_index, downsampled_index, + cfg.sampling_factor); + barcode_extractor_ptr = std::make_shared(downsampled_index, graph); + } + + ResolveComplexVertices(cfg, graph, barcode_extractor_ptr, id_mapper.get(), gfa, gfa_writer); + } +} diff --git a/src/projects/splitter/path_extractor.cpp b/src/projects/splitter/path_extractor.cpp new file mode 100644 index 0000000000..ac41089645 --- /dev/null +++ b/src/projects/splitter/path_extractor.cpp @@ -0,0 +1,166 @@ +//*************************************************************************** +//* Copyright (c) 2023-2026 SPAdes team +//* Copyright (c) 2021-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "path_extractor.hpp" + +namespace cont_index { + +void PathExtractor::ExtractPaths(path_extend::PathContainer &paths, + const VertexResults &vertex_results) const { + //fixme replace with graph distance + int DEFAULT_GAP = 500; + + auto scaffold_links = GetScaffoldLinks(vertex_results); + const auto &in_degrees = scaffold_links.in_degrees; + const auto &out_degrees = scaffold_links.out_degrees; + const auto &in_to_out = scaffold_links.in_to_out; + const auto &vertex_link_storage = scaffold_links.vertex_link_storage; + for (const auto &entry: in_degrees) { + VERIFY_MSG(entry.second < 2, "In degree " << entry.second << ", " << entry.first); + } + for (const auto &entry: out_degrees) { + VERIFY_MSG(entry.second < 2, "Out degree " << entry.second << ", " << entry.first); + } + size_t visited_edges = 0; + size_t total_path_overlap = 0; + std::unordered_set visited; + std::unordered_map end_to_path_idx; + for (const auto &entry: out_degrees) { + if (in_degrees.find(entry.first) == in_degrees.end()) { + if (visited.find(entry.first) == visited.end()) { + debruijn_graph::EdgeId current_edge = entry.first; + auto &path = paths.Create(graph_, current_edge); + visited.insert(current_edge); + visited.insert(graph_.conjugate(current_edge)); + while (out_degrees.find(current_edge) != out_degrees.end()) { + const auto &next_edge = in_to_out.at(current_edge); + if (visited.find(next_edge) != visited.end()) { + TRACE("Edge is visited!"); + visited_edges++; + break; + } + if (IsGraphLink(current_edge, next_edge, vertex_link_storage)) { + total_path_overlap += graph_.data(graph_.EdgeStart(next_edge)).overlap(); + path.PushBack(next_edge); + } else { + path.PushBack(next_edge, path_extend::Gap(DEFAULT_GAP)); + } + visited.insert(next_edge); + visited.insert(graph_.conjugate(next_edge)); + current_edge = next_edge; + } + } + } + } + + for (const debruijn_graph::EdgeId &edge: graph_.canonical_edges()) { + if (visited.find(edge) == visited.end()) { + paths.Create(graph_, edge); + visited.insert(edge); + visited.insert(graph_.conjugate(edge)); + } + } + size_t total_path_size = 0; + size_t total_path_length = 0; + for (const auto &path: paths) { + total_path_size += path.first->Size(); + total_path_length += path.first->Length(); + } + INFO("Total path size: " << total_path_size); + INFO("Total path length: " << total_path_length); + INFO("Total path overlap: " << total_path_overlap); + INFO("Edges visited by several paths: " << visited_edges); +} +bool PathExtractor::IsConjugatePair(const PathExtractor::SimplePath &first, + const PathExtractor::SimplePath &second) const { + if (first.size() != second.size()) { + return false; + } + for (auto it1 = first.begin(), it2 = second.end(); it1 != first.end(); ++it1) { + --it2; + if (*it1 != graph_.conjugate(*it2)) { + return false; + } + } + return true; +} +bool PathExtractor::IsGraphLink(debruijn_graph::EdgeId first, + debruijn_graph::EdgeId second, + const PathExtractor::VertexLinkStorage &vertex_storage) const { + auto out_graph_links = vertex_storage.find(first); + if (out_graph_links == vertex_storage.end()) { + return false; + } + auto out_link_result = out_graph_links->second.find(second); + if (out_link_result == out_graph_links->second.end()) { + return false; + } + return true; +} +PathExtractor::ScaffoldLinks PathExtractor::GetScaffoldLinks(const VertexResults &vertex_results) const { + INFO("Extracting paths"); + std::unordered_map in_degrees; + std::unordered_map out_degrees; + std::unordered_map in_to_out; + std::unordered_map> vertex_link_storage; + size_t total_length = 0; + size_t total_edges = 0; + size_t total_resolved_overlap = 0; + size_t not_graph_supported_links = 0; + size_t graph_supported_links = 0; + + for (const debruijn_graph::EdgeId &edge: graph_.canonical_edges()) { + total_length += graph_.length(edge); + ++total_edges; + } + + for (const auto &vertex_entry: vertex_results.vertex_to_result) { + const auto &vertex_result = vertex_entry.second; + auto vertex = vertex_entry.first; + DEBUG("Updating link storage"); + if (graph_.is_complex(vertex)) { + for (const debruijn_graph::LinkId &link_id: graph_.links(vertex)) { + auto &link = graph_.link(link_id); + TRACE(link.link.first.int_id() << "," << link.link.second.int_id() << "," << link_id); + vertex_link_storage[link.link.first].insert(link.link.second); + } + } else { + for (const auto &in_edge: graph_.IncomingEdges(vertex)) { + for (const auto &out_edge: graph_.OutgoingEdges(vertex)) { + vertex_link_storage[in_edge].insert(out_edge); + } + } + } + DEBUG("Constructing path map"); + for (const auto &entry: vertex_result.supported_pairs) { + if (vertex_result.state == VertexState::Completely or vertex_result.state == VertexState::Partially) { + if (in_to_out.find(entry.first) == in_to_out.end()) { + TRACE(entry.first.int_id() << "," << entry.second.int_id()); + if (IsGraphLink(entry.first, entry.second, vertex_link_storage)) { + total_resolved_overlap += graph_.data(vertex_entry.first).overlap(); + ++graph_supported_links; + } else { + ++not_graph_supported_links; + } + in_to_out[entry.first] = entry.second; + in_degrees[entry.second]++; + out_degrees[entry.first]++; + } + } + } + } + + INFO("Total graph size: " << total_edges); + INFO("Total graph length: " << total_length); + INFO("Links not supported by graph: " << not_graph_supported_links); + INFO("Links supported by graph: " << graph_supported_links); + INFO("Total resolved overlap: " << total_resolved_overlap); + ScaffoldLinks result(in_degrees, out_degrees, in_to_out, vertex_link_storage); + return result; +} + +} diff --git a/src/projects/splitter/path_extractor.hpp b/src/projects/splitter/path_extractor.hpp new file mode 100644 index 0000000000..b7e9389fa0 --- /dev/null +++ b/src/projects/splitter/path_extractor.hpp @@ -0,0 +1,57 @@ +//*************************************************************************** +//* Copyright (c) 2023-2026 SPAdes team +//* Copyright (c) 2021-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "assembly_graph/paths/bidirectional_path_container.hpp" +#include "assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.hpp" +#include "barcode_index/barcode_info_extractor.hpp" +#include "io/graph/gfa_reader.hpp" + +#include "vertex_resolver.hpp" + +namespace cont_index { + +class PathExtractor { + typedef std::vector SimplePath; + typedef std::unordered_map> VertexLinkStorage; + public: + explicit PathExtractor(const debruijn_graph::Graph &graph) : graph_(graph) {} + + void ExtractPaths(path_extend::PathContainer &paths, const VertexResults &vertex_results) const; + private: + struct ScaffoldLinks { + typedef std::unordered_map DegreeMap; + DegreeMap in_degrees; + DegreeMap out_degrees; + std::unordered_map in_to_out; + std::unordered_map> vertex_link_storage; + + ScaffoldLinks(const DegreeMap &in_degrees, + const DegreeMap &out_degrees, + const std::unordered_map &in_to_out, + const std::unordered_map> &vertex_link_storage) + : in_degrees(in_degrees), + out_degrees(out_degrees), + in_to_out(in_to_out), + vertex_link_storage(vertex_link_storage) {} + }; + + bool IsConjugatePair(const SimplePath &first, const SimplePath &second) const; + bool IsGraphLink(debruijn_graph::EdgeId first, + debruijn_graph::EdgeId second, + const VertexLinkStorage &vertex_storage) const; + + ScaffoldLinks GetScaffoldLinks(const VertexResults &vertex_results) const; + + const debruijn_graph::Graph &graph_; + + DECL_LOGGER("PathExtractor"); +}; + +} \ No newline at end of file diff --git a/src/projects/splitter/scaffold_graph_helper.cpp b/src/projects/splitter/scaffold_graph_helper.cpp new file mode 100644 index 0000000000..f2c0db818b --- /dev/null +++ b/src/projects/splitter/scaffold_graph_helper.cpp @@ -0,0 +1,254 @@ +//*************************************************************************** +//* Copyright (c) 2023-2026 SPAdes team +//* Copyright (c) 2021-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "auxiliary_graphs/scaffold_graph/scaffold_vertex_index_builder.hpp" +//#include "modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.hpp" +#include "modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp" +#include "scaffold_graph_helper.hpp" + +namespace cont_index { + +scaffold_graph::ScaffoldGraph LinkIndexGraphConstructor::ConstructGraph() const { + scaffold_graph::ScaffoldGraph result(g_); + std::unordered_set scaffold_vertices; + for (const debruijn_graph::EdgeId &edge: g_.canonical_edges()) { + scaffold_vertices.insert(edge); + scaffold_vertices.insert(g_.conjugate(edge)); + } + auto score_function = ConstructScoreFunction(); + barcode_index::SimpleScaffoldVertexIndexBuilderHelper helper; + auto tail_threshold_getter = std::make_shared(tail_threshold_); + auto scaffold_vertex_index = helper.ConstructScaffoldVertexIndex(g_, + *barcode_extractor_, + tail_threshold_getter, + count_threshold_, + length_threshold_, + max_threads_, + scaffold_vertices); + + auto scaffold_index_extractor = + std::make_shared(scaffold_vertex_index); + INFO("Setting score index threshold to " << graph_score_threshold_); + + for (const auto &vertex: scaffold_vertices) { + result.AddVertex(vertex); + } + +// ReverseBarcodeIndexConstructor reverse_index_constructor(g_, barcode_extractor_, length_threshold_, tail_threshold_, +// count_threshold_, max_threads_); +// auto reverse_index = reverse_index_constructor.ConstructReverseIndex(scaffold_vertices); +// +// size_t total_head_size = 0; +// size_t total_tail_size = 0; +// for (const auto &vertex: scaffold_vertices) { +// total_head_size += scaffold_index_extractor->GetHeadSize(vertex); +// total_tail_size += scaffold_index_extractor->GetTailSize(vertex); +// } +// INFO("Total head size: " << total_head_size); +// INFO("Total tail size: " << total_tail_size); +// size_t total_pairs = 0; +// for (const auto &entry: reverse_index) { +// total_pairs += entry.second.size() * entry.second.size(); +// } + + std::vector chunks; + for (const auto &first: scaffold_vertices) { + chunks.emplace_back(first, scaffold_vertices.begin(), scaffold_vertices.end()); + } +// for (const auto &entry: reverse_index) { +// for (const auto &first: entry.second) { +// chunks.emplace_back(first, entry.second.begin(), entry.second.end()); +// } +// } + INFO(chunks.size() << " chunks"); + auto score_filter = std::make_shared(g_, chunks, + score_function, + graph_score_threshold_, + max_threads_); + return *(score_filter->Construct()); +} +LinkIndexGraphConstructor::LinkIndexGraphConstructor(const debruijn_graph::Graph &g, + LinkIndexGraphConstructor::BarcodeExtractorPtr barcode_extractor, + const double graph_score_threshold, + const size_t tail_threshold, + const size_t length_threshold, + const size_t count_threshold, + size_t max_threads) : g_(g), + barcode_extractor_(barcode_extractor), + graph_score_threshold_( + graph_score_threshold), + tail_threshold_(tail_threshold), + length_threshold_(length_threshold), + count_threshold_(count_threshold), + max_threads_(max_threads) {} +LinkIndexGraphConstructor::BarcodeScoreFunctionPtr LinkIndexGraphConstructor::ConstructScoreFunction() const { + std::set scaffold_vertices; + for (const debruijn_graph::EdgeId &edge: g_.canonical_edges()) { + scaffold_vertices.insert(edge); + scaffold_vertices.insert(g_.conjugate(edge)); + } + + barcode_index::SimpleScaffoldVertexIndexBuilderHelper helper; + auto tail_threshold_getter = std::make_shared(tail_threshold_); + auto scaffold_vertex_index = helper.ConstructScaffoldVertexIndex(g_, + *barcode_extractor_, + tail_threshold_getter, + count_threshold_, + length_threshold_, + max_threads_, + scaffold_vertices); + + auto scaffold_index_extractor = + std::make_shared(scaffold_vertex_index); + auto score_function = + std::make_shared(g_, scaffold_index_extractor, + count_threshold_, tail_threshold_); + return score_function; +} +GFAGraphConstructor::GFAGraphConstructor(const debruijn_graph::Graph &g, + const gfa::GFAReader &gfa, + io::IdMapper *id_mapper) : + g_(g), gfa_(gfa), id_mapper_(id_mapper) {} +scaffold_graph::ScaffoldGraph GFAGraphConstructor::ConstructGraphFromDBG() const { + scaffold_graph::ScaffoldGraph scaffold_graph(g_); + for (const EdgeId &edge: g_.canonical_edges()) { + scaffold_graph.AddVertex(edge); + } + for (const auto &vertex: g_.vertices()) { + for (const auto &outgoing: g_.OutgoingEdges(vertex)) { + for (const auto &incoming: g_.IncomingEdges(vertex)) { + scaffold_graph.AddEdge(incoming, outgoing, 0, 1.0, 0); + } + } + } + return scaffold_graph; +} + +ReverseBarcodeIndex ReverseBarcodeIndexConstructor::ConstructReverseIndex(const std::set &scaffold_vertices) const { + barcode_index::SimpleScaffoldVertexIndexBuilderHelper helper; + auto tail_threshold_getter = std::make_shared(tail_threshold_); + auto scaffold_vertex_index = helper.ConstructScaffoldVertexIndex(g_, *barcode_extractor_, tail_threshold_getter, + count_threshold_, length_threshold_, + max_threads_, scaffold_vertices); + auto scaffold_index_extractor = + std::make_shared(scaffold_vertex_index); + ReverseBarcodeIndex result; + for (const auto &vertex: scaffold_vertices) { + for (const auto &barcode: scaffold_index_extractor->GetHeadEntry(vertex)) { + result[barcode].insert(vertex); + } + for (const auto &barcode: scaffold_index_extractor->GetTailEntry(vertex)) { + result[barcode].insert(vertex); + } + } + double mean_barcode_size = .0; + double barcode_size_m2 = .0; + for (const auto &entry: result) { + auto entry_size = static_cast(entry.second.size()); + mean_barcode_size += entry_size; + barcode_size_m2 += entry_size * entry_size; + } + mean_barcode_size /= static_cast(result.size()); + barcode_size_m2 /= static_cast(result.size()); + INFO("Number of barcodes: " << result.size()); + INFO("Mean edges in barcode: " << mean_barcode_size); + INFO("Raw second moment of barcode edges: " << barcode_size_m2); + return result; +} +ReverseBarcodeIndexConstructor::ReverseBarcodeIndexConstructor(const debruijn_graph::Graph &g, + BarcodeExtractorPtr barcode_extractor, + const size_t length_threshold, + const size_t tail_threshold, + const size_t count_threshold, + size_t max_threads) : + g_(g), + barcode_extractor_(barcode_extractor), + length_threshold_(length_threshold), + tail_threshold_(tail_threshold), + count_threshold_(count_threshold), + max_threads_(max_threads) {} +scaffold_graph::ScaffoldGraph ScaffoldGraphSerializer::ReadGraph(const string &path_to_graph) { + scaffold_graph::ScaffoldGraph result(g_); + std::unordered_map id_to_vertex; + for (const debruijn_graph::EdgeId &edge: g_.canonical_edges()) { + auto str_id = (*id_mapper_)[edge.int_id()]; + scaffold_graph::ScaffoldVertex vertex(edge); + scaffold_graph::ScaffoldVertex conj_vertex(g_.conjugate(edge)); + id_to_vertex.emplace(str_id, vertex); + id_to_vertex.emplace(str_id + "\'", conj_vertex); + result.AddVertex(vertex); + result.AddVertex(conj_vertex); + } + + size_t number_of_edges; + std::ifstream graph_reader(path_to_graph); + graph_reader >> number_of_edges; + size_t i = 0; + std::string first_id, second_id; + double weight; + //fixme optimize link deduplication in scaffold graph itself + std::set> unique_links; + while (i < number_of_edges) { + graph_reader >> first_id >> second_id >> weight; + auto first_vertex = id_to_vertex.at(first_id); + auto second_vertex = id_to_vertex.at(second_id); + auto emplace_result = unique_links.emplace(first_vertex, second_vertex); + if (emplace_result.second) { + scaffold_graph::ScaffoldGraph::ScaffoldEdge sc_edge(first_vertex, second_vertex, 0, weight, 0); + result.AddEdgeSimple(sc_edge); + } + ++i; + } + return result; +} +void ScaffoldGraphSerializer::WriteGraph(const scaffold_graph::ScaffoldGraph &scaffold_graph, const std::string &path_to_graph) const { + std::ofstream os(path_to_graph); + os << scaffold_graph.EdgeCount() << "\n"; +// os << "FirstId\tSecondId\tWeight\n"; + for (const scaffold_graph::ScaffoldGraph::ScaffoldEdge &edge: scaffold_graph.edges()) { + os << (*id_mapper_)[edge.getStart().int_id()] << "\t" << (*id_mapper_)[edge.getEnd().int_id()] << "\t" << edge.getWeight() << "\n"; + } +} +ScaffoldGraphSerializer::ScaffoldGraphSerializer(const debruijn_graph::Graph &g, io::IdMapper *id_mapper) : + g_(g), id_mapper_(id_mapper) {} +scaffold_graph::ScaffoldGraph GetTellSeqScaffoldGraph(const debruijn_graph::Graph &g, + BarcodeExtractorPtr barcode_extractor, + double score_threshold, + size_t length_threshold, + size_t tail_threshold, + size_t count_threshold, + size_t max_threads, + bool bin_load, + bool debug, + const std::filesystem::path &output_dir, + io::IdMapper *id_mapper) { + auto path_to_scaffold_graph = output_dir / "tellseq_links.scg"; + scaffold_graph::ScaffoldGraph scaffold_graph(g); + if (!bin_load or !std::filesystem::exists(path_to_scaffold_graph)) { + LinkIndexGraphConstructor link_index_constructor(g, + barcode_extractor, + score_threshold, + tail_threshold, + length_threshold, + count_threshold, + max_threads); + INFO("Constructing scaffold graph"); + scaffold_graph = link_index_constructor.ConstructGraph(); + } else { + INFO("Reading scaffold graph from " << path_to_scaffold_graph); + ScaffoldGraphSerializer graph_serializer(g, id_mapper); + scaffold_graph = graph_serializer.ReadGraph(path_to_scaffold_graph); + } + INFO(scaffold_graph.VertexCount() << " vertices and " << scaffold_graph.EdgeCount() + << " edges in scaffold graph"); + if (debug) { + ScaffoldGraphSerializer graph_serializer(g, id_mapper); + graph_serializer.WriteGraph(scaffold_graph, path_to_scaffold_graph); + } + return scaffold_graph; +} +} diff --git a/src/projects/splitter/scaffold_graph_helper.hpp b/src/projects/splitter/scaffold_graph_helper.hpp new file mode 100644 index 0000000000..7648f37ca5 --- /dev/null +++ b/src/projects/splitter/scaffold_graph_helper.hpp @@ -0,0 +1,117 @@ +//*************************************************************************** +//* Copyright (c) 2023-2026 SPAdes team +//* Copyright (c) 2021-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "assembly_graph/core/graph.hpp" +#include "auxiliary_graphs/scaffold_graph/scaffold_graph.hpp" +#include "barcode_index/barcode_index_builder.hpp" +#include "barcode_index/barcode_info_extractor.hpp" +#include "io/graph/gfa_reader.hpp" +#include "library/library.hpp" +#include "library/library_data.hpp" +#include "modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.hpp" +#include "modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp" + +#include + +namespace cont_index { + +typedef std::unordered_map> ReverseBarcodeIndex; + +class LinkIndexGraphConstructor { + public: + using Graph = debruijn_graph::Graph; + using BarcodeExtractorPtr = std::shared_ptr; + using BarcodeScoreFunctionPtr = std::shared_ptr; + + LinkIndexGraphConstructor(const Graph &g, + BarcodeExtractorPtr barcode_extractor, + const double graph_score_threshold, + const size_t tail_threshold, + const size_t length_threshold, + const size_t count_threshold, + size_t max_threads); + + scaffold_graph::ScaffoldGraph ConstructGraph() const; + + BarcodeScoreFunctionPtr ConstructScoreFunction() const; + + private: + const debruijn_graph::Graph &g_; + BarcodeExtractorPtr barcode_extractor_; + const double graph_score_threshold_; + const size_t tail_threshold_; + const size_t length_threshold_; + const size_t count_threshold_; + size_t max_threads_; +}; + +class ReverseBarcodeIndexConstructor { + public: + using ScaffoldVertex = scaffold_graph::ScaffoldVertex; + using BarcodeExtractorPtr = std::shared_ptr; + ReverseBarcodeIndexConstructor(const debruijn_graph::Graph &g, + BarcodeExtractorPtr barcode_extractor, + const size_t length_threshold, + const size_t tail_threshold, + const size_t count_threshold, + size_t max_threads); + + ReverseBarcodeIndex ConstructReverseIndex(const std::set &scaffold_vertices) const; + private: + const debruijn_graph::Graph &g_; + BarcodeExtractorPtr barcode_extractor_; + const size_t length_threshold_; + const size_t tail_threshold_; + const size_t count_threshold_; + size_t max_threads_; +}; + +class ScaffoldGraphSerializer { + public: + ScaffoldGraphSerializer(const debruijn_graph::Graph &g, io::IdMapper *id_mapper); + + scaffold_graph::ScaffoldGraph ReadGraph(const std::string &path_to_graph); + void WriteGraph(const scaffold_graph::ScaffoldGraph &scaffold_graph, const std::string &path_to_graph) const; + + private: + const debruijn_graph::Graph &g_; + io::IdMapper *id_mapper_; +}; + +class GFAGraphConstructor { + public: + using Graph = debruijn_graph::Graph; + using EdgeId = debruijn_graph::EdgeId; + using BarcodeExtractorPtr = std::shared_ptr; + + GFAGraphConstructor(const Graph &g, + const gfa::GFAReader &gfa, + io::IdMapper *id_mapper); + + scaffold_graph::ScaffoldGraph ConstructGraphFromDBG() const; + + private: + const debruijn_graph::Graph &g_; + const gfa::GFAReader &gfa_; + io::IdMapper *id_mapper_; +}; + +using BarcodeExtractorPtr = std::shared_ptr; +scaffold_graph::ScaffoldGraph GetTellSeqScaffoldGraph(const debruijn_graph::Graph &g, + BarcodeExtractorPtr barcode_extractor, + double score_threshold, + size_t length_threshold, + size_t tail_threshold, + size_t count_threshold, + size_t max_threads, + bool bin_load, + bool debug, + const std::filesystem::path &output_dir, + io::IdMapper *id_mapper); +} diff --git a/src/projects/splitter/vertex_resolver.hpp b/src/projects/splitter/vertex_resolver.hpp new file mode 100644 index 0000000000..25de68844a --- /dev/null +++ b/src/projects/splitter/vertex_resolver.hpp @@ -0,0 +1,292 @@ +//*************************************************************************** +//* Copyright (c) 2023-2026 SPAdes team +//* Copyright (c) 2021-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "barcode_index/barcode_info_extractor.hpp" +#include "common/sequence/rtseq.hpp" +#include "io/graph/gfa_reader.hpp" + +#include "scaffold_graph_helper.hpp" + +namespace cont_index { + +enum class VertexState { + Completely, + Partially, + Ambiguous, + Uncovered +}; + +struct VertexResult { + VertexResult(VertexState state, + const size_t &total_links, + const size_t &supporting_links, + const std::unordered_map &supported_pairs) : + state(state), + total_links(total_links), + supporting_links(supporting_links), + supported_pairs(supported_pairs) {} + + VertexState state; + size_t total_links; + size_t supporting_links; + std::unordered_map supported_pairs; +}; + +struct VertexResults { + VertexResults(const std::unordered_map &vertex_to_result) : + vertex_to_result(vertex_to_result) {} + + std::unordered_map vertex_to_result; +}; + +template +class VertexResolver { + public: + typedef std::unordered_map ResolutionResults; + typedef typename Graph::EdgeId EdgeId; + typedef std::unordered_map> LinkMap; + + VertexResolver(Graph &graph, + const debruijn_graph::Graph &assembly_graph, + const LinkMap &links, + const std::shared_ptr &barcode_extractor_ptr, + size_t count_threshold, + size_t tail_threshold, + size_t length_threshold, + size_t threads, + double score_threshold, + double rel_threshold) : + graph_(graph), + assembly_graph_(assembly_graph), + links_(links), + barcode_extractor_ptr_(barcode_extractor_ptr), + count_threshold_(count_threshold), + tail_threshold_(tail_threshold), + length_threshold_(length_threshold), + threads_(threads), + score_threshold_(score_threshold), + rel_threshold_(rel_threshold) {} + + VertexResults ResolveVertices() { + std::unordered_set interesting_vertices; + size_t total_in_edges = 0; + size_t total_out_edges = 0; + for (const auto &vertex: graph_.vertices()) { + if (vertex.int_id() > graph_.conjugate(vertex).int_id()) { + continue; + } + //todo use predicate iterator + if (graph_.OutgoingEdgeCount(vertex) >= 2 and graph_.IncomingEdgeCount(vertex) >= 2) { + interesting_vertices.insert(vertex); + total_in_edges += graph_.IncomingEdgeCount(vertex); + total_out_edges += graph_.OutgoingEdgeCount(vertex); + } + } + INFO(interesting_vertices.size() << " complex vertices"); + INFO("Total indegree: " << total_in_edges << ", total outdegree: " << total_out_edges); + LinkIndexGraphConstructor link_index_constructor(assembly_graph_, barcode_extractor_ptr_, score_threshold_, + tail_threshold_, length_threshold_, count_threshold_, threads_); + auto score_function = link_index_constructor.ConstructScoreFunction(); + INFO("Constructed score function"); + + std::unordered_map vertex_to_result; + for (const auto &vertex: interesting_vertices) { + auto vertex_result = ResolveVertex(vertex, score_function); + vertex_to_result.insert({vertex, vertex_result}); + } + return vertex_to_result; + } + + VertexResult ResolveVertex(debruijn_graph::VertexId vertex, + LinkIndexGraphConstructor::BarcodeScoreFunctionPtr score_function) const { + size_t total_links = 0; + size_t answer_links = 0; + std::unordered_map in_to_out; + bool is_ambiguous = false; + std::unordered_set covered_vertices; + double LINK_BONUS = 1000000; + + for (const EdgeId &sc_in_edge: graph_.IncomingEdges(vertex)) { + //convert to dbg EdgeId + scaffold_graph::ScaffoldVertex sc_in_vertex(sc_in_edge); + scaffold_graph::EdgeGetter edge_getter; + debruijn_graph::EdgeId in_edge = edge_getter.GetEdgeFromScaffoldVertex(sc_in_vertex); + + std::pair max_pair(0, 0); + std::pair second_pair(0, 0); + size_t max_links = 0; + size_t second_links = 0; + for (const EdgeId &sc_out_edge: graph_.OutgoingEdges(vertex)) { + scaffold_graph::ScaffoldVertex sc_out_vertex(sc_out_edge); + debruijn_graph::EdgeId out_edge = edge_getter.GetEdgeFromScaffoldVertex(sc_out_vertex); + if (in_edge == out_edge or in_edge == assembly_graph_.conjugate(out_edge)) { + continue; + } + + scaffold_graph::ScaffoldGraph::ScaffoldEdge sc_edge(in_edge, out_edge); + auto score = score_function->GetScore(sc_edge); + auto link_result = links_.find(in_edge); + if (link_result != links_.end() and link_result->second.find(out_edge) != link_result->second.end()) { + score += LINK_BONUS; + } + total_links += static_cast(score); + if (math::ge(score, score_threshold_)) { + covered_vertices.insert(vertex); + if (score > static_cast(max_links)) { + second_pair = max_pair; + second_links = max_links; + max_links = static_cast(score); + max_pair = std::make_pair(in_edge, out_edge); + } + } + } + if (static_cast(max_links) < static_cast(second_links) * rel_threshold_) { + is_ambiguous = true; + } else if (static_cast(max_links) >= score_threshold_) { + in_to_out[max_pair.first] = max_pair.second; + answer_links += max_links; + } + } + bool is_covered = covered_vertices.find(vertex) != covered_vertices.end(); + VertexState state = GetState(in_to_out, vertex, is_ambiguous, is_covered); + VertexResult result(state, total_links, answer_links, in_to_out); + return result; + } + + void PrintVertexResults(const VertexResults &results, + const std::filesystem::path &output_path, + io::IdMapper *id_mapper) const { + std::ofstream ver_stream(output_path); + ver_stream << + "Vertex Id\tInDegree\tInEdges\tOutDegree\tOutEdges\tCovered edges\tVertex result\tSupported paths\tTotal links\tAnswer links\tAnswer\n"; + size_t uncovered = 0; + size_t ambiguous = 0; + size_t partially = 0; + size_t completely = 0; + for (const auto &entry: results.vertex_to_result) { + const auto &vertex_results = entry.second; + switch (vertex_results.state) { + case VertexState::Uncovered: + ++uncovered; + break; + case VertexState::Ambiguous: + ++ambiguous; + break; + case VertexState::Partially: + ++partially; + break; + case VertexState::Completely: + ++completely; + break; + } + ver_stream << VertexResultString(entry.first, vertex_results, id_mapper) << std::endl; + } + INFO(uncovered << " uncovered vertices"); + INFO(ambiguous << " ambiguous vertices"); + INFO(partially << " partially resolved vertices"); + INFO(completely << " completely resolved vertices"); + } + + std::string VertexResultString(debruijn_graph::VertexId vertex, + const VertexResult &vertex_result, + io::IdMapper *id_mapper) const { + std::string result_string; + switch (vertex_result.state) { + case VertexState::Uncovered: + result_string = "Uncovered"; + break; + case VertexState::Ambiguous: + result_string = "Ambiguous"; + break; + case VertexState::Partially: + result_string = "Partially"; + break; + case VertexState::Completely: + result_string = "Completely"; + break; + } + std::string answer_string; + for (const auto &entry: vertex_result.supported_pairs) { + answer_string += (*id_mapper)[entry.first.int_id()] + "#" + (*id_mapper)[entry.second.int_id()] + ","; + } + std::string in_edge_string, out_edge_string; + for (const EdgeId &edge: graph_.IncomingEdges(vertex)) { + in_edge_string += (*id_mapper)[edge.int_id()] + ","; + } + for (const EdgeId &edge: graph_.OutgoingEdges(vertex)) { + out_edge_string += (*id_mapper)[edge.int_id()] + ","; + } + in_edge_string = in_edge_string.substr(0, in_edge_string.size() - 1); + out_edge_string = out_edge_string.substr(0, out_edge_string.size() - 1); + answer_string = answer_string.substr(0, answer_string.size() - 1); + std::string vertex_string; + vertex_string += + std::to_string(vertex.int_id()) + "\t" + std::to_string(graph_.IncomingEdgeCount(vertex)) + "\t" + + in_edge_string + "\t"; + vertex_string += + std::to_string(graph_.OutgoingEdgeCount(vertex)) + "\t" + out_edge_string + "\t" + result_string + "\t"; + vertex_string += + std::to_string(vertex_result.supported_pairs.size()) + "\t" + std::to_string(vertex_result.total_links); + vertex_string += "\t" + std::to_string(vertex_result.supporting_links) + "\t" + answer_string; + return vertex_string; + } + private: + VertexState GetState(std::unordered_map &in_to_out, + debruijn_graph::VertexId vertex, + bool is_ambiguous, + bool is_covered) const { + std::unordered_set in_edges; + std::unordered_set out_edges; + for (const auto &entry: in_to_out) { + in_edges.insert(entry.first); + out_edges.insert(entry.second); + } + if (is_ambiguous or in_edges.size() > out_edges.size()) { + std::unordered_map outedge_to_indegree; + for (const auto &entry: in_to_out) { + outedge_to_indegree[entry.second]++; + } + std::unordered_map new_in_to_out; + for (const auto &entry: in_to_out) { + auto outedge = entry.second; + if (outedge_to_indegree.at(outedge) == 1) { + new_in_to_out[entry.first] = entry.second; + } + } + if (not new_in_to_out.empty()) { + in_to_out = std::move(new_in_to_out); + return VertexState::Partially; + } else { + return VertexState::Ambiguous; + } + } + if (not is_covered) { + return VertexState::Uncovered; + } else { + if (in_edges.size() == graph_.IncomingEdgeCount(vertex)) { + return VertexState::Completely; + } else { + return VertexState::Partially; + } + } + } + + Graph &graph_; + const debruijn_graph::Graph &assembly_graph_; + const LinkMap links_; + std::shared_ptr barcode_extractor_ptr_; + size_t count_threshold_; + size_t tail_threshold_; + size_t length_threshold_; + size_t threads_; + double score_threshold_; + double rel_threshold_; +}; + +} \ No newline at end of file From d5ec3ff06d38f5cf8821e50c8999e289d9872132 Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Fri, 30 Jan 2026 15:28:19 +0100 Subject: [PATCH 02/17] Scaffold graph fixes --- src/common/auxiliary_graphs/CMakeLists.txt | 1 + .../barcode_index/barcode_info_extractor.hpp | 2 +- src/common/modules/CMakeLists.txt | 2 +- src/common/modules/path_extend/CMakeLists.txt | 8 +- .../modules/path_extend/pipeline/launcher.cpp | 19 +- .../modules/path_extend/pipeline/launcher.hpp | 7 +- .../read_cloud_connection_conditions.cpp | 12 +- .../scaffolder2015/scaffold_graph.cpp | 264 ------------------ .../scaffolder2015/scaffold_graph.hpp | 233 ---------------- .../scaffold_graph_constructor.cpp | 10 +- .../scaffold_graph_constructor.hpp | 3 +- .../splitter/scaffold_graph_helper.cpp | 4 +- 12 files changed, 39 insertions(+), 526 deletions(-) delete mode 100644 src/common/modules/path_extend/scaffolder2015/scaffold_graph.cpp delete mode 100644 src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp diff --git a/src/common/auxiliary_graphs/CMakeLists.txt b/src/common/auxiliary_graphs/CMakeLists.txt index f5c0545501..11f5a93d77 100644 --- a/src/common/auxiliary_graphs/CMakeLists.txt +++ b/src/common/auxiliary_graphs/CMakeLists.txt @@ -14,6 +14,7 @@ add_library(auxiliary_graphs STATIC contracted_graph/graph_condensation.cpp contracted_graph/contracted_statistics.cpp scaffold_graph/scaffold_vertex.cpp + scaffold_graph/scaffold_graph.cpp ) target_link_libraries(auxiliary_graphs assembly_graph) diff --git a/src/common/barcode_index/barcode_info_extractor.hpp b/src/common/barcode_index/barcode_info_extractor.hpp index 1f4969c355..99b53205aa 100644 --- a/src/common/barcode_index/barcode_info_extractor.hpp +++ b/src/common/barcode_index/barcode_info_extractor.hpp @@ -235,7 +235,7 @@ namespace barcode_index { } bool operator!= (const const_intersection_iterator& other) { - return *this != other; + return !(*this == other); } private: diff --git a/src/common/modules/CMakeLists.txt b/src/common/modules/CMakeLists.txt index aed8e83d14..64586cba57 100644 --- a/src/common/modules/CMakeLists.txt +++ b/src/common/modules/CMakeLists.txt @@ -13,4 +13,4 @@ add_library(modules STATIC genome_consistance_checker.cpp path_extend/scaff_supplementary.cpp) -target_link_libraries(modules sequence bwa assembly_graph alignment) +target_link_libraries(modules sequence bwa assembly_graph alignment auxiliary_graphs) diff --git a/src/common/modules/path_extend/CMakeLists.txt b/src/common/modules/path_extend/CMakeLists.txt index 93eb49a5dd..f8e8a89930 100644 --- a/src/common/modules/path_extend/CMakeLists.txt +++ b/src/common/modules/path_extend/CMakeLists.txt @@ -19,12 +19,12 @@ add_library(path_extend STATIC pipeline/launcher.cpp pipeline/extenders_logic.cpp scaffolder2015/extension_chooser2015.cpp - scaffolder2015/scaffold_graph.cpp scaffolder2015/scaffold_graph_constructor.cpp scaffolder2015/scaffold_graph_visualizer.cpp - scaffolder2015/connection_condition2015.cpp - scaffolder2015/path_polisher.cpp) + scaffolder2015/connection_condition2015.cpp + scaffolder2015/path_polisher.cpp + read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.cpp) -target_link_libraries(path_extend modules assembly_graph ssw configs) +target_link_libraries(path_extend modules assembly_graph auxiliary_graphs ssw configs) diff --git a/src/common/modules/path_extend/pipeline/launcher.cpp b/src/common/modules/path_extend/pipeline/launcher.cpp index 3ac0954f25..14edbf98ee 100644 --- a/src/common/modules/path_extend/pipeline/launcher.cpp +++ b/src/common/modules/path_extend/pipeline/launcher.cpp @@ -56,7 +56,7 @@ PathExtendLauncher::ConstructPairedConnectionConditions(const ScaffoldingUniqueE return conditions; } -std::shared_ptr PathExtendLauncher::ConstructScaffoldGraph(const ScaffoldingUniqueEdgeStorage &edge_storage) const { +std::shared_ptr PathExtendLauncher::ConstructScaffoldGraph(const ScaffoldingUniqueEdgeStorage &edge_storage) const { using namespace scaffolder; const pe_config::ParamSetT::ScaffoldGraphParamsT ¶ms = params_.pset.scaffold_graph_params; @@ -84,18 +84,25 @@ std::shared_ptr PathExtendLauncher::ConstructScaffold return scaffold_graph; } -void PathExtendLauncher::PrintScaffoldGraph(const scaffolder::ScaffoldGraph &scaffold_graph, +void PathExtendLauncher::PrintScaffoldGraph(const scaffold_graph::ScaffoldGraph &scaffold_graph, const std::set &main_edge_set, const debruijn_graph::GenomeConsistenceChecker &genome_checker, const std::filesystem::path &filename) const { using namespace scaffolder; - - auto vertex_colorer = std::make_shared(main_edge_set); + std::set scaff_vertex_set; + for (const auto& edge: main_edge_set) { + scaff_vertex_set.insert(edge); + } + auto vertex_colorer = std::make_shared(scaff_vertex_set); auto edge_colorer = std::make_shared(); - graph_colorer::CompositeGraphColorer colorer(vertex_colorer, edge_colorer); + graph_colorer::CompositeGraphColorer colorer(vertex_colorer, edge_colorer); INFO("Visualizing scaffold graph"); - ScaffoldGraphVisualizer singleVisualizer(scaffold_graph, genome_checker.EdgeLabels()); + std::map scaff_vertex_labels; + for (const auto& entry: genome_checker.EdgeLabels()) { + scaff_vertex_labels.insert({entry.first, entry.second}); + } + ScaffoldGraphVisualizer singleVisualizer(scaffold_graph, scaff_vertex_labels); std::ofstream single_dot; single_dot.open(filename.native() + "_single.dot"); singleVisualizer.Visualize(single_dot, colorer); diff --git a/src/common/modules/path_extend/pipeline/launcher.hpp b/src/common/modules/path_extend/pipeline/launcher.hpp index 14aadf2dcf..f29bafedab 100644 --- a/src/common/modules/path_extend/pipeline/launcher.hpp +++ b/src/common/modules/path_extend/pipeline/launcher.hpp @@ -12,11 +12,12 @@ #include "launch_support.hpp" #include "modules/path_extend/pe_resolver.hpp" -#include "modules/path_extend/scaffolder2015/scaffold_graph.hpp" +#include "modules/path_extend/scaffolder2015/connection_condition2015.hpp" #include "modules/genome_consistance_checker.hpp" #include "alignment/rna/ss_coverage.hpp" #include "assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.hpp" +#include "auxiliary_graphs/scaffold_graph/scaffold_graph.hpp" namespace path_extend { @@ -37,10 +38,10 @@ class PathExtendLauncher { std::vector> ConstructPairedConnectionConditions(const ScaffoldingUniqueEdgeStorage &edge_storage) const; - std::shared_ptr + std::shared_ptr ConstructScaffoldGraph(const ScaffoldingUniqueEdgeStorage &edge_storage) const; - void PrintScaffoldGraph(const scaffolder::ScaffoldGraph &scaffold_graph, + void PrintScaffoldGraph(const scaffold_graph::ScaffoldGraph &scaffold_graph, const std::set &main_edge_set, const debruijn_graph::GenomeConsistenceChecker &genome_checker, const std::filesystem::path &filename) const; diff --git a/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.cpp b/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.cpp index 452f66782f..d5a44bd273 100644 --- a/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.cpp +++ b/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.cpp @@ -11,7 +11,7 @@ namespace path_extend { namespace read_cloud { -double NormalizedBarcodeScoreFunction::GetScore(const scaffolder::ScaffoldGraph::ScaffoldEdge &edge) const { +double NormalizedBarcodeScoreFunction::GetScore(const scaffold_graph::ScaffoldGraph::ScaffoldEdge &edge) const { auto first = edge.getStart(); auto second = edge.getEnd(); DEBUG("Checking edge " << edge.getStart().int_id() << " -> " << edge.getEnd().int_id()); @@ -44,7 +44,7 @@ NormalizedBarcodeScoreFunction::NormalizedBarcodeScoreFunction( std::shared_ptr barcode_extractor_) : AbstractBarcodeScoreFunction(graph_, barcode_extractor_) {} -TransitiveEdgesPredicate::TransitiveEdgesPredicate(const scaffolder::ScaffoldGraph &graph, +TransitiveEdgesPredicate::TransitiveEdgesPredicate(const scaffold_graph::ScaffoldGraph &graph, const Graph &g, size_t distance_threshold) : scaffold_graph_(graph), g_(g), distance_threshold_(distance_threshold) {} @@ -64,7 +64,7 @@ bool TransitiveEdgesPredicate::Check(const ScaffoldEdgePredicate::ScaffoldEdge & DEBUG("True"); return true; } -SimpleSearcher::SimpleSearcher(const scaffolder::ScaffoldGraph &graph, const Graph &g, size_t distance) +SimpleSearcher::SimpleSearcher(const scaffold_graph::ScaffoldGraph &graph, const Graph &g, size_t distance) : scaff_graph_(graph), g_(g), distance_threshold_(distance) {} std::vector SimpleSearcher::GetReachableVertices( const SimpleSearcher::ScaffoldVertex &vertex, @@ -109,8 +109,8 @@ void SimpleSearcher::ProcessVertex(std::queue &vertex_queue, } } } -bool SimpleSearcher::AreEqual(const scaffolder::ScaffoldGraph::ScaffoldEdge &first, - const scaffolder::ScaffoldGraph::ScaffoldEdge &second) { +bool SimpleSearcher::AreEqual(const scaffold_graph::ScaffoldGraph::ScaffoldEdge &first, + const scaffold_graph::ScaffoldGraph::ScaffoldEdge &second) { return first.getStart() == second.getStart() and first.getEnd() == second.getEnd(); } @@ -131,7 +131,7 @@ TrivialBarcodeScoreFunction::TrivialBarcodeScoreFunction( barcode_extractor_), read_count_threshold_(read_count_threshold_), tail_threshold_(tail_threshold_) {} -double TrivialBarcodeScoreFunction::GetScore(const scaffolder::ScaffoldGraph::ScaffoldEdge &edge) const { +double TrivialBarcodeScoreFunction::GetScore(const scaffold_graph::ScaffoldGraph::ScaffoldEdge &edge) const { size_t shared_count = barcode_extractor_->GetIntersectionSize(edge.getStart(), edge.getEnd()); return static_cast(shared_count); diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph.cpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph.cpp deleted file mode 100644 index e1bcb713e5..0000000000 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph.cpp +++ /dev/null @@ -1,264 +0,0 @@ - -//*************************************************************************** -//* Copyright (c) 2023-2024 SPAdes team -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "scaffold_graph.hpp" - - -namespace path_extend { -namespace scaffold_graph { - -std::atomic ScaffoldGraph::ScaffoldEdge::scaffold_edge_id_{0}; - -void ScaffoldGraph::AddEdgeSimple(const ScaffoldGraph::ScaffoldEdge &e) { - edges_.emplace(e.getId(), e); - outgoing_edges_.emplace(e.getStart(), e.getId()); - incoming_edges_.emplace(e.getEnd(), e.getId()); -} - -void ScaffoldGraph::DeleteOutgoing(const ScaffoldGraph::ScaffoldEdge &e) { - auto e_range = outgoing_edges_.equal_range(e.getStart()); - for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) { - if (edges_.at(edge_id->second) == e) { - outgoing_edges_.erase(edge_id); - } - } -} - -void ScaffoldGraph::DeleteIncoming(const ScaffoldGraph::ScaffoldEdge &e) { - auto e_range = incoming_edges_.equal_range(e.getEnd()); - for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) { - if (edges_.at(edge_id->second) == e) { - incoming_edges_.erase(edge_id); - } - } -} - -void ScaffoldGraph::DeleteAllOutgoingEdgesSimple(ScaffoldGraph::ScaffoldVertex v) { - auto e_range = outgoing_edges_.equal_range(v); - for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) { - DeleteIncoming(edges_.at(edge_id->second)); - } - outgoing_edges_.erase(v); -} - -void ScaffoldGraph::DeleteEdgeFromStorage(const ScaffoldGraph::ScaffoldEdge &e) { - VERIFY(!Exists(e)); - edges_.erase(e.getId()); -} - -void ScaffoldGraph::DeleteAllIncomingEdgesSimple(ScaffoldGraph::ScaffoldVertex v) { - auto e_range = incoming_edges_.equal_range(v); - for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) { - DeleteOutgoing(edges_.at(edge_id->second)); - } - incoming_edges_.erase(v); -} - -bool ScaffoldGraph::Exists(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const { - return vertices_.count(assembly_graph_edge) != 0; -} - -bool ScaffoldGraph::Exists(const ScaffoldGraph::ScaffoldEdge &e) const { - auto e_range = outgoing_edges_.equal_range(e.getStart()); - for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) { - if (edges_.at(edge_id->second) == e) { - return true; - } - } - return false; -} - -ScaffoldGraph::ScaffoldVertex ScaffoldGraph::conjugate(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const { - return assembly_graph_.conjugate(assembly_graph_edge); -} - -ScaffoldGraph::ScaffoldEdge ScaffoldGraph::conjugate(const ScaffoldGraph::ScaffoldEdge &e) const { - return ScaffoldEdge(conjugate(e.getEnd()), conjugate(e.getStart()), e.getColor(), e.getWeight()); -} - -bool ScaffoldGraph::AddVertex(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) { - if (!Exists(assembly_graph_edge)) { - VERIFY(!Exists(conjugate(assembly_graph_edge))); - vertices_.insert(assembly_graph_edge); - vertices_.insert(conjugate(assembly_graph_edge)); - return true; - } - return false; -} - -void ScaffoldGraph::AddVertices(const std::set &vertices) { - for (auto v : vertices) { - AddVertex(v); - } -} - -bool ScaffoldGraph::AddEdge(ScaffoldGraph::ScaffoldVertex v1, ScaffoldGraph::ScaffoldVertex v2, size_t lib_id, double weight) { - VERIFY(Exists(v1)); - VERIFY(Exists(v2)); - - ScaffoldEdge e(v1, v2, lib_id, weight); - if (Exists(e)) { - return false; - } - - - AddEdgeSimple(e); - return true; -} - -void ScaffoldGraph::Print(std::ostream &os) const { - for (auto v: vertices_) { - os << "Vertex " << int_id(v) << " ~ " << int_id(conjugate(v)) - << ": len = " << assembly_graph_.length(v) << ", cov = " << assembly_graph_.coverage(v) << std::endl; - } - for (auto e_iter = edges_.begin(); e_iter != edges_.end(); ++e_iter) { - os << "Edge " << e_iter->second.getId() << - ": " << int_id(e_iter->second.getStart()) << " -> " << int_id(e_iter->second.getEnd()) << - ", lib index = " << e_iter->second.getColor() << ", weight " << e_iter->second.getWeight() << std::endl; - } -} - -ScaffoldGraph::ScaffoldEdge ScaffoldGraph::UniqueIncoming(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const { - VERIFY(HasUniqueIncoming(assembly_graph_edge)); - return edges_.at(incoming_edges_.find(assembly_graph_edge)->second); -} - -ScaffoldGraph::ScaffoldEdge ScaffoldGraph::UniqueOutgoing(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const { - VERIFY(HasUniqueOutgoing(assembly_graph_edge)); - return edges_.at(outgoing_edges_.find(assembly_graph_edge)->second); -} - -bool ScaffoldGraph::HasUniqueIncoming(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const { - return IncomingEdgeCount(assembly_graph_edge) == 1; -} - -bool ScaffoldGraph::HasUniqueOutgoing(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const { - return OutgoingEdgeCount(assembly_graph_edge) == 1; -} - -size_t ScaffoldGraph::IncomingEdgeCount(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const { - return incoming_edges_.count(assembly_graph_edge); -} - -size_t ScaffoldGraph::OutgoingEdgeCount(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const { - return outgoing_edges_.count(assembly_graph_edge); -} - -std::vector ScaffoldGraph::IncomingEdges(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const { - std::vector result; - auto e_range = incoming_edges_.equal_range(assembly_graph_edge); - for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) { - result.push_back(edges_.at(edge_id->second)); - } - return result; -} - -std::vector ScaffoldGraph::OutgoingEdges(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const { - std::vector result; - auto e_range = outgoing_edges_.equal_range(assembly_graph_edge); - for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) { - result.push_back(edges_.at(edge_id->second)); - } - return result; -} - -const debruijn_graph::Graph &ScaffoldGraph::AssemblyGraph() const { - return assembly_graph_; -} - -size_t ScaffoldGraph::EdgeCount() const { - return edges_.size(); -} - -size_t ScaffoldGraph::VertexCount() const { - return vertices_.size(); -} - -ScaffoldGraph::ScaffoldVertex ScaffoldGraph::EdgeEnd(ScaffoldEdge e) const { - return e.getEnd(); -} - -ScaffoldGraph::ScaffoldVertex ScaffoldGraph::EdgeStart(ScaffoldEdge e) const { - return e.getStart(); -} - -size_t ScaffoldGraph::int_id(ScaffoldGraph::ScaffoldEdge e) const { - return e.getId(); -} - -size_t ScaffoldGraph::int_id(ScaffoldGraph::ScaffoldVertex v) const { - return assembly_graph_.int_id(v); -} - -ScaffoldGraph::ConstScaffoldEdgeIterator ScaffoldGraph::eend() const { - return ConstScaffoldEdgeIterator(edges_.cend()); -} - -ScaffoldGraph::ConstScaffoldEdgeIterator ScaffoldGraph::ebegin() const { - return ConstScaffoldEdgeIterator(edges_.cbegin()); -} - -ScaffoldGraph::VertexStorage::const_iterator ScaffoldGraph::vend() const { - return vertices_.cend(); -} - -ScaffoldGraph::VertexStorage::const_iterator ScaffoldGraph::vbegin() const { - return vertices_.cbegin(); -} - -adt::iterator_range ScaffoldGraph::vertices() const { - return adt::make_range(vbegin(), vend()); -} - -adt::iterator_range ScaffoldGraph::edges() const { - return adt::make_range(ebegin(), eend()); -} - -bool ScaffoldGraph::IsVertexIsolated(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const { - bool result = incoming_edges_.count(assembly_graph_edge) == 0 && outgoing_edges_.count(assembly_graph_edge) == 0; - return result; -} - -bool ScaffoldGraph::RemoveVertex(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) { - if (Exists(assembly_graph_edge)) { - VERIFY(Exists(conjugate(assembly_graph_edge))); - - DeleteAllOutgoingEdgesSimple(assembly_graph_edge); - DeleteAllIncomingEdgesSimple(assembly_graph_edge); - DeleteAllOutgoingEdgesSimple(conjugate(assembly_graph_edge)); - DeleteAllIncomingEdgesSimple(conjugate(assembly_graph_edge)); - - VERIFY(incoming_edges_.count(assembly_graph_edge) == 0); - VERIFY(outgoing_edges_.count(assembly_graph_edge) == 0); - VERIFY(incoming_edges_.count(conjugate(assembly_graph_edge)) == 0); - VERIFY(outgoing_edges_.count(conjugate(assembly_graph_edge)) == 0); - - vertices_.erase(assembly_graph_edge); - vertices_.erase(conjugate(assembly_graph_edge)); - - return true; - } - return false; -} - -bool ScaffoldGraph::RemoveEdge(const ScaffoldGraph::ScaffoldEdge &e) { - if (Exists(e)) { - DeleteOutgoing(e); - DeleteIncoming(e); - DeleteEdgeFromStorage(e); - - return true; - } - return false; -} - -bool ScaffoldGraph::AddEdge(const ScaffoldGraph::ScaffoldEdge &e) { - return AddEdge(e.getStart(), e.getEnd(), e.getColor(), e.getWeight()); -} - -} //scaffold_graph -} //path_extend diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp deleted file mode 100644 index dded2d9ce1..0000000000 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp +++ /dev/null @@ -1,233 +0,0 @@ - -//*************************************************************************** -//* Copyright (c) 2023-2024 SPAdes team -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -// -// Created by andrey on 17.09.15. -// -#pragma once - -#include "utils/logger/logger.hpp" -#include "assembly_graph/core/graph.hpp" -#include "modules/path_extend/paired_library.hpp" -#include "connection_condition2015.hpp" -#include "adt/iterator_range.hpp" - -namespace path_extend { -namespace scaffolder { - -//do NOT add "using namespace debruijn_graph" in order not to confuse between EdgeId typdefs - -class ScaffoldGraph { - -public: - //EdgeId in de Bruijn graph is vertex in scaffolding graph - typedef debruijn_graph::EdgeId ScaffoldVertex; - - //Unique edge id - typedef size_t ScaffoldEdgeIdT; - - //Scaffold edge indormation class - struct ScaffoldEdge { - private: - //unique id - ScaffoldEdgeIdT id_; - //id counter - static std::atomic scaffold_edge_id_; - - ScaffoldVertex start_; - ScaffoldVertex end_; - //color = lib# - size_t color_; - //read pair weight or anything else - double weight_; - - public: - - ScaffoldEdge(ScaffoldVertex start, ScaffoldVertex end, size_t lib_id = (size_t) -1, double weight = 0) : - id_(scaffold_edge_id_++), - start_(start), end_(end), - color_(lib_id), - weight_(weight) { - } - - ScaffoldEdgeIdT getId() const { - return id_; - } - - - size_t getColor() const { - return color_; - } - - double getWeight() const { - return weight_; - } - - const ScaffoldVertex getStart() const { - return start_; - } - - const ScaffoldVertex getEnd() const { - return end_; - } - - bool operator==(const ScaffoldEdge &e) const { - return color_ == e.color_ && weight_ == e.weight_ && start_ == e.start_ && end_ == e.end_; - } - - bool operator==(const ScaffoldEdge &e) { - return color_ == e.color_ && weight_ == e.weight_ && start_ == e.start_ && end_ == e.end_; - } - }; - - //typedef for possibility to use in templated graph visualizers - typedef ScaffoldVertex VertexId; - typedef ScaffoldEdge EdgeId; - - //All vertices are stored in set - typedef std::set VertexStorage; - //Edges are stored in map: Id -> Edge Information - typedef std::unordered_map EdgeStorage; - //Adjacency list contains vertrx and edge id (instead of whole edge information) - typedef std::unordered_multimap AdjacencyStorage; - - struct ConstScaffoldEdgeIterator: public boost::iterator_facade { - private: - EdgeStorage::const_iterator iter_; - - public: - ConstScaffoldEdgeIterator(EdgeStorage::const_iterator iter) : iter_(iter) { - } - - private: - friend class boost::iterator_core_access; - - void increment() { - ++iter_; - } - - bool equal(const ConstScaffoldEdgeIterator &other) const { - return iter_ == other.iter_; - } - - const ScaffoldEdge& dereference() const { - return iter_->second; - } - }; - -//TODO:: fix this. Seems that only ebegin and eend are broken. -private: - EdgeStorage edges_; - - VertexStorage vertices_; - - const debruijn_graph::Graph &assembly_graph_; - - AdjacencyStorage outgoing_edges_; - - AdjacencyStorage incoming_edges_; - - void AddEdgeSimple(const ScaffoldEdge &e); - - //Delete outgoing edge from adjancecy list without checks - void DeleteOutgoing(const ScaffoldEdge &e); - - //Delete incoming edge from adjancecy list without checks - void DeleteIncoming(const ScaffoldEdge &e); - - //Delete all edge info from storage - void DeleteEdgeFromStorage(const ScaffoldEdge &e); - - //Detelte all outgoing from v edges from adjacency lists - void DeleteAllOutgoingEdgesSimple(ScaffoldVertex v); - - //Detelte all incoming from v edges from adjacency lists - void DeleteAllIncomingEdgesSimple(ScaffoldVertex v); - -public: - ScaffoldGraph(const debruijn_graph::Graph &g) : assembly_graph_(g) { - } - - bool Exists(ScaffoldVertex assembly_graph_edge) const; - - bool Exists(const ScaffoldEdge &e) const; - - ScaffoldVertex conjugate(ScaffoldVertex assembly_graph_edge) const; - - //Return structure thay is equal to conjugate of e (not exactrly the same structure as in graph) - ScaffoldEdge conjugate(const ScaffoldEdge &e) const; - - //Add isolated vertex to the graph if not exitsts - bool AddVertex(ScaffoldVertex assembly_graph_edge); - - void AddVertices(const std::set &vertices); - - //Add edge (and conjugate) if not exists - //v1 and v2 must exist - bool AddEdge(ScaffoldVertex v1, ScaffoldVertex v2, size_t lib_id, double weight); - - bool AddEdge(const ScaffoldEdge &e); - - //Rempve edge from edge container and all adjacency lists - bool RemoveEdge(const ScaffoldEdge &e); - - //Remove vertex and all adjacent edges - bool RemoveVertex(ScaffoldVertex assembly_graph_edge); - - bool IsVertexIsolated(ScaffoldVertex assembly_graph_edge) const; - - VertexStorage::const_iterator vbegin() const; - - VertexStorage::const_iterator vend() const; - - adt::iterator_range vertices() const; - - ConstScaffoldEdgeIterator ebegin() const; - - ConstScaffoldEdgeIterator eend() const; - - adt::iterator_range edges() const; - - size_t int_id(ScaffoldVertex v) const; - - size_t int_id(ScaffoldEdge e) const; - - ScaffoldVertex EdgeStart(ScaffoldEdge e) const; - - ScaffoldVertex EdgeEnd(ScaffoldEdge e) const; - - size_t VertexCount() const; - - size_t EdgeCount() const; - - const debruijn_graph::Graph & AssemblyGraph() const; - - std::vector OutgoingEdges(ScaffoldVertex assembly_graph_edge) const; - - std::vector IncomingEdges(ScaffoldVertex assembly_graph_edge) const; - - size_t OutgoingEdgeCount(ScaffoldVertex assembly_graph_edge) const; - - size_t IncomingEdgeCount(ScaffoldVertex assembly_graph_edge) const; - - bool HasUniqueOutgoing(ScaffoldVertex assembly_graph_edge) const; - - bool HasUniqueIncoming(ScaffoldVertex assembly_graph_edge) const; - - ScaffoldEdge UniqueOutgoing(ScaffoldVertex assembly_graph_edge) const; - - ScaffoldEdge UniqueIncoming(ScaffoldVertex assembly_graph_edge) const; - - void Print(std::ostream &os) const; - -}; - -} //scaffold_graph -} //path_extend - diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp index 17171f84b0..67913d407e 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp @@ -312,9 +312,9 @@ std::shared_ptr InternalScoreScaffoldGraphFilter: } return graph_; } -boost::optional InternalScoreScaffoldGraphFilter::GetWinnerVertex( +std::optional InternalScoreScaffoldGraphFilter::GetWinnerVertex( std::vector &edges) const { - boost::optional result; + std::optional result; if (edges.size() < 2) { return result; } @@ -329,9 +329,9 @@ boost::optional InternalScoreScaffo return result; } void InternalScoreScaffoldGraphFilter::ProcessEdges(std::vector &edges) { - boost::optional incoming_winner = GetWinnerVertex(edges); - if (incoming_winner.is_initialized()) { - graph_->AddEdge(incoming_winner.get()); + std::optional incoming_winner = GetWinnerVertex(edges); + if (incoming_winner.has_value()) { + graph_->AddEdge(*incoming_winner); } else { for (const auto &edge: edges) { graph_->AddEdge(edge); diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp index 232fd04b61..b6ee912821 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp @@ -16,6 +16,7 @@ #include "modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.hpp" #include +#include #include namespace path_extend { @@ -216,7 +217,7 @@ class InternalScoreScaffoldGraphFilter: public BaseScaffoldGraphConstructor { private: void ProcessEdges(std::vector &edges); - boost::optional GetWinnerVertex(std::vector &edges) const; + std::optional GetWinnerVertex(std::vector &edges) const; }; class ScoreFunctionScaffoldGraphConstructor: public BaseScaffoldGraphConstructor { diff --git a/src/projects/splitter/scaffold_graph_helper.cpp b/src/projects/splitter/scaffold_graph_helper.cpp index f2c0db818b..19ec39c9e1 100644 --- a/src/projects/splitter/scaffold_graph_helper.cpp +++ b/src/projects/splitter/scaffold_graph_helper.cpp @@ -171,7 +171,7 @@ ReverseBarcodeIndexConstructor::ReverseBarcodeIndexConstructor(const debruijn_gr tail_threshold_(tail_threshold), count_threshold_(count_threshold), max_threads_(max_threads) {} -scaffold_graph::ScaffoldGraph ScaffoldGraphSerializer::ReadGraph(const string &path_to_graph) { +scaffold_graph::ScaffoldGraph ScaffoldGraphSerializer::ReadGraph(const std::string &path_to_graph) { scaffold_graph::ScaffoldGraph result(g_); std::unordered_map id_to_vertex; for (const debruijn_graph::EdgeId &edge: g_.canonical_edges()) { @@ -213,7 +213,7 @@ void ScaffoldGraphSerializer::WriteGraph(const scaffold_graph::ScaffoldGraph &sc os << (*id_mapper_)[edge.getStart().int_id()] << "\t" << (*id_mapper_)[edge.getEnd().int_id()] << "\t" << edge.getWeight() << "\n"; } } -ScaffoldGraphSerializer::ScaffoldGraphSerializer(const debruijn_graph::Graph &g, io::IdMapper *id_mapper) : +ScaffoldGraphSerializer::ScaffoldGraphSerializer(const debruijn_graph::Graph &g, io::IdMapper *id_mapper) : g_(g), id_mapper_(id_mapper) {} scaffold_graph::ScaffoldGraph GetTellSeqScaffoldGraph(const debruijn_graph::Graph &g, BarcodeExtractorPtr barcode_extractor, From 9891f1c3429a185050343509424a1ce125200126 Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Fri, 30 Jan 2026 16:21:40 +0100 Subject: [PATCH 03/17] Remove InternalScore filter --- .../scaffold_graph_constructor.cpp | 47 ------------------- .../scaffold_graph_constructor.hpp | 22 --------- 2 files changed, 69 deletions(-) diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp index 67913d407e..438e8fa8c3 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp @@ -300,52 +300,5 @@ std::shared_ptr ScoreFunctionScaffoldGraphConstru } return graph_; } -std::shared_ptr InternalScoreScaffoldGraphFilter::Construct() { - for (const auto& vertex: old_graph_.vertices()) { - graph_->AddVertex(vertex); - } - for (const ScaffoldVertex &vertex: old_graph_.vertices()) { - auto outgoing = old_graph_.OutgoingEdges(vertex); - auto incoming = old_graph_.IncomingEdges(vertex); - ProcessEdges(incoming); - ProcessEdges(outgoing); - } - return graph_; -} -std::optional InternalScoreScaffoldGraphFilter::GetWinnerVertex( - std::vector &edges) const { - std::optional result; - if (edges.size() < 2) { - return result; - } - std::sort(edges.begin(), edges.end(), [this](const ScaffoldEdge &first, const ScaffoldEdge &second) { - return math::gr(score_function_->GetScore(first), score_function_->GetScore(second)); - }); - const double top_score = score_function_->GetScore(edges[0]); - const double second_score = score_function_->GetScore(edges[1]); - if (math::gr(top_score, relative_threshold_ * second_score)) { - return edges[0]; - } - return result; -} -void InternalScoreScaffoldGraphFilter::ProcessEdges(std::vector &edges) { - std::optional incoming_winner = GetWinnerVertex(edges); - if (incoming_winner.has_value()) { - graph_->AddEdge(*incoming_winner); - } else { - for (const auto &edge: edges) { - graph_->AddEdge(edge); - } - } -} -InternalScoreScaffoldGraphFilter::InternalScoreScaffoldGraphFilter( - const Graph &assembly_graph, - const ScaffoldGraph &old_graph, - std::shared_ptr score_function, - double relative_threshold) - : BaseScaffoldGraphConstructor(assembly_graph), - old_graph_(old_graph), - score_function_(score_function), - relative_threshold_(relative_threshold) {} } //scaffold_graph } //path_extend diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp index b6ee912821..9326795871 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp @@ -198,28 +198,6 @@ class ScoreFunctionScaffoldGraphFilter: public BaseScaffoldGraphConstructor { DECL_LOGGER("ScoreFunctionScaffoldGraphConstructor") }; -class InternalScoreScaffoldGraphFilter: public BaseScaffoldGraphConstructor { - typedef read_cloud::ScaffoldEdgeScoreFunction EdgePairScoreFunction; - using BaseScaffoldGraphConstructor::ScaffoldGraph; - using BaseScaffoldGraphConstructor::ScaffoldVertex; - typedef ScaffoldGraph::ScaffoldEdge ScaffoldEdge; - protected: - const ScaffoldGraph &old_graph_; - std::shared_ptr score_function_; - const double relative_threshold_; - public: - InternalScoreScaffoldGraphFilter(const Graph &assembly_graph, - const ScaffoldGraph &old_graph, - std::shared_ptr score_function, - double relative_threshold); - - std::shared_ptr Construct() override; - private: - void ProcessEdges(std::vector &edges); - - std::optional GetWinnerVertex(std::vector &edges) const; -}; - class ScoreFunctionScaffoldGraphConstructor: public BaseScaffoldGraphConstructor { typedef read_cloud::ScaffoldEdgeScoreFunction EdgePairScoreFunction; using BaseScaffoldGraphConstructor::ScaffoldGraph; From daad2cb53fec5549dd83b0caceaa013017408100 Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Fri, 30 Jan 2026 16:36:57 +0100 Subject: [PATCH 04/17] Remove full scaffold graph constructor --- .../scaffold_graph/scaffold_vertex_index.hpp | 115 ++++++++---------- .../scaffold_graph_constructor.cpp | 51 -------- .../scaffold_graph_constructor.hpp | 23 ---- 3 files changed, 49 insertions(+), 140 deletions(-) diff --git a/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index.hpp b/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index.hpp index 399d5ec97b..e01cff23e7 100644 --- a/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index.hpp +++ b/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index.hpp @@ -7,9 +7,9 @@ #pragma once -#include "auxiliary_graphs/scaffold_graph/scaffold_vertex.hpp" -#include "assembly_graph/core/graph.hpp" #include "adt/iterator_range.hpp" +#include "assembly_graph/core/graph.hpp" +#include "auxiliary_graphs/scaffold_graph/scaffold_vertex.hpp" #include "barcode_index/barcode_info_extractor.hpp" namespace barcode_index { @@ -20,36 +20,31 @@ class ScaffoldVertexIndexBuilder; template class ScaffoldVertexIndex { friend class ScaffoldVertexIndexBuilder; - public: + +public: typedef scaffold_graph::ScaffoldVertex ScaffoldVertex; typedef typename VertexEntryT::const_iterator const_iterator; typedef debruijn_graph::Graph Graph; - ScaffoldVertexIndex(const Graph &g): g_(g), vertex_to_entry_() {} + ScaffoldVertexIndex(const Graph &g) : g_(g), vertex_to_entry_() {} - const VertexEntryT& GetHeadEntry(const ScaffoldVertex& vertex) const { - return vertex_to_entry_.at(vertex); - } - const VertexEntryT& GetTailEntry(const ScaffoldVertex& vertex) const { + const VertexEntryT &GetHeadEntry(const ScaffoldVertex &vertex) const { return vertex_to_entry_.at(vertex); } + const VertexEntryT &GetTailEntry(const ScaffoldVertex &vertex) const { return vertex_to_entry_.at(vertex.GetConjugateFromGraph(g_)); } - const_iterator GetHeadBegin(const ScaffoldVertex& vertex) const { - return vertex_to_entry_.at(vertex).begin(); - } - const_iterator GetHeadEnd(const ScaffoldVertex& vertex) const { - return vertex_to_entry_.at(vertex).end(); - } - adt::iterator_range GetHeadRange(const ScaffoldVertex& vertex) const { + const_iterator GetHeadBegin(const ScaffoldVertex &vertex) const { return vertex_to_entry_.at(vertex).begin(); } + const_iterator GetHeadEnd(const ScaffoldVertex &vertex) const { return vertex_to_entry_.at(vertex).end(); } + adt::iterator_range GetHeadRange(const ScaffoldVertex &vertex) const { return adt::make_range(GetHeadBegin(vertex), GetHeadEnd(vertex)); } - const_iterator GetTailBegin(const ScaffoldVertex& vertex) const { + const_iterator GetTailBegin(const ScaffoldVertex &vertex) const { return vertex_to_entry_.at(vertex.GetConjugateFromGraph(g_)).begin(); } - const_iterator GetTailEnd(const ScaffoldVertex& vertex) const { + const_iterator GetTailEnd(const ScaffoldVertex &vertex) const { return vertex_to_entry_.at(vertex.GetConjugateFromGraph(g_)).end(); } - adt::iterator_range GetTailRange(const ScaffoldVertex& vertex) const { + adt::iterator_range GetTailRange(const ScaffoldVertex &vertex) const { return adt::make_range(GetTailBegin(vertex.GetConjugateFromGraph(g_)), GetTailEnd(vertex.GetConjugateFromGraph(g_))); } @@ -57,12 +52,11 @@ class ScaffoldVertexIndex { bool Contains(const ScaffoldVertex &vertex) const { return vertex_to_entry_.find(vertex) != vertex_to_entry_.end(); } - private: - void InsertEntry(const ScaffoldVertex& vertex, VertexEntryT&& entry) { - vertex_to_entry_.insert({vertex, entry}); - } - const Graph& g_; +private: + void InsertEntry(const ScaffoldVertex &vertex, VertexEntryT &&entry) { vertex_to_entry_.insert({vertex, entry}); } + + const Graph &g_; std::unordered_map vertex_to_entry_; }; @@ -71,9 +65,10 @@ typedef std::set SimpleVertexEntry; typedef ScaffoldVertexIndex SimpleScaffoldVertexIndex; class ScaffoldVertexIndexInfoExtractor { - public: +public: typedef typename scaffold_graph::ScaffoldVertex ScaffoldVertex; - public: + +public: virtual size_t GetHeadSize(const ScaffoldVertex &vertex) const = 0; virtual size_t GetTailSize(const ScaffoldVertex &vertex) const = 0; @@ -82,16 +77,16 @@ class ScaffoldVertexIndexInfoExtractor { /** * @note second is supposed to be between first and third */ - virtual size_t GetIntersectionSize(const ScaffoldVertex &first, const ScaffoldVertex &second, - const ScaffoldVertex &third) const = 0; + virtual size_t GetTripleIntersectionSize(const ScaffoldVertex &first, const ScaffoldVertex &second, + const ScaffoldVertex &third) const = 0; }; template -class IntersectingScaffoldVertexIndexInfoExtractor: public ScaffoldVertexIndexInfoExtractor { - public: +class IntersectingScaffoldVertexIndexInfoExtractor : public ScaffoldVertexIndexInfoExtractor { +public: using ScaffoldVertexIndexInfoExtractor::ScaffoldVertex; - public: +public: virtual SimpleVertexEntry GetIntersection(const VertexEntryT &first, const VertexEntryT &second) const = 0; virtual SimpleVertexEntry GetIntersection(const ScaffoldVertex &first, const ScaffoldVertex &second) const = 0; /** @@ -106,8 +101,8 @@ class IntersectingScaffoldVertexIndexInfoExtractor: public ScaffoldVertexIndexIn virtual SimpleVertexEntry GetTailEntry(const ScaffoldVertex &vertex) = 0; }; -class BarcodeIndexInfoExtractorWrapper: public IntersectingScaffoldVertexIndexInfoExtractor { - public: +class BarcodeIndexInfoExtractorWrapper : public IntersectingScaffoldVertexIndexInfoExtractor { +public: using Graph = debruijn_graph::Graph; BarcodeIndexInfoExtractorWrapper(const Graph &g, std::shared_ptr barcode_index_) @@ -122,14 +117,12 @@ class BarcodeIndexInfoExtractorWrapper: public IntersectingScaffoldVertexIndexIn size_t GetIntersectionSize(const ScaffoldVertex &first, const ScaffoldVertex &second) const override { return barcode_extractor_->GetNumberOfSharedBarcodes(first.GetLastEdge(), second.GetFirstEdge()); } - size_t GetIntersectionSize(const ScaffoldVertex &first, - const ScaffoldVertex &second, - const ScaffoldVertex &third) const override { + size_t GetTripleIntersectionSize(const ScaffoldVertex &first, const ScaffoldVertex &second, + const ScaffoldVertex &third) const override { return GetIntersectionSize(third, GetIntersection(first, second)); } - SimpleVertexEntry GetIntersection(const SimpleVertexEntry &first, - const SimpleVertexEntry &second) const override { + SimpleVertexEntry GetIntersection(const SimpleVertexEntry &first, const SimpleVertexEntry &second) const override { SimpleVertexEntry result; std::set_intersection(first.begin(), first.end(), second.begin(), second.end(), std::inserter(result, result.end())); @@ -148,22 +141,20 @@ class BarcodeIndexInfoExtractorWrapper: public IntersectingScaffoldVertexIndexIn std::inserter(intersection, intersection.begin())); return intersection.size(); } - SimpleVertexEntry GetHeadEntry(const ScaffoldVertex &/*vertex*/) override { + SimpleVertexEntry GetHeadEntry(const ScaffoldVertex & /*vertex*/) override { VERIFY_MSG(false, "Head entry extractor from BarcodeIndexInfoExtractorWrapper is currently not supported"); SimpleVertexEntry result; return result; } - SimpleVertexEntry GetTailEntry(const ScaffoldVertex &vertex) override { - return GetHeadEntry(vertex); - } + SimpleVertexEntry GetTailEntry(const ScaffoldVertex &vertex) override { return GetHeadEntry(vertex); } - private: +private: const Graph &g_; std::shared_ptr barcode_extractor_; }; -class SimpleScaffoldVertexIndexInfoExtractor: public IntersectingScaffoldVertexIndexInfoExtractor { - public: +class SimpleScaffoldVertexIndexInfoExtractor : public IntersectingScaffoldVertexIndexInfoExtractor { +public: typedef scaffold_graph::ScaffoldVertex ScaffoldVertex; explicit SimpleScaffoldVertexIndexInfoExtractor(std::shared_ptr> index_) @@ -203,38 +194,30 @@ class SimpleScaffoldVertexIndexInfoExtractor: public IntersectingScaffoldVertexI auto middle_begin = index_->GetHeadBegin(middle); auto middle_end = index_->GetHeadEnd(middle); SimpleVertexEntry intersection; - std::set_intersection(entry.begin(), entry.end(), middle_begin, middle_end, std::inserter(intersection, intersection.end())); + std::set_intersection(entry.begin(), entry.end(), middle_begin, middle_end, + std::inserter(intersection, intersection.end())); return intersection.size(); } - size_t GetIntersectionSize(const ScaffoldVertex &first, - const ScaffoldVertex &second, - const ScaffoldVertex &third) const override { - const auto& entry = GetIntersection(first, third); + size_t GetTripleIntersectionSize(const ScaffoldVertex &first, const ScaffoldVertex &second, + const ScaffoldVertex &third) const override { + const auto &entry = GetIntersection(first, third); return GetIntersectionSize(second, entry); } - size_t GetHeadSize(const ScaffoldVertex &vertex) const override { - return (index_->GetHeadEntry(vertex)).size(); - } - size_t GetTailSize(const ScaffoldVertex &vertex) const override { - return (index_->GetTailEntry(vertex)).size(); - } - SimpleVertexEntry GetIntersection(const SimpleVertexEntry &first, - const SimpleVertexEntry &second) const override { + size_t GetHeadSize(const ScaffoldVertex &vertex) const override { return (index_->GetHeadEntry(vertex)).size(); } + size_t GetTailSize(const ScaffoldVertex &vertex) const override { return (index_->GetTailEntry(vertex)).size(); } + SimpleVertexEntry GetIntersection(const SimpleVertexEntry &first, const SimpleVertexEntry &second) const override { SimpleVertexEntry result; - std::set_intersection(first.begin(), first.end(), second.begin(), second.end(), std::inserter(result, result.end())); + std::set_intersection(first.begin(), first.end(), second.begin(), second.end(), + std::inserter(result, result.end())); return result; } - SimpleVertexEntry GetHeadEntry(const ScaffoldVertex &vertex) override { - return index_->GetHeadEntry(vertex); - } - SimpleVertexEntry GetTailEntry(const ScaffoldVertex &vertex) override { - return index_->GetTailEntry(vertex); - } + SimpleVertexEntry GetHeadEntry(const ScaffoldVertex &vertex) override { return index_->GetHeadEntry(vertex); } + SimpleVertexEntry GetTailEntry(const ScaffoldVertex &vertex) override { return index_->GetTailEntry(vertex); } - private: +private: std::shared_ptr> index_; }; typedef IntersectingScaffoldVertexIndexInfoExtractor SimpleIntersectingScaffoldVertexExtractor; -} \ No newline at end of file +} // namespace barcode_index \ No newline at end of file diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp index 438e8fa8c3..f259d8b1ad 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp @@ -249,56 +249,5 @@ ScaffoldSubgraphConstructor::ScaffoldSubgraphConstructor(const Graph &assembly_g vertex_condition_(vertex_condition), large_graph_(large_graph), distance_threshold_(distance_threshold) {} -ScoreFunctionScaffoldGraphConstructor::ScoreFunctionScaffoldGraphConstructor( - const Graph &assembly_graph, - const std::set &scaffold_vertices, - const std::shared_ptr &score_function, - double score_threshold, - size_t num_threads) - : BaseScaffoldGraphConstructor(assembly_graph), - scaffold_vertices_(scaffold_vertices), - score_function_(score_function), - score_threshold_(score_threshold), - num_threads_(num_threads) {} - -std::shared_ptr ScoreFunctionScaffoldGraphConstructor::Construct() { - for (const auto& vertex: scaffold_vertices_) { - graph_->AddVertex(vertex); - } - //fixme switch to tbb or use chunk splitter - std::vector scaffold_vertex_vec; - for (const auto& vertex: scaffold_vertices_) { - scaffold_vertex_vec.push_back(vertex); - } - size_t counter = 0; - size_t edges_size = scaffold_vertices_.size() * scaffold_vertices_.size(); - const size_t block_size = edges_size / 10; -#pragma omp parallel for num_threads(num_threads_) - for (size_t i = 0; i < scaffold_vertex_vec.size(); ++i) { - for (size_t j = 0; j < scaffold_vertex_vec.size(); ++j) { - const ScaffoldVertex& from = scaffold_vertex_vec[i]; - const ScaffoldVertex& to = scaffold_vertex_vec[j]; - ScaffoldGraph::ScaffoldEdge edge(from, to); - double score = score_function_->GetScore(edge); -#pragma omp critical - { - TRACE("Checking edge " << edge.getStart().int_id() << " -> " << edge.getEnd().int_id()); - TRACE("Score: " << score); - TRACE("Score threshold: " << score_threshold_); - bool are_conjugate = from == to.GetConjugateFromGraph(graph_->AssemblyGraph()); - if (math::ge(score, score_threshold_) and from != to and not are_conjugate) { - TRACE("Success"); - graph_->AddEdge(edge.getStart(), edge.getEnd(), edge.getColor(), score, edge.getLength()); - } - TRACE("Edge added"); - ++counter; - if (block_size != 0 and counter % block_size == 0) { - DEBUG("Processed " << counter << " edges out of " << edges_size); - } - } - } - } - return graph_; -} } //scaffold_graph } //path_extend diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp index 9326795871..1c61a36e70 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp @@ -198,29 +198,6 @@ class ScoreFunctionScaffoldGraphFilter: public BaseScaffoldGraphConstructor { DECL_LOGGER("ScoreFunctionScaffoldGraphConstructor") }; -class ScoreFunctionScaffoldGraphConstructor: public BaseScaffoldGraphConstructor { - typedef read_cloud::ScaffoldEdgeScoreFunction EdgePairScoreFunction; - using BaseScaffoldGraphConstructor::ScaffoldGraph; - using BaseScaffoldGraphConstructor::ScaffoldVertex; - - protected: - const std::set scaffold_vertices_; - const std::shared_ptr score_function_; - const double score_threshold_; - const size_t num_threads_; - - public: - ScoreFunctionScaffoldGraphConstructor(const Graph &assembly_graph, - const std::set &scaffold_vertices, - const std::shared_ptr &score_function, - double score_threshold, - size_t num_threads); - - std::shared_ptr Construct() override; - - DECL_LOGGER("ScoreFunctionScaffoldGraphConstructor"); -}; - } //scaffold_graph } //path_extend From 2339c7fec9eac4da0cca786843f312b044474e8f Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Fri, 30 Jan 2026 16:37:11 +0100 Subject: [PATCH 05/17] Fix warnings --- .../auxiliary_graphs/scaffold_graph/scaffold_vertex_index.hpp | 3 ++- src/common/barcode_index/barcode_index_builder.hpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index.hpp b/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index.hpp index e01cff23e7..6fbdd8fbfd 100644 --- a/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index.hpp +++ b/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_index.hpp @@ -85,6 +85,7 @@ template class IntersectingScaffoldVertexIndexInfoExtractor : public ScaffoldVertexIndexInfoExtractor { public: using ScaffoldVertexIndexInfoExtractor::ScaffoldVertex; + using ScaffoldVertexIndexInfoExtractor::GetIntersectionSize; public: virtual SimpleVertexEntry GetIntersection(const VertexEntryT &first, const VertexEntryT &second) const = 0; @@ -93,7 +94,7 @@ class IntersectingScaffoldVertexIndexInfoExtractor : public ScaffoldVertexIndexI * @note second is supposed to be between first and third */ virtual size_t GetIntersectionSize(const ScaffoldVertex &middle, const VertexEntryT &entry) const = 0; - size_t GetIntersectionSize(const VertexEntryT &first, const VertexEntryT &second) { + virtual size_t GetIntersectionSize(const VertexEntryT &first, const VertexEntryT &second) { return GetIntersection(first, second).size(); } diff --git a/src/common/barcode_index/barcode_index_builder.hpp b/src/common/barcode_index/barcode_index_builder.hpp index c8c7e09714..7c7c78d898 100644 --- a/src/common/barcode_index/barcode_index_builder.hpp +++ b/src/common/barcode_index/barcode_index_builder.hpp @@ -220,7 +220,7 @@ class FrameBarcodeIndexBuilder { INFO("Barcode id range: " << min_barcode << ", " << max_barcode); double barcode_thr = static_cast(max_barcode - min_barcode) * sampling_factor; for (const auto &barcode: barcodes) { - if (math::le(barcode - min_barcode, barcode_thr)) { + if (math::le(static_cast(barcode - min_barcode), barcode_thr)) { passed_barcodes.insert(barcode); } } From fde2f7986014314ad1a8420c85a6b8313db0932f Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Mon, 2 Feb 2026 16:28:07 +0100 Subject: [PATCH 06/17] Remove unused vertex predicates and graph constructors --- .../scaffold_graph_extractor.cpp | 77 ----------------- .../scaffold_graph_extractor.hpp | 29 ------- .../scaffold_vertex_predicates.cpp | 13 +-- .../scaffold_vertex_predicates.hpp | 29 +------ src/projects/splitter/path_extractor.cpp | 14 +--- src/projects/splitter/path_extractor.hpp | 1 - .../splitter/scaffold_graph_helper.cpp | 83 ------------------- .../splitter/scaffold_graph_helper.hpp | 39 --------- 8 files changed, 3 insertions(+), 282 deletions(-) delete mode 100644 src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.cpp delete mode 100644 src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.hpp diff --git a/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.cpp b/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.cpp deleted file mode 100644 index ece4aea37c..0000000000 --- a/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.cpp +++ /dev/null @@ -1,77 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2019 Saint Petersburg State University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "scaffold_graph_extractor.hpp" - -namespace path_extend { -namespace read_cloud { - -std::vector ScaffoldGraphExtractor::ExtractReliableEdges( - const ScaffoldGraph &scaffold_graph) const { - std::vector result; - for (const ScaffoldGraph::ScaffoldEdge &edge: scaffold_graph.edges()) { - if (scaffold_graph.HasUniqueOutgoing(edge.getStart()) and scaffold_graph.HasUniqueIncoming(edge.getEnd())) { - result.push_back(edge); - } - } - return result; -} -std::vector ScaffoldGraphExtractor::ExtractMaxScoreEdges( - const ScaffoldGraphExtractor::ScaffoldGraph &scaffold_graph) const { - std::vector result; - std::unordered_map start_to_edge; - std::unordered_map end_to_edge; - size_t edge_counter = 0; - size_t edge_block = 10000; - for (const ScaffoldGraph::ScaffoldEdge &edge: scaffold_graph.edges()) { - double score = edge.getWeight(); - auto start = edge.getStart(); - auto end = edge.getEnd(); - bool is_max_score_edge = true; - for (const auto &in_edge: scaffold_graph.IncomingEdges(end)) { - double in_score = in_edge.getWeight(); - if (math::gr(in_score, score)) { - is_max_score_edge = false; - break; - } - } - if (not is_max_score_edge) { - continue; - } - for (const auto &out_edge: scaffold_graph.OutgoingEdges(start)) { - double out_score = out_edge.getWeight(); - if (math::gr(out_score, score)) { - is_max_score_edge = false; - break; - } - } - if (is_max_score_edge) { - if (start_to_edge.find(start) == start_to_edge.end() and end_to_edge.find(end) == end_to_edge.end()) { - start_to_edge.insert({start, edge}); - end_to_edge.insert({end, edge}); - result.push_back(edge); - } - } - ++edge_counter; - if (edge_counter % edge_block == 0) { - INFO("Processed " << edge_counter << " edges out of " << scaffold_graph.EdgeCount()); - } - } - return result; -} -std::unordered_map ScaffoldGraphExtractor::GetFirstEdgeMap( - const ScaffoldGraphExtractor::ScaffoldGraph &scaffold_graph, const func::TypedPredicate &pred) const { - std::unordered_map result; - for (const ScaffoldVertex &vertex: scaffold_graph.vertices()) { - auto first_edge = vertex.GetFirstEdgeWithPredicate(pred); - if (first_edge.is_initialized()) { - result[first_edge.get()].insert(vertex); - } - } - return result; -} -} -} \ No newline at end of file diff --git a/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.hpp b/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.hpp deleted file mode 100644 index 7d4cc2caf9..0000000000 --- a/src/common/modules/path_extend/read_cloud_path_extend/scaffold_graph_extractor.hpp +++ /dev/null @@ -1,29 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2019 Saint Petersburg State University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include "auxiliary_graphs/scaffold_graph/scaffold_graph.hpp" - -namespace path_extend { -namespace read_cloud { - -class ScaffoldGraphExtractor { - public: - typedef debruijn_graph::EdgeId EdgeId; - typedef scaffold_graph::ScaffoldGraph ScaffoldGraph; - typedef ScaffoldGraph::ScaffoldEdge ScaffoldEdge; - typedef scaffold_graph::ScaffoldVertex ScaffoldVertex; - typedef std::unordered_set VertexSet; - - std::vector ExtractMaxScoreEdges(const ScaffoldGraph &scaffold_graph) const; - std::vector ExtractReliableEdges(const ScaffoldGraph &scaffold_graph) const; - std::unordered_map GetFirstEdgeMap(const ScaffoldGraph &scaffold_graph, - const func::TypedPredicate &pred) const; -}; - -} -} \ No newline at end of file diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.cpp b/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.cpp index 94feaf12db..d55b470f59 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.cpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.cpp @@ -9,18 +9,7 @@ namespace path_extend { namespace scaffolder { -LengthChecker::LengthChecker(size_t length_threshold, const Graph &g) - : length_threshold_(length_threshold), g_(g) {} -bool LengthChecker::Check(const ScaffoldVertex &vertex) const { - return vertex.GetLengthFromGraph(g_) < length_threshold_; -} +// ScaffoldVertexPredicate is an abstract interface class with no implementation -AndPredicate::AndPredicate(std::shared_ptr first, - std::shared_ptr second) : - first_(first), second_(second) {} -bool AndPredicate::Check(const ScaffoldVertex &scaffold_vertex) const { - return first_->Check(scaffold_vertex) and second_->Check(scaffold_vertex); } - } -} \ No newline at end of file diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.hpp index dd9134068a..b3aeae6ce9 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.hpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.hpp @@ -20,32 +20,5 @@ class ScaffoldVertexPredicate virtual ~ScaffoldVertexPredicate() = default; }; -class LengthChecker : public ScaffoldVertexPredicate { - public: - typedef scaffold_graph::ScaffoldVertex ScaffoldVertex; - typedef debruijn_graph::Graph Graph; - - LengthChecker(size_t length_threshold, const Graph &g); - - bool Check(const ScaffoldVertex &vertex) const override; - - private: - const size_t length_threshold_; - const Graph &g_; -}; - -class AndPredicate : public ScaffoldVertexPredicate { - public: - typedef scaffold_graph::ScaffoldVertex ScaffoldVertex; - - AndPredicate(std::shared_ptr first, std::shared_ptr second); - - bool Check(const ScaffoldVertex &scaffold_vertex) const override; - - private: - std::shared_ptr first_; - std::shared_ptr second_; -}; - } -} \ No newline at end of file +} diff --git a/src/projects/splitter/path_extractor.cpp b/src/projects/splitter/path_extractor.cpp index ac41089645..06f4c7f2d6 100644 --- a/src/projects/splitter/path_extractor.cpp +++ b/src/projects/splitter/path_extractor.cpp @@ -75,19 +75,7 @@ void PathExtractor::ExtractPaths(path_extend::PathContainer &paths, INFO("Total path overlap: " << total_path_overlap); INFO("Edges visited by several paths: " << visited_edges); } -bool PathExtractor::IsConjugatePair(const PathExtractor::SimplePath &first, - const PathExtractor::SimplePath &second) const { - if (first.size() != second.size()) { - return false; - } - for (auto it1 = first.begin(), it2 = second.end(); it1 != first.end(); ++it1) { - --it2; - if (*it1 != graph_.conjugate(*it2)) { - return false; - } - } - return true; -} + bool PathExtractor::IsGraphLink(debruijn_graph::EdgeId first, debruijn_graph::EdgeId second, const PathExtractor::VertexLinkStorage &vertex_storage) const { diff --git a/src/projects/splitter/path_extractor.hpp b/src/projects/splitter/path_extractor.hpp index b7e9389fa0..eba8637273 100644 --- a/src/projects/splitter/path_extractor.hpp +++ b/src/projects/splitter/path_extractor.hpp @@ -42,7 +42,6 @@ class PathExtractor { vertex_link_storage(vertex_link_storage) {} }; - bool IsConjugatePair(const SimplePath &first, const SimplePath &second) const; bool IsGraphLink(debruijn_graph::EdgeId first, debruijn_graph::EdgeId second, const VertexLinkStorage &vertex_storage) const; diff --git a/src/projects/splitter/scaffold_graph_helper.cpp b/src/projects/splitter/scaffold_graph_helper.cpp index 19ec39c9e1..2f97ce8ddf 100644 --- a/src/projects/splitter/scaffold_graph_helper.cpp +++ b/src/projects/splitter/scaffold_graph_helper.cpp @@ -38,32 +38,10 @@ scaffold_graph::ScaffoldGraph LinkIndexGraphConstructor::ConstructGraph() const result.AddVertex(vertex); } -// ReverseBarcodeIndexConstructor reverse_index_constructor(g_, barcode_extractor_, length_threshold_, tail_threshold_, -// count_threshold_, max_threads_); -// auto reverse_index = reverse_index_constructor.ConstructReverseIndex(scaffold_vertices); -// -// size_t total_head_size = 0; -// size_t total_tail_size = 0; -// for (const auto &vertex: scaffold_vertices) { -// total_head_size += scaffold_index_extractor->GetHeadSize(vertex); -// total_tail_size += scaffold_index_extractor->GetTailSize(vertex); -// } -// INFO("Total head size: " << total_head_size); -// INFO("Total tail size: " << total_tail_size); -// size_t total_pairs = 0; -// for (const auto &entry: reverse_index) { -// total_pairs += entry.second.size() * entry.second.size(); -// } - std::vector chunks; for (const auto &first: scaffold_vertices) { chunks.emplace_back(first, scaffold_vertices.begin(), scaffold_vertices.end()); } -// for (const auto &entry: reverse_index) { -// for (const auto &first: entry.second) { -// chunks.emplace_back(first, entry.second.begin(), entry.second.end()); -// } -// } INFO(chunks.size() << " chunks"); auto score_filter = std::make_shared(g_, chunks, score_function, @@ -109,68 +87,7 @@ LinkIndexGraphConstructor::BarcodeScoreFunctionPtr LinkIndexGraphConstructor::Co count_threshold_, tail_threshold_); return score_function; } -GFAGraphConstructor::GFAGraphConstructor(const debruijn_graph::Graph &g, - const gfa::GFAReader &gfa, - io::IdMapper *id_mapper) : - g_(g), gfa_(gfa), id_mapper_(id_mapper) {} -scaffold_graph::ScaffoldGraph GFAGraphConstructor::ConstructGraphFromDBG() const { - scaffold_graph::ScaffoldGraph scaffold_graph(g_); - for (const EdgeId &edge: g_.canonical_edges()) { - scaffold_graph.AddVertex(edge); - } - for (const auto &vertex: g_.vertices()) { - for (const auto &outgoing: g_.OutgoingEdges(vertex)) { - for (const auto &incoming: g_.IncomingEdges(vertex)) { - scaffold_graph.AddEdge(incoming, outgoing, 0, 1.0, 0); - } - } - } - return scaffold_graph; -} -ReverseBarcodeIndex ReverseBarcodeIndexConstructor::ConstructReverseIndex(const std::set &scaffold_vertices) const { - barcode_index::SimpleScaffoldVertexIndexBuilderHelper helper; - auto tail_threshold_getter = std::make_shared(tail_threshold_); - auto scaffold_vertex_index = helper.ConstructScaffoldVertexIndex(g_, *barcode_extractor_, tail_threshold_getter, - count_threshold_, length_threshold_, - max_threads_, scaffold_vertices); - auto scaffold_index_extractor = - std::make_shared(scaffold_vertex_index); - ReverseBarcodeIndex result; - for (const auto &vertex: scaffold_vertices) { - for (const auto &barcode: scaffold_index_extractor->GetHeadEntry(vertex)) { - result[barcode].insert(vertex); - } - for (const auto &barcode: scaffold_index_extractor->GetTailEntry(vertex)) { - result[barcode].insert(vertex); - } - } - double mean_barcode_size = .0; - double barcode_size_m2 = .0; - for (const auto &entry: result) { - auto entry_size = static_cast(entry.second.size()); - mean_barcode_size += entry_size; - barcode_size_m2 += entry_size * entry_size; - } - mean_barcode_size /= static_cast(result.size()); - barcode_size_m2 /= static_cast(result.size()); - INFO("Number of barcodes: " << result.size()); - INFO("Mean edges in barcode: " << mean_barcode_size); - INFO("Raw second moment of barcode edges: " << barcode_size_m2); - return result; -} -ReverseBarcodeIndexConstructor::ReverseBarcodeIndexConstructor(const debruijn_graph::Graph &g, - BarcodeExtractorPtr barcode_extractor, - const size_t length_threshold, - const size_t tail_threshold, - const size_t count_threshold, - size_t max_threads) : - g_(g), - barcode_extractor_(barcode_extractor), - length_threshold_(length_threshold), - tail_threshold_(tail_threshold), - count_threshold_(count_threshold), - max_threads_(max_threads) {} scaffold_graph::ScaffoldGraph ScaffoldGraphSerializer::ReadGraph(const std::string &path_to_graph) { scaffold_graph::ScaffoldGraph result(g_); std::unordered_map id_to_vertex; diff --git a/src/projects/splitter/scaffold_graph_helper.hpp b/src/projects/splitter/scaffold_graph_helper.hpp index 7648f37ca5..beb530e1dc 100644 --- a/src/projects/splitter/scaffold_graph_helper.hpp +++ b/src/projects/splitter/scaffold_graph_helper.hpp @@ -51,27 +51,6 @@ class LinkIndexGraphConstructor { size_t max_threads_; }; -class ReverseBarcodeIndexConstructor { - public: - using ScaffoldVertex = scaffold_graph::ScaffoldVertex; - using BarcodeExtractorPtr = std::shared_ptr; - ReverseBarcodeIndexConstructor(const debruijn_graph::Graph &g, - BarcodeExtractorPtr barcode_extractor, - const size_t length_threshold, - const size_t tail_threshold, - const size_t count_threshold, - size_t max_threads); - - ReverseBarcodeIndex ConstructReverseIndex(const std::set &scaffold_vertices) const; - private: - const debruijn_graph::Graph &g_; - BarcodeExtractorPtr barcode_extractor_; - const size_t length_threshold_; - const size_t tail_threshold_; - const size_t count_threshold_; - size_t max_threads_; -}; - class ScaffoldGraphSerializer { public: ScaffoldGraphSerializer(const debruijn_graph::Graph &g, io::IdMapper *id_mapper); @@ -84,24 +63,6 @@ class ScaffoldGraphSerializer { io::IdMapper *id_mapper_; }; -class GFAGraphConstructor { - public: - using Graph = debruijn_graph::Graph; - using EdgeId = debruijn_graph::EdgeId; - using BarcodeExtractorPtr = std::shared_ptr; - - GFAGraphConstructor(const Graph &g, - const gfa::GFAReader &gfa, - io::IdMapper *id_mapper); - - scaffold_graph::ScaffoldGraph ConstructGraphFromDBG() const; - - private: - const debruijn_graph::Graph &g_; - const gfa::GFAReader &gfa_; - io::IdMapper *id_mapper_; -}; - using BarcodeExtractorPtr = std::shared_ptr; scaffold_graph::ScaffoldGraph GetTellSeqScaffoldGraph(const debruijn_graph::Graph &g, BarcodeExtractorPtr barcode_extractor, From 7c8f0edea6d35564afa5bd8c5d2ef610e2d6e350 Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Mon, 2 Feb 2026 17:12:59 +0100 Subject: [PATCH 07/17] Move path extend unrelated code from scaffolder2015 to auxiliary_graphs --- src/common/auxiliary_graphs/CMakeLists.txt | 1 + .../scaffold_graph_dijkstra.hpp | 4 +- .../scaffold_vertex_predicates.cpp | 26 ++++++++++ .../scaffold_vertex_predicates.hpp | 52 +++++++++++++++++++ .../scaffold_graph_constructor.cpp | 3 +- .../scaffold_vertex_predicates.cpp | 15 ------ .../scaffold_vertex_predicates.hpp | 24 --------- 7 files changed, 82 insertions(+), 43 deletions(-) rename src/common/{modules/path_extend/scaffolder2015 => auxiliary_graphs/scaffold_graph}/scaffold_graph_dijkstra.hpp (98%) create mode 100644 src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_predicates.cpp create mode 100644 src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_predicates.hpp delete mode 100644 src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.cpp delete mode 100644 src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.hpp diff --git a/src/common/auxiliary_graphs/CMakeLists.txt b/src/common/auxiliary_graphs/CMakeLists.txt index 11f5a93d77..4f876bb816 100644 --- a/src/common/auxiliary_graphs/CMakeLists.txt +++ b/src/common/auxiliary_graphs/CMakeLists.txt @@ -15,6 +15,7 @@ add_library(auxiliary_graphs STATIC contracted_graph/contracted_statistics.cpp scaffold_graph/scaffold_vertex.cpp scaffold_graph/scaffold_graph.cpp + scaffold_graph/scaffold_vertex_predicates.cpp ) target_link_libraries(auxiliary_graphs assembly_graph) diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_dijkstra.hpp b/src/common/auxiliary_graphs/scaffold_graph/scaffold_graph_dijkstra.hpp similarity index 98% rename from src/common/modules/path_extend/scaffolder2015/scaffold_graph_dijkstra.hpp rename to src/common/auxiliary_graphs/scaffold_graph/scaffold_graph_dijkstra.hpp index 5948e84f28..9b0990555c 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_dijkstra.hpp +++ b/src/common/auxiliary_graphs/scaffold_graph/scaffold_graph_dijkstra.hpp @@ -7,8 +7,8 @@ #pragma once #include "assembly_graph/dijkstra/dijkstra_helper.hpp" -#include "auxiliary_graphs/scaffold_graph/scaffold_graph.hpp" -#include "modules/path_extend/scaffolder2015/scaffold_vertex_predicates.hpp" +#include "scaffold_graph.hpp" +#include "scaffold_vertex_predicates.hpp" namespace omnigraph { template<> diff --git a/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_predicates.cpp b/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_predicates.cpp new file mode 100644 index 0000000000..94feaf12db --- /dev/null +++ b/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_predicates.cpp @@ -0,0 +1,26 @@ +//*************************************************************************** +//* Copyright (c) 2019 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "scaffold_vertex_predicates.hpp" + +namespace path_extend { +namespace scaffolder { + +LengthChecker::LengthChecker(size_t length_threshold, const Graph &g) + : length_threshold_(length_threshold), g_(g) {} +bool LengthChecker::Check(const ScaffoldVertex &vertex) const { + return vertex.GetLengthFromGraph(g_) < length_threshold_; +} + +AndPredicate::AndPredicate(std::shared_ptr first, + std::shared_ptr second) : + first_(first), second_(second) {} +bool AndPredicate::Check(const ScaffoldVertex &scaffold_vertex) const { + return first_->Check(scaffold_vertex) and second_->Check(scaffold_vertex); +} + +} +} \ No newline at end of file diff --git a/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_predicates.hpp b/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_predicates.hpp new file mode 100644 index 0000000000..cf6a63f9c6 --- /dev/null +++ b/src/common/auxiliary_graphs/scaffold_graph/scaffold_vertex_predicates.hpp @@ -0,0 +1,52 @@ + +//*************************************************************************** +//* Copyright (c) 2019 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "scaffold_vertex.hpp" +#include "assembly_graph/core/graph.hpp" + +#include + +namespace path_extend { +namespace scaffolder { + +class ScaffoldVertexPredicate + : public func::AbstractPredicate { + public: + virtual ~ScaffoldVertexPredicate() = default; +}; + +class LengthChecker : public ScaffoldVertexPredicate { + public: + typedef scaffold_graph::ScaffoldVertex ScaffoldVertex; + typedef debruijn_graph::Graph Graph; + + LengthChecker(size_t length_threshold, const Graph &g); + + bool Check(const ScaffoldVertex &vertex) const override; + + private: + const size_t length_threshold_; + const Graph &g_; +}; + +class AndPredicate : public ScaffoldVertexPredicate { + public: + typedef scaffold_graph::ScaffoldVertex ScaffoldVertex; + + AndPredicate(std::shared_ptr first, std::shared_ptr second); + + bool Check(const ScaffoldVertex &scaffold_vertex) const override; + + private: + std::shared_ptr first_; + std::shared_ptr second_; +}; + +} +} diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp index f259d8b1ad..d47b65df88 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp @@ -9,9 +9,8 @@ // Created by andrey on 04.12.15. // -#include "common/modules/path_extend/scaffolder2015/scaffold_graph_dijkstra.hpp" +#include "auxiliary_graphs/scaffold_graph/scaffold_graph_dijkstra.hpp" #include "scaffold_graph_constructor.hpp" -#include "scaffold_graph_dijkstra.hpp" namespace path_extend { diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.cpp b/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.cpp deleted file mode 100644 index d55b470f59..0000000000 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.cpp +++ /dev/null @@ -1,15 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2019 Saint Petersburg State University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "scaffold_vertex_predicates.hpp" - -namespace path_extend { -namespace scaffolder { - -// ScaffoldVertexPredicate is an abstract interface class with no implementation - -} -} diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.hpp deleted file mode 100644 index b3aeae6ce9..0000000000 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_vertex_predicates.hpp +++ /dev/null @@ -1,24 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2019 Saint Petersburg State University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include "auxiliary_graphs/scaffold_graph/scaffold_vertex.hpp" -#include "assembly_graph/core/graph.hpp" - -#include - -namespace path_extend { -namespace scaffolder { - -class ScaffoldVertexPredicate - : public func::AbstractPredicate { - public: - virtual ~ScaffoldVertexPredicate() = default; -}; - -} -} From 3d25839c5a7428b6652ba4859d6cef3180d46f0d Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Mon, 2 Feb 2026 17:19:35 +0100 Subject: [PATCH 08/17] Pass small types by value --- .../scaffold_graph/scaffold_graph_dijkstra.hpp | 4 ++-- src/projects/splitter/graph_resolver.cpp | 2 +- src/projects/splitter/vertex_resolver.hpp | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/common/auxiliary_graphs/scaffold_graph/scaffold_graph_dijkstra.hpp b/src/common/auxiliary_graphs/scaffold_graph/scaffold_graph_dijkstra.hpp index 9b0990555c..dea1c8f4bd 100644 --- a/src/common/auxiliary_graphs/scaffold_graph/scaffold_graph_dijkstra.hpp +++ b/src/common/auxiliary_graphs/scaffold_graph/scaffold_graph_dijkstra.hpp @@ -103,7 +103,7 @@ class ScaffoldBarcodedPathPutChecker { std::shared_ptr predicate_; public: - ScaffoldBarcodedPathPutChecker(const Graph &g, const VertexId &first, const VertexId &second, + ScaffoldBarcodedPathPutChecker(const Graph &g, VertexId first, VertexId second, std::shared_ptr predicate) : g_(g), first_(first), @@ -140,7 +140,7 @@ class StartPredicateProcessChecker { const func::TypedPredicate &predicate_; public: StartPredicateProcessChecker(const Graph &g, - const VertexId &start, + VertexId start, const func::TypedPredicate &predicate) : g_(g), start_(start), predicate_(predicate) {} diff --git a/src/projects/splitter/graph_resolver.cpp b/src/projects/splitter/graph_resolver.cpp index 06da28975b..d1a38f87fa 100644 --- a/src/projects/splitter/graph_resolver.cpp +++ b/src/projects/splitter/graph_resolver.cpp @@ -17,7 +17,7 @@ GraphResolver::GraphResolverInfo::VertexMap GraphResolver::SplitVertices(debruij GraphResolver::GraphResolverInfo::VertexMap transformed_vertex_to_original; auto helper = graph.GetConstructionHelper(); for (const auto &vertex_entry: vertex_results.vertex_to_result) { - const VertexId &vertex = vertex_entry.first; + VertexId vertex = vertex_entry.first; DEBUG("Conjugate: " << graph.conjugate(vertex).int_id()); const auto &vertex_result = vertex_entry.second; diff --git a/src/projects/splitter/vertex_resolver.hpp b/src/projects/splitter/vertex_resolver.hpp index 25de68844a..676e319879 100644 --- a/src/projects/splitter/vertex_resolver.hpp +++ b/src/projects/splitter/vertex_resolver.hpp @@ -112,7 +112,7 @@ class VertexResolver { std::unordered_set covered_vertices; double LINK_BONUS = 1000000; - for (const EdgeId &sc_in_edge: graph_.IncomingEdges(vertex)) { + for (EdgeId sc_in_edge: graph_.IncomingEdges(vertex)) { //convert to dbg EdgeId scaffold_graph::ScaffoldVertex sc_in_vertex(sc_in_edge); scaffold_graph::EdgeGetter edge_getter; @@ -122,7 +122,7 @@ class VertexResolver { std::pair second_pair(0, 0); size_t max_links = 0; size_t second_links = 0; - for (const EdgeId &sc_out_edge: graph_.OutgoingEdges(vertex)) { + for (EdgeId sc_out_edge: graph_.OutgoingEdges(vertex)) { scaffold_graph::ScaffoldVertex sc_out_vertex(sc_out_edge); debruijn_graph::EdgeId out_edge = edge_getter.GetEdgeFromScaffoldVertex(sc_out_vertex); if (in_edge == out_edge or in_edge == assembly_graph_.conjugate(out_edge)) { @@ -216,10 +216,10 @@ class VertexResolver { answer_string += (*id_mapper)[entry.first.int_id()] + "#" + (*id_mapper)[entry.second.int_id()] + ","; } std::string in_edge_string, out_edge_string; - for (const EdgeId &edge: graph_.IncomingEdges(vertex)) { + for (EdgeId edge: graph_.IncomingEdges(vertex)) { in_edge_string += (*id_mapper)[edge.int_id()] + ","; } - for (const EdgeId &edge: graph_.OutgoingEdges(vertex)) { + for (EdgeId edge: graph_.OutgoingEdges(vertex)) { out_edge_string += (*id_mapper)[edge.int_id()] + ","; } in_edge_string = in_edge_string.substr(0, in_edge_string.size() - 1); From 09d6f1999b36c3cbcfd1915a48333165a12108f5 Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Fri, 13 Feb 2026 14:55:42 +0100 Subject: [PATCH 09/17] Add splitter docs --- docs/splitter.md | 151 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 docs/splitter.md diff --git a/docs/splitter.md b/docs/splitter.md new file mode 100644 index 0000000000..8592b6531e --- /dev/null +++ b/docs/splitter.md @@ -0,0 +1,151 @@ +# SPlitteR - Repeat resolution in assembly graph using synthetic long reads + +SPlitteR is a tool that uses synthetic long reads (SLRs) to improve the contiguity of HiFi assemblies. Given a SLR library and a HiFi assembly graph in the GFA format, SpLitteR resolves repeats in the assembly graph using linked-reads and generates a simplified (more contiguous) assembly graph with corresponding scaffolds. + +## Compilation + +To compile SPlitteR, run + +``` +./spades_compile -SPADES_ENABLE_PROJECTS=splitter +``` + +## Input + +The tool requires + +- Assembly graph file in [GFA 1.0 format](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md), with scaffolds included as path lines. +- SLR library in [YAML format](running.md#specifying-multiple-libraries-with-yaml-data-set-file). The tool supports SLR libraries produced using 10X Genomics Chromium and UST TELL-Seq technologies. Other SLR technologies, such as stLFR or LoopSeq can potentially be used as an input if converted to 10X or TELL-Seq format. + +SPlitteR supports [LJA](https://github.com/AntonBankevich/LJA) and [Flye](https://github.com/fenderglass/Flye) assembly graphs out of the box. Other assembly graphs should prefferably be converted into blunt format by e.g. [GetBlunted](https://github.com/vgteam/GetBlunted) utility. + +### UST TELLSeq format + +TELL-Seq library should include barcodes, left reads, and right reads as three separate FASTQ files. + +For example, if you have a TELL-Seq library + +``` bash + tellseq_reads_I1.fastq.gz + tellseq_reads_R1.fastq.gz + tellseq_reads_R2.fastq.gz +``` + +YAML file should look like this: + +``` bash + + [ + { + orientation: "fr", + type: "tell-seq", + right reads: [ + "/FULL_PATH_TO_DATASET/tellseq_reads_R2.fastq.gz" + ], + left reads: [ + "/FULL_PATH_TO_DATASET/tellseq_reads_R1.fastq.gz" + ], + aux: [ + "/FULL_PATH_TO_DATASET/tellseq_reads_I1.fastq.gz" + ] + } + ] +``` + +### 10X Genomics Chromium format + +10X library should be in FASTQ format with barcodes attached as BC:Z or BX:Z tags: + +``` +@COOPER:77:HCYNTBBXX:1:1216:22343:0 BX:Z:AAAAAAAAAACATAGT +CCAGGTAGGATTATGGAATTGGTATAAGCGATCAAACTCAATATTTTTGGTGCGGTGACAGACGCCTTCTGGCAGATGATGGGCTTGTCGTAAGTGTGGT ++ +GGAGGGAAGGGGIGIIAGAGAGGGGGIAGGGGGGGAGGGGGGGGGGGGAAAGGAGGGGGIGIGGGGGGGAGGAGGIGAIAGGIGGGGIGGGGGGGGGGGG +``` + +For example, if you have an SLR library + +``` bash + + lib_slr_1.fastq.gz + lib_slr_2.fastq.gz +``` + +YAML file should look like this: + +``` bash + + [ + { + orientation: "fr", + type: "clouds10x", + right reads: [ + "/FULL_PATH_TO_DATASET/lib_slr_2.fastq.gz" + ], + left reads: [ + "/FULL_PATH_TO_DATASET/lib_slr_1.fastq.gz" + ] + } + ] +``` + +### Command line + +Synopsis: `splitter [OPTION...]` + +Main options: + +- `-t` Number of threads to use (default: 1/2 of available threads) +- `--mapping-k` k-mer length for read mapping (default: 31) +- `-Gmdbg|-Gblunt` Assembly graph type: mDBG (LJA) or blunted (Flye) +- `-Mdiploid|-Mmeta` Repeat resolution mode (diploid or meta) +- `--assembly-info` Path to metaFlye assembly_info.txt file (meta mode, metaFlye graphs only) + +Barcode index construction: +- `--count-threshold` Minimum number of reads for barcode index +- `--frame-size` Resolution of the barcode index +- `--length-threshold` Minimum scaffold graph edge length (meta mode option) +- `--linkage-distance` Reads are assigned to the same fragment on long edges based on the linkage distance +- `--min-read-threshold` Minimum number of reads for path cluster extraction +- `--relative-score-threshold` Relative score threshold for path cluster extraction +- `--sampling-factor` Downsample input SLR reads by this factor + +Repeat resolution: +- `--score` Score threshold for link index. +- `--tail-threshold` Barcodes are assigned to the first and last nucleotides of the edge. +- `--scaffold-links` Use scaffold links in addition to graph links for repeat resolution + +Developer options: +- `--ref` Reference path for repeat resolution evaluation +- `--bin-load` Load read-to-graph alignment +- `--debug` Produce lots of debug data, save read-to-graph alignment +- `--tmp-dir` Scratch directory to use +- `-h, --help ` Print help message + +Example command lines: + +- Assembly produced LJA from HiFi diploid human dataset, with 10X SLR library (HPC compressed)\ + `splitter lja_output/mdbg/mdbg.hpc.gfa 10x_dataset.yaml output -Mdiploid -Gmdbg` +- Assembly produced by metaFlye from metagenomic dataset, with TELL-Seq SLR library\ + `splitter metaflye_output/assembly_graph.gfa tellseq_dataset.yaml output --assembly-info metaflye_output/assembly_info.txt -Mmeta -Gblunt` +- + +## Output + +SPlitteR stores all output files in output directory ` `, which is set by the user. + +- `/assembly_graph.gfa` input assembly graph in mDBG encoding +- `/resolved_graph.gfa` output assembly graph after repeat resolution +- `/contigs.fasta` output scaffolds + +In addition + +- `/edge_transform.tsv` map from input graph edges to resolved graph edges +- `/vertex_stats.tsv` Statistics for complex vertices +- `/resolved_graph.fasta` Sequences of the resolved graph edges + +## References + +If you are using **BinSPreader** in your research, please cite: + +[Tolstoganov et al., 2024](https://doi.org/10.7717/peerj.18050) \ No newline at end of file From 9e9285f4ee87f2188a5fbfd598dd1b86552d25d9 Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Fri, 13 Feb 2026 16:05:27 +0100 Subject: [PATCH 10/17] Structurize cli, fix tmpdir bug --- src/projects/splitter/main.cpp | 132 +++++++++++++++++---------------- 1 file changed, 69 insertions(+), 63 deletions(-) diff --git a/src/projects/splitter/main.cpp b/src/projects/splitter/main.cpp index fbe182b947..44ab1c6999 100644 --- a/src/projects/splitter/main.cpp +++ b/src/projects/splitter/main.cpp @@ -47,7 +47,7 @@ struct gcfg { unsigned nthreads = (omp_get_max_threads() / 2 + 1); std::filesystem::path file = ""; std::filesystem::path tmpdir = "saves"; - unsigned libindex = 0; + unsigned libindex = -1u; GraphType graph_type = GraphType::Multiplexed; ResolutionMode mode = ResolutionMode::Diploid; bool bin_load = false; @@ -78,42 +78,49 @@ static void process_cmdline(int argc, char** argv, gcfg& cfg) { std::string output_dir; std::string refpath; std::string file; - std::string tmpdir; + std::string tmpdir = "saves"; std::string assembly_info; auto cli = ( graph << value("graph (in binary or GFA)"), - file << value("SLR library description (in YAML)"), - output_dir << value("path to output directory"), - (option("--dataset") & value("yaml", file)) % "dataset description (in YAML)", - (option("-l") & integer("value", cfg.libindex)) % "library index (0-based, default: 0)", - (option("--assembly-info") & value("assembly-info", assembly_info)) - % "Path to metaflye assembly_info.txt file (meta mode, metaFlye graphs only)", - (option("-t") & integer("value", cfg.nthreads)) % "# of threads to use", - (option("--mapping-k") & integer("value", cfg.mapping_k)) % "k for read mapping", - (option("--tmp-dir") & value("tmp", tmpdir)) % "scratch directory to use", - (option("--ref") & value("reference", refpath)) % "Reference path for repeat resolution evaluation (developer option)", - (option("--bin-load").set(cfg.bin_load)) % "load binary-converted reads from tmpdir (developer option)", - (option("--debug").set(cfg.debug)) % "produce lots of debug data (developer option)", - (option("--sampling-factor") & value("sampling-factor", cfg.sampling_factor)) % "Sampling factor for read downsampling", - (with_prefix("-G", - option("mdbg").set(cfg.graph_type, GraphType::Multiplexed) | - option("blunt").set(cfg.graph_type, GraphType::Blunted)) % "assembly graph type (mDBG or blunted)"), + file << value("SLR library description (in YAML)"), + output_dir << value("path to output directory"), + // (option("--dataset") & value("yaml", file)) % "dataset description (in YAML)", + (option("-l") & integer("value", cfg.libindex)) % "library index (0-based, default: 0)", + (option("-t") & integer("value", cfg.nthreads)) % "# of threads to use", + (option("--sampling-factor") & value("sampling-factor", cfg.sampling_factor)) % "Sampling factor for read downsampling", + (option("--assembly-info") & value("assembly-info", assembly_info)) + % "Path to metaflye assembly_info.txt file (meta mode, metaFlye graphs only)", + (with_prefix("-G", + option("mdbg").set(cfg.graph_type, GraphType::Multiplexed) | + option("blunt").set(cfg.graph_type, GraphType::Blunted)) % "assembly graph type (mDBG or blunted)"), + "Repeat resolution options" % ( (with_prefix("-M", - option("diploid").set(cfg.mode, ResolutionMode::Diploid) | - option("meta").set(cfg.mode, ResolutionMode::Meta)) % "repeat resolution mode (diploid or meta)"), - (option("--frame-size") & value("frame-size", cfg.frame_size)) % "Resolution of barcode index", - (option("--linkage-distance") & value("read-linkage-distance", cfg.read_linkage_distance)) % - "Reads are assigned to the same fragment based on linkage distance", + option("diploid").set(cfg.mode, ResolutionMode::Diploid) | + option("meta").set(cfg.mode, ResolutionMode::Meta)) % "repeat resolution mode (diploid or meta)"), (option("--score") & value("score", cfg.graph_score_threshold)) % "Score threshold for link index", (option("--rel-threshold") & value("rel-threshold", cfg.rel_threshold)) % "Relative score threshold for vertex resolution", - (option("--tail-threshold") & value("tail-threshold", cfg.tail_threshold)) % - "Barcodes are assigned to the first and last nucleotides of the edge", - (option("--count-threshold") & value("count-threshold", cfg.count_threshold)) - % "Minimum number of reads for barcode index", (option("--scaffold-links").set(cfg.scaffold_links)) % "Use scaffold links in addition to graph links for repeat resolution", (option("--length-threshold") & value("length-threshold", cfg.length_threshold)) % "Minimum scaffold graph edge length (meta mode option)" + ), + "Index options" % ( + (option("--mapping-k") & integer("value", cfg.mapping_k)) % "k for read mapping", + (option("--frame-size") & value("frame-size", cfg.frame_size)) % + "Resolution of barcode index", + (option("--linkage-distance") & value("read-linkage-distance", cfg.read_linkage_distance)) % + "Reads are assigned to the same fragment based on linkage distance", + (option("--tail-threshold") & value("tail-threshold", cfg.tail_threshold)) % + "Barcodes are assigned to the first and last nucleotides of the edge", + (option("--count-threshold") & value("count-threshold", cfg.count_threshold)) % + "Minimum number of reads for barcode index" + ), + "Developer options:" % ( + (option("--bin-load").set(cfg.bin_load)) % "load binary-converted reads from tmpdir", + (option("--debug").set(cfg.debug)) % "produce lots of debug data", + (option("--tmp-dir") & value("dir", tmpdir)) % "scratch directory to use", + (option("--ref") & value("reference", refpath)) % "Reference path for repeat resolution evaluation" + ) ); auto result = parse(argc, argv, cli); @@ -343,10 +350,12 @@ int main(int argc, char** argv) { START_BANNER("SpLitteR"); cfg.nthreads = spades_set_omp_threads(cfg.nthreads); - INFO("Maximum # of threads to use (adjusted due to OMP capabilities): " << cfg.nthreads); + INFO("Maximum # of threads to use (adjusted due cfgto OMP capabilities): " << cfg.nthreads); - std::filesystem::create_directory(cfg.output_dir); - std::filesystem::create_directory(cfg.tmpdir); + if (!std::filesystem::create_directories(cfg.output_dir)) + FATAL_IO_ERROR("Failed to create output directory: " << cfg.output_dir); + if (!std::filesystem::create_directories(cfg.tmpdir)) + FATAL_IO_ERROR("Failed to create temporary directory: " << cfg.tmpdir); INFO("Loading graph"); std::unique_ptr> id_mapper(new io::IdMapper()); @@ -361,43 +370,40 @@ int main(int argc, char** argv) { gfa_writer.WriteSegmentsAndLinks(); INFO("Building barcode index"); - if (cfg.libindex != -1u) { - INFO("Processing paired-end reads"); - DataSet dataset; - if (cfg.file != "") { - dataset.load(cfg.file); - if (cfg.libindex == -1u) - cfg.libindex = 0; - CHECK_FATAL_ERROR(cfg.libindex < dataset.lib_count(), "invalid library index"); - } - - debruijn_graph::config::init_libs(dataset, cfg.nthreads, cfg.tmpdir); - barcode_index::FrameBarcodeIndex barcode_index(graph, cfg.frame_size); - using BarcodeExtractor = barcode_index::FrameBarcodeIndexInfoExtractor; - auto barcode_extractor_ptr = std::make_shared(barcode_index, graph); + DataSet dataset; + if (cfg.file != "") { + dataset.load(cfg.file); + if (cfg.libindex == -1u) + cfg.libindex = 0; + CHECK_FATAL_ERROR(cfg.libindex < dataset.lib_count(), "invalid library index"); + } - std::unique_ptr traceraii; - traceraii.reset(new TimeTracerRAII(argv[0], 500)); - INFO("Time tracing is enabled"); + debruijn_graph::config::init_libs(dataset, cfg.nthreads, cfg.tmpdir); + barcode_index::FrameBarcodeIndex barcode_index(graph, cfg.frame_size); + using BarcodeExtractor = barcode_index::FrameBarcodeIndexInfoExtractor; + auto barcode_extractor_ptr = std::make_shared(barcode_index, graph); - TIME_TRACE_SCOPE("Containment index"); + std::unique_ptr traceraii; + traceraii.reset(new TimeTracerRAII(argv[0], 500)); + INFO("Time tracing is enabled"); - auto &lib = dataset[cfg.libindex]; - if (lib.type() == io::LibraryType::Clouds10x or lib.type() == io::LibraryType::TellSeqReads) { - cont_index::ConstructBarcodeIndex(barcode_index, lib, graph, cfg.tmpdir, cfg.nthreads, cfg.frame_size, - cfg.mapping_k, cfg.bin_load, cfg.debug); - } else { - ERROR("Only read cloud libraries with barcode tags are supported for links"); - } + TIME_TRACE_SCOPE("Containment index"); - barcode_index::FrameBarcodeIndex downsampled_index(graph, cfg.frame_size); - if (not math::eq(cfg.sampling_factor, 1.0)) { - INFO("Downsampling the barcode index with factor " << cfg.sampling_factor); - cont_index::DownsampleBarcodeIndex(graph, cfg.nthreads, barcode_index, downsampled_index, - cfg.sampling_factor); - barcode_extractor_ptr = std::make_shared(downsampled_index, graph); - } + auto &lib = dataset[cfg.libindex]; + if (lib.type() == io::LibraryType::Clouds10x or lib.type() == io::LibraryType::TellSeqReads) { + cont_index::ConstructBarcodeIndex(barcode_index, lib, graph, cfg.tmpdir, cfg.nthreads, cfg.frame_size, + cfg.mapping_k, cfg.bin_load, cfg.debug); + } else { + ERROR("Only read cloud libraries with barcode tags are supported for links"); + } - ResolveComplexVertices(cfg, graph, barcode_extractor_ptr, id_mapper.get(), gfa, gfa_writer); + barcode_index::FrameBarcodeIndex downsampled_index(graph, cfg.frame_size); + if (not math::eq(cfg.sampling_factor, 1.0)) { + INFO("Downsampling the barcode index with factor " << cfg.sampling_factor); + cont_index::DownsampleBarcodeIndex(graph, cfg.nthreads, barcode_index, downsampled_index, + cfg.sampling_factor); + barcode_extractor_ptr = std::make_shared(downsampled_index, graph); } + + ResolveComplexVertices(cfg, graph, barcode_extractor_ptr, id_mapper.get(), gfa, gfa_writer); } From e2ae250da8e852a7d07218b72edcb44ca609e683 Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Sun, 12 Apr 2026 15:19:00 +0200 Subject: [PATCH 11/17] Move IdStorage from GraphCore. Support conjugate LinkId --- .../assembly_graph/core/debruijn_data.hpp | 2 +- src/common/assembly_graph/core/graph.hpp | 46 ++++--- src/common/assembly_graph/core/graph_core.hpp | 130 ++---------------- src/common/assembly_graph/core/id_storage.hpp | 128 +++++++++++++++++ src/common/io/binary/graph.hpp | 3 +- src/common/io/graph/gfa_reader.cpp | 9 +- src/test/debruijn/v_overlaps.cpp | 30 ++-- 7 files changed, 187 insertions(+), 161 deletions(-) create mode 100644 src/common/assembly_graph/core/id_storage.hpp diff --git a/src/common/assembly_graph/core/debruijn_data.hpp b/src/common/assembly_graph/core/debruijn_data.hpp index 324a0b2673..994d5102ad 100644 --- a/src/common/assembly_graph/core/debruijn_data.hpp +++ b/src/common/assembly_graph/core/debruijn_data.hpp @@ -26,7 +26,7 @@ class DeBruijnDataMaster; class DeBruijnVertexData { friend class DeBruijnDataMaster; - typedef size_t LinkId; + typedef omnigraph::impl::LinkId LinkId; enum OverlapKind { ComplexOverlap, diff --git a/src/common/assembly_graph/core/graph.hpp b/src/common/assembly_graph/core/graph.hpp index 4142f3b85f..c35d2f86c3 100644 --- a/src/common/assembly_graph/core/graph.hpp +++ b/src/common/assembly_graph/core/graph.hpp @@ -9,6 +9,7 @@ #pragma once #include "adt/iterator_range.hpp" +#include "id_storage.hpp" #include "observable_graph.hpp" #include "coverage.hpp" #include "debruijn_data.hpp" @@ -25,7 +26,7 @@ class DeBruijnGraph: public omnigraph::ObservableGraph { typedef base::EdgeId EdgeId; typedef base::VertexId VertexId; typedef base::VertexIt VertexIt; - typedef DataMasterT::LinkId LinkId; + typedef omnigraph::impl::LinkId LinkId; typedef VertexIt VertexIterator; typedef VertexIterator iterator; // for for_each typedef const VertexIterator const_iterator; // for for_each @@ -44,14 +45,15 @@ class DeBruijnGraph: public omnigraph::ObservableGraph { std::pair link; unsigned overlap; + LinkId conjugate_; }; - typedef std::vector LinkStorage; - LinkStorage link_storage_; + static constexpr unsigned LINK_ID_BIAS = 3; + omnigraph::IdStorage lstorage_; public: DeBruijnGraph(unsigned k) - : base(k), coverage_index_(*this), link_storage_{} {} + : base(k), coverage_index_(*this), lstorage_(LINK_ID_BIAS) {} CoverageIndex& coverage_index() { return coverage_index_; @@ -76,9 +78,22 @@ class DeBruijnGraph: public omnigraph::ObservableGraph { data(v).set_overlap(ovl); } + using base::conjugate; + LinkId add_link(EdgeId e1, EdgeId e2, unsigned ovl) { - link_storage_.emplace_back(e1, e2, ovl); - return link_storage_.size() - 1; + LinkId fwd = lstorage_.create(e1, e2, ovl); + if (e1 == this->conjugate(e2)) { + lstorage_.at(fwd.int_id()).conjugate_ = fwd; + return fwd; + } + LinkId conj = lstorage_.create(this->conjugate(e2), this->conjugate(e1), ovl); + lstorage_.at(fwd.int_id()).conjugate_ = conj; + lstorage_.at(conj.int_id()).conjugate_ = fwd; + return fwd; + } + + LinkId conjugate(LinkId id) const { + return lstorage_.at(id.int_id()).conjugate_; } void add_link(VertexId v, LinkId idx) { @@ -102,7 +117,7 @@ class DeBruijnGraph: public omnigraph::ObservableGraph { links.erase(std::remove_if(links.begin(), links.end(), [this, &e](const LinkId &link_id) { - return link_storage_[link_id].link.first == e; + return lstorage_.at(link_id.int_id()).link.first == e; }), links.end()); } @@ -111,7 +126,7 @@ class DeBruijnGraph: public omnigraph::ObservableGraph { links.erase(std::remove_if(links.begin(), links.end(), [this, &e](const LinkId &link_id) { - return link_storage_[link_id].link.second == e; + return lstorage_.at(link_id.int_id()).link.second == e; }), links.end()); } @@ -119,8 +134,8 @@ class DeBruijnGraph: public omnigraph::ObservableGraph { return data(v).links(); } - const auto& link(size_t idx) const { - return link_storage_[idx]; + const auto& link(LinkId idx) const { + return lstorage_.at(idx.int_id()); } bool is_complex(VertexId v) const { @@ -141,16 +156,9 @@ class DeBruijnGraph: public omnigraph::ObservableGraph { VERIFY_MSG(false, "Link " << in.int_id() << " -> " << out.int_id() << " was not found for vertex " << v.int_id()); } - void lreserve(size_t size) { link_storage_.reserve(size); } - - auto link_begin() { return link_storage_.begin(); } - auto link_end() { return link_storage_.end(); } - auto link_begin() const { return link_storage_.begin(); } - auto link_end() const { return link_storage_.end(); } - auto links() { return adt::make_range(link_begin(), link_end()); } - auto links() const { return adt::make_range(link_begin(), link_end()); } + void lreserve(size_t size) { lstorage_.reserve(size); } - size_t link_size() const {return link_storage_.size(); } + size_t link_size() const { return lstorage_.size(); } using base::AddVertex; using base::AddEdge; diff --git a/src/common/assembly_graph/core/graph_core.hpp b/src/common/assembly_graph/core/graph_core.hpp index 55b808f94e..2c36cf690f 100644 --- a/src/common/assembly_graph/core/graph_core.hpp +++ b/src/common/assembly_graph/core/graph_core.hpp @@ -8,6 +8,7 @@ #pragma once #include "id_distributor.hpp" +#include "id_storage.hpp" #include "utils/verify.hpp" #include "utils/logger/logger.hpp" #include "utils/stl_utils.hpp" @@ -76,6 +77,10 @@ struct EdgeId : public Id { using Id::Id; }; +struct LinkId : public Id { + using Id::Id; +}; + } template @@ -121,6 +126,7 @@ class PairedEdge { friend class GraphCore; friend class ConstructionHelper; friend class PairedElementManipulationHelper; + friend class IdStorage>; //todo unfriend friend class PairedVertex; VertexId end_; @@ -159,6 +165,7 @@ class PairedVertex { friend class ConstructionHelper; friend class PairedEdge; friend class PairedElementManipulationHelper; + friend class IdStorage>; template friend class conjugate_iterator; @@ -227,121 +234,7 @@ class GraphCore: private boost::noncopyable { static constexpr unsigned ID_BIAS = 3; template - class IdStorage { - private: - void resize(size_t N) { - // INFO("resize to " << N); - VERIFY(N > storage_size_); - T *new_storage = (T*)malloc(N * sizeof(T)); - // Note: we cannot iterate over id's here as resize() could - // be called during create() and therefore we might end with - // uninitialized newly created id as well. - for (uint64_t id = bias_; id < storage_size_; ++id) { - if (!id_distributor_.occupied(id)) - continue; - - T *val = &storage_[id]; - ::new(new_storage + id) T(std::move(*val)); - } - - if (storage_) - free(storage_); - storage_ = new_storage; - storage_size_ = N; - } - - public: - typedef omnigraph::ReclaimingIdDistributor::id_iterator id_iterator; - typedef T value_type; - - IdStorage(uint64_t bias = ID_BIAS) - : size_(0), bias_(bias), storage_(nullptr), storage_size_(0), id_distributor_(bias) { - resize(id_distributor_.size() + bias_); - } - - ~IdStorage() { - if (storage_) { - for (uint64_t id : id_distributor_.ids()) { - // INFO("~Remove " << id << ":" << typeid(T).name()); - - T *val = &storage_[id]; - val->~T(); - } - - free(storage_); - } - } - - id_iterator id_begin() const { return id_distributor_.begin(); } - id_iterator id_end() const { return id_distributor_.end(); } - uint64_t max_id() const { return id_distributor_.max_id(); } - - void reserve(size_t sz) { - if (storage_size_ >= sz + bias_) - return; - - id_distributor_.resize(sz); - resize(sz + bias_); - } - - // FIXME: Count! - size_t size() const noexcept { return size_; } - - bool contains(uint64_t id) const { - return id < storage_size_ && id_distributor_.occupied(id); - } - - template - uint64_t create(ArgTypes &&... args) { - uint64_t id = id_distributor_.allocate(); - - while (storage_size_ < id + 1) - resize(storage_size_ * 2 + 1); - - new(storage_ + id) T(std::forward(args)...);; - size_ += 1; - - // INFO("Create " << id << ":" << typeid(T).name()); - return id; - } - - template - uint64_t emplace(uint64_t at, ArgTypes &&... args) { - // One MUST call reserve before using emplace() - VERIFY(!id_distributor_.occupied(at)); - - id_distributor_.acquire(at); - new(storage_ + at) T(std::forward(args)...);; - size_.fetch_add(1); - - // INFO("Emplace " << at << ":" << typeid(T).name()); - return at; - } - - void erase(uint64_t id) { - T *v = &storage_[id]; - - // INFO("Remove " << id << ":" << typeid(T).name()); - v->~T(); - - id_distributor_.release(id); - size_ -= 1; - } - - T& at(uint64_t id) const noexcept { - return storage_[id]; - } - - uint64_t reserved() const { return id_distributor_.size(); } - void clear_state() { id_distributor_.clear_state(); } - - private: - std::atomic size_; - uint64_t bias_; - T *storage_; - size_t storage_size_; - omnigraph::ReclaimingIdDistributor id_distributor_; - }; + using IdStorage = omnigraph::IdStorage; using VertexStorage = IdStorage>; VertexStorage vstorage_; @@ -872,4 +765,11 @@ struct hash { } }; +template<> +struct hash { + size_t operator()(omnigraph::impl::LinkId l) const noexcept { + return l.hash(); + } +}; + } diff --git a/src/common/assembly_graph/core/id_storage.hpp b/src/common/assembly_graph/core/id_storage.hpp new file mode 100644 index 0000000000..57a2ce7941 --- /dev/null +++ b/src/common/assembly_graph/core/id_storage.hpp @@ -0,0 +1,128 @@ +//*************************************************************************** +//* Copyright (c) 2023-2024 SPAdes team +//* Copyright (c) 2015-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "id_distributor.hpp" +#include "utils/verify.hpp" + +#include +#include + +namespace omnigraph { + +template +class IdStorage { + private: + void resize(size_t N) { + VERIFY(N > storage_size_); + T *new_storage = (T*)malloc(N * sizeof(T)); + // Note: we cannot iterate over id's here as resize() could + // be called during create() and therefore we might end with + // uninitialized newly created id as well. + for (uint64_t id = bias_; id < storage_size_; ++id) { + if (!id_distributor_.occupied(id)) + continue; + + T *val = &storage_[id]; + ::new(new_storage + id) T(std::move(*val)); + } + + if (storage_) + free(storage_); + storage_ = new_storage; + storage_size_ = N; + } + + public: + typedef omnigraph::ReclaimingIdDistributor::id_iterator id_iterator; + typedef T value_type; + + IdStorage(uint64_t bias) + : size_(0), bias_(bias), storage_(nullptr), storage_size_(0), id_distributor_(bias) { + resize(id_distributor_.size() + bias_); + } + + ~IdStorage() { + if (storage_) { + for (uint64_t id : id_distributor_.ids()) { + T *val = &storage_[id]; + val->~T(); + } + + free(storage_); + } + } + + id_iterator id_begin() const { return id_distributor_.begin(); } + id_iterator id_end() const { return id_distributor_.end(); } + uint64_t max_id() const { return id_distributor_.max_id(); } + + void reserve(size_t sz) { + if (storage_size_ >= sz + bias_) + return; + + id_distributor_.resize(sz); + resize(sz + bias_); + } + + size_t size() const noexcept { return size_; } + + bool contains(uint64_t id) const { + return id < storage_size_ && id_distributor_.occupied(id); + } + + template + uint64_t create(ArgTypes &&... args) { + uint64_t id = id_distributor_.allocate(); + + while (storage_size_ < id + 1) + resize(storage_size_ * 2 + 1); + + new(storage_ + id) T(std::forward(args)...);; + size_ += 1; + + return id; + } + + template + uint64_t emplace(uint64_t at, ArgTypes &&... args) { + // One MUST call reserve before using emplace() + VERIFY(!id_distributor_.occupied(at)); + + id_distributor_.acquire(at); + new(storage_ + at) T(std::forward(args)...);; + size_.fetch_add(1); + + return at; + } + + void erase(uint64_t id) { + T *v = &storage_[id]; + + v->~T(); + + id_distributor_.release(id); + size_ -= 1; + } + + T& at(uint64_t id) const noexcept { + return storage_[id]; + } + + uint64_t reserved() const { return id_distributor_.size(); } + void clear_state() { id_distributor_.clear_state(); } + + private: + std::atomic size_; + uint64_t bias_; + T *storage_; + size_t storage_size_; + omnigraph::ReclaimingIdDistributor id_distributor_; +}; + +} diff --git a/src/common/io/binary/graph.hpp b/src/common/io/binary/graph.hpp index 37b9898fb0..32019c6ad8 100644 --- a/src/common/io/binary/graph.hpp +++ b/src/common/io/binary/graph.hpp @@ -108,9 +108,8 @@ class GraphIO : public IOSingle { unsigned ovl; str >> e1 >> e2 >> e1_conj >> e2_conj >> ovl; auto link_id = graph.add_link(e1, e2, ovl); - auto conj_link_id = graph.add_link(e2_conj, e1_conj, ovl); link_ids.push_back(link_id); - conj_link_ids.push_back(conj_link_id); + conj_link_ids.push_back(graph.conjugate(link_id)); } std::vector empty_links; new_id = graph.AddVertex(debruijn_graph::DeBruijnVertexData(empty_links), ids[0], ids[1]); diff --git a/src/common/io/graph/gfa_reader.cpp b/src/common/io/graph/gfa_reader.cpp index e10f26666f..79be048b64 100644 --- a/src/common/io/graph/gfa_reader.cpp +++ b/src/common/io/graph/gfa_reader.cpp @@ -112,8 +112,6 @@ static void HandleLink(Links &links, e2 = g.conjugate(e2); links.emplace_back(e1, e2, record.overlap); - if (e1 != g.conjugate(e2)) - links.emplace_back(g.conjugate(e2), g.conjugate(e1), record.overlap); } static void HandleGapLink(GFAReader::GapLinks &links, @@ -129,8 +127,6 @@ static void HandleGapLink(GFAReader::GapLinks &links, e2 = g.conjugate(e2); links.emplace_back(e1, e2); - if (e1 != g.conjugate(e2)) - links.emplace_back(g.conjugate(e2), g.conjugate(e1)); } @@ -197,12 +193,17 @@ static std::pair ProcessLinks(DeBruijnGraph &g, const Links &lin if (simple) { g.set_overlap(v1, ovl); g.set_overlap(v2, ovl); + g.set_overlap(g.conjugate(v1), ovl); + g.set_overlap(g.conjugate(v2), ovl); } else { LinkId link_idx = g.add_link(e1, e2, ovl); g.add_link(v1, link_idx); + g.add_link(g.conjugate(v1), g.conjugate(link_idx)); if (v1 != v2) { g.add_links(v1, g.links(v2)); g.clear_links(v2); + g.add_links(g.conjugate(v1), g.links(g.conjugate(v2))); + g.clear_links(g.conjugate(v2)); } } diff --git a/src/test/debruijn/v_overlaps.cpp b/src/test/debruijn/v_overlaps.cpp index a7c51abe2a..885de3d217 100644 --- a/src/test/debruijn/v_overlaps.cpp +++ b/src/test/debruijn/v_overlaps.cpp @@ -98,17 +98,15 @@ void CheckLinks(std::ifstream &graph_stream, const Graph &graph, const IdMapper if (graph.is_complex(vertex)) { bool link_found = false; bool conj_link_found = false; - for (const auto &link: graph.links(vertex)) { - auto link_in_edge = graph.link(link).link.first; - auto link_out_edge = graph.link(link).link.second; + for (const auto &link_id: graph.links(vertex)) { + auto link_in_edge = graph.link(link_id).link.first; + auto link_out_edge = graph.link(link_id).link.second; if (link_in_edge == in_edge and link_out_edge == out_edge) { link_found = true; - } - } - for (const auto &link: graph.links(graph.conjugate(vertex))) { - auto link_in_edge = graph.link(link).link.first; - auto link_out_edge = graph.link(link).link.second; - if (link_in_edge == out_conjugate and link_out_edge == in_conjugate) { + // Verify conjugate links + LinkId conj_link_id = graph.conjugate(link_id); + EXPECT_EQ(graph.link(conj_link_id).link.first, out_conjugate); + EXPECT_EQ(graph.link(conj_link_id).link.second, in_conjugate); conj_link_found = true; } } @@ -135,24 +133,16 @@ void PerformSplits(debruijn_graph::Graph &graph, std::ifstream &ops_stream, cons " are not incident, operations are not consistent with the graph"); VertexId new_vertex; if (graph.is_complex(vertex)) { - LinkId split_link, conj_split_link; + LinkId split_link; bool link_found = false; - bool conj_link_found = false; for (auto &link_id: graph.links(vertex)) { if (graph.link(link_id).link.first == in_edge && graph.link(link_id).link.second == out_edge) { split_link = link_id; link_found = true; } } - for (auto &link_id: graph.links(graph.conjugate(vertex))) { - if (graph.link(link_id).link.first == graph.conjugate(out_edge) && - graph.link(link_id).link.second == graph.conjugate(in_edge)) { - conj_split_link = link_id; - conj_link_found = true; - } - } - - EXPECT_TRUE(link_found && conj_link_found); + EXPECT_TRUE(link_found); + LinkId conj_split_link = graph.conjugate(split_link); std::vector empty; new_vertex = helper.CreateVertex(debruijn_graph::DeBruijnVertexData(empty)); graph.add_link(new_vertex, split_link); From 339d29d99ebd4ef56b4aeaddd735dcc9b74bc40e Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Sun, 12 Apr 2026 19:56:20 +0200 Subject: [PATCH 12/17] Support conjugate data for complex vertices --- .../assembly_graph/core/construction_helper.hpp | 5 +++++ src/common/assembly_graph/core/debruijn_data.hpp | 11 +++++++++++ src/common/assembly_graph/core/graph.hpp | 4 ++++ src/projects/splitter/graph_resolver.cpp | 8 ++++++-- 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/common/assembly_graph/core/construction_helper.hpp b/src/common/assembly_graph/core/construction_helper.hpp index 875db15eb2..b19bb14ee7 100644 --- a/src/common/assembly_graph/core/construction_helper.hpp +++ b/src/common/assembly_graph/core/construction_helper.hpp @@ -137,6 +137,11 @@ class ConstructionHelper { return graph_.CreateVertex(std::move(data), id1, id2); } + VertexId CreateVertex(VertexData data, VertexData cdata, VertexId id1 = 0, VertexId id2 = 0) { + return graph_.CreateVertex(std::move(data), std::move(cdata), id1, id2); + } + + }; } diff --git a/src/common/assembly_graph/core/debruijn_data.hpp b/src/common/assembly_graph/core/debruijn_data.hpp index 994d5102ad..8047058ada 100644 --- a/src/common/assembly_graph/core/debruijn_data.hpp +++ b/src/common/assembly_graph/core/debruijn_data.hpp @@ -262,6 +262,17 @@ class DeBruijnDataMaster { VERIFY_MSG(false, "Conjugation of complex overlap data is not implemented") } + template + VertexData conjugate(const VertexData &data, LinkConjugate &&link_conjugate) const { + if (!data.has_complex_overlap() || data.links().empty()) + return data.clone(); + std::vector conjugated_links; + conjugated_links.reserve(data.links().size()); + for (const auto &lid : data.links()) + conjugated_links.push_back(link_conjugate(lid)); + return VertexData(conjugated_links); + } + size_t length(const EdgeData& data) const { return data.nucls().size() - k_; } diff --git a/src/common/assembly_graph/core/graph.hpp b/src/common/assembly_graph/core/graph.hpp index c35d2f86c3..b40cfd80ca 100644 --- a/src/common/assembly_graph/core/graph.hpp +++ b/src/common/assembly_graph/core/graph.hpp @@ -160,6 +160,10 @@ class DeBruijnGraph: public omnigraph::ObservableGraph { size_t link_size() const { return lstorage_.size(); } + VertexData ConjugateData(const VertexData &data) const { + return master().conjugate(data, [this](LinkId lid) { return conjugate(lid); }); + } + using base::AddVertex; using base::AddEdge; diff --git a/src/projects/splitter/graph_resolver.cpp b/src/projects/splitter/graph_resolver.cpp index d1a38f87fa..8ee8f380c8 100644 --- a/src/projects/splitter/graph_resolver.cpp +++ b/src/projects/splitter/graph_resolver.cpp @@ -34,7 +34,9 @@ GraphResolver::GraphResolverInfo::VertexMap GraphResolver::SplitVertices(debruij helper.DeleteLink(vertex, out_edge); helper.DeleteLink(graph.conjugate(vertex), graph.conjugate(in_edge)); std::vector links {link}; - VertexId new_vertex = helper.CreateVertex(debruijn_graph::DeBruijnVertexData(links)); + auto data = debruijn_graph::DeBruijnVertexData(links); + auto cdata = graph.ConjugateData(data); + VertexId new_vertex = helper.CreateVertex(std::move(data), std::move(cdata)); transformed_vertex_to_original[new_vertex] = vertex; helper.LinkIncomingEdge(new_vertex, in_edge); helper.LinkOutgoingEdge(new_vertex, out_edge); @@ -52,7 +54,9 @@ GraphResolver::GraphResolverInfo::VertexMap GraphResolver::SplitVertices(debruij new_links.push_back(link_id); } } - VertexId new_vertex = helper.CreateVertex(debruijn_graph::DeBruijnVertexData(new_links)); + auto data = debruijn_graph::DeBruijnVertexData(new_links); + auto cdata = graph.ConjugateData(data); + VertexId new_vertex = helper.CreateVertex(std::move(data), std::move(cdata)); for (const auto &in_edge: graph.IncomingEdges(vertex)) { helper.DeleteLink(graph.conjugate(vertex), graph.conjugate(in_edge)); helper.LinkIncomingEdge(new_vertex, in_edge); From b10f5c9b8d495a4fbaf4eee752c19b20ba9c6adc Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Mon, 13 Apr 2026 12:27:50 +0200 Subject: [PATCH 13/17] Add splitter to projects, clarify conjugate data --- src/cmake/proj.cmake | 2 +- src/common/assembly_graph/core/debruijn_data.hpp | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/cmake/proj.cmake b/src/cmake/proj.cmake index 9cc3b7834b..3899c2f1a5 100644 --- a/src/cmake/proj.cmake +++ b/src/cmake/proj.cmake @@ -8,7 +8,7 @@ # Side-by-side subprojects layout: automatically set the # SPADES_EXTERNAL_${project}_SOURCE_DIR using SPADES_ALL_PROJECTS -set(SPADES_ALL_PROJECTS "spades;hammer;ionhammer;corrector;spaligner;spades_tools;binspreader;pathracer;hpcspades") +set(SPADES_ALL_PROJECTS "spades;hammer;ionhammer;corrector;spaligner;spades_tools;binspreader;pathracer;splitter;hpcspades") set(SPADES_EXTRA_PROJECTS "mts;online_vis;cds_subgraphs") set(SPADES_KNOWN_PROJECTS "${SPADES_ALL_PROJECTS};${SPADES_EXTRA_PROJECTS}") set(SPADES_ENABLE_PROJECTS "" CACHE STRING diff --git a/src/common/assembly_graph/core/debruijn_data.hpp b/src/common/assembly_graph/core/debruijn_data.hpp index 8047058ada..d31cdf0f39 100644 --- a/src/common/assembly_graph/core/debruijn_data.hpp +++ b/src/common/assembly_graph/core/debruijn_data.hpp @@ -255,21 +255,20 @@ class DeBruijnDataMaster { return EdgeData(!(data.nucls())); } - // FIXME: support complex overlaps VertexData conjugate(const VertexData &data) const { if (!data.has_complex_overlap() or data.links().empty()) return data.clone(); - VERIFY_MSG(false, "Conjugation of complex overlap data is not implemented") + VERIFY_MSG(false, "Conjugate overlap data has to be provided separately for complex vertices") } - template - VertexData conjugate(const VertexData &data, LinkConjugate &&link_conjugate) const { + template + VertexData conjugate(const VertexData &data, LinkConjugator &&link_conjugator) const { if (!data.has_complex_overlap() || data.links().empty()) return data.clone(); std::vector conjugated_links; conjugated_links.reserve(data.links().size()); for (const auto &lid : data.links()) - conjugated_links.push_back(link_conjugate(lid)); + conjugated_links.push_back(link_conjugator(lid)); return VertexData(conjugated_links); } From 63915c1dcb7988111f5732a4bff0f438f3d82c92 Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Mon, 13 Apr 2026 13:00:16 +0200 Subject: [PATCH 14/17] Minor doc fixes --- docs/installation.md | 5 +++-- docs/splitter.md | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 9f98f07146..34ff5541c0 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -102,14 +102,15 @@ SPAdes algorithms. This includes: - MPI-aware version of SPAdes pipeline hpcSPAdes These tools are not built by default and therefore must be built separately. One -can pass `-SPADES_ENABLE_PROJECTS="semicolon-separated list of projects"` to enable building only +can pass `-DSPADES_ENABLE_PROJECTS="semicolon-separated list of projects"` to enable building only subset of SPAdes components. The components are: - `spades` - [`spades_tools`](standalone.md) - - [`binspareader`](binspreader.md) + - [`binspreader`](binspreader.md) - [`pathracer`](pathracer.md) - [`spaligner`](spaligner.md) + - [`splitter`](splitter.md) - [`hpcspades`](hpc.md) By default, only SPAdes and SPAdes tools are enabled (so diff --git a/docs/splitter.md b/docs/splitter.md index 8592b6531e..6d72c9be56 100644 --- a/docs/splitter.md +++ b/docs/splitter.md @@ -7,7 +7,7 @@ SPlitteR is a tool that uses synthetic long reads (SLRs) to improve the contigui To compile SPlitteR, run ``` -./spades_compile -SPADES_ENABLE_PROJECTS=splitter +./spades_compile.sh -DSPADES_ENABLE_PROJECTS=splitter ``` ## Input @@ -124,7 +124,7 @@ Developer options: Example command lines: -- Assembly produced LJA from HiFi diploid human dataset, with 10X SLR library (HPC compressed)\ +- Assembly produced by LJA from HiFi diploid human dataset, with 10X SLR library (HPC compressed)\ `splitter lja_output/mdbg/mdbg.hpc.gfa 10x_dataset.yaml output -Mdiploid -Gmdbg` - Assembly produced by metaFlye from metagenomic dataset, with TELL-Seq SLR library\ `splitter metaflye_output/assembly_graph.gfa tellseq_dataset.yaml output --assembly-info metaflye_output/assembly_info.txt -Mmeta -Gblunt` @@ -146,6 +146,6 @@ In addition ## References -If you are using **BinSPreader** in your research, please cite: +If you are using **SPlitteR** in your research, please cite: [Tolstoganov et al., 2024](https://doi.org/10.7717/peerj.18050) \ No newline at end of file From 4fbe4713814db97f3099a769c63cae186a9c8b70 Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Mon, 13 Apr 2026 15:46:36 +0200 Subject: [PATCH 15/17] Add some comments, vertex resolver refactoring --- .../splitter/barcode_index_construction.cpp | 8 +-- src/projects/splitter/graph_resolver.cpp | 23 +++++---- src/projects/splitter/main.cpp | 4 +- src/projects/splitter/path_extractor.cpp | 2 +- .../splitter/scaffold_graph_helper.cpp | 2 +- src/projects/splitter/vertex_resolver.hpp | 51 ++++++++++--------- 6 files changed, 48 insertions(+), 42 deletions(-) diff --git a/src/projects/splitter/barcode_index_construction.cpp b/src/projects/splitter/barcode_index_construction.cpp index 6eda228326..d6eacc7bf9 100644 --- a/src/projects/splitter/barcode_index_construction.cpp +++ b/src/projects/splitter/barcode_index_construction.cpp @@ -34,7 +34,7 @@ void ConstructBarcodeIndex(barcode_index::FrameBarcodeIndex buffer(graph, frame_size); FrameBarcodeIndexBuilder barcode_index_builder(graph, mapper, barcode_prefices, frame_size, nthreads); bool is_tellseq = lib.type() == io::LibraryType::TellSeqReads; - if (not is_tellseq) { + if (!is_tellseq) { barcode_index_builder.ConstructBarcodeIndex(io::paired_easy_readers(lib, false, 0), barcode_index, lib, is_tellseq); } if (is_tellseq) { @@ -69,14 +69,14 @@ void DownsampleBarcodeIndex(const debruijn_graph::Graph &graph, unsigned nthreads, barcode_index::FrameBarcodeIndex &barcode_index, barcode_index::FrameBarcodeIndex &downsampled_index, - double sampling_factor) { - VERIFY_DEV(math::ls(sampling_factor, 1.0)); + double sampling_fraction) { + VERIFY_MSG(math::ls(sampling_fraction, 1.0), "Sampling fraction must be less than 1"); const size_t mapping_k = 31; const std::vector barcode_prefices = {"BC:Z:", "BX:Z:"}; debruijn_graph::Graph empty_graph(mapping_k); alignment::BWAReadMapper mapper(empty_graph); FrameBarcodeIndexBuilder barcode_index_builder(graph, mapper, barcode_prefices, barcode_index.GetFrameSize(), nthreads); - barcode_index_builder.DownsampleBarcodeIndex(downsampled_index, barcode_index, sampling_factor); + barcode_index_builder.DownsampleBarcodeIndex(downsampled_index, barcode_index, sampling_fraction); } } diff --git a/src/projects/splitter/graph_resolver.cpp b/src/projects/splitter/graph_resolver.cpp index 8ee8f380c8..1a44c0ebde 100644 --- a/src/projects/splitter/graph_resolver.cpp +++ b/src/projects/splitter/graph_resolver.cpp @@ -22,14 +22,14 @@ GraphResolver::GraphResolverInfo::VertexMap GraphResolver::SplitVertices(debruij const auto &vertex_result = vertex_entry.second; if (vertex_result.state == VertexState::Completely or vertex_result.state == VertexState::Partially) { - auto in_to_correct_link = GetLinkMap(graph, vertex, vertex_result); - VERIFY_DEV(in_to_correct_link.size() == vertex_entry.second.supported_pairs.size()); + auto in_to_link = GetLinkMap(graph, vertex, vertex_result); + VERIFY_MSG(in_to_link.size() == vertex_entry.second.supported_pairs.size(), "Some barcode-supported pairs do not have a GFA link"); std::unordered_set resolved_in_edges; std::unordered_set resolved_out_edges; for (const auto &entry: vertex_result.supported_pairs) { EdgeId in_edge = entry.first; EdgeId out_edge = entry.second; - LinkId link = in_to_correct_link.at(in_edge); + LinkId link = in_to_link.at(in_edge); DEBUG("In edge: " << in_edge.int_id() << ", out edge: " << out_edge.int_id() << ", vertex: " << vertex.int_id()); helper.DeleteLink(vertex, out_edge); helper.DeleteLink(graph.conjugate(vertex), graph.conjugate(in_edge)); @@ -50,7 +50,8 @@ GraphResolver::GraphResolverInfo::VertexMap GraphResolver::SplitVertices(debruij std::vector new_links; for (const auto &link_id: links) { auto link = graph.link(link_id); - if (resolved_in_edges.find(link.link.first) == resolved_in_edges.end() and resolved_out_edges.find(link.link.second) == resolved_out_edges.end()) { + if (resolved_in_edges.find(link.link.first) == resolved_in_edges.end() && + resolved_out_edges.find(link.link.second) == resolved_out_edges.end()) { new_links.push_back(link_id); } } @@ -83,7 +84,7 @@ GraphResolver::GraphResolverInfo::EdgeMap GraphResolver::MergePaths(debruijn_gra std::vector overlaps; const auto &first_path = *(path.first); for (size_t i = 0; i < first_path.Size(); ++i) { - if (i > 0 and graph.is_complex(graph.EdgeStart(first_path[i]))) { + if (i > 0 && graph.is_complex(graph.EdgeStart(first_path[i]))) { size_t overlap = graph.link_length(graph.EdgeStart(first_path[i]), first_path[i - 1], first_path[i]); overlaps.push_back(static_cast(overlap)); } @@ -109,18 +110,20 @@ GraphResolver::GraphResolverInfo GraphResolver::TransformGraph(debruijn_graph::G GraphResolver::LinkMap GraphResolver::GetLinkMap(const debruijn_graph::Graph &graph, GraphResolver::VertexId vertex, const VertexResult &vertex_result) const { - std::unordered_map in_to_out; + std::unordered_map in_to_link; std::unordered_map in_to_correct_link; for (const auto &entry: vertex_result.supported_pairs) { EdgeId in_edge = entry.first; EdgeId out_edge = entry.second; - VERIFY_DEV(in_to_out.find(in_edge) == in_to_out.end()); - in_to_out[in_edge] = out_edge; + VERIFY_MSG(in_to_link.find(in_edge) == in_to_link.end(), + "Barcode supported pair " << std::to_string(in_edge.int_id()) << ", " << + std::to_string(out_edge.int_id()) << " does not have a corresponding GFA link"); + in_to_link[in_edge] = out_edge; } for (const LinkId &link_id: graph.links(vertex)) { const auto &link = graph.link(link_id); - auto in_result = in_to_out.find(link.link.first); - if (in_result == in_to_out.end()) { + auto in_result = in_to_link.find(link.link.first); + if (in_result == in_to_link.end()) { continue; } if (in_result->second == link.link.second) { diff --git a/src/projects/splitter/main.cpp b/src/projects/splitter/main.cpp index 44ab1c6999..bf77153073 100644 --- a/src/projects/splitter/main.cpp +++ b/src/projects/splitter/main.cpp @@ -251,9 +251,9 @@ cont_index::VertexResolver::LinkMap GetTrustedContigLinks trusted_link_map[graph.conjugate(next)].insert(graph.conjugate(current)); } } - size_t total_links = 0; + size_t total_score = 0; for (const auto &entry: trusted_link_map) { - total_links += entry.second.size(); + total_score += entry.second.size(); } return trusted_link_map; } diff --git a/src/projects/splitter/path_extractor.cpp b/src/projects/splitter/path_extractor.cpp index 06f4c7f2d6..26d083d68d 100644 --- a/src/projects/splitter/path_extractor.cpp +++ b/src/projects/splitter/path_extractor.cpp @@ -125,7 +125,7 @@ PathExtractor::ScaffoldLinks PathExtractor::GetScaffoldLinks(const VertexResults } DEBUG("Constructing path map"); for (const auto &entry: vertex_result.supported_pairs) { - if (vertex_result.state == VertexState::Completely or vertex_result.state == VertexState::Partially) { + if (vertex_result.state == VertexState::Completely || vertex_result.state == VertexState::Partially) { if (in_to_out.find(entry.first) == in_to_out.end()) { TRACE(entry.first.int_id() << "," << entry.second.int_id()); if (IsGraphLink(entry.first, entry.second, vertex_link_storage)) { diff --git a/src/projects/splitter/scaffold_graph_helper.cpp b/src/projects/splitter/scaffold_graph_helper.cpp index 2f97ce8ddf..9cc0085559 100644 --- a/src/projects/splitter/scaffold_graph_helper.cpp +++ b/src/projects/splitter/scaffold_graph_helper.cpp @@ -145,7 +145,7 @@ scaffold_graph::ScaffoldGraph GetTellSeqScaffoldGraph(const debruijn_graph::Grap io::IdMapper *id_mapper) { auto path_to_scaffold_graph = output_dir / "tellseq_links.scg"; scaffold_graph::ScaffoldGraph scaffold_graph(g); - if (!bin_load or !std::filesystem::exists(path_to_scaffold_graph)) { + if (!bin_load || !std::filesystem::exists(path_to_scaffold_graph)) { LinkIndexGraphConstructor link_index_constructor(g, barcode_extractor, score_threshold, diff --git a/src/projects/splitter/vertex_resolver.hpp b/src/projects/splitter/vertex_resolver.hpp index 676e319879..3951a568e8 100644 --- a/src/projects/splitter/vertex_resolver.hpp +++ b/src/projects/splitter/vertex_resolver.hpp @@ -24,17 +24,19 @@ enum class VertexState { struct VertexResult { VertexResult(VertexState state, - const size_t &total_links, - const size_t &supporting_links, + const size_t &total_score, + const size_t &supporting_score, const std::unordered_map &supported_pairs) : state(state), - total_links(total_links), - supporting_links(supporting_links), + total_score(total_score), + supporting_score(supporting_score), supported_pairs(supported_pairs) {} VertexState state; - size_t total_links; - size_t supporting_links; + // Total score between all pairs of in- and out- edges + size_t total_score; + // Total score between resolved pairs of in- and out- edges + size_t supporting_score; std::unordered_map supported_pairs; }; @@ -105,12 +107,12 @@ class VertexResolver { VertexResult ResolveVertex(debruijn_graph::VertexId vertex, LinkIndexGraphConstructor::BarcodeScoreFunctionPtr score_function) const { - size_t total_links = 0; - size_t answer_links = 0; + size_t total_score = 0; + size_t total_supporting_score = 0; std::unordered_map in_to_out; bool is_ambiguous = false; std::unordered_set covered_vertices; - double LINK_BONUS = 1000000; + const double TRUSTED_LINK_BONUS = 1000000; for (EdgeId sc_in_edge: graph_.IncomingEdges(vertex)) { //convert to dbg EdgeId @@ -119,9 +121,9 @@ class VertexResolver { debruijn_graph::EdgeId in_edge = edge_getter.GetEdgeFromScaffoldVertex(sc_in_vertex); std::pair max_pair(0, 0); - std::pair second_pair(0, 0); - size_t max_links = 0; - size_t second_links = 0; + std::pair contender_pair(0, 0); + size_t max_score = 0; + size_t contender_score = 0; for (EdgeId sc_out_edge: graph_.OutgoingEdges(vertex)) { scaffold_graph::ScaffoldVertex sc_out_vertex(sc_out_edge); debruijn_graph::EdgeId out_edge = edge_getter.GetEdgeFromScaffoldVertex(sc_out_vertex); @@ -133,29 +135,29 @@ class VertexResolver { auto score = score_function->GetScore(sc_edge); auto link_result = links_.find(in_edge); if (link_result != links_.end() and link_result->second.find(out_edge) != link_result->second.end()) { - score += LINK_BONUS; + score += TRUSTED_LINK_BONUS; } - total_links += static_cast(score); + total_score += static_cast(score); if (math::ge(score, score_threshold_)) { covered_vertices.insert(vertex); - if (score > static_cast(max_links)) { - second_pair = max_pair; - second_links = max_links; - max_links = static_cast(score); + if (score > static_cast(max_score)) { + contender_pair = max_pair; + contender_score = max_score; + max_score = static_cast(score); max_pair = std::make_pair(in_edge, out_edge); } } } - if (static_cast(max_links) < static_cast(second_links) * rel_threshold_) { + if (static_cast(max_score) < static_cast(contender_score) * rel_threshold_) { is_ambiguous = true; - } else if (static_cast(max_links) >= score_threshold_) { + } else if (static_cast(max_score) >= score_threshold_) { in_to_out[max_pair.first] = max_pair.second; - answer_links += max_links; + total_supporting_score += max_score; } } bool is_covered = covered_vertices.find(vertex) != covered_vertices.end(); VertexState state = GetState(in_to_out, vertex, is_ambiguous, is_covered); - VertexResult result(state, total_links, answer_links, in_to_out); + VertexResult result(state, total_score, total_supporting_score, in_to_out); return result; } @@ -232,8 +234,8 @@ class VertexResolver { vertex_string += std::to_string(graph_.OutgoingEdgeCount(vertex)) + "\t" + out_edge_string + "\t" + result_string + "\t"; vertex_string += - std::to_string(vertex_result.supported_pairs.size()) + "\t" + std::to_string(vertex_result.total_links); - vertex_string += "\t" + std::to_string(vertex_result.supporting_links) + "\t" + answer_string; + std::to_string(vertex_result.supported_pairs.size()) + "\t" + std::to_string(vertex_result.total_score); + vertex_string += "\t" + std::to_string(vertex_result.supporting_score) + "\t" + answer_string; return vertex_string; } private: @@ -279,6 +281,7 @@ class VertexResolver { Graph &graph_; const debruijn_graph::Graph &assembly_graph_; + // Trusted link map (for meta mode) const LinkMap links_; std::shared_ptr barcode_extractor_ptr_; size_t count_threshold_; From 6258dbfd45a8f368dfe51af145c8281974245c95 Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Sun, 10 May 2026 17:34:10 +0200 Subject: [PATCH 16/17] Refactor parallel scaffold graph construction --- .../modules/path_extend/pipeline/launcher.cpp | 5 +- .../scaffold_graph_constructor.cpp | 131 ++++++++---------- .../scaffold_graph_constructor.hpp | 28 ++++ 3 files changed, 83 insertions(+), 81 deletions(-) diff --git a/src/common/modules/path_extend/pipeline/launcher.cpp b/src/common/modules/path_extend/pipeline/launcher.cpp index 14edbf98ee..7cd49622db 100644 --- a/src/common/modules/path_extend/pipeline/launcher.cpp +++ b/src/common/modules/path_extend/pipeline/launcher.cpp @@ -89,10 +89,7 @@ void PathExtendLauncher::PrintScaffoldGraph(const scaffold_graph::ScaffoldGraph const debruijn_graph::GenomeConsistenceChecker &genome_checker, const std::filesystem::path &filename) const { using namespace scaffolder; - std::set scaff_vertex_set; - for (const auto& edge: main_edge_set) { - scaff_vertex_set.insert(edge); - } + std::set scaff_vertex_set(main_edge_set.begin(), main_edge_set.end()); auto vertex_colorer = std::make_shared(scaff_vertex_set); auto edge_colorer = std::make_shared(); graph_colorer::CompositeGraphColorer colorer(vertex_colorer, edge_colorer); diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp index d47b65df88..7373795447 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp @@ -32,7 +32,7 @@ void BaseScaffoldGraphConstructor::ConstructFromSet(const EdgeSet &edge_set, for (const auto &v: edge_set) { graph_->AddVertex(v); } - INFO("Added vertices") + INFO("Added vertices"); ConstructFromConditions(connection_conditions, use_terminal_vertices_only); } @@ -90,34 +90,23 @@ PredicateScaffoldGraphFilter::PredicateScaffoldGraphFilter(const Graph &assembly void PredicateScaffoldGraphFilter::ConstructFromGraphAndPredicate(const ScaffoldGraph &old_graph, std::shared_ptr predicate) { + using ScEdge = ScaffoldGraph::ScaffoldEdge; for (const auto& vertex: old_graph.vertices()) { graph_->AddVertex(vertex); } - std::vector scaffold_edges; - for (const auto& edge: old_graph.edges()) { - scaffold_edges.push_back(edge); - } - size_t counter = 0; - const size_t block_size = scaffold_edges.size() / 10; - size_t threads = max_threads_; - DEBUG("Number of threads: " << threads); -#pragma omp parallel for num_threads(threads) - for (size_t i = 0; i < scaffold_edges.size(); ++i) { - auto edge = scaffold_edges[i]; - TRACE("Checking"); - bool check_predicate = (*predicate)(edge); - TRACE("Check result: " << check_predicate); -#pragma omp critical - { - if (check_predicate) { - graph_->AddEdge(edge); - } - ++counter; - if (block_size != 0 and counter % block_size == 0) { - DEBUG("Processed " << counter << " edges out of " << scaffold_edges.size()); - } - } - } + std::vector scaffold_edges(old_graph.edges().begin(), + old_graph.edges().end()); + DEBUG("Number of threads: " << max_threads_); + size_t num_blocks = 10; + auto kept_edges = FilterEdgesParallel(scaffold_edges, max_threads_, num_blocks, + [&](const ScEdge &edge) -> std::optional { + if ((*predicate)(edge)) return edge; + return std::nullopt; + }, + [&](size_t done, size_t total) { + DEBUG("Processed " << done << " edges out of " << total); + }); + for (const auto &edge : kept_edges) graph_->AddEdge(edge); } std::shared_ptr PredicateScaffoldGraphFilter::Construct() { @@ -134,35 +123,28 @@ ScoreFunctionScaffoldGraphFilter::ScoreFunctionScaffoldGraphFilter(const Graph & void ScoreFunctionScaffoldGraphFilter::ConstructFromGraphAndScore(const ScaffoldGraph &graph, std::shared_ptr score_function, double score_threshold, size_t threads) { + using ScEdge = ScaffoldGraph::ScaffoldEdge; for (const auto& vertex: graph.vertices()) { graph_->AddVertex(vertex); } - std::vector scaffold_edges; - for (const auto& edge: graph.edges()) { - scaffold_edges.push_back(edge); - } - size_t counter = 0; - const size_t block_size = scaffold_edges.size() / 25; - #pragma omp parallel for num_threads(threads) - for (size_t i = 0; i < scaffold_edges.size(); ++i) { - ScaffoldGraph::ScaffoldEdge edge = scaffold_edges[i]; - double score = score_function->GetScore(edge); - #pragma omp critical - { - TRACE("Checking edge " << edge.getStart().int_id() << " -> " << edge.getEnd().int_id()); - TRACE("Score: " << score); - TRACE("Score threshold: " << score_threshold); - if (math::ge(score, score_threshold)) { - TRACE("Success"); - graph_->AddEdge(edge.getStart(), edge.getEnd(), edge.getColor(), score, edge.getLength()); - } - TRACE("Edge added"); - ++counter; - if (counter % block_size == 0) { - INFO("Processed " << counter << " edges out of " << scaffold_edges.size()); + size_t num_blocks = 25; + std::vector scaffold_edges(graph.edges().begin(), graph.edges().end()); + auto kept_edges = FilterEdgesParallel(scaffold_edges, threads, num_blocks, + [&](const ScEdge &edge) -> std::optional { + double score = score_function->GetScore(edge); + if (!math::ge(score, score_threshold)) { + return std::nullopt; } - } - } + return ScEdge(edge.getStart(), + edge.getEnd(), + edge.getColor(), + score, + edge.getLength()); + }, + [&](size_t done, size_t total) { + INFO("Processed " << done << " edges out of " << total); + }); + for (const auto &edge : kept_edges) graph_->AddEdge(edge); } std::shared_ptr ScoreFunctionScaffoldGraphFilter::Construct() { ConstructFromGraphAndScore(old_graph_, score_function_, score_threshold_, num_threads_); @@ -187,35 +169,30 @@ void ScoreFunctionGraphConstructor::ConstructFromScore(std::shared_ptrAddVertex(chunk.vertex_); } - size_t approx_block_counter = 0; - const size_t NUM_BLOCKS = 100; - const size_t block_size = chunks_.size() / NUM_BLOCKS; -#pragma omp parallel for schedule(guided) - for (size_t i = 0; i < chunks_.size(); ++i) { - for (auto it = chunks_[i].begin_; it != chunks_[i].end_; ++it) { - const ScaffoldVertex &first = chunks_[i].vertex_; + std::vector> pairs; + for (const auto &chunk : chunks_) { + const ScaffoldVertex &first = chunk.vertex_; + const auto &conjugate = first.GetConjugateFromGraph(graph_->AssemblyGraph()); + for (auto it = chunk.begin_; it != chunk.end_; ++it) { //todo move this check elsewhere - if (first != *it and first.GetConjugateFromGraph(graph_->AssemblyGraph()) != *it) { - ScaffoldGraph::ScaffoldEdge edge(first, *it, 0, .0, 0); - double score = score_function->GetScore(edge); - if (math::ge(score, score_threshold)) { - #pragma omp critical - { - TRACE("Success"); - ScaffoldGraph::ScaffoldEdge new_edge(first, *it, 0, score, 0); - graph_->AddEdgeSimple(new_edge); - } - } - } - } - if (i % block_size == 0 and i != 0) { -#pragma omp critical - { - ++approx_block_counter; - INFO("Processed " << approx_block_counter << " out of " << NUM_BLOCKS << " blocks"); - } + if (first != *it and conjugate != *it) + pairs.emplace_back(first, *it); } } + size_t num_blocks = 100; + auto kept_edges = FilterEdgesParallel(pairs, num_threads_, num_blocks, + [&](const std::pair &p) -> std::optional { + ScaffoldGraph::ScaffoldEdge probe(p.first, p.second, 0, .0, 0); + double score = score_function->GetScore(probe); + if (!math::ge(score, score_threshold)) { + return std::nullopt; + } + return ScaffoldGraph::ScaffoldEdge(p.first, p.second, 0, score, 0); + }, + [&](size_t done, size_t total) { + INFO("Processed " << done << " pairs out of " << total); + }); + for (const auto &edge : kept_edges) graph_->AddEdgeSimple(edge); } std::shared_ptr ScaffoldSubgraphConstructor::Construct() { diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp index 1c61a36e70..6e54951f35 100644 --- a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp +++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp @@ -15,6 +15,7 @@ #include "connection_condition2015.hpp" #include "modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/read_cloud_connection_conditions.hpp" +#include #include #include #include @@ -59,6 +60,33 @@ class BaseScaffoldGraphConstructor: public ScaffoldGraphConstructor { ConnectionConditions &connection_conditions, bool use_terminal_vertices_only = false); + template + static std::vector + FilterEdgesParallel(const std::vector &in_edges, size_t threads, + size_t num_log_blocks, EdgeConstructor edge_constructor, + LogProgress log_progress) { + using ScEdge = scaffold_graph::ScaffoldGraph::ScaffoldEdge; + std::vector> kept_edges(in_edges.size()); + std::atomic counter = 0; + const size_t block_size = num_log_blocks ? in_edges.size() / num_log_blocks : 0; + +#pragma omp parallel for num_threads(threads) schedule(guided) + for (size_t i = 0; i < in_edges.size(); ++i) { + kept_edges[i] = edge_constructor(in_edges[i]); + size_t done = ++counter; + if (block_size != 0 && done % block_size == 0) { + log_progress(done, in_edges.size()); + } + } + + std::vector result; + for (auto &k : kept_edges) + if (k) { + result.push_back(std::move(*k)); + } + return result; + } + DECL_LOGGER("BaseScaffoldGraphConstructor"); }; From 546fad2b7e0e6b2b6daba693e44fb0e7d7dd34fc Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Sun, 10 May 2026 18:12:29 +0200 Subject: [PATCH 17/17] Randomize barcode downsampling --- .../barcode_index/barcode_index_builder.hpp | 33 +++++++++---------- .../splitter/barcode_index_construction.cpp | 11 ++++--- .../splitter/barcode_index_construction.hpp | 3 +- src/projects/splitter/main.cpp | 14 ++++---- 4 files changed, 32 insertions(+), 29 deletions(-) diff --git a/src/common/barcode_index/barcode_index_builder.hpp b/src/common/barcode_index/barcode_index_builder.hpp index 7c7c78d898..3db0b6d528 100644 --- a/src/common/barcode_index/barcode_index_builder.hpp +++ b/src/common/barcode_index/barcode_index_builder.hpp @@ -14,6 +14,8 @@ #include "alignment/sequence_mapper_notifier.hpp" #include "alignment/sequence_mapper.hpp" +#include +#include #include #include #include @@ -199,31 +201,29 @@ class FrameBarcodeIndexBuilder { template void ConstructBarcodeIndex(io::ReadStreamList read_streams, FrameBarcodeIndex &barcode_index, - const io::SequencingLibraryBase &lib, bool is_tellseq); - void DownsampleBarcodeIndex(FrameBarcodeIndex &downsampled_index, FrameBarcodeIndex &original_index, double sampling_factor) { + void DownsampleBarcodeIndex(FrameBarcodeIndex &downsampled_index, + FrameBarcodeIndex &original_index, + double sampling_factor, + int seed) { std::unordered_set barcodes; - std::unordered_set passed_barcodes; - BarcodeId min_barcode = std::numeric_limits::max(); - BarcodeId max_barcode = std::numeric_limits::min(); for (auto it = original_index.begin(); it != original_index.end(); ++it) { const auto &barcode_distribution = it->second.GetDistribution(); for (const auto &entry: barcode_distribution) { - BarcodeId current_barcode = entry.first; - barcodes.insert(current_barcode); - min_barcode = std::min(min_barcode, current_barcode); - max_barcode = std::max(max_barcode, current_barcode); + barcodes.insert(entry.first); } } INFO("Number of encountered barcodes: " << barcodes.size()); - INFO("Barcode id range: " << min_barcode << ", " << max_barcode); - double barcode_thr = static_cast(max_barcode - min_barcode) * sampling_factor; - for (const auto &barcode: barcodes) { - if (math::le(static_cast(barcode - min_barcode), barcode_thr)) { - passed_barcodes.insert(barcode); - } - } + size_t target = static_cast(std::round(static_cast(barcodes.size()) * sampling_factor)); + std::unordered_set passed_barcodes; + passed_barcodes.reserve(target); + std::mt19937 rng(seed); + std::sample(barcodes.begin(), + barcodes.end(), + std::inserter(passed_barcodes, passed_barcodes.end()), + target, + rng); INFO("Passed barcodes: " << passed_barcodes.size()); downsampled_index.InitialFillMap(); @@ -248,7 +248,6 @@ class FrameBarcodeIndexBuilder { template void FrameBarcodeIndexBuilder::ConstructBarcodeIndex(io::ReadStreamList read_streams, FrameBarcodeIndex &barcode_index, - const io::SequencingLibraryBase &lib, bool is_tellseq) { { size_t starting_barcode = 0; diff --git a/src/projects/splitter/barcode_index_construction.cpp b/src/projects/splitter/barcode_index_construction.cpp index d6eacc7bf9..d0497bbb3a 100644 --- a/src/projects/splitter/barcode_index_construction.cpp +++ b/src/projects/splitter/barcode_index_construction.cpp @@ -35,11 +35,11 @@ void ConstructBarcodeIndex(barcode_index::FrameBarcodeIndex &barcode_index, barcode_index::FrameBarcodeIndex &downsampled_index, - double sampling_fraction) { + double sampling_fraction, + int seed) { VERIFY_MSG(math::ls(sampling_fraction, 1.0), "Sampling fraction must be less than 1"); const size_t mapping_k = 31; const std::vector barcode_prefices = {"BC:Z:", "BX:Z:"}; debruijn_graph::Graph empty_graph(mapping_k); alignment::BWAReadMapper mapper(empty_graph); FrameBarcodeIndexBuilder barcode_index_builder(graph, mapper, barcode_prefices, barcode_index.GetFrameSize(), nthreads); - barcode_index_builder.DownsampleBarcodeIndex(downsampled_index, barcode_index, sampling_fraction); + barcode_index_builder.DownsampleBarcodeIndex(downsampled_index, barcode_index, sampling_fraction, seed); } } diff --git a/src/projects/splitter/barcode_index_construction.hpp b/src/projects/splitter/barcode_index_construction.hpp index b511614332..6ad931071e 100644 --- a/src/projects/splitter/barcode_index_construction.hpp +++ b/src/projects/splitter/barcode_index_construction.hpp @@ -36,5 +36,6 @@ void DownsampleBarcodeIndex(const debruijn_graph::Graph &graph, unsigned nthreads, barcode_index::FrameBarcodeIndex &barcode_index, barcode_index::FrameBarcodeIndex &downsampled_index, - double sampling_factor); + double sampling_factor, + int seed); } diff --git a/src/projects/splitter/main.cpp b/src/projects/splitter/main.cpp index bf77153073..fa42d24320 100644 --- a/src/projects/splitter/main.cpp +++ b/src/projects/splitter/main.cpp @@ -57,6 +57,7 @@ struct gcfg { size_t frame_size = 40000; size_t read_linkage_distance = 40000; double sampling_factor = 1.0; + int seed = 999; //graph construction double graph_score_threshold = 2.0; @@ -89,6 +90,7 @@ static void process_cmdline(int argc, char** argv, gcfg& cfg) { (option("-l") & integer("value", cfg.libindex)) % "library index (0-based, default: 0)", (option("-t") & integer("value", cfg.nthreads)) % "# of threads to use", (option("--sampling-factor") & value("sampling-factor", cfg.sampling_factor)) % "Sampling factor for read downsampling", + (option("--seed") & value("seed", cfg.seed)) % "Seed for barcode downsampling", (option("--assembly-info") & value("assembly-info", assembly_info)) % "Path to metaflye assembly_info.txt file (meta mode, metaFlye graphs only)", (with_prefix("-G", @@ -162,11 +164,11 @@ struct TimeTracerRAII { gfa::GFAReader ReadGraph(const gcfg &cfg, debruijn_graph::Graph &graph, io::IdMapper *id_mapper) { - gfa::GFAReader gfa(cfg.graph); - gfa.to_graph(graph, id_mapper); - INFO("GFA segments: " << gfa.num_edges() << ", links: " << gfa.num_links() << ", paths: " - << gfa.num_paths()); - return gfa; + gfa::GFAReader gfa(cfg.graph); + gfa.to_graph(graph, id_mapper); + INFO("GFA segments: " << gfa.num_edges() << ", links: " << gfa.num_links() << ", paths: " + << gfa.num_paths()); + return gfa; } std::unordered_set ParseRepetitiveEdges(const debruijn_graph::Graph &graph, @@ -401,7 +403,7 @@ int main(int argc, char** argv) { if (not math::eq(cfg.sampling_factor, 1.0)) { INFO("Downsampling the barcode index with factor " << cfg.sampling_factor); cont_index::DownsampleBarcodeIndex(graph, cfg.nthreads, barcode_index, downsampled_index, - cfg.sampling_factor); + cfg.sampling_factor, cfg.seed); barcode_extractor_ptr = std::make_shared(downsampled_index, graph); }