koide3 · koide3 · May 30, 2025 · Feb 27, 2025 · Mar 10, 2025
diff --git a/src/gtsam_points/optimizers/gaussian_factor_graph_solver.cpp b/src/gtsam_points/optimizers/gaussian_factor_graph_solver.cpp
@@ -6,7 +6,6 @@
 #include <gtsam_points/optimizers/linear_solver.hpp>
 #include <gtsam_points/optimizers/linear_system_builder.hpp>
 
-#include <gtsam_points/util/easy_profiler.hpp>
 
 namespace gtsam_points {
 

diff --git a/src/gtsam_points/optimizers/levenberg_marquardt_ext.cpp b/src/gtsam_points/optimizers/levenberg_marquardt_ext.cpp
@@ -22,11 +22,18 @@
 
 #include <gtsam_points/optimizers/linearization_hook.hpp>
 
+#include <numeric>
 #include <gtsam/base/Vector.h>
 #include <gtsam/linear/GaussianFactorGraph.h>
 #include <gtsam/nonlinear/NonlinearFactorGraph.h>
 #include <gtsam/nonlinear/internal/LevenbergMarquardtState.h>
 #include <gtsam/inference/Ordering.h>
+#include <gtsam_points/config.hpp>
+#include <gtsam_points/util/parallelism.hpp>
+
+#ifdef GTSAM_POINTS_USE_TBB
+#include <tbb/parallel_for.h>
+#endif
 
 #include <boost/format.hpp>
 #include <boost/range/adaptor/map.hpp>
@@ -35,6 +42,38 @@ namespace gtsam_points {
 
 using State = gtsam::internal::LevenbergMarquardtState;
 
+double calc_error(const gtsam::GaussianFactorGraph& gfg, const gtsam::VectorValues& x) {
+  if (is_omp_default()) {
+    return gfg.error(x);
+  } else {
+#ifdef GTSAM_POINTS_USE_TBB
+    // TODO: Should use parallel reduction
+    std::vector<double> errors(gfg.size(), 0.0);
+    tbb::parallel_for(static_cast<size_t>(0), gfg.size(), [&](size_t i) { errors[i] = gfg[i]->error(x); });
+    return std::accumulate(errors.begin(), errors.end(), 0.0);
+#else
+    std::cerr << "error: gtsam_points is not built with TBB!!" << std::endl;
+    return gfg.error(x);
+#endif
+  }
+}
+
+double calc_error(const gtsam::NonlinearFactorGraph& graph, const gtsam::Values& x) {
+  if (is_omp_default()) {
+    return graph.error(x);
+  } else {
+#ifdef GTSAM_POINTS_USE_TBB
+    // TODO: Should use parallel reduction
+    std::vector<double> errors(graph.size(), 0.0);
+    tbb::parallel_for(static_cast<size_t>(0), graph.size(), [&](size_t i) { errors[i] = graph[i]->error(x); });
+    return std::accumulate(errors.begin(), errors.end(), 0.0);
+#else
+    std::cerr << "error: gtsam_points is not built with TBB!!" << std::endl;
+    return gfg.error(x);
+#endif
+  }
+}
+
 LevenbergMarquardtExtParams LevenbergMarquardtExtParams::ensureHasOrdering(const gtsam::NonlinearFactorGraph& graph) const {
   if (ordering) {
     return *this;
@@ -107,8 +146,8 @@ bool LevenbergMarquardtOptimizerExt::tryLambda(
   if (systemSolvedSuccessfully) {
     // Compute the old linearized error as it is not the same
     // as the nonlinear error when robust noise models are used.
-    double oldLinearizedError = linear.error(gtsam::VectorValues::Zero(delta));
-    double newlinearizedError = linear.error(delta);
+    double oldLinearizedError = calc_error(linear, gtsam::VectorValues::Zero(delta));
+    double newlinearizedError = calc_error(linear, delta);
     // cost change in the linearized system (old - new)
     double linearizedCostChange = oldLinearizedError - newlinearizedError;
 
@@ -119,7 +158,7 @@ bool LevenbergMarquardtOptimizerExt::tryLambda(
 
       // cost change in the original, nonlinear system (old - new)
       linearization_hook->error(newValues);
-      newError = graph_.error(newValues);
+      newError = calc_error(graph_, newValues);
       costChange = oldError - newError;
 
       if (linearizedCostChange > std::numeric_limits<double>::epsilon() * oldLinearizedError) {
@@ -138,7 +177,6 @@ bool LevenbergMarquardtOptimizerExt::tryLambda(
       }
     }
   }  // if (systemSolvedSuccessfully)
-
   if (params_.callback || params_.status_msg_callback) {
     LevenbergMarquardtOptimizationStatus status;
     status.iterations = currentState->iterations;
@@ -179,7 +217,7 @@ bool LevenbergMarquardtOptimizerExt::tryLambda(
     // TODO(frank): make Values actually support move. Does not seem to happen now.
     state_ = currentState->decreaseLambda(params_, modelFidelity, std::move(newValues), newError);
     return true;
-  } else if (!stopSearchingLambda) {         // we failed to solved the system or had no decrease in cost
+  } else if (!stopSearchingLambda) {  // we failed to solved the system or had no decrease in cost
     State* modifiedState = static_cast<State*>(state_.get());
     modifiedState->increaseLambda(params_);  // TODO(frank): make this functional with Values move
 
@@ -190,7 +228,7 @@ bool LevenbergMarquardtOptimizerExt::tryLambda(
     } else {
       return false;  // only case where we will keep trying
     }
-  } else {           // the change in the cost is very small and it is not worth trying bigger lambdas
+  } else {  // the change in the cost is very small and it is not worth trying bigger lambdas
     return true;
   }
 }
@@ -206,7 +244,7 @@ gtsam::GaussianFactorGraph::shared_ptr LevenbergMarquardtOptimizerExt::iterate()
   linearization_time = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1);
 
   linearization_hook->error(currentState->values);
-  double oldError = graph_.error(currentState->values);
+  double oldError = calc_error(graph_, currentState->values);
   // double oldError = graph_.error(currentState->values, linear);
 
   gtsam::VectorValues sqrtHessianDiagonal;

diff --git a/src/gtsam_points/optimizers/linear_system_builder.cpp b/src/gtsam_points/optimizers/linear_system_builder.cpp
@@ -3,9 +3,18 @@
 
 #include <gtsam_points/optimizers/linear_system_builder.hpp>
 
+#include <numeric>
 #include <unordered_map>
 #include <gtsam/linear/HessianFactor.h>
 
+#include <gtsam_points/config.hpp>
+#include <gtsam_points/util/parallelism.hpp>
+
+#ifdef GTSAM_POINTS_USE_TBB
+#include <tbb/parallel_for.h>
+#include <tbb/concurrent_hash_map.h>
+#endif
+
 namespace {
 
 template <typename T>
@@ -71,87 +80,191 @@ SparseLinearSystemBuilder<BLOCK_SIZE>::SparseLinearSystemBuilder(const gtsam::Ga
     }
   }
 
-  std::unordered_map<std::pair<int, int>, BlockMatrix, PairHasher<int>> A_blocks;
-  std::unordered_map<int, BlockVector> b_blocks;
   c = 0.0;
 
-  for (const auto& gf : gfg) {
-    std::vector<BlockSlot> slots(gf->keys().size());
-    std::transform(gf->keys().begin(), gf->keys().end(), slots.begin(), [&](const gtsam::Key key) { return block_slots[key]; });
+  if (is_omp_default()) {
+    std::unordered_map<std::pair<int, int>, BlockMatrix, PairHasher<int>> A_blocks;
+    std::unordered_map<int, BlockVector> b_blocks;
+
+    for (const auto& gf : gfg) {
+      std::vector<BlockSlot> slots(gf->keys().size());
+      std::transform(gf->keys().begin(), gf->keys().end(), slots.begin(), [&](const gtsam::Key key) { return block_slots[key]; });
+
+      const auto augmented = gf->augmentedInformation();
+      const int N = augmented.rows() - 1;
 
-    const auto augmented = gf->augmentedInformation();
-    const int N = augmented.rows() - 1;
+      c += augmented(augmented.rows() - 1, augmented.cols() - 1);
 
-    c += augmented(augmented.rows() - 1, augmented.cols() - 1);
+      for (int i = 0, index_i = 0; i < gf->keys().size(); index_i += slots[i++].dim) {
+        const int dim_i = slots[i].dim;
+        const int global_index_i = slots[i].index;
 
-    for (int i = 0, index_i = 0; i < gf->keys().size(); index_i += slots[i++].dim) {
-      const int dim_i = slots[i].dim;
-      const int global_index_i = slots[i].index;
+        BlockVector b_i;
+        if constexpr (BLOCK_SIZE < 0) {
+          b_i = augmented.block(index_i, N, dim_i, 1);
+        } else {
+          b_i = augmented.block<BLOCK_SIZE, 1>(index_i, N);
+        }
 
-      BlockVector b_i;
-      if constexpr (BLOCK_SIZE < 0) {
-        b_i = augmented.block(index_i, N, dim_i, 1);
-      } else {
-        b_i = augmented.block<BLOCK_SIZE, 1>(index_i, N);
+        auto found = b_blocks.find(global_index_i);
+        if (found == b_blocks.end()) {
+          b_blocks.emplace_hint(found, global_index_i, b_i);
+        } else {
+          found->second += b_i;
+        }
       }
 
-      auto found = b_blocks.find(global_index_i);
-      if (found == b_blocks.end()) {
-        b_blocks.emplace_hint(found, global_index_i, b_i);
-      } else {
-        found->second += b_i;
+      for (int i = 0, index_i = 0; i < gf->keys().size(); index_i += slots[i++].dim) {
+        for (int j = i, index_j = index_i; j < gf->keys().size(); index_j += slots[j++].dim) {
+          const int dim_i = slots[i].dim;
+          const int dim_j = slots[j].dim;
+          int global_index_i = slots[i].index;
+          int global_index_j = slots[j].index;
+
+          BlockMatrix H_ij;
+          if constexpr (BLOCK_SIZE < 0) {
+            H_ij = augmented.block(index_i, index_j, dim_i, dim_j);
+          } else {
+            H_ij = augmented.block<BLOCK_SIZE, BLOCK_SIZE>(index_i, index_j);
+          }
+
+          if (global_index_i < global_index_j) {
+            std::swap(global_index_i, global_index_j);
+            H_ij = H_ij.transpose().eval();
+          }
+
+          auto found = A_blocks.find({global_index_i, global_index_j});
+          if (found == A_blocks.end()) {
+            A_blocks.emplace_hint(found, std::make_pair(global_index_i, global_index_j), H_ij);
+          } else {
+            found->second += H_ij;
+          }
+        }
       }
     }
 
-    for (int i = 0, index_i = 0; i < gf->keys().size(); index_i += slots[i++].dim) {
-      for (int j = i, index_j = index_i; j < gf->keys().size(); index_j += slots[j++].dim) {
+    std::vector<Eigen::Triplet<double>> triplets;
+    triplets.reserve(A_blocks.size() * 6 * 6);
+
+    for (const auto& [index, H_ij] : A_blocks) {
+      const auto [index_i, index_j] = index;
+      Eigen::SparseMatrix<double> sH_ij = H_ij.sparseView();
+
+      for (int k = 0; k < sH_ij.outerSize(); k++) {
+        for (Eigen::SparseMatrix<double>::InnerIterator it(sH_ij, k); it; ++it) {
+          triplets.emplace_back(index_i + it.row(), index_j + it.col(), it.value());
+        }
+      }
+    }
+
+    A.resize(total_dim, total_dim);
+    A.setFromTriplets(triplets.begin(), triplets.end());
+
+    b.resize(total_dim);
+    for (const auto& [index_i, b_i] : b_blocks) {
+      b.block(index_i, 0, b_i.size(), 1) = b_i;
+    }
+  } else {
+#ifdef GTSAM_POINTS_USE_TBB
+    using BlockMatrixMap = tbb::concurrent_hash_map<std::uint64_t, BlockMatrix>;
+    using BlockVectorMap = tbb::concurrent_hash_map<std::uint64_t, BlockVector>;
+    BlockMatrixMap A_blocks;
+    BlockVectorMap b_blocks;
+    std::vector<double> all_c(gfg.size(), 0.0);
+
+    const auto perfactor_task = [&](const gtsam::GaussianFactor& gf) {
+      std::vector<BlockSlot> slots(gf.keys().size());
+      std::transform(gf.keys().begin(), gf.keys().end(), slots.begin(), [&](const gtsam::Key key) { return block_slots[key]; });
+
+      const auto augmented = gf.augmentedInformation();
+      const int N = augmented.rows() - 1;
+
+      // Add b
+      for (int i = 0, index_i = 0; i < gf.keys().size(); index_i += slots[i++].dim) {
         const int dim_i = slots[i].dim;
-        const int dim_j = slots[j].dim;
-        int global_index_i = slots[i].index;
-        int global_index_j = slots[j].index;
+        const int global_index_i = slots[i].index;
 
-        BlockMatrix H_ij;
+        BlockVector b_i;
         if constexpr (BLOCK_SIZE < 0) {
-          H_ij = augmented.block(index_i, index_j, dim_i, dim_j);
+          b_i = augmented.block(index_i, N, dim_i, 1);
         } else {
-          H_ij = augmented.block<BLOCK_SIZE, BLOCK_SIZE>(index_i, index_j);
+          b_i = augmented.block<BLOCK_SIZE, 1>(index_i, N);
         }
 
-        if (global_index_i < global_index_j) {
-          std::swap(global_index_i, global_index_j);
-          H_ij = H_ij.transpose().eval();
+        typename BlockVectorMap::accessor a;
+        if (!b_blocks.insert(a, global_index_i)) {
+          a->second += b_i;
+        } else {
+          a->second = b_i;
         }
+      }
 
-        auto found = A_blocks.find({global_index_i, global_index_j});
-        if (found == A_blocks.end()) {
-          A_blocks.emplace_hint(found, std::make_pair(global_index_i, global_index_j), H_ij);
-        } else {
-          found->second += H_ij;
+      // Add A
+      for (int i = 0, index_i = 0; i < gf.keys().size(); index_i += slots[i++].dim) {
+        for (int j = i, index_j = index_i; j < gf.keys().size(); index_j += slots[j++].dim) {
+          const int dim_i = slots[i].dim;
+          const int dim_j = slots[j].dim;
+          int global_index_i = slots[i].index;
+          int global_index_j = slots[j].index;
+
+          BlockMatrix H_ij;
+          if constexpr (BLOCK_SIZE < 0) {
+            H_ij = augmented.block(index_i, index_j, dim_i, dim_j);
+          } else {
+            H_ij = augmented.block<BLOCK_SIZE, BLOCK_SIZE>(index_i, index_j);
+          }
+
+          if (global_index_i < global_index_j) {
+            std::swap(global_index_i, global_index_j);
+            H_ij = H_ij.transpose().eval();
+          }
+
+          const size_t key = (static_cast<std::uint64_t>(global_index_i) << 32) | global_index_j;
+
+          typename BlockMatrixMap::accessor a;
+          if (!A_blocks.insert(a, key)) {
+            a->second += H_ij;
+          } else {
+            a->second = H_ij;
+          }
         }
       }
-    }
-  }
 
-  std::vector<Eigen::Triplet<double>> triplets;
-  triplets.reserve(A_blocks.size() * 6 * 6);
+      return augmented(augmented.rows() - 1, augmented.cols() - 1);
+    };
 
-  for (const auto& [index, H_ij] : A_blocks) {
-    const auto [index_i, index_j] = index;
-    Eigen::SparseMatrix<double> sH_ij = H_ij.sparseView();
+    tbb::parallel_for(static_cast<size_t>(0), gfg.size(), [&](size_t i) { all_c[i] = perfactor_task(*gfg[i]); });
+    c = std::accumulate(all_c.begin(), all_c.end(), 0.0);
 
-    for (int k = 0; k < sH_ij.outerSize(); k++) {
-      for (Eigen::SparseMatrix<double>::InnerIterator it(sH_ij, k); it; ++it) {
-        triplets.emplace_back(index_i + it.row(), index_j + it.col(), it.value());
+    std::vector<Eigen::Triplet<double>> triplets;
+    triplets.reserve(A_blocks.size() * 6 * 6);
+
+    for (auto itr = A_blocks.begin(); itr != A_blocks.end(); itr++) {
+      const size_t key = itr->first;
+      const int index_i = key >> 32;
+      const int index_j = key & ((1ull << 32) - 1);
+      const BlockMatrix& H_ij = itr->second;
+
+      Eigen::SparseMatrix<double> sH_ij = H_ij.sparseView();
+
+      for (int k = 0; k < sH_ij.outerSize(); k++) {
+        for (Eigen::SparseMatrix<double>::InnerIterator it(sH_ij, k); it; ++it) {
+          triplets.emplace_back(index_i + it.row(), index_j + it.col(), it.value());
+        }
       }
     }
-  }
 
-  A.resize(total_dim, total_dim);
-  A.setFromTriplets(triplets.begin(), triplets.end());
+    A.resize(total_dim, total_dim);
+    A.setFromTriplets(triplets.begin(), triplets.end());
+
+    b.resize(total_dim);
+    for (auto itr = b_blocks.begin(); itr != b_blocks.end(); itr++) {
+      b.block(itr->first, 0, itr->second.size(), 1) = itr->second;
+    }
 
-  b.resize(total_dim);
-  for (const auto& [index_i, b_i] : b_blocks) {
-    b.block(index_i, 0, b_i.size(), 1) = b_i;
+#else
+    std::cerr << "error : gtsam_points is not built with TBB!!" << std::endl;
+#endif
   }
 }