Skip to content

Commit 1e72eb1

Browse files
committed
perf: TBB parallelization for HessianFactor.
Parallelize the updateHessian operation when forming A'*A in the HessianFactor merge constructor. For large matrices (>50 rows), the work is split across columns using TBB parallel_for, with each thread updating a disjoint set of block columns. Key changes: - Add column-range overload of updateHessian() to GaussianFactor, HessianFactor, JacobianFactor, and RegularImplicitSchurFactor - Add setZeroColumns() method to SymmetricBlockMatrix for efficient column-wise zeroing using memset In my toy example it reducded updateHessian time from ~72s to ~41s. (with 10 threads and GTSAM_TBB_BOUNDED_MEMORY_GROWTH=OFF)
1 parent e018acc commit 1e72eb1

File tree

10 files changed

+398
-10
lines changed

10 files changed

+398
-10
lines changed

gtsam/base/SymmetricBlockMatrix.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <boost/serialization/nvp.hpp>
2626
#endif
2727
#include <cassert>
28+
#include <cstring>
2829
#include <stdexcept>
2930
#include <array>
3031
#include <vector>
@@ -300,6 +301,25 @@ namespace gtsam {
300301
matrix_.setZero();
301302
}
302303

304+
/// Set the block columns between beginCol (inclusive) and endCol (exclusive) to zero.
305+
void setZeroColumns(DenseIndex beginCol, DenseIndex endCol) {
306+
assert(beginCol < endCol);
307+
assert(beginCol >= 0);
308+
assert(endCol >= 0);
309+
assert(beginCol < nBlocks());
310+
assert(endCol <= nBlocks());
311+
static_assert(Matrix::IsRowMajor == 0, "setZeroColumns requires column-major storage.");
312+
313+
const DenseIndex denseBeginCol = offset(beginCol);
314+
const DenseIndex denseEndCol = offset(endCol);
315+
316+
double *begin = matrix_.data() + denseBeginCol * matrix_.rows();
317+
double *end = matrix_.data() + denseEndCol * matrix_.rows();
318+
319+
// Using memset for maximal compiler optimization.
320+
memset(begin, 0, (end - begin) * sizeof(*begin));
321+
}
322+
303323
/// Negate the entire active matrix.
304324
void negate();
305325

gtsam/base/tests/testSymmetricBlockMatrix.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,28 @@ TEST(SymmetricBlockMatrix, WriteBlocks)
7979
EXPECT(assert_equal(expected3, actual3));
8080
}
8181

82+
/* ************************************************************************* */
83+
TEST(SymmetricBlockMatrix, setZeroColumns) {
84+
// Expected: columns 3 and 4 are zero
85+
Matrix expected = testBlockMatrix.selfadjointView().toDenseMatrix().eval();
86+
expected.col(3).setZero();
87+
expected.col(4).setZero();
88+
expected = expected.triangularView<Eigen::Upper>().toDenseMatrix().eval();
89+
90+
SymmetricBlockMatrix bm = testBlockMatrix;
91+
92+
// Zero out the middle block (block 1, columns 3-4)
93+
bm.setZeroColumns(1, 2);
94+
95+
Matrix result = bm.selfadjointView()
96+
.toDenseMatrix()
97+
.triangularView<Eigen::Upper>()
98+
.toDenseMatrix()
99+
.eval();
100+
101+
EXPECT(assert_equal(expected, result));
102+
}
103+
82104
/* ************************************************************************* */
83105
// Verify block range access.
84106
TEST(SymmetricBlockMatrix, Ranges)

gtsam/linear/GaussianFactor.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,18 @@ namespace gtsam {
150150
virtual void updateHessian(const KeyVector& keys,
151151
SymmetricBlockMatrix* info) const = 0;
152152

153+
/** Update an information matrix by adding the information corresponding to this factor
154+
* (used internally during elimination), restricted to a range of block columns,
155+
* useful for parallelization.
156+
* @param keys The ordered vector of keys for the information matrix to be updated
157+
* @param info The information matrix to be updated
158+
* @param beginCol First block column index (inclusive) in the range to update
159+
* @param endCol Last block column index (exclusive) in the range to update
160+
*/
161+
virtual void updateHessian(const KeyVector& keys,
162+
SymmetricBlockMatrix* info,
163+
DenseIndex beginCol, DenseIndex endCol) const = 0;
164+
153165
/// @}
154166
/// @name Operator interface
155167
/// @{

gtsam/linear/HessianFactor.cpp

Lines changed: 105 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,14 @@
3434
#include <cassert>
3535
#include <limits>
3636

37+
#ifdef GTSAM_USE_TBB
38+
#include <tbb/blocked_range.h>
39+
#include <tbb/parallel_for.h>
40+
#include <oneapi/tbb/global_control.h>
41+
#include <oneapi/tbb/task_arena.h>
42+
#include <algorithm>
43+
#endif
44+
3745
using namespace std;
3846

3947
namespace gtsam {
@@ -241,17 +249,65 @@ HessianFactor::HessianFactor(const GaussianFactorGraph& factors,
241249
const Scatter& scatter) {
242250
gttic(HessianFactor_MergeConstructor);
243251

252+
gttic(Allocate);
244253
Allocate(scatter);
254+
gttoc(Allocate);
255+
256+
// Only parallelize the inner loop if GTSAM_TBB_BOUNDED_MEMORY_GROWTH_FLAG is
257+
// defined. Without this flag, multiple HessianFactor constructions are
258+
// already running in parallel (e.g., when constructing multiple factors
259+
// concurrently), so there's no need to parallelize the inner
260+
// updateHessian loop here as well.
261+
#if defined(GTSAM_USE_TBB) && defined(GTSAM_TBB_BOUNDED_MEMORY_GROWTH_FLAG)
262+
constexpr DenseIndex kParallelThresholdHeuristic = 50;
263+
if (info_.rows() > kParallelThresholdHeuristic) {
264+
gttic(updateHessian_TBB);
265+
266+
const DenseIndex M = info_.nBlocks();
267+
268+
auto numThreads = std::min(
269+
static_cast<int>(oneapi::tbb::global_control::active_value(
270+
oneapi::tbb::global_control::max_allowed_parallelism)),
271+
static_cast<int>(oneapi::tbb::this_task_arena::max_concurrency()));
272+
273+
if (numThreads > 1) {
274+
DenseIndex grain = std::max<DenseIndex>(1, M / (2 * numThreads));
275+
tbb::parallel_for(tbb::blocked_range<DenseIndex>(0, M, grain),
276+
[&, M](const tbb::blocked_range<DenseIndex>& range) {
277+
// reverse the range to start from the end because
278+
// matrix is upper triangular and therefore end is
279+
// larger than begin so we would like to start with
280+
// the last column that is the most work and go to the
281+
// first column that is least.
282+
DenseIndex beginCol = M - range.end();
283+
DenseIndex endCol = M - range.begin();
284+
info_.setZeroColumns(beginCol, endCol);
285+
for (const auto& factor : factors) {
286+
if (factor) {
287+
factor->updateHessian(keys_, &info_, beginCol,
288+
endCol);
289+
}
290+
}
291+
});
292+
return;
293+
}
294+
}
295+
#endif
296+
gttic(setAllZero);
297+
info_.setAllZero();
298+
gttoc(setAllZero);
299+
300+
gttic(updateHessian);
245301

246302
// Form A' * A
247-
gttic(update);
248-
info_.setAllZero();
249-
for(const auto& factor: factors)
250-
if (factor)
303+
for (const auto& factor : factors) {
304+
if (factor) {
251305
factor->updateHessian(keys_, &info_);
252-
gttoc(update);
306+
}
307+
}
253308
}
254309

310+
255311
/* ************************************************************************* */
256312
void HessianFactor::print(const std::string& s,
257313
const KeyFormatter& formatter) const {
@@ -352,14 +408,58 @@ void HessianFactor::updateHessian(const KeyVector& infoKeys,
352408
gttic(updateHessian_HessianFactor);
353409
const DenseIndex nrVariablesInThisFactor = size();
354410

411+
gttic(slots);
355412
vector<DenseIndex> slots(nrVariablesInThisFactor + 1);
356413
for (DenseIndex j = 0; j < nrVariablesInThisFactor; ++j)
357414
slots[j] = Slot(infoKeys, keys_[j]);
415+
358416
slots[nrVariablesInThisFactor] = info->nBlocks() - 1;
417+
gttoc(slots);
359418

360419
info->updateFromMappedBlocks(info_, slots);
361420
}
362421

422+
/* ************************************************************************* */
423+
void HessianFactor::updateHessian(const KeyVector& infoKeys,
424+
SymmetricBlockMatrix* info,
425+
DenseIndex beginCol,
426+
DenseIndex endCol) const {
427+
assert(info);
428+
const DenseIndex nrVariablesInThisFactor = size();
429+
430+
vector<DenseIndex> slots;
431+
slots.reserve(nrVariablesInThisFactor + 1);
432+
433+
for (DenseIndex j = 0; j < nrVariablesInThisFactor; ++j) {
434+
slots.push_back(Slot(infoKeys, keys_[j]));
435+
}
436+
slots.push_back(info->nBlocks() - 1);
437+
438+
for (DenseIndex j = 0; j <= nrVariablesInThisFactor; ++j) {
439+
const DenseIndex J = slots[j];
440+
// Update diagonal block if J is in range
441+
if (J >= beginCol && J < endCol) {
442+
info->updateDiagonalBlock(J, info_.diagonalBlock(j));
443+
}
444+
445+
// Update off-diagonal blocks where column max(I, J) is in range
446+
// Note: We process all blocks and let the maxCol check filter them,
447+
// because I and J may be in different orders (slots are not necessarily sorted)
448+
for (DenseIndex i = 0; i < j; ++i) {
449+
const DenseIndex I = slots[i];
450+
assert(i < j);
451+
assert(I != J);
452+
453+
// The physical column index in the symmetric matrix is max(I, J)
454+
const DenseIndex maxCol = std::max(I, J);
455+
456+
if (maxCol >= beginCol && maxCol < endCol) {
457+
info->updateOffDiagonalBlock(I, J, info_.aboveDiagonalBlock(i, j));
458+
}
459+
}
460+
}
461+
}
462+
363463
/* ************************************************************************* */
364464
GaussianFactor::shared_ptr HessianFactor::negate() const {
365465
shared_ptr result = std::make_shared<This>(*this);

gtsam/linear/HessianFactor.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,17 @@ namespace gtsam {
326326
*/
327327
void updateHessian(const KeyVector& keys, SymmetricBlockMatrix* info) const override;
328328

329+
/** Update an information matrix by adding the information corresponding to this factor
330+
* (used internally during elimination), restricted to a range of block columns,
331+
* useful for parallelization.
332+
* @param keys The ordered vector of keys for the information matrix to be updated
333+
* @param info The information matrix to be updated
334+
* @param beginCol First block column index (inclusive) in the range to update
335+
* @param endCol Last block column index (exclusive) in the range to update
336+
*/
337+
void updateHessian(const KeyVector& keys, SymmetricBlockMatrix* info,
338+
DenseIndex beginCol, DenseIndex endCol) const override;
339+
329340
/** Update another Hessian factor
330341
* @param other the HessianFactor to be updated
331342
*/

gtsam/linear/JacobianFactor.cpp

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,72 @@ void JacobianFactor::updateHessian(const KeyVector& infoKeys,
630630
}
631631
}
632632

633+
/* ************************************************************************* */
634+
static void whitenedUpdateHessian(SymmetricBlockMatrix* info, std::vector<DenseIndex> slots,
635+
const VerticalBlockMatrix& Ab_, DenseIndex beginCol, DenseIndex endCol) {
636+
const DenseIndex n = Ab_.nBlocks() - 1;
637+
638+
for (DenseIndex j = 0; j <= n; ++j) {
639+
const DenseIndex J = slots[j];
640+
Eigen::Block<const Matrix> Ab_j = Ab_(j);
641+
642+
// Update diagonal block if J is in range
643+
if (J >= beginCol && J < endCol) {
644+
info->diagonalBlock(J).rankUpdate(Ab_j.transpose());
645+
}
646+
647+
// Fill off-diagonal blocks with Ai'*Aj where column max(I, J) is in range
648+
for (DenseIndex i = 0; i < j; ++i) {
649+
const DenseIndex I = slots[i];
650+
651+
// The physical column index in the symmetric matrix is max(I, J)
652+
const DenseIndex maxCol = std::max(I, J);
653+
if (maxCol >= beginCol && maxCol < endCol) {
654+
// Pass original indices - updateOffDiagonalBlock handles swapping internally
655+
info->updateOffDiagonalBlock(I, J, Ab_(i).transpose() * Ab_j);
656+
}
657+
}
658+
}
659+
}
660+
661+
void JacobianFactor::updateHessian(const KeyVector& infoKeys,
662+
SymmetricBlockMatrix* info,
663+
DenseIndex beginCol, DenseIndex endCol) const {
664+
if (rows() == 0) return;
665+
666+
// Ab_ is the augmented Jacobian matrix A, and we perform I += A'*A below.
667+
DenseIndex n = Ab_.nBlocks() - 1;
668+
669+
// Pre-calculate slots
670+
vector<DenseIndex> slots;
671+
slots.reserve(n + 1);
672+
bool foundCol = false;
673+
for (DenseIndex j = 0; j < n; ++j) {
674+
slots.push_back(Slot(infoKeys, keys_[j]));
675+
if (slots[j] >= beginCol && slots[j] < endCol) {
676+
foundCol = true;
677+
}
678+
}
679+
slots.push_back(info->nBlocks() - 1);
680+
if (slots[n] >= beginCol && slots[n] < endCol) {
681+
foundCol = true;
682+
}
683+
if (!foundCol) return;
684+
685+
// Whiten the factor if it has a noise model
686+
const SharedDiagonal& model = get_model();
687+
if (model && !model->isUnit()) {
688+
if (model->isConstrained())
689+
throw invalid_argument(
690+
"JacobianFactor::updateHessian: cannot update information with "
691+
"constrained noise model");
692+
JacobianFactor whitenedFactor = whiten();
693+
whitenedUpdateHessian(info, std::move(slots), whitenedFactor.Ab_, beginCol, endCol);
694+
} else {
695+
whitenedUpdateHessian(info, std::move(slots), Ab_, beginCol, endCol);
696+
}
697+
}
698+
633699
/* ************************************************************************* */
634700
Vector JacobianFactor::operator*(const VectorValues& x) const {
635701
Vector Ax(Ab_.rows());

gtsam/linear/JacobianFactor.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,17 @@ namespace gtsam {
331331
*/
332332
void updateHessian(const KeyVector& keys, SymmetricBlockMatrix* info) const override;
333333

334+
/** Update an information matrix by adding the information corresponding to this factor
335+
* (used internally during elimination), restricted to a range of block columns,
336+
* useful for parallelization.
337+
* @param keys The ordered vector of keys for the information matrix to be updated
338+
* @param info The information matrix to be updated
339+
* @param beginCol First block column index (inclusive) in the range to update
340+
* @param endCol Last block column index (exclusive) in the range to update
341+
*/
342+
void updateHessian(const KeyVector& keys, SymmetricBlockMatrix* info,
343+
DenseIndex beginCol, DenseIndex endCol) const override;
344+
334345
/** Return A*x */
335346
Vector operator*(const VectorValues& x) const;
336347

0 commit comments

Comments
 (0)