|
34 | 34 | #include <cassert> |
35 | 35 | #include <limits> |
36 | 36 |
|
| 37 | +#ifdef GTSAM_USE_TBB |
| 38 | + #include <tbb/blocked_range.h> |
| 39 | + #include <tbb/parallel_for.h> |
| 40 | + #include <oneapi/tbb/global_control.h> |
| 41 | + #include <oneapi/tbb/task_arena.h> |
| 42 | + #include <algorithm> |
| 43 | +#endif |
| 44 | + |
37 | 45 | using namespace std; |
38 | 46 |
|
39 | 47 | namespace gtsam { |
@@ -241,17 +249,65 @@ HessianFactor::HessianFactor(const GaussianFactorGraph& factors, |
241 | 249 | const Scatter& scatter) { |
242 | 250 | gttic(HessianFactor_MergeConstructor); |
243 | 251 |
|
| 252 | + gttic(Allocate); |
244 | 253 | Allocate(scatter); |
| 254 | + gttoc(Allocate); |
| 255 | + |
| 256 | + // Only parallelize the inner loop if GTSAM_TBB_BOUNDED_MEMORY_GROWTH_FLAG is |
| 257 | + // defined. Without this flag, multiple HessianFactor constructions are |
| 258 | + // already running in parallel (e.g., when constructing multiple factors |
| 259 | + // concurrently), so there's no need to parallelize the inner |
| 260 | + // updateHessian loop here as well. |
| 261 | +#if defined(GTSAM_USE_TBB) && defined(GTSAM_TBB_BOUNDED_MEMORY_GROWTH_FLAG) |
| 262 | + constexpr DenseIndex kParallelThresholdHeuristic = 50; |
| 263 | + if (info_.rows() > kParallelThresholdHeuristic) { |
| 264 | + gttic(updateHessian_TBB); |
| 265 | + |
| 266 | + const DenseIndex M = info_.nBlocks(); |
| 267 | + |
| 268 | + auto numThreads = std::min( |
| 269 | + static_cast<int>(oneapi::tbb::global_control::active_value( |
| 270 | + oneapi::tbb::global_control::max_allowed_parallelism)), |
| 271 | + static_cast<int>(oneapi::tbb::this_task_arena::max_concurrency())); |
| 272 | + |
| 273 | + if (numThreads > 1) { |
| 274 | + DenseIndex grain = std::max<DenseIndex>(1, M / (2 * numThreads)); |
| 275 | + tbb::parallel_for(tbb::blocked_range<DenseIndex>(0, M, grain), |
| 276 | + [&, M](const tbb::blocked_range<DenseIndex>& range) { |
| 277 | + // reverse the range to start from the end because |
| 278 | + // matrix is upper triangular and therefore end is |
| 279 | + // larger than begin so we would like to start with |
| 280 | + // the last column that is the most work and go to the |
| 281 | + // first column that is least. |
| 282 | + DenseIndex beginCol = M - range.end(); |
| 283 | + DenseIndex endCol = M - range.begin(); |
| 284 | + info_.setZeroColumns(beginCol, endCol); |
| 285 | + for (const auto& factor : factors) { |
| 286 | + if (factor) { |
| 287 | + factor->updateHessian(keys_, &info_, beginCol, |
| 288 | + endCol); |
| 289 | + } |
| 290 | + } |
| 291 | + }); |
| 292 | + return; |
| 293 | + } |
| 294 | + } |
| 295 | +#endif |
| 296 | + gttic(setAllZero); |
| 297 | + info_.setAllZero(); |
| 298 | + gttoc(setAllZero); |
| 299 | + |
| 300 | + gttic(updateHessian); |
245 | 301 |
|
246 | 302 | // Form A' * A |
247 | | - gttic(update); |
248 | | - info_.setAllZero(); |
249 | | - for(const auto& factor: factors) |
250 | | - if (factor) |
| 303 | + for (const auto& factor : factors) { |
| 304 | + if (factor) { |
251 | 305 | factor->updateHessian(keys_, &info_); |
252 | | - gttoc(update); |
| 306 | + } |
| 307 | + } |
253 | 308 | } |
254 | 309 |
|
| 310 | + |
255 | 311 | /* ************************************************************************* */ |
256 | 312 | void HessianFactor::print(const std::string& s, |
257 | 313 | const KeyFormatter& formatter) const { |
@@ -352,14 +408,58 @@ void HessianFactor::updateHessian(const KeyVector& infoKeys, |
352 | 408 | gttic(updateHessian_HessianFactor); |
353 | 409 | const DenseIndex nrVariablesInThisFactor = size(); |
354 | 410 |
|
| 411 | + gttic(slots); |
355 | 412 | vector<DenseIndex> slots(nrVariablesInThisFactor + 1); |
356 | 413 | for (DenseIndex j = 0; j < nrVariablesInThisFactor; ++j) |
357 | 414 | slots[j] = Slot(infoKeys, keys_[j]); |
| 415 | + |
358 | 416 | slots[nrVariablesInThisFactor] = info->nBlocks() - 1; |
| 417 | + gttoc(slots); |
359 | 418 |
|
360 | 419 | info->updateFromMappedBlocks(info_, slots); |
361 | 420 | } |
362 | 421 |
|
| 422 | +/* ************************************************************************* */ |
| 423 | +void HessianFactor::updateHessian(const KeyVector& infoKeys, |
| 424 | + SymmetricBlockMatrix* info, |
| 425 | + DenseIndex beginCol, |
| 426 | + DenseIndex endCol) const { |
| 427 | + assert(info); |
| 428 | + const DenseIndex nrVariablesInThisFactor = size(); |
| 429 | + |
| 430 | + vector<DenseIndex> slots; |
| 431 | + slots.reserve(nrVariablesInThisFactor + 1); |
| 432 | + |
| 433 | + for (DenseIndex j = 0; j < nrVariablesInThisFactor; ++j) { |
| 434 | + slots.push_back(Slot(infoKeys, keys_[j])); |
| 435 | + } |
| 436 | + slots.push_back(info->nBlocks() - 1); |
| 437 | + |
| 438 | + for (DenseIndex j = 0; j <= nrVariablesInThisFactor; ++j) { |
| 439 | + const DenseIndex J = slots[j]; |
| 440 | + // Update diagonal block if J is in range |
| 441 | + if (J >= beginCol && J < endCol) { |
| 442 | + info->updateDiagonalBlock(J, info_.diagonalBlock(j)); |
| 443 | + } |
| 444 | + |
| 445 | + // Update off-diagonal blocks where column max(I, J) is in range |
| 446 | + // Note: We process all blocks and let the maxCol check filter them, |
| 447 | + // because I and J may be in different orders (slots are not necessarily sorted) |
| 448 | + for (DenseIndex i = 0; i < j; ++i) { |
| 449 | + const DenseIndex I = slots[i]; |
| 450 | + assert(i < j); |
| 451 | + assert(I != J); |
| 452 | + |
| 453 | + // The physical column index in the symmetric matrix is max(I, J) |
| 454 | + const DenseIndex maxCol = std::max(I, J); |
| 455 | + |
| 456 | + if (maxCol >= beginCol && maxCol < endCol) { |
| 457 | + info->updateOffDiagonalBlock(I, J, info_.aboveDiagonalBlock(i, j)); |
| 458 | + } |
| 459 | + } |
| 460 | + } |
| 461 | +} |
| 462 | + |
363 | 463 | /* ************************************************************************* */ |
364 | 464 | GaussianFactor::shared_ptr HessianFactor::negate() const { |
365 | 465 | shared_ptr result = std::make_shared<This>(*this); |
|
0 commit comments