diff --git a/doc/CovarianceRecovery.pdf b/doc/CovarianceRecovery.pdf
new file mode 100644
index 0000000000..d61938f3a5
Binary files /dev/null and b/doc/CovarianceRecovery.pdf differ
diff --git a/gtsam/inference/BayesTree-inst.h b/gtsam/inference/BayesTree-inst.h
index be466e43df..dd37a64c95 100644
--- a/gtsam/inference/BayesTree-inst.h
+++ b/gtsam/inference/BayesTree-inst.h
@@ -364,6 +364,14 @@ namespace gtsam {
     return std::make_shared<FactorGraphType>(*jointBayesNet(j1, j2, function));
   }
 
+  /* ************************************************************************* */
+  template <class CLIQUE>
+  typename BayesTree<CLIQUE>::sharedFactorGraph BayesTree<CLIQUE>::joint(
+      const KeyVector& keys, const Eliminate& function) const {
+    gttic(BayesTree_joint);
+    return std::make_shared<FactorGraphType>(*jointBayesNet(keys, function));
+  }
+
   /* ************************************************************************* */
   // Find the lowest common ancestor of two cliques
   // TODO(Varun): consider implementing this as a Range Minimum Query
@@ -387,6 +395,21 @@ namespace gtsam {
     return nullptr;  // Return nullptr if no common ancestor is found
   }
 
+  /* ************************************************************************* */
+  template <class CLIQUE>
+  static std::shared_ptr<CLIQUE> findLowestCommonAncestor(
+      const std::vector<std::shared_ptr<CLIQUE>>& cliques) {
+    if (cliques.empty()) {
+      return nullptr;
+    }
+
+    std::shared_ptr<CLIQUE> lca = cliques.front();
+    for (size_t i = 1; i < cliques.size() && lca; ++i) {
+      lca = findLowestCommonAncestor(lca, cliques[i]);
+    }
+    return lca;
+  }
+
   /* ************************************************************************* */
   // Given the clique P(F:S) and the ancestor clique B
   // Return the Bayes tree P(S\B | S \cap B), where \cap is intersection
@@ -408,6 +431,157 @@ namespace gtsam {
             Ordering(S_setminus_B), eliminate);
     return bayesTree;
   }
+
+  /* ************************************************************************* */
+  template <class CLIQUE>
+  static KeyVector uniqueKeys(const KeyVector& keys) {
+    KeyVector unique = keys;
+    std::sort(unique.begin(), unique.end());
+    unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
+    return unique;
+  }
+
+  /* ************************************************************************* */
+  template <class CLIQUE>
+  static std::vector<std::shared_ptr<CLIQUE>> uniqueCliquesFromKeys(
+      const BayesTree<CLIQUE>& tree, const KeyVector& keys) {
+    std::vector<std::shared_ptr<CLIQUE>> queryCliques;
+    queryCliques.reserve(keys.size());
+    std::unordered_set<std::shared_ptr<CLIQUE>> seen;
+
+    for (Key key : keys) {
+      auto clique = tree.clique(key);
+      if (seen.insert(clique).second) {
+        queryCliques.push_back(clique);
+      }
+    }
+    return queryCliques;
+  }
+
+  /* ************************************************************************* */
+  template <class CLIQUE>
+  static std::shared_ptr<CLIQUE> rootClique(
+      const std::shared_ptr<CLIQUE>& clique) {
+    auto current = clique;
+    while (current && current->parent()) {
+      current = current->parent();
+    }
+    return current;
+  }
+
+  /* ************************************************************************* */
+  template <class CLIQUE>
+  static std::unordered_set<std::shared_ptr<CLIQUE>> collectSupportCliques(
+      const std::vector<std::shared_ptr<CLIQUE>>& queryCliques,
+      const std::shared_ptr<CLIQUE>& root) {
+    std::unordered_set<std::shared_ptr<CLIQUE>> support;
+    if (!root) {
+      return support;
+    }
+
+    support.insert(root);
+    for (const auto& clique : queryCliques) {
+      for (auto current = clique; current && current != root;
+           current = current->parent()) {
+        support.insert(current);
+      }
+    }
+    return support;
+  }
+
+  /* ************************************************************************* */
+  template <class CLIQUE>
+  static std::unordered_map<std::shared_ptr<CLIQUE>, size_t>
+  countSupportChildren(
+      const std::unordered_set<std::shared_ptr<CLIQUE>>& support,
+      const std::shared_ptr<CLIQUE>& root) {
+    std::unordered_map<std::shared_ptr<CLIQUE>, size_t> supportChildren;
+    for (const auto& clique : support) {
+      supportChildren[clique] = 0;
+    }
+
+    for (const auto& clique : support) {
+      if (clique == root) {
+        continue;
+      }
+      auto parent = clique->parent();
+      if (parent && support.count(parent)) {
+        ++supportChildren[parent];
+      }
+    }
+    return supportChildren;
+  }
+
+  /* ************************************************************************* */
+  template <class CLIQUE>
+  static std::unordered_set<std::shared_ptr<CLIQUE>> collectEssentialCliques(
+      const std::vector<std::shared_ptr<CLIQUE>>& queryCliques,
+      const std::unordered_set<std::shared_ptr<CLIQUE>>& support,
+      const std::unordered_map<std::shared_ptr<CLIQUE>, size_t>& supportChildren,
+      const std::shared_ptr<CLIQUE>& root) {
+    std::unordered_set<std::shared_ptr<CLIQUE>> essential;
+    if (root) {
+      essential.insert(root);
+    }
+
+    std::unordered_set<std::shared_ptr<CLIQUE>> querySet(queryCliques.begin(),
+                                                         queryCliques.end());
+    for (const auto& clique : support) {
+      const auto childCount = supportChildren.find(clique);
+      const size_t numSupportChildren =
+          childCount == supportChildren.end() ? 0 : childCount->second;
+      if (querySet.count(clique) || numSupportChildren > 1) {
+        essential.insert(clique);
+      }
+    }
+    return essential;
+  }
+
+  /* ************************************************************************* */
+  template <class CLIQUE>
+  static std::shared_ptr<CLIQUE> descendToNextEssentialClique(
+      const std::shared_ptr<CLIQUE>& child,
+      const std::unordered_set<std::shared_ptr<CLIQUE>>& support,
+      const std::unordered_set<std::shared_ptr<CLIQUE>>& essential) {
+    auto current = child;
+    while (current && !essential.count(current)) {
+      std::shared_ptr<CLIQUE> next;
+      for (const auto& grandChild : current->children) {
+        if (support.count(grandChild)) {
+          next = grandChild;
+          break;
+        }
+      }
+      current = next;
+    }
+    return current;
+  }
+
+  /* ************************************************************************* */
+  template <class CLIQUE>
+  static void appendCompressedSupport(
+      const std::shared_ptr<CLIQUE>& ancestor,
+      const std::unordered_set<std::shared_ptr<CLIQUE>>& support,
+      const std::unordered_set<std::shared_ptr<CLIQUE>>& essential,
+      typename CLIQUE::FactorGraphType* factorGraph,
+      const typename CLIQUE::FactorGraphType::Eliminate& eliminate) {
+    for (const auto& child : ancestor->children) {
+      if (!support.count(child)) {
+        continue;
+      }
+
+      auto nextEssential =
+          descendToNextEssentialClique(child, support, essential);
+      if (!nextEssential) {
+        continue;
+      }
+
+      factorGraph->push_back(*factorInto(nextEssential, ancestor, eliminate));
+      factorGraph->push_back(nextEssential->conditional());
+      appendCompressedSupport(nextEssential, support, essential, factorGraph,
+                              eliminate);
+    }
+  }
   
   /* ************************************************************************* */
   template <class CLIQUE>
@@ -447,6 +621,58 @@ namespace gtsam {
     return p_BC1C2.marginalMultifrontalBayesNet(Ordering{j1, j2}, eliminate);
   }
 
+  /* ************************************************************************* */
+  template <class CLIQUE>
+  typename BayesTree<CLIQUE>::sharedBayesNet BayesTree<CLIQUE>::jointBayesNet(
+      const KeyVector& keys, const Eliminate& eliminate) const {
+    gttic(BayesTree_jointBayesNet);
+
+    const KeyVector queryKeys = uniqueKeys<CLIQUE>(keys);
+    if (queryKeys.empty()) {
+      return std::make_shared<BayesNetType>();
+    }
+    if (queryKeys.size() == 1) {
+      auto bayesNet = std::make_shared<BayesNetType>();
+      bayesNet->push_back(marginalFactor(queryKeys.front(), eliminate));
+      return bayesNet;
+    }
+    if (queryKeys.size() == 2) {
+      return jointBayesNet(queryKeys[0], queryKeys[1], eliminate);
+    }
+
+    const auto queryCliques = uniqueCliquesFromKeys(*this, queryKeys);
+    std::unordered_map<std::shared_ptr<CLIQUE>, KeyVector> keysByRoot;
+    for (Key key : queryKeys) {
+      keysByRoot[rootClique(this->clique(key))].push_back(key);
+    }
+    if (keysByRoot.size() > 1) {
+      FactorGraphType disjointJoint;
+      for (const auto& [rootClique, groupKeys] : keysByRoot) {
+        (void)rootClique;
+        disjointJoint.push_back(*jointBayesNet(groupKeys, eliminate));
+      }
+      return disjointJoint.marginalMultifrontalBayesNet(Ordering(queryKeys),
+                                                        eliminate);
+    }
+
+    const auto root = findLowestCommonAncestor(queryCliques);
+    if (!root) {
+      return std::make_shared<BayesNetType>();
+    }
+
+    const auto support = collectSupportCliques(queryCliques, root);
+    const auto supportChildren = countSupportChildren(support, root);
+    const auto essential =
+        collectEssentialCliques(queryCliques, support, supportChildren, root);
+
+    FactorGraphType reducedJoint;
+    reducedJoint.push_back(root->marginal2(eliminate));
+    appendCompressedSupport(root, support, essential, &reducedJoint, eliminate);
+
+    return reducedJoint.marginalMultifrontalBayesNet(Ordering(queryKeys),
+                                                     eliminate);
+  }
+
   /* ************************************************************************* */
   template<class CLIQUE>
   void BayesTree<CLIQUE>::clear() {
diff --git a/gtsam/inference/BayesTree.h b/gtsam/inference/BayesTree.h
index 02c10bcd88..12ba27033b 100644
--- a/gtsam/inference/BayesTree.h
+++ b/gtsam/inference/BayesTree.h
@@ -180,12 +180,22 @@ namespace gtsam {
      */
     sharedFactorGraph joint(Key j1, Key j2, const Eliminate& function = EliminationTraitsType::DefaultEliminate) const;
 
+    /** Return a joint factor graph on an arbitrary set of variables. */
+    sharedFactorGraph joint(
+        const KeyVector& keys,
+        const Eliminate& function = EliminationTraitsType::DefaultEliminate) const;
+
     /**
      * return joint on two variables as a BayesNet
      * Limitation: can only calculate joint if cliques are disjoint or one of them is root
      */
     sharedBayesNet jointBayesNet(Key j1, Key j2, const Eliminate& function = EliminationTraitsType::DefaultEliminate) const;
 
+    /** Return a joint marginal on an arbitrary set of variables as a BayesNet. */
+    sharedBayesNet jointBayesNet(
+        const KeyVector& keys,
+        const Eliminate& function = EliminationTraitsType::DefaultEliminate) const;
+
     /// @}   
     /// @name Graph Display
     /// @{
diff --git a/gtsam/linear/GaussianBayesTree.cpp b/gtsam/linear/GaussianBayesTree.cpp
index b35252e729..c3dc340be0 100644
--- a/gtsam/linear/GaussianBayesTree.cpp
+++ b/gtsam/linear/GaussianBayesTree.cpp
@@ -117,10 +117,16 @@ namespace gtsam {
   }
 
   /* ************************************************************************* */
-  Matrix GaussianBayesTree::marginalCovariance(Key key) const
-  {
-    return marginalFactor(key)->information().inverse();
+  Matrix GaussianBayesTree::marginalCovariance(Key key) const {
+    const Matrix information = marginalFactor(key)->information();
+    if (!information.allFinite()) {
+      return Matrix::Zero(information.rows(), information.cols());
+    }
+    Eigen::LLT<Matrix> llt(information.selfadjointView<Eigen::Upper>());
+    Matrix covariance =
+        Matrix::Identity(information.rows(), information.cols());
+    llt.solveInPlace(covariance);
+    return covariance;
   }
 
-
 } // \namespace gtsam
diff --git a/gtsam/linear/linear.i b/gtsam/linear/linear.i
index 1b7ce58a98..3b8afd9df0 100644
--- a/gtsam/linear/linear.i
+++ b/gtsam/linear/linear.i
@@ -719,7 +719,10 @@ virtual class GaussianBayesTree {
   gtsam::Matrix marginalCovariance(gtsam::Key key) const;
   gtsam::GaussianConditional* marginalFactor(gtsam::Key key) const;
   gtsam::GaussianFactorGraph* joint(gtsam::Key key1, gtsam::Key key2) const;
+  gtsam::GaussianFactorGraph* joint(const gtsam::KeyVector& queryKeys) const;
   gtsam::GaussianBayesNet* jointBayesNet(gtsam::Key key1, gtsam::Key key2) const;
+  gtsam::GaussianBayesNet* jointBayesNet(const gtsam::KeyVector& queryKeys) const;
+  void deleteCachedShortcuts();
 };
 
 #include <gtsam/linear/GaussianEliminationTree.h>
diff --git a/gtsam/nonlinear/Marginals.cpp b/gtsam/nonlinear/Marginals.cpp
index 15653ba8ac..6ba6fd559d 100644
--- a/gtsam/nonlinear/Marginals.cpp
+++ b/gtsam/nonlinear/Marginals.cpp
@@ -13,18 +13,181 @@
  * @file Marginals.cpp
  * @brief
  * @author Richard Roberts
+ * @author Frank Dellaert
  * @date May 14, 2012
  */
 
 #include <gtsam/base/timing.h>
 #include <gtsam/linear/JacobianFactor.h>
 #include <gtsam/linear/HessianFactor.h>
+#include <gtsam/linear/GaussianBayesNet.h>
 #include <gtsam/nonlinear/Marginals.h>
 
+#include <numeric>
+
 using namespace std;
 
 namespace gtsam {
 
+namespace {
+
+/* ************************************************************************* */
+// Return a sorted key list with duplicates removed.
+KeyVector uniqueSortedKeys(const KeyVector& keys) {
+  KeyVector result = keys;
+  std::sort(result.begin(), result.end());
+  result.erase(std::unique(result.begin(), result.end()), result.end());
+  return result;
+}
+
+/* ************************************************************************* */
+// Return keys in first-seen order with duplicates removed.
+KeyVector uniqueStableKeys(const KeyVector& keys) {
+  KeyVector result;
+  result.reserve(keys.size());
+  KeySet seen;
+  for (Key key : keys) {
+    if (seen.insert(key).second) {
+      result.push_back(key);
+    }
+  }
+  return result;
+}
+
+/* ************************************************************************* */
+// Look up the block dimension of each requested variable.
+std::vector<size_t> dimsForKeys(const KeyVector& keys, const Values& values) {
+  std::vector<size_t> dims;
+  dims.reserve(keys.size());
+  for (Key key : keys) {
+    dims.push_back(values.at(key).dim());
+  }
+  return dims;
+}
+
+/* ************************************************************************* */
+// Convert per-variable dimensions into cumulative block offsets.
+std::vector<size_t> blockOffsets(const std::vector<size_t>& dims) {
+  std::vector<size_t> offsets(dims.size() + 1, 0);
+  for (size_t i = 0; i < dims.size(); ++i) {
+    offsets[i + 1] = offsets[i] + dims[i];
+  }
+  return offsets;
+}
+
+/* ************************************************************************* */
+// Recover a covariance matrix from a symmetric information matrix.
+Matrix informationToCovariance(const Matrix& information) {
+  if (!information.allFinite()) {
+    return Matrix::Zero(information.rows(), information.cols());
+  }
+
+  Eigen::LLT<Matrix> llt(information.selfadjointView<Eigen::Upper>());
+  Matrix covariance = Matrix::Identity(information.rows(), information.cols());
+  llt.solveInPlace(covariance);
+  return covariance;
+}
+
+// Build the reduced joint factor graph for the requested variable set.
+GaussianFactorGraph reducedJointFactorGraph(
+    const GaussianBayesTree& bayesTree, Marginals::Factorization factorization,
+    const KeyVector& variables) {
+  if (factorization == Marginals::CHOLESKY) {
+    return *bayesTree.joint(variables, EliminatePreferCholesky);
+  } else if (factorization == Marginals::QR) {
+    return *bayesTree.joint(variables, EliminateQR);
+  }
+  throw std::runtime_error(
+      "Marginals::reducedJointFactorGraph: Unknown factorization");
+}
+
+/* ************************************************************************* */
+// Eliminate a reduced factor graph into a Bayes net ordered by the query keys.
+GaussianBayesNet queryBayesNet(const GaussianFactorGraph& factorGraph,
+                               Marginals::Factorization factorization,
+                               const KeyVector& variables) {
+  if (factorization == Marginals::CHOLESKY) {
+    return *factorGraph.marginalMultifrontalBayesNet(Ordering(variables),
+                                                     EliminatePreferCholesky);
+  } else if (factorization == Marginals::QR) {
+    return *factorGraph.marginalMultifrontalBayesNet(Ordering(variables),
+                                                     EliminateQR);
+  }
+  throw std::runtime_error("Marginals::queryBayesNet: Unknown factorization");
+}
+
+/* ************************************************************************* */
+// Recover only the covariance columns associated with selected variable blocks.
+Matrix covarianceColumns(const GaussianBayesNet& bayesNet,
+                         const KeyVector& orderedKeys,
+                         const std::vector<size_t>& dims,
+                         const std::vector<size_t>& selectedBlocks) {
+  const auto [R, rhs] = bayesNet.matrix(Ordering(orderedKeys));
+  (void)rhs;
+
+  const std::vector<size_t> offsets = blockOffsets(dims);
+  const size_t totalDim = offsets.back();
+
+  size_t selectedDim = 0;
+  for (size_t blockIndex : selectedBlocks) {
+    selectedDim += dims.at(blockIndex);
+  }
+
+  Matrix selectors = Matrix::Zero(totalDim, selectedDim);
+  size_t selectedOffset = 0;
+  for (size_t blockIndex : selectedBlocks) {
+    const size_t begin = offsets[blockIndex];
+    const size_t dim = dims[blockIndex];
+    selectors.block(begin, selectedOffset, dim, dim).setIdentity();
+    selectedOffset += dim;
+  }
+
+  R.transpose().triangularView<Eigen::Lower>().solveInPlace(selectors);
+  R.triangularView<Eigen::Upper>().solveInPlace(selectors);
+  return selectors;
+}
+
+/* ************************************************************************* */
+// Assemble a left-by-right block from packed selected covariance columns.
+Matrix assembleCrossBlock(const Matrix& selectedColumns,
+                          const KeyVector& orderedKeys,
+                          const std::vector<size_t>& dims,
+                          const KeyVector& left, const KeyVector& right) {
+  FastMap<Key, size_t> keyIndex = Ordering(orderedKeys).invert();
+  const std::vector<size_t> offsets = blockOffsets(dims);
+
+  size_t leftDim = 0;
+  for (Key key : left) {
+    leftDim += dims.at(keyIndex.at(key));
+  }
+  size_t rightDim = 0;
+  for (Key key : right) {
+    rightDim += dims.at(keyIndex.at(key));
+  }
+
+  Matrix result(leftDim, rightDim);
+  size_t rowOffset = 0;
+  for (Key leftKey : left) {
+    const size_t leftBlock = keyIndex.at(leftKey);
+    const size_t leftSize = dims[leftBlock];
+    const size_t leftBegin = offsets[leftBlock];
+    size_t columnOffset = 0;
+    size_t selectedOffset = 0;
+    for (Key rightKey : right) {
+      const size_t rightBlock = keyIndex.at(rightKey);
+      const size_t rightSize = dims[rightBlock];
+      result.block(rowOffset, columnOffset, leftSize, rightSize) =
+          selectedColumns.block(leftBegin, selectedOffset, leftSize, rightSize);
+      columnOffset += rightSize;
+      selectedOffset += rightSize;
+    }
+    rowOffset += leftSize;
+  }
+  return result;
+}
+
+}  // namespace
+
 /* ************************************************************************* */
 Marginals::Marginals(const NonlinearFactorGraph& graph, const Values& solution, Factorization factorization)
                      : values_(solution), factorization_(factorization) {
@@ -144,63 +307,66 @@ Matrix Marginals::marginalInformation(Key variable) const {
 /* ************************************************************************* */
 Matrix Marginals::marginalCovariance(Key variable) const {
   Matrix info = marginalInformation(variable);
-
-  // Deterministic constraints (e.g., NonlinearEquality) produce infinite
-  // information for the constrained variable. Treat those as zero covariance
-  // rather than attempting to invert an infinite matrix.
-  if (!info.allFinite())
-    return Matrix::Zero(info.rows(), info.cols());
-
-  return info.inverse();
+  return informationToCovariance(info);
 }
 
 /* ************************************************************************* */
-JointMarginal Marginals::jointMarginalCovariance(const KeyVector& variables) const {
-  JointMarginal info = jointMarginalInformation(variables);
-  info.blockMatrix_.invertInPlace();
-  return info;
+JointMarginal Marginals::jointMarginalCovariance(
+    const KeyVector& variables) const {
+  // Normalize the query key set so block order is deterministic.
+  const KeyVector variablesSorted = uniqueSortedKeys(variables);
+  if (variablesSorted.size() == 1) {
+    Matrix covariance = marginalCovariance(variablesSorted.front());
+    return JointMarginal(covariance, {static_cast<size_t>(covariance.rows())},
+                         variablesSorted);
+  }
+
+  // Reduce the Bayes tree query to a Bayes net on just the requested variables.
+  const GaussianFactorGraph jointFG =
+      reducedJointFactorGraph(bayesTree_, factorization_, variablesSorted);
+  const GaussianBayesNet bayesNet =
+      queryBayesNet(jointFG, factorization_, variablesSorted);
+  const std::vector<size_t> dims = dimsForKeys(variablesSorted, values_);
+
+  // Recover every covariance block by selecting every column block.
+  std::vector<size_t> allBlocks(variablesSorted.size());
+  std::iota(allBlocks.begin(), allBlocks.end(), 0);
+  Matrix covariance =
+      covarianceColumns(bayesNet, variablesSorted, dims, allBlocks);
+  return JointMarginal(covariance, dims, variablesSorted);
 }
 
 /* ************************************************************************* */
-JointMarginal Marginals::jointMarginalInformation(const KeyVector& variables) const {
+JointMarginal Marginals::jointMarginalInformation(
+    const KeyVector& variables) const {
+  // Normalize the query key set so the returned block layout is stable.
+  const KeyVector variablesSorted = uniqueSortedKeys(variables);
+  if (variablesSorted.empty()) {
+    return JointMarginal(Matrix(), std::vector<size_t>(), variablesSorted);
+  }
 
-  // If 2 variables, we can use the BayesTree::joint function, otherwise we
-  // have to use sequential elimination.
-  if(variables.size() == 1)
-  {
-    Matrix info = marginalInformation(variables.front());
+  // A single-variable query can reuse the existing marginal-factor path.
+  if (variablesSorted.size() == 1) {
+    Matrix info = marginalInformation(variablesSorted.front());
     std::vector<size_t> dims;
     dims.push_back(info.rows());
-    return JointMarginal(info, dims, variables);
-  }
-  else
-  {
-    // Compute joint marginal factor graph.
-    GaussianFactorGraph jointFG;
-    if(variables.size() == 2) {
-      if(factorization_ == CHOLESKY)
-        jointFG = *bayesTree_.joint(variables[0], variables[1], EliminatePreferCholesky);
-      else if(factorization_ == QR)
-        jointFG = *bayesTree_.joint(variables[0], variables[1], EliminateQR);
-    } else {
-      if(factorization_ == CHOLESKY)
-        jointFG = GaussianFactorGraph(*graph_.marginalMultifrontalBayesTree(variables, EliminatePreferCholesky));
-      else if(factorization_ == QR)
-        jointFG = GaussianFactorGraph(*graph_.marginalMultifrontalBayesTree(variables, EliminateQR));
-    }
-
-    // Get information matrix
-    Matrix augmentedInfo = jointFG.augmentedHessian();
-    Matrix info = augmentedInfo.topLeftCorner(augmentedInfo.rows()-1, augmentedInfo.cols()-1);
-
-    // Information matrix will be returned with sorted keys
-    KeyVector variablesSorted = variables;
-    std::sort(variablesSorted.begin(), variablesSorted.end());
-
-    // Get dimensions from factor graph
+    return JointMarginal(info, dims, variablesSorted);
+  } else {
+    // Reduce the Bayes tree query and then materialize the reduced Hessian.
+    GaussianFactorGraph jointFG =
+        reducedJointFactorGraph(bayesTree_, factorization_, variablesSorted);
+    GaussianBayesNet bayesNet =
+        queryBayesNet(jointFG, factorization_, variablesSorted);
+
+    // Get information matrix on the reduced query system.
+    const auto [R, rhs] = bayesNet.matrix(Ordering(variablesSorted));
+    (void)rhs;
+    Matrix info = R.transpose() * R;
+
+    // Record block dimensions in the same key order used for the Hessian.
     std::vector<size_t> dims;
     dims.reserve(variablesSorted.size());
-    for(const auto& key: variablesSorted) {
+    for (const auto& key : variablesSorted) {
       dims.push_back(values_.at(key).dim());
     }
 
@@ -208,6 +374,46 @@ JointMarginal Marginals::jointMarginalInformation(const KeyVector& variables) co
   }
 }
 
+/* ************************************************************************* */
+Matrix Marginals::crossCovariance(const KeyVector& left,
+                                  const KeyVector& right) const {
+  // Preserve the requested left/right block order while removing duplicates.
+  const KeyVector leftUnique = uniqueStableKeys(left);
+  const KeyVector rightUnique = uniqueStableKeys(right);
+  KeyVector unionKeys = leftUnique;
+  unionKeys.insert(unionKeys.end(), rightUnique.begin(), rightUnique.end());
+  unionKeys = uniqueSortedKeys(unionKeys);
+
+  if (unionKeys.empty()) {
+    return Matrix();
+  }
+
+  if (unionKeys.size() == 1 && leftUnique == rightUnique) {
+    return marginalCovariance(unionKeys.front());
+  }
+
+  // Reduce to the union query once, then recover only the requested RHS blocks.
+  const GaussianFactorGraph jointFG =
+      reducedJointFactorGraph(bayesTree_, factorization_, unionKeys);
+  const GaussianBayesNet bayesNet =
+      queryBayesNet(jointFG, factorization_, unionKeys);
+  const std::vector<size_t> dims = dimsForKeys(unionKeys, values_);
+  FastMap<Key, size_t> keyIndex = Ordering(unionKeys).invert();
+
+  // Map each requested right-hand variable to its block in the reduced system.
+  std::vector<size_t> rightBlocks;
+  rightBlocks.reserve(rightUnique.size());
+  for (Key key : rightUnique) {
+    rightBlocks.push_back(keyIndex.at(key));
+  }
+
+  // Recover the selected columns and pack them back into the requested block layout.
+  const Matrix selectedColumns =
+      covarianceColumns(bayesNet, unionKeys, dims, rightBlocks);
+  return assembleCrossBlock(selectedColumns, unionKeys, dims, leftUnique,
+                            rightUnique);
+}
+
 /* ************************************************************************* */
 VectorValues Marginals::optimize() const {
   return bayesTree_.optimize();
diff --git a/gtsam/nonlinear/Marginals.h b/gtsam/nonlinear/Marginals.h
index 6e2b316912..e57d76c18a 100644
--- a/gtsam/nonlinear/Marginals.h
+++ b/gtsam/nonlinear/Marginals.h
@@ -130,6 +130,9 @@ class GTSAM_EXPORT Marginals {
   /** Compute the joint marginal information of several variables */
   JointMarginal jointMarginalInformation(const KeyVector& variables) const;
 
+  /** Compute the cross-covariance block between two variable sets. */
+  Matrix crossCovariance(const KeyVector& left, const KeyVector& right) const;
+
   /** Delete cached Bayes tree shortcuts created while computing marginals */
   void deleteCachedShortcuts();
 
diff --git a/gtsam/nonlinear/nonlinear.i b/gtsam/nonlinear/nonlinear.i
index aaa73b4d28..31d7af12e6 100644
--- a/gtsam/nonlinear/nonlinear.i
+++ b/gtsam/nonlinear/nonlinear.i
@@ -170,6 +170,8 @@ class Marginals {
       const gtsam::KeyVector& variables) const;
   gtsam::JointMarginal jointMarginalInformation(
       const gtsam::KeyVector& variables) const;
+  gtsam::Matrix crossCovariance(const gtsam::KeyVector& left,
+                                const gtsam::KeyVector& right) const;
   void deleteCachedShortcuts();
 };
 
diff --git a/python/gtsam/tests/test_GaussianBayesTree.py b/python/gtsam/tests/test_GaussianBayesTree.py
new file mode 100644
index 0000000000..b330ff7789
--- /dev/null
+++ b/python/gtsam/tests/test_GaussianBayesTree.py
@@ -0,0 +1,105 @@
+"""
+GTSAM Copyright 2010-2026, Georgia Tech Research Corporation,
+Atlanta, Georgia 30332-0415
+All Rights Reserved
+
+See LICENSE for the license information.
+
+Unit tests for wrapped GaussianBayesTree methods added for covariance recovery.
+Author: Codex, prompted by Frank Dellaert
+"""
+
+# pylint: disable=invalid-name, no-name-in-module, no-member
+
+import unittest
+
+import numpy as np
+
+import gtsam
+from gtsam.symbol_shorthand import X
+from gtsam.utils.test_case import GtsamTestCase
+
+
+def create_linear_chain():
+    """Create a small anchored linear chain and its key tuple."""
+    graph = gtsam.GaussianFactorGraph()
+
+    x0 = X(0)
+    x1 = X(1)
+    x2 = X(2)
+    x3 = X(3)
+
+    prior_noise = gtsam.noiseModel.Diagonal.Sigmas(np.ones(1))
+    between_noise = gtsam.noiseModel.Diagonal.Sigmas(np.ones(1))
+
+    graph.add(x0, np.eye(1), np.zeros(1), prior_noise)
+    graph.add(x1, np.eye(1), x0, -np.eye(1), np.ones(1), between_noise)
+    graph.add(x2, np.eye(1), x1, -np.eye(1), 2 * np.ones(1), between_noise)
+    graph.add(x3, np.eye(1), x2, -np.eye(1), 3 * np.ones(1), between_noise)
+
+    return graph, (x0, x1, x2, x3)
+
+
+def subset_vector_values(values, keys):
+    """Extract a VectorValues subset in the requested key order."""
+    result = gtsam.VectorValues()
+    for key in keys:
+        result.insert(key, values.at(key))
+    return result
+
+
+class TestGaussianBayesTree(GtsamTestCase):
+    """Tests for GaussianBayesTree wrapper extensions."""
+
+    def test_joint_key_vector(self):
+        """joint(KeyVector) should preserve the queried joint covariance."""
+        graph, keys = create_linear_chain()
+        bayes_tree = graph.eliminateMultifrontal()
+        solution = graph.optimize()
+        query_keys = [keys[0], keys[2], keys[3]]
+
+        expected = (
+            gtsam.Marginals(graph, solution)
+            .jointMarginalCovariance(query_keys)
+            .fullMatrix()
+        )
+
+        joint_graph = bayes_tree.joint(query_keys)
+        query_solution = subset_vector_values(solution, query_keys)
+        actual = (
+            gtsam.Marginals(joint_graph, query_solution)
+            .jointMarginalCovariance(query_keys)
+            .fullMatrix()
+        )
+
+        np.testing.assert_allclose(actual, expected, atol=1e-9)
+
+    def test_joint_bayes_net_key_vector(self):
+        """jointBayesNet(KeyVector) should optimize to the queried solution."""
+        graph, keys = create_linear_chain()
+        bayes_tree = graph.eliminateMultifrontal()
+        solution = graph.optimize()
+        query_keys = [keys[0], keys[2], keys[3]]
+
+        joint_bayes_net = bayes_tree.jointBayesNet(query_keys)
+        actual = joint_bayes_net.optimize()
+
+        for key in query_keys:
+            np.testing.assert_allclose(actual.at(key), solution.at(key), atol=1e-9)
+
+    def test_delete_cached_shortcuts(self):
+        """deleteCachedShortcuts should clear caches created by query shortcuts."""
+        graph, keys = create_linear_chain()
+        bayes_tree = graph.eliminateMultifrontal()
+        query_keys = [keys[0], keys[2], keys[3]]
+
+        self.assertEqual(bayes_tree.numCachedSeparatorMarginals(), 0)
+        bayes_tree.jointBayesNet(query_keys)
+        self.assertGreater(bayes_tree.numCachedSeparatorMarginals(), 0)
+
+        bayes_tree.deleteCachedShortcuts()
+        self.assertEqual(bayes_tree.numCachedSeparatorMarginals(), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/gtsam/tests/test_Marginals.py b/python/gtsam/tests/test_Marginals.py
new file mode 100644
index 0000000000..824bfdea16
--- /dev/null
+++ b/python/gtsam/tests/test_Marginals.py
@@ -0,0 +1,97 @@
+"""
+GTSAM Copyright 2010-2026, Georgia Tech Research Corporation,
+Atlanta, Georgia 30332-0415
+All Rights Reserved
+
+See LICENSE for the license information.
+
+Unit tests for wrapped Marginals methods added for covariance recovery.
+Author: Codex, prompted by Frank Dellaert
+"""
+
+# pylint: disable=invalid-name, no-name-in-module, no-member
+
+import unittest
+
+import numpy as np
+
+import gtsam
+from gtsam.symbol_shorthand import L, X
+from gtsam.utils.test_case import GtsamTestCase
+
+
+def create_planar_slam_problem():
+    """Create a small nonlinear SLAM problem with pose and landmark variables."""
+    graph = gtsam.NonlinearFactorGraph()
+
+    x1 = X(1)
+    x2 = X(2)
+    x3 = X(3)
+    l1 = L(1)
+    l2 = L(2)
+
+    prior_noise = gtsam.noiseModel.Diagonal.Sigmas(np.array([0.3, 0.3, 0.1]))
+    odometry_noise = gtsam.noiseModel.Diagonal.Sigmas(np.array([0.2, 0.2, 0.1]))
+    measurement_noise = gtsam.noiseModel.Diagonal.Sigmas(np.array([0.1, 0.2]))
+
+    graph.addPriorPose2(x1, gtsam.Pose2(0.0, 0.0, 0.0), prior_noise)
+    graph.add(
+        gtsam.BetweenFactorPose2(x1, x2, gtsam.Pose2(2.0, 0.0, 0.0), odometry_noise)
+    )
+    graph.add(
+        gtsam.BetweenFactorPose2(x2, x3, gtsam.Pose2(2.0, 0.0, 0.0), odometry_noise)
+    )
+
+    graph.add(
+        gtsam.BearingRangeFactor2D(
+            x1, l1, gtsam.Rot2.fromDegrees(45), np.sqrt(8.0), measurement_noise
+        )
+    )
+    graph.add(
+        gtsam.BearingRangeFactor2D(
+            x2, l1, gtsam.Rot2.fromDegrees(90), 2.0, measurement_noise
+        )
+    )
+    graph.add(
+        gtsam.BearingRangeFactor2D(
+            x3, l2, gtsam.Rot2.fromDegrees(90), 2.0, measurement_noise
+        )
+    )
+
+    values = gtsam.Values()
+    values.insert(x1, gtsam.Pose2(0.0, 0.0, 0.0))
+    values.insert(x2, gtsam.Pose2(2.0, 0.0, 0.0))
+    values.insert(x3, gtsam.Pose2(4.0, 0.0, 0.0))
+    values.insert(l1, gtsam.Point2(2.0, 2.0))
+    values.insert(l2, gtsam.Point2(4.0, 2.0))
+
+    return graph, values, (x1, x2, x3, l1, l2)
+
+
+class TestMarginals(GtsamTestCase):
+    """Tests for Marginals wrapper extensions."""
+
+    def test_cross_covariance(self):
+        """crossCovariance should match blocks from the full joint covariance."""
+        graph, values, (x1, _, x3, l1, l2) = create_planar_slam_problem()
+        marginals = gtsam.Marginals(graph, values)
+
+        left = [l2, x3]
+        right = [l1, x1]
+        union_keys = [x1, l1, l2, x3]
+
+        actual = marginals.crossCovariance(left, right)
+        joint = marginals.jointMarginalCovariance(union_keys)
+
+        expected = np.vstack(
+            [
+                np.hstack([joint.at(l2, l1), joint.at(l2, x1)]),
+                np.hstack([joint.at(x3, l1), joint.at(x3, x1)]),
+            ]
+        )
+
+        np.testing.assert_allclose(actual, expected, atol=1e-8)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/testGaussianBayesTreeB.cpp b/tests/testGaussianBayesTreeB.cpp
index bb1180903d..83920762a7 100644
--- a/tests/testGaussianBayesTreeB.cpp
+++ b/tests/testGaussianBayesTreeB.cpp
@@ -259,16 +259,14 @@ TEST( GaussianBayesTree, balanced_smoother_joint )
   GaussianBayesNet actual3 = *bayesTree.jointBayesNet(X(1),X(4));
   EXPECT(assert_equal(expected3,actual3,tol));
 
-  //  // Check the joint density P(x4,x1), i.e. with a root variable, factored the other way
-  //  GaussianBayesNet expected4;
-  //  GaussianConditional::shared_ptr
-  //      parent4(new GaussianConditional(X(1), Z_2x1, -1.0*I/sigmax1, ones(2)));
-  //    expected4.push_front(parent4);
-  //  double sig41 = 0.668096;
-  //  Matrix A41 = -0.055794*I;
-  //  push_front(expected4,X(4), Z_2x1, I/sig41, X(1), A41/sig41, sigma);
-  //  GaussianBayesNet actual4 = *bayesTree.jointBayesNet(X(4),X(1));
-  //  EXPECT(assert_equal(expected4,actual4,tol));
+  // Check the joint density P(x4,x1), i.e. when the first queried clique is the ancestor
+  double sig41 = 0.668096;
+  Matrix A41 = -0.055794*I;
+  GaussianBayesNet expected4;
+  expected4.emplace_shared<GaussianConditional>(X(4), Z_2x1, I/sig41, X(1), A41/sig41);
+  expected4.emplace_shared<GaussianConditional>(X(1), Z_2x1, -1.0*I/sigmax1);
+  GaussianBayesNet actual4 = *bayesTree.jointBayesNet(X(4),X(1));
+  EXPECT(assert_equal(expected4, actual4, tol));
 }
 
 /* ************************************************************************* */
@@ -319,6 +317,63 @@ TEST(GaussianBayesTree, shortcut_overlapping_separator)
   EXPECT(assert_equal(expectedJointJ, actualJointJ));
 }
 
+/* ************************************************************************* */
+TEST(GaussianBayesTree, multiKeyJointMatchesPairwise) {
+  GaussianFactorGraph fg;
+  noiseModel::Diagonal::shared_ptr model = noiseModel::Unit::Create(1);
+  fg.add(1, (Matrix(1, 1) << 1.0).finished(), 3,
+         (Matrix(1, 1) << 2.0).finished(), 5,
+         (Matrix(1, 1) << 3.0).finished(), (Vector(1) << 4.0).finished(), model);
+  fg.add(1, (Matrix(1, 1) << 5.0).finished(), (Vector(1) << 6.0).finished(), model);
+  fg.add(2, (Matrix(1, 1) << 7.0).finished(), 4,
+         (Matrix(1, 1) << 8.0).finished(), 5,
+         (Matrix(1, 1) << 9.0).finished(), (Vector(1) << 10.0).finished(), model);
+  fg.add(2, (Matrix(1, 1) << 11.0).finished(), (Vector(1) << 12.0).finished(), model);
+  fg.add(5, (Matrix(1, 1) << 13.0).finished(), 6,
+         (Matrix(1, 1) << 14.0).finished(), (Vector(1) << 15.0).finished(), model);
+  fg.add(6, (Matrix(1, 1) << 17.0).finished(), 7,
+         (Matrix(1, 1) << 18.0).finished(), (Vector(1) << 19.0).finished(), model);
+  fg.add(7, (Matrix(1, 1) << 20.0).finished(), (Vector(1) << 21.0).finished(), model);
+
+  Ordering ordering(fg.keys());
+  GaussianBayesTree bt = *fg.eliminateMultifrontal(ordering);
+
+  const GaussianFactorGraph pairwiseJoint = *bt.joint(1, 2, EliminateQR);
+  const GaussianFactorGraph multiKeyJoint = *bt.joint(KeyVector{2, 1}, EliminateQR);
+  EXPECT(assert_equal(pairwiseJoint.augmentedHessian(),
+                      multiKeyJoint.augmentedHessian(), 1e-9));
+}
+
+/* ************************************************************************* */
+TEST(GaussianBayesTree, multiKeyJointMatchesLegacyMarginalization) {
+  GaussianFactorGraph fg;
+  noiseModel::Diagonal::shared_ptr model = noiseModel::Unit::Create(1);
+  fg.add(1, (Matrix(1, 1) << 1.0).finished(), 3,
+         (Matrix(1, 1) << 2.0).finished(), 5,
+         (Matrix(1, 1) << 3.0).finished(), (Vector(1) << 4.0).finished(), model);
+  fg.add(1, (Matrix(1, 1) << 5.0).finished(), (Vector(1) << 6.0).finished(), model);
+  fg.add(2, (Matrix(1, 1) << 7.0).finished(), 4,
+         (Matrix(1, 1) << 8.0).finished(), 5,
+         (Matrix(1, 1) << 9.0).finished(), (Vector(1) << 10.0).finished(), model);
+  fg.add(2, (Matrix(1, 1) << 11.0).finished(), (Vector(1) << 12.0).finished(), model);
+  fg.add(5, (Matrix(1, 1) << 13.0).finished(), 6,
+         (Matrix(1, 1) << 14.0).finished(), (Vector(1) << 15.0).finished(), model);
+  fg.add(6, (Matrix(1, 1) << 17.0).finished(), 7,
+         (Matrix(1, 1) << 18.0).finished(), (Vector(1) << 19.0).finished(), model);
+  fg.add(7, (Matrix(1, 1) << 20.0).finished(), (Vector(1) << 21.0).finished(), model);
+
+  Ordering ordering(fg.keys());
+  GaussianBayesTree bt = *fg.eliminateMultifrontal(ordering);
+
+  const KeyVector query{7, 2, 1};
+  const GaussianFactorGraph actualJoint = *bt.joint(query, EliminateQR);
+  const GaussianFactorGraph legacyJoint =
+      GaussianFactorGraph(*fg.marginalMultifrontalBayesTree(KeyVector{1, 2, 7},
+                                                            EliminateQR));
+  EXPECT(assert_equal(legacyJoint.augmentedHessian(),
+                      actualJoint.augmentedHessian(), 1e-9));
+}
+
 /* ************************************************************************* */
 int main() { TestResult tr; return TestRegistry::runAllTests(tr);}
 /* ************************************************************************* */
diff --git a/tests/testMarginals.cpp b/tests/testMarginals.cpp
index 344a1f56c5..e439602ff3 100644
--- a/tests/testMarginals.cpp
+++ b/tests/testMarginals.cpp
@@ -27,6 +27,7 @@
 
 // for modeling measurement uncertainty - all models included here
 #include <gtsam/linear/NoiseModel.h>
+#include <gtsam/config.h>
 
 // add in headers for specific factors
 #include <gtsam/slam/BetweenFactor.h>
@@ -37,6 +38,202 @@
 using namespace std;
 using namespace gtsam;
 
+namespace {
+
+/* ************************************************************************* */
+KeyVector uniqueSortedKeys(const KeyVector& keys) {
+  KeyVector result = keys;
+  std::sort(result.begin(), result.end());
+  result.erase(std::unique(result.begin(), result.end()), result.end());
+  return result;
+}
+
+/* ************************************************************************* */
+KeyVector uniqueStableKeys(const KeyVector& keys) {
+  KeyVector result;
+  KeySet seen;
+  for (Key key : keys) {
+    if (seen.insert(key).second) {
+      result.push_back(key);
+    }
+  }
+  return result;
+}
+
+/* ************************************************************************* */
+std::vector<size_t> dimsForKeys(const KeyVector& keys, const Values& values) {
+  std::vector<size_t> dims;
+  dims.reserve(keys.size());
+  for (Key key : keys) {
+    dims.push_back(values.at(key).dim());
+  }
+  return dims;
+}
+
+/* ************************************************************************* */
+std::vector<size_t> blockOffsets(const std::vector<size_t>& dims) {
+  std::vector<size_t> offsets(dims.size() + 1, 0);
+  for (size_t i = 0; i < dims.size(); ++i) {
+    offsets[i + 1] = offsets[i] + dims[i];
+  }
+  return offsets;
+}
+
+/* ************************************************************************* */
+GaussianFactorGraph legacyJointFactorGraph(
+    const GaussianFactorGraph& graph, const Ordering& ordering,
+    const KeyVector& variables, Marginals::Factorization factorization) {
+  const KeyVector sortedVariables = uniqueSortedKeys(variables);
+  if (factorization == Marginals::CHOLESKY) {
+    GaussianBayesTree bayesTree =
+        *graph.eliminateMultifrontal(ordering, EliminatePreferCholesky);
+    if (sortedVariables.size() == 2) {
+      return *bayesTree.joint(sortedVariables[0], sortedVariables[1],
+                              EliminatePreferCholesky);
+    }
+    return GaussianFactorGraph(*graph.marginalMultifrontalBayesTree(
+        sortedVariables, EliminatePreferCholesky));
+  } else {
+    GaussianBayesTree bayesTree =
+        *graph.eliminateMultifrontal(ordering, EliminateQR);
+    if (sortedVariables.size() == 2) {
+      return *bayesTree.joint(sortedVariables[0], sortedVariables[1],
+                              EliminateQR);
+    }
+    return GaussianFactorGraph(
+        *graph.marginalMultifrontalBayesTree(sortedVariables, EliminateQR));
+  }
+}
+
+/* ************************************************************************* */
+Matrix legacyJointCovariance(const GaussianFactorGraph& graph,
+                             const Ordering& ordering,
+                             const KeyVector& variables,
+                             Marginals::Factorization factorization) {
+  const KeyVector sortedVariables = uniqueSortedKeys(variables);
+  if (sortedVariables.size() == 1) {
+    GaussianBayesTree bayesTree =
+        factorization == Marginals::CHOLESKY
+            ? *graph.eliminateMultifrontal(ordering, EliminatePreferCholesky)
+            : *graph.eliminateMultifrontal(ordering, EliminateQR);
+    return bayesTree.marginalFactor(sortedVariables.front())
+        ->information()
+        .inverse();
+  }
+
+  const GaussianFactorGraph jointFG =
+      legacyJointFactorGraph(graph, ordering, sortedVariables, factorization);
+  const Matrix augmentedInfo = jointFG.augmentedHessian();
+  const Matrix information = augmentedInfo.topLeftCorner(
+      augmentedInfo.rows() - 1, augmentedInfo.cols() - 1);
+  return information.inverse();
+}
+
+/* ************************************************************************* */
+Matrix extractCrossBlock(const Matrix& fullCovariance,
+                         const KeyVector& orderedKeys, const Values& values,
+                         const KeyVector& left, const KeyVector& right) {
+  const std::vector<size_t> dims = dimsForKeys(orderedKeys, values);
+  const std::vector<size_t> offsets = blockOffsets(dims);
+  const FastMap<Key, size_t> keyIndex = Ordering(orderedKeys).invert();
+
+  size_t leftDim = 0;
+  for (Key key : left) {
+    leftDim += dims.at(keyIndex.at(key));
+  }
+  size_t rightDim = 0;
+  for (Key key : right) {
+    rightDim += dims.at(keyIndex.at(key));
+  }
+
+  Matrix result(leftDim, rightDim);
+  size_t rowOffset = 0;
+  for (Key leftKey : left) {
+    const size_t leftBlock = keyIndex.at(leftKey);
+    const size_t leftSize = dims[leftBlock];
+    const size_t leftBegin = offsets[leftBlock];
+    size_t columnOffset = 0;
+    for (Key rightKey : right) {
+      const size_t rightBlock = keyIndex.at(rightKey);
+      const size_t rightSize = dims[rightBlock];
+      const size_t rightBegin = offsets[rightBlock];
+      result.block(rowOffset, columnOffset, leftSize, rightSize) =
+          fullCovariance.block(leftBegin, rightBegin, leftSize, rightSize);
+      columnOffset += rightSize;
+    }
+    rowOffset += leftSize;
+  }
+  return result;
+}
+
+/* ************************************************************************* */
+Matrix packedSelectedColumns(const Matrix& fullCovariance,
+                             const KeyVector& orderedKeys, const Values& values,
+                             const KeyVector& selectedKeys) {
+  const std::vector<size_t> dims = dimsForKeys(orderedKeys, values);
+  const std::vector<size_t> offsets = blockOffsets(dims);
+  const FastMap<Key, size_t> keyIndex = Ordering(orderedKeys).invert();
+
+  size_t selectedDim = 0;
+  for (Key key : selectedKeys) {
+    selectedDim += dims.at(keyIndex.at(key));
+  }
+
+  Matrix result(fullCovariance.rows(), selectedDim);
+  size_t columnOffset = 0;
+  for (Key key : selectedKeys) {
+    const size_t block = keyIndex.at(key);
+    const size_t dim = dims.at(block);
+    const size_t begin = offsets.at(block);
+    result.block(0, columnOffset, fullCovariance.rows(), dim) =
+        fullCovariance.block(0, begin, fullCovariance.rows(), dim);
+    columnOffset += dim;
+  }
+  return result;
+}
+
+/* ************************************************************************* */
+Matrix extractPackedCrossBlock(const Matrix& selectedColumns,
+                               const KeyVector& orderedKeys,
+                               const Values& values, const KeyVector& left,
+                               const KeyVector& right) {
+  const std::vector<size_t> dims = dimsForKeys(orderedKeys, values);
+  const std::vector<size_t> offsets = blockOffsets(dims);
+  const FastMap<Key, size_t> keyIndex = Ordering(orderedKeys).invert();
+
+  size_t leftDim = 0;
+  for (Key key : left) {
+    leftDim += dims.at(keyIndex.at(key));
+  }
+  size_t rightDim = 0;
+  for (Key key : right) {
+    rightDim += dims.at(keyIndex.at(key));
+  }
+
+  Matrix result(leftDim, rightDim);
+  size_t rowOffset = 0;
+  for (Key leftKey : left) {
+    const size_t leftBlock = keyIndex.at(leftKey);
+    const size_t leftSize = dims[leftBlock];
+    const size_t leftBegin = offsets[leftBlock];
+    size_t columnOffset = 0;
+    size_t selectedOffset = 0;
+    for (Key rightKey : right) {
+      const size_t rightBlock = keyIndex.at(rightKey);
+      const size_t rightSize = dims[rightBlock];
+      (void)rightBlock;
+      result.block(rowOffset, columnOffset, leftSize, rightSize) =
+          selectedColumns.block(leftBegin, selectedOffset, leftSize, rightSize);
+      columnOffset += rightSize;
+      selectedOffset += rightSize;
+    }
+    rowOffset += leftSize;
+  }
+  return result;
+}
+
+}  // namespace
+
 TEST(Marginals, planarSLAMmarginals) {
 
   // Taken from PlanarSLAMSelfContained_advanced
@@ -189,6 +386,25 @@ TEST(Marginals, planarSLAMmarginals) {
   marginals = Marginals(gfg, soln_lin, Marginals::QR);
   testMarginals(marginals);
   testJointMarginals(marginals);
+
+  const Ordering ordering = Ordering::Colamd(gfg);
+  for (const auto factorization : {Marginals::CHOLESKY, Marginals::QR}) {
+    Marginals orderedMarginals(gfg, soln_lin, ordering, factorization);
+    const KeyVector unorderedVariables{x3, l2, x1};
+    const Matrix actual =
+        orderedMarginals.jointMarginalCovariance(unorderedVariables)
+            .fullMatrix();
+    const Matrix expected =
+        legacyJointCovariance(gfg, ordering, unorderedVariables, factorization);
+    EXPECT(assert_equal(expected, actual, 1e-8));
+
+    const Matrix expectedCross =
+        extractCrossBlock(expected, uniqueSortedKeys(unorderedVariables), soln,
+                          KeyVector{l2, x3}, KeyVector{x1});
+    const Matrix actualCross =
+        orderedMarginals.crossCovariance(KeyVector{l2, x3}, KeyVector{x1});
+    EXPECT(assert_equal(expectedCross, actualCross, 1e-8));
+  }
 }
 
 /* ************************************************************************* */
@@ -258,6 +474,142 @@ TEST(Marginals, order) {
   testMarginals(marginals, set);
 }
 
+/* ************************************************************************* */
+TEST(Marginals, crossCovarianceOrderAndLegacyAgreement) {
+  NonlinearFactorGraph graph;
+  graph.addPrior(0, Pose2(), noiseModel::Unit::Create(3));
+  graph.emplace_shared<BetweenFactor<Pose2>>(0, 1, Pose2(1, 0, 0),
+                                             noiseModel::Unit::Create(3));
+  graph.emplace_shared<BetweenFactor<Pose2>>(1, 2, Pose2(1, 0, 0),
+                                             noiseModel::Unit::Create(3));
+  graph.emplace_shared<BetweenFactor<Pose2>>(2, 3, Pose2(1, 0, 0),
+                                             noiseModel::Unit::Create(3));
+
+  Values values;
+  values.insert(0, Pose2());
+  values.insert(1, Pose2(1, 0, 0));
+  values.insert(2, Pose2(2, 0, 0));
+  values.insert(3, Pose2(3, 0, 0));
+  values.insert(100, Point2(0, 1));
+  values.insert(101, Point2(1, 1));
+
+  graph.emplace_shared<BearingRangeFactor<Pose2, Point2>>(
+      0, 100, values.at<Pose2>(0).bearing(values.at<Point2>(100)),
+      values.at<Pose2>(0).range(values.at<Point2>(100)),
+      noiseModel::Unit::Create(2));
+  graph.emplace_shared<BearingRangeFactor<Pose2, Point2>>(
+      2, 101, values.at<Pose2>(2).bearing(values.at<Point2>(101)),
+      values.at<Pose2>(2).range(values.at<Point2>(101)),
+      noiseModel::Unit::Create(2));
+
+  GaussianFactorGraph linearGraph = *graph.linearize(values);
+  const Ordering ordering = Ordering::Colamd(linearGraph);
+  const KeyVector left{101, 1};
+  const KeyVector right{100, 3};
+  const KeyVector unionKeys{101, 1, 100, 3};
+
+  for (const auto factorization : {Marginals::CHOLESKY, Marginals::QR}) {
+    Marginals marginals(linearGraph, values, ordering, factorization);
+    const Matrix actual = marginals.crossCovariance(left, right);
+
+    const Matrix legacyFull =
+        legacyJointCovariance(linearGraph, ordering, unionKeys, factorization);
+    const Matrix expected =
+        extractCrossBlock(legacyFull, uniqueSortedKeys(unionKeys), values,
+                          uniqueStableKeys(left), uniqueStableKeys(right));
+    EXPECT(assert_equal(expected, actual, 1e-8));
+  }
+}
+
+/* ************************************************************************* */
+TEST(Marginals, packedCrossBlockRegression) {
+  NonlinearFactorGraph graph;
+  graph.addPrior(0, Pose2(), noiseModel::Unit::Create(3));
+  graph.emplace_shared<BetweenFactor<Pose2>>(0, 1, Pose2(1, 0, 0),
+                                             noiseModel::Unit::Create(3));
+  graph.emplace_shared<BetweenFactor<Pose2>>(1, 2, Pose2(1, 0, 0),
+                                             noiseModel::Unit::Create(3));
+  graph.emplace_shared<BetweenFactor<Pose2>>(2, 3, Pose2(1, 0, 0),
+                                             noiseModel::Unit::Create(3));
+
+  Values values;
+  values.insert(0, Pose2());
+  values.insert(1, Pose2(1, 0, 0));
+  values.insert(2, Pose2(2, 0, 0));
+  values.insert(3, Pose2(3, 0, 0));
+  values.insert(100, Point2(0, 1));
+  values.insert(101, Point2(1, 1));
+
+  graph.emplace_shared<BearingRangeFactor<Pose2, Point2>>(
+      0, 100, values.at<Pose2>(0).bearing(values.at<Point2>(100)),
+      values.at<Pose2>(0).range(values.at<Point2>(100)),
+      noiseModel::Unit::Create(2));
+  graph.emplace_shared<BearingRangeFactor<Pose2, Point2>>(
+      2, 101, values.at<Pose2>(2).bearing(values.at<Point2>(101)),
+      values.at<Pose2>(2).range(values.at<Point2>(101)),
+      noiseModel::Unit::Create(2));
+
+  GaussianFactorGraph linearGraph = *graph.linearize(values);
+  const Ordering ordering = Ordering::Colamd(linearGraph);
+  const KeyVector unionKeys{101, 1, 100, 3};
+  const KeyVector orderedKeys = uniqueSortedKeys(unionKeys);
+  const KeyVector left = uniqueStableKeys(KeyVector{101, 1});
+  const KeyVector right = uniqueStableKeys(KeyVector{100, 3});
+
+  const Matrix fullCovariance = legacyJointCovariance(
+      linearGraph, ordering, unionKeys, Marginals::CHOLESKY);
+  const Matrix expected =
+      extractCrossBlock(fullCovariance, orderedKeys, values, left, right);
+  const Matrix packedColumns =
+      packedSelectedColumns(fullCovariance, orderedKeys, values, right);
+  const Matrix actual =
+      extractPackedCrossBlock(packedColumns, orderedKeys, values, left, right);
+
+  EXPECT(assert_equal(expected, actual, 1e-8));
+  const Matrix incorrect =
+      extractPackedCrossBlock(fullCovariance, orderedKeys, values, left, right);
+  EXPECT((incorrect - expected).norm() > 1e-3);
+}
+
+#ifdef GTSAM_SUPPORT_NESTED_DISSECTION
+/* ************************************************************************* */
+TEST(Marginals, orderingEquivalence) {
+  NonlinearFactorGraph graph;
+  graph.addPrior(0, Pose2(), noiseModel::Unit::Create(3));
+  graph.emplace_shared<BetweenFactor<Pose2>>(0, 1, Pose2(1, 0, 0),
+                                             noiseModel::Unit::Create(3));
+  graph.emplace_shared<BetweenFactor<Pose2>>(1, 2, Pose2(1, 0, 0),
+                                             noiseModel::Unit::Create(3));
+  graph.emplace_shared<BetweenFactor<Pose2>>(2, 3, Pose2(1, 0, 0),
+                                             noiseModel::Unit::Create(3));
+  graph.emplace_shared<BetweenFactor<Pose2>>(3, 4, Pose2(1, 0, 0),
+                                             noiseModel::Unit::Create(3));
+
+  Values values;
+  values.insert(0, Pose2());
+  values.insert(1, Pose2(1, 0, 0));
+  values.insert(2, Pose2(2, 0, 0));
+  values.insert(3, Pose2(3, 0, 0));
+  values.insert(4, Pose2(4, 0, 0));
+
+  GaussianFactorGraph linearGraph = *graph.linearize(values);
+  const Ordering colamd = Ordering::Create(Ordering::COLAMD, linearGraph);
+  const Ordering metis = Ordering::Create(Ordering::METIS, linearGraph);
+  const KeyVector query{4, 1, 3};
+
+  for (const auto factorization : {Marginals::CHOLESKY, Marginals::QR}) {
+    Marginals colamdMarginals(linearGraph, values, colamd, factorization);
+    Marginals metisMarginals(linearGraph, values, metis, factorization);
+    EXPECT(assert_equal(
+        colamdMarginals.jointMarginalCovariance(query).fullMatrix(),
+        metisMarginals.jointMarginalCovariance(query).fullMatrix(), 1e-8));
+    EXPECT(assert_equal(
+        colamdMarginals.crossCovariance(KeyVector{4}, KeyVector{1, 3}),
+        metisMarginals.crossCovariance(KeyVector{4}, KeyVector{1, 3}), 1e-8));
+  }
+}
+#endif
+
 /* ************************************************************************* */
 int main() { TestResult tr; return TestRegistry::runAllTests(tr);}
 /* ************************************************************************* */
diff --git a/timing/README.md b/timing/README.md
new file mode 100644
index 0000000000..def0006d2c
--- /dev/null
+++ b/timing/README.md
@@ -0,0 +1,102 @@
+# Timing Benchmarks
+
+This directory contains timing executables and helper scripts for GTSAM.
+
+## Bayes-Tree Covariance Results
+
+The Bayes-tree covariance paper uses generated benchmark output rather than
+checked-in CSV files. The files under `timing/results/` can be regenerated from
+the commands below and do not need to be committed.
+
+### Build
+
+From the build directory:
+
+```bash
+make -j6 timeBayesTreeCovariance
+make -j6 exportBayesTreeCovarianceVisuals
+```
+
+### Generate benchmark CSVs
+
+Run from `build/`:
+
+```bash
+./timing/timeBayesTreeCovariance \
+  --datasets w100.graph,w10000.graph,w20000.txt \
+  --repeats 10 \
+  --output-dir ../timing/results/bayes_tree_covariance
+```
+
+This writes:
+
+- `timing/results/bayes_tree_covariance/raw.csv`
+- `timing/results/bayes_tree_covariance/per_query.csv`
+- `timing/results/bayes_tree_covariance/summary.csv`
+
+The generated CSVs now include the small-query families used to benchmark the
+legacy common cases:
+
+- `single_pose` for `Q = 1`
+- `pair_pose` for `Q = 2`
+
+These are timed with the same benchmark executable and appear alongside the
+larger `local_window`, `wide_separated`, `overlap_window`, and
+`selected_cross` workloads.
+
+### Export `w100` visual data
+
+Run from `build/`:
+
+```bash
+./timing/exportBayesTreeCovarianceVisuals \
+  --dataset w100.graph \
+  --output-dir ../timing/results/bayes_tree_covariance/visuals
+```
+
+This writes the CSV files used for the `w100` query and covariance figures.
+
+### Generate figures
+
+Run from the repository root:
+
+```bash
+python3 timing/plot_bayes_tree_covariance.py \
+  --input timing/results/bayes_tree_covariance/summary.csv \
+  --output-dir ../BayesTreeCovariance/figures/generated \
+  --copy-csv-dir ../BayesTreeCovariance/data \
+  --visual-data-dir timing/results/bayes_tree_covariance/visuals
+```
+
+This generates:
+
+- `results-smallq.pdf`
+- `results-ablation.pdf`
+- `results-ordering.pdf`
+- `results-structure.pdf`
+- `results-cross.pdf`
+- `results-w100-queries.pdf`
+- `results-w100-covariance.pdf`
+
+## Notes
+
+- The benchmark timings measure covariance-query work after optimization and
+  linearization.
+- Each distinct query is run once as an untimed warmup and then `--repeats`
+  times as measured repetitions.
+- `raw.csv` stores one row per measured repetition.
+- `per_query.csv` stores one row per distinct query, aggregating repeated
+  timings by per-query means.
+- `summary.csv` stores one row per query family bucket, aggregating the
+  per-query means and structural statistics by medians across the sampled
+  queries.
+- The `results-smallq.pdf` figure summarizes the `Q = 1` and `Q = 2` query
+  families; the larger performance figures focus on `Q > 2`, where Steiner
+  localization changes the asymptotic behavior.
+- The benchmark compares four variants:
+  - `legacy_dense`
+  - `steiner_dense`
+  - `legacy_solve`
+  - `steiner_solve`
+- If the generated results become stale, it is safe to delete
+  `timing/results/bayes_tree_covariance/` and regenerate it.
diff --git a/timing/exportBayesTreeCovarianceVisuals.cpp b/timing/exportBayesTreeCovarianceVisuals.cpp
new file mode 100644
index 0000000000..483b21c021
--- /dev/null
+++ b/timing/exportBayesTreeCovarianceVisuals.cpp
@@ -0,0 +1,193 @@
+/* ----------------------------------------------------------------------------
+
+ * GTSAM Copyright 2010, Georgia Tech Research Corporation,
+ * Atlanta, Georgia 30332-0415
+ * All Rights Reserved
+ * Authors: Frank Dellaert, et al. (see THANKS for the full author list)
+
+ * See LICENSE for the license information
+ * -------------------------------------------------------------------------- */
+
+/**
+ * @file    exportBayesTreeCovarianceVisuals.cpp
+ * @brief   Export representative pose-graph query and covariance visuals.
+ * @date    March 2026
+ * @author  Frank Dellaert
+ */
+
+#include <gtsam/geometry/Pose2.h>
+#include <gtsam/nonlinear/LevenbergMarquardtOptimizer.h>
+#include <gtsam/nonlinear/Marginals.h>
+#include <gtsam/nonlinear/NonlinearFactorGraph.h>
+#include <gtsam/nonlinear/Values.h>
+#include <gtsam/slam/BetweenFactor.h>
+#include <gtsam/slam/dataset.h>
+
+#include <algorithm>
+#include <cmath>
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <set>
+#include <string>
+#include <vector>
+
+using namespace gtsam;
+using namespace std;
+
+namespace {
+
+/// Return all Pose2 keys in sorted order.
+KeyVector poseKeys(const Values& values) {
+  KeyVector keys;
+  for (const auto& keyValue : values.extract<Pose2>()) {
+    keys.push_back(keyValue.first);
+  }
+  sort(keys.begin(), keys.end());
+  return keys;
+}
+
+/// Select a contiguous window near the center of the trajectory.
+KeyVector centeredWindow(const KeyVector& keys, size_t querySize) {
+  if (keys.size() <= querySize) {
+    return keys;
+  }
+  const size_t start = (keys.size() - querySize) / 2;
+  return KeyVector(keys.begin() + start, keys.begin() + start + querySize);
+}
+
+/// Select approximately evenly spaced keys across the full trajectory.
+KeyVector evenlySpaced(const KeyVector& keys, size_t querySize) {
+  if (keys.size() <= querySize) {
+    return keys;
+  }
+
+  KeyVector query;
+  query.reserve(querySize);
+  for (size_t i = 0; i < querySize; ++i) {
+    const double alpha =
+        querySize == 1 ? 0.0 : double(i) / double(querySize - 1);
+    const size_t index =
+        static_cast<size_t>(llround(alpha * double(keys.size() - 1)));
+    query.push_back(keys.at(index));
+  }
+  query.erase(unique(query.begin(), query.end()), query.end());
+  return query;
+}
+
+/// Write a one-column key list CSV.
+void writeKeyListCsv(const filesystem::path& path, const KeyVector& keys) {
+  ofstream stream(path);
+  stream << "key\n";
+  for (Key key : keys) {
+    stream << key << '\n';
+  }
+}
+
+/// Write a dense matrix to CSV with fixed precision.
+void writeMatrixCsv(const filesystem::path& path, const Matrix& matrix) {
+  ofstream stream(path);
+  stream << fixed << setprecision(12);
+  for (Eigen::Index row = 0; row < matrix.rows(); ++row) {
+    for (Eigen::Index column = 0; column < matrix.cols(); ++column) {
+      if (column) {
+        stream << ',';
+      }
+      stream << matrix(row, column);
+    }
+    stream << '\n';
+  }
+}
+
+/// Export optimized Pose2 values together with query membership flags.
+void writePoseCsv(const filesystem::path& path, const Values& values,
+                  const KeyVector& localQuery, const KeyVector& wideQuery) {
+  const set<Key> localKeys(localQuery.begin(), localQuery.end());
+  const set<Key> wideKeys(wideQuery.begin(), wideQuery.end());
+
+  ofstream stream(path);
+  stream << "key,x,y,theta,in_local,in_wide\n";
+  stream << fixed << setprecision(9);
+  for (Key key : poseKeys(values)) {
+    const Pose2 pose = values.at<Pose2>(key);
+    stream << key << ',' << pose.x() << ',' << pose.y() << ',' << pose.theta()
+           << ',' << (localKeys.count(key) ? 1 : 0) << ','
+           << (wideKeys.count(key) ? 1 : 0) << '\n';
+  }
+}
+
+/// Export the unique Pose2 measurement edges in the factor graph.
+void writeEdgeCsv(const filesystem::path& path,
+                  const NonlinearFactorGraph& graph) {
+  set<pair<Key, Key>> edges;
+  for (const auto& factor : graph) {
+    auto between = dynamic_pointer_cast<const BetweenFactor<Pose2>>(factor);
+    if (!between) {
+      continue;
+    }
+    const Key key1 = between->key1();
+    const Key key2 = between->key2();
+    const auto edge = minmax(key1, key2);
+    edges.insert(edge);
+  }
+
+  ofstream stream(path);
+  stream << "key1,key2\n";
+  for (const auto& [key1, key2] : edges) {
+    stream << key1 << ',' << key2 << '\n';
+  }
+}
+
+/// Read a string argument from argv or return a default value.
+string argumentOrDefault(char** begin, char** end, const string& flag,
+                         const string& defaultValue) {
+  for (auto it = begin; it != end; ++it) {
+    if (string(*it) == flag && it + 1 != end) {
+      return *(it + 1);
+    }
+  }
+  return defaultValue;
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  const string datasetName =
+      argumentOrDefault(argv, argv + argc, "--dataset", "w100.graph");
+  const filesystem::path outputDir =
+      argumentOrDefault(argv, argv + argc, "--output-dir",
+                        (filesystem::path("timing") / "results" /
+                         "bayes_tree_covariance" / "visuals")
+                            .string());
+  filesystem::create_directories(outputDir);
+
+  const auto [graphPtr, initialPtr] = load2D(findExampleDataFile(datasetName));
+  const KeyVector initialPoseKeys = poseKeys(*initialPtr);
+  if (!initialPoseKeys.empty()) {
+    graphPtr->addPrior(initialPoseKeys.front(),
+                       initialPtr->at<Pose2>(initialPoseKeys.front()),
+                       noiseModel::Diagonal::Sigmas(
+                           (Vector(3) << 1e-6, 1e-6, 1e-6).finished()));
+  }
+
+  LevenbergMarquardtOptimizer optimizer(*graphPtr, *initialPtr);
+  const Values result = optimizer.optimize();
+  const KeyVector queryCandidates = poseKeys(result);
+  const KeyVector localQuery = centeredWindow(queryCandidates, 10);
+  const KeyVector wideQuery = evenlySpaced(queryCandidates, 10);
+
+  const Marginals marginals(*graphPtr, result, Marginals::CHOLESKY);
+  const Matrix localCovariance =
+      marginals.jointMarginalCovariance(localQuery).fullMatrix();
+  const Matrix wideCovariance =
+      marginals.jointMarginalCovariance(wideQuery).fullMatrix();
+
+  writePoseCsv(outputDir / "w100_poses.csv", result, localQuery, wideQuery);
+  writeEdgeCsv(outputDir / "w100_edges.csv", *graphPtr);
+  writeKeyListCsv(outputDir / "w100_local_keys.csv", localQuery);
+  writeKeyListCsv(outputDir / "w100_wide_keys.csv", wideQuery);
+  writeMatrixCsv(outputDir / "w100_local_covariance.csv", localCovariance);
+  writeMatrixCsv(outputDir / "w100_wide_covariance.csv", wideCovariance);
+
+  return 0;
+}
diff --git a/timing/plot_bayes_tree_covariance.py b/timing/plot_bayes_tree_covariance.py
new file mode 100644
index 0000000000..5d824607b3
--- /dev/null
+++ b/timing/plot_bayes_tree_covariance.py
@@ -0,0 +1,801 @@
+#!/usr/bin/env python3
+"""
+GTSAM Copyright 2010, Georgia Tech Research Corporation,
+Atlanta, Georgia 30332-0415
+All Rights Reserved
+
+See LICENSE for the license information
+
+Generate paper-ready plots for the Bayes-tree covariance benchmarks.
+Author: Codex 5.4, prompted by Frank Dellaert
+"""
+
+import argparse
+import csv
+import math
+import shutil
+from collections import defaultdict
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.patches import Ellipse
+
+
+def load_rows(path):
+    """Load a CSV file into a list of row dictionaries."""
+    with open(path, newline="") as handle:
+        return list(csv.DictReader(handle))
+
+
+def to_float(rows, fields):
+    """Convert numeric CSV fields to float or integer types in place."""
+    for row in rows:
+        for field in fields:
+            row[field] = float(row[field])
+        row["query_size"] = int(row["query_size"])
+        row["queries"] = int(row["queries"])
+        row["repeats"] = int(row["repeats"])
+        row["support_cliques"] = float(row["support_cliques"])
+        row["compressed_cliques"] = float(row["compressed_cliques"])
+        row["reduced_state_dim"] = float(row["reduced_state_dim"])
+    return rows
+
+
+def baseline_speedups(rows):
+    """Annotate each row with its speedup relative to the legacy dense baseline."""
+    baselines = {}
+    for row in rows:
+        if row["variant"] == "legacy_dense":
+            baselines[
+                (
+                    row["dataset"],
+                    row["ordering"],
+                    row["query_family"],
+                    row["mode"],
+                    row["query_size"],
+                )
+            ] = row["median_total_ms"]
+
+    for row in rows:
+        key = (
+            row["dataset"],
+            row["ordering"],
+            row["query_family"],
+            row["mode"],
+            row["query_size"],
+        )
+        baseline = baselines.get(key, row["median_total_ms"])
+        row["speedup"] = (
+            baseline / row["median_total_ms"] if row["median_total_ms"] else 1.0
+        )
+    return rows
+
+
+def filter_rows(rows, **criteria):
+    """Return rows whose named fields match the provided values."""
+    result = []
+    for row in rows:
+        if all(row[key] == value for key, value in criteria.items()):
+            result.append(row)
+    return result
+
+
+def grouped(rows, *keys):
+    """Group rows by a tuple of field values."""
+    groups = defaultdict(list)
+    for row in rows:
+        groups[tuple(row[key] for key in keys)].append(row)
+    return groups
+
+
+def dataset_label(dataset):
+    """Strip dataset file extensions for presentation."""
+    return dataset.removesuffix(".graph").removesuffix(".txt")
+
+
+def plot_ablation(rows, output_path):
+    """Plot the four-way ablation for local-window joint queries."""
+    joint_rows = [
+        row
+        for row in rows
+        if row["mode"] == "joint"
+        and row["query_family"] == "local_window"
+        and row["dataset"] in {"w10000.graph", "w20000.txt"}
+    ]
+    variants = ["legacy_dense", "steiner_dense", "legacy_solve", "steiner_solve"]
+    variant_styles = {
+        "legacy_dense": {
+            "color": "#228b22",
+            "linestyle": "--",
+            "marker": "s",
+            "linewidth": 2.0,
+        },
+        "steiner_dense": {
+            "color": "#ff8c00",
+            "linestyle": "-",
+            "marker": "o",
+            "linewidth": 2.0,
+        },
+        "legacy_solve": {
+            "color": "#1f77b4",
+            "linestyle": "-.",
+            "marker": "^",
+            "linewidth": 2.0,
+        },
+        "steiner_solve": {
+            "color": "#b22222",
+            "linestyle": "-",
+            "marker": "D",
+            "linewidth": 2.2,
+        },
+    }
+    figure, axes = plt.subplots(2, 2, figsize=(12, 8), sharex=True, sharey=True)
+    for ax, (dataset, ordering) in zip(
+        axes.flat,
+        [
+            ("w10000.graph", "COLAMD"),
+            ("w10000.graph", "METIS"),
+            ("w20000.txt", "COLAMD"),
+            ("w20000.txt", "METIS"),
+        ],
+    ):
+        subset = filter_rows(joint_rows, dataset=dataset, ordering=ordering)
+        for variant in variants:
+            series = sorted(
+                filter_rows(subset, variant=variant), key=lambda row: row["query_size"]
+            )
+            if not series:
+                continue
+            style = variant_styles[variant]
+            ax.plot(
+                [row["query_size"] for row in series],
+                [row["speedup"] for row in series],
+                label=variant.replace("_", " "),
+                markersize=6,
+                zorder=3 if variant == "legacy_dense" else 4,
+                **style,
+            )
+        ax.set_title(f"{dataset_label(dataset)} / {ordering}")
+        ax.set_xlabel("Query size")
+        ax.set_ylabel("Speedup vs legacy_dense")
+        ax.set_yscale("log")
+        ax.grid(True, alpha=0.3)
+    handles, labels = axes[0, 0].get_legend_handles_labels()
+    figure.legend(handles, labels, loc="upper center", ncol=4, frameon=False)
+    figure.tight_layout(rect=(0, 0, 1, 0.94))
+    figure.savefig(output_path)
+    plt.close(figure)
+
+
+def plot_small_queries(rows, output_path):
+    """Plot timing for the legacy single-pose and pair-pose query families."""
+    subset = [
+        row
+        for row in rows
+        if row["mode"] == "joint"
+        and row["query_family"] in {"single_pose", "pair_pose"}
+        and row["dataset"] in {"w10000.graph", "w20000.txt"}
+    ]
+    families = [("single_pose", "Q = 1"), ("pair_pose", "Q = 2")]
+    groups = [
+        ("w10000.graph", "COLAMD"),
+        ("w10000.graph", "METIS"),
+        ("w20000.txt", "COLAMD"),
+        ("w20000.txt", "METIS"),
+    ]
+    variants = ["legacy_dense", "steiner_dense", "legacy_solve", "steiner_solve"]
+    colors = {
+        "legacy_dense": "#228b22",
+        "steiner_dense": "#ff8c00",
+        "legacy_solve": "#1f77b4",
+        "steiner_solve": "#b22222",
+    }
+    figure, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
+    width = 0.18
+    x_positions = np.arange(len(groups))
+
+    for ax, (family, title) in zip(axes, families):
+        family_rows = [row for row in subset if row["query_family"] == family]
+        for index, variant in enumerate(variants):
+            heights = []
+            for dataset, ordering in groups:
+                matches = [
+                    row["median_total_ms"]
+                    for row in family_rows
+                    if row["dataset"] == dataset
+                    and row["ordering"] == ordering
+                    and row["variant"] == variant
+                ]
+                heights.append(matches[0] if matches else math.nan)
+            ax.bar(
+                x_positions + (index - 1.5) * width,
+                heights,
+                width=width,
+                color=colors[variant],
+                label=variant.replace("_", " "),
+            )
+        ax.set_title(title)
+        ax.set_xticks(
+            x_positions,
+            [
+                "w10000\nCOLAMD",
+                "w10000\nMETIS",
+                "w20000\nCOLAMD",
+                "w20000\nMETIS",
+            ],
+        )
+        ax.set_ylabel("Median per-query mean time (ms)")
+        ax.set_yscale("log")
+        ax.grid(True, axis="y", alpha=0.3)
+    handles, labels = axes[0].get_legend_handles_labels()
+    figure.legend(handles, labels, loc="upper center", ncol=4, frameon=False)
+    figure.tight_layout(rect=(0, 0, 1, 0.88))
+    figure.savefig(output_path)
+    plt.close(figure)
+
+
+def plot_ordering(rows, output_path):
+    """Compare ordering sensitivity for the dense baseline and localized method."""
+    target_rows = [
+        row
+        for row in rows
+        if row["mode"] == "joint"
+        and row["variant"] in {"legacy_dense", "steiner_solve"}
+        and row["dataset"] in {"w10000.graph", "w20000.txt"}
+        and row["query_family"] == "local_window"
+    ]
+    variant_styles = {
+        ("legacy_dense", "COLAMD"): {
+            "color": "#228b22",
+            "linestyle": "--",
+            "marker": "s",
+            "linewidth": 2.0,
+        },
+        ("legacy_dense", "METIS"): {
+            "color": "#66a61e",
+            "linestyle": ":",
+            "marker": "s",
+            "linewidth": 2.0,
+        },
+        ("steiner_solve", "COLAMD"): {
+            "color": "#b22222",
+            "linestyle": "-",
+            "marker": "D",
+            "linewidth": 2.2,
+        },
+        ("steiner_solve", "METIS"): {
+            "color": "#ff8c00",
+            "linestyle": "-.",
+            "marker": "o",
+            "linewidth": 2.2,
+        },
+    }
+    figure, axes = plt.subplots(1, 2, figsize=(11, 4), sharey=True)
+    for ax, dataset in zip(axes, ["w10000.graph", "w20000.txt"]):
+        subset = [row for row in target_rows if row["dataset"] == dataset]
+        query_sizes = sorted({row["query_size"] for row in subset})
+        for variant, ordering in [
+            ("legacy_dense", "COLAMD"),
+            ("legacy_dense", "METIS"),
+            ("steiner_solve", "COLAMD"),
+            ("steiner_solve", "METIS"),
+        ]:
+            series = sorted(
+                [
+                    row
+                    for row in subset
+                    if row["variant"] == variant and row["ordering"] == ordering
+                ],
+                key=lambda row: row["query_size"],
+            )
+            if not series:
+                continue
+            style = variant_styles[(variant, ordering)]
+            ax.plot(
+                [row["query_size"] for row in series],
+                [row["median_total_ms"] for row in series],
+                label=f"{ordering} / {variant.replace('_', ' ')}",
+                markersize=6,
+                **style,
+            )
+        ax.set_title(dataset_label(dataset))
+        ax.set_xlabel("Local-window query size")
+        ax.set_xticks(query_sizes, [str(size) for size in query_sizes])
+        ax.set_ylabel("Median per-query mean time (ms)")
+        ax.set_yscale("log")
+        ax.grid(True, axis="y", alpha=0.3)
+    handles, labels = axes[0].get_legend_handles_labels()
+    figure.legend(handles, labels, loc="upper center", ncol=2, frameon=False)
+    figure.tight_layout(rect=(0, 0, 1, 0.9))
+    figure.savefig(output_path)
+    plt.close(figure)
+
+
+def plot_structure(rows, output_path):
+    """Plot support size and reduced state statistics against query size."""
+    subset = [
+        row
+        for row in rows
+        if row["mode"] == "joint"
+        and row["variant"] == "steiner_solve"
+        and row["query_family"] == "local_window"
+        and row["dataset"] in {"w10000.graph", "w20000.txt"}
+        and row["ordering"] == "COLAMD"
+    ]
+    figure, axes = plt.subplots(1, 2, figsize=(11, 4), sharey=False)
+    for ax, dataset in zip(axes, ["w10000.graph", "w20000.txt"]):
+        series = sorted(
+            [row for row in subset if row["dataset"] == dataset],
+            key=lambda row: row["query_size"],
+        )
+        x = [row["query_size"] for row in series]
+        ax.plot(
+            x,
+            [row["support_cliques"] for row in series],
+            marker="o",
+            label="support cliques",
+        )
+        ax.plot(
+            x,
+            [row["compressed_cliques"] for row in series],
+            marker="s",
+            label="compressed cliques",
+        )
+        ax.plot(
+            x,
+            [row["reduced_state_dim"] for row in series],
+            marker="^",
+            label="reduced-state dimension",
+        )
+        ax.set_title(dataset_label(dataset))
+        ax.set_xlabel("Query size")
+        ax.set_ylabel("Median count / dimension")
+        ax.grid(True, alpha=0.3)
+    handles, labels = axes[0].get_legend_handles_labels()
+    figure.legend(handles, labels, loc="upper center", ncol=3, frameon=False)
+    figure.tight_layout(rect=(0, 0, 1, 0.9))
+    figure.savefig(output_path)
+    plt.close(figure)
+
+
+def plot_selected_cross(rows, output_path):
+    """Plot selected cross-covariance speedups for all benchmark variants."""
+    subset = [
+        row
+        for row in rows
+        if row["mode"] == "cross"
+        and row["query_family"] == "selected_cross"
+        and row["dataset"] in {"w10000.graph", "w20000.txt"}
+    ]
+    variants = ["legacy_dense", "steiner_dense", "legacy_solve", "steiner_solve"]
+    variant_styles = {
+        "legacy_dense": {
+            "color": "#228b22",
+            "linestyle": "--",
+            "marker": "s",
+            "linewidth": 2.0,
+        },
+        "steiner_dense": {
+            "color": "#ff8c00",
+            "linestyle": "-",
+            "marker": "o",
+            "linewidth": 2.0,
+        },
+        "legacy_solve": {
+            "color": "#1f77b4",
+            "linestyle": "-.",
+            "marker": "^",
+            "linewidth": 2.0,
+        },
+        "steiner_solve": {
+            "color": "#b22222",
+            "linestyle": "-",
+            "marker": "D",
+            "linewidth": 2.2,
+        },
+    }
+    figure, axes = plt.subplots(1, 2, figsize=(11, 4), sharey=True)
+    for ax, dataset in zip(axes, ["w10000.graph", "w20000.txt"]):
+        for variant in variants:
+            series = sorted(
+                [
+                    row
+                    for row in subset
+                    if row["dataset"] == dataset
+                    and row["ordering"] == "COLAMD"
+                    and row["variant"] == variant
+                ],
+                key=lambda row: row["query_size"],
+            )
+            if not series:
+                continue
+            style = variant_styles[variant]
+            ax.plot(
+                [row["query_size"] for row in series],
+                [row["speedup"] for row in series],
+                label=variant.replace("_", " "),
+                markersize=6,
+                zorder=3 if variant == "legacy_dense" else 4,
+                **style,
+            )
+        ax.set_title(f"{dataset_label(dataset)} / COLAMD")
+        ax.set_xlabel("Query size")
+        ax.set_ylabel("Speedup vs legacy_dense")
+        ax.set_yscale("log")
+        ax.grid(True, alpha=0.3)
+    handles, labels = axes[0].get_legend_handles_labels()
+    figure.legend(handles, labels, loc="upper center", ncol=4, frameon=False)
+    figure.tight_layout(rect=(0, 0, 1, 0.88))
+    figure.savefig(output_path)
+    plt.close(figure)
+
+
+def load_key_list(path):
+    """Load a single-column key CSV."""
+    with open(path, newline="") as handle:
+        reader = csv.DictReader(handle)
+        return [int(row["key"]) for row in reader]
+
+
+def load_pose_rows(path):
+    """Load pose metadata used by the visual summary figures."""
+    with open(path, newline="") as handle:
+        reader = csv.DictReader(handle)
+        rows = list(reader)
+    for row in rows:
+        row["key"] = int(row["key"])
+        row["x"] = float(row["x"])
+        row["y"] = float(row["y"])
+        row["theta"] = float(row["theta"])
+        row["in_local"] = int(row["in_local"])
+        row["in_wide"] = int(row["in_wide"])
+    return rows
+
+
+def load_edge_rows(path):
+    """Load pose-graph edges used by the visual summary figures."""
+    with open(path, newline="") as handle:
+        reader = csv.DictReader(handle)
+        rows = list(reader)
+    for row in rows:
+        row["key1"] = int(row["key1"])
+        row["key2"] = int(row["key2"])
+    return rows
+
+
+def plot_w100_queries(data_dir, output_path):
+    """Render representative local and wide-separated query selections on w100."""
+    pose_rows = load_pose_rows(data_dir / "w100_poses.csv")
+    edge_rows = load_edge_rows(data_dir / "w100_edges.csv")
+    local_keys = load_key_list(data_dir / "w100_local_keys.csv")
+    wide_keys = load_key_list(data_dir / "w100_wide_keys.csv")
+    pose_by_key = {row["key"]: row for row in pose_rows}
+
+    figure, axes = plt.subplots(1, 2, figsize=(11.5, 4.8), sharex=True, sharey=True)
+    query_specs = [
+        ("Local 10-pose query", local_keys, "#b22222"),
+        ("Wide-separated 10-pose query", wide_keys, "#1f5aa6"),
+    ]
+
+    for ax, (title, query_keys, color) in zip(axes, query_specs):
+        for edge in edge_rows:
+            pose1 = pose_by_key[edge["key1"]]
+            pose2 = pose_by_key[edge["key2"]]
+            ax.plot(
+                [pose1["x"], pose2["x"]],
+                [pose1["y"], pose2["y"]],
+                color="#c8c8c8",
+                linewidth=0.7,
+                alpha=0.7,
+                zorder=1,
+            )
+
+        ax.scatter(
+            [row["x"] for row in pose_rows],
+            [row["y"] for row in pose_rows],
+            s=10,
+            color="#666666",
+            alpha=0.65,
+            zorder=2,
+        )
+
+        query_points = [pose_by_key[key] for key in query_keys]
+        ax.plot(
+            [row["x"] for row in query_points],
+            [row["y"] for row in query_points],
+            color=color,
+            linewidth=2.2,
+            alpha=0.95,
+            zorder=3,
+        )
+        ax.scatter(
+            [row["x"] for row in query_points],
+            [row["y"] for row in query_points],
+            s=36,
+            color=color,
+            edgecolor="white",
+            linewidth=0.6,
+            zorder=4,
+        )
+
+        ax.scatter(
+            [query_points[0]["x"]],
+            [query_points[0]["y"]],
+            s=70,
+            marker="s",
+            color=color,
+            edgecolor="white",
+            linewidth=0.8,
+            zorder=5,
+        )
+        ax.scatter(
+            [query_points[-1]["x"]],
+            [query_points[-1]["y"]],
+            s=70,
+            marker="D",
+            color=color,
+            edgecolor="white",
+            linewidth=0.8,
+            zorder=5,
+        )
+
+        for key in query_keys[:: max(1, len(query_keys) // 5)]:
+            row = pose_by_key[key]
+            ax.text(
+                row["x"] + 0.12,
+                row["y"] + 0.12,
+                str(key),
+                fontsize=8,
+                color=color,
+                zorder=6,
+            )
+
+        ax.set_title(title)
+        ax.set_aspect("equal", adjustable="box")
+        ax.grid(True, alpha=0.18)
+        ax.set_xlabel("x")
+    axes[0].set_ylabel("y")
+    figure.tight_layout()
+    figure.savefig(output_path)
+    plt.close(figure)
+
+
+def pose_block(covariance, keys, key):
+    """Extract the 3x3 Pose2 marginal block for one key."""
+    block_index = keys.index(key)
+    start = 3 * block_index
+    return covariance[start : start + 3, start : start + 3]
+
+
+def representative_pose_key(keys, covariance):
+    """Select the queried pose with the largest translational uncertainty."""
+    scored = []
+    for key in keys:
+        marginal = pose_block(covariance, keys, key)
+        translation_area = np.linalg.det(marginal[:2, :2])
+        scored.append((translation_area, marginal[2, 2], key))
+    scored.sort(reverse=True)
+    return scored[0][2]
+
+
+def sample_tangent_gaussian(rng, covariance, size):
+    """Sample a Gaussian in Pose2 tangent coordinates."""
+    symmetric_covariance = 0.5 * (covariance + covariance.T)
+    eigenvalues, eigenvectors = np.linalg.eigh(symmetric_covariance)
+    clamped = np.clip(eigenvalues, 0.0, None)
+    sqrt_covariance = eigenvectors @ np.diag(np.sqrt(clamped))
+    standard = rng.standard_normal((size, covariance.shape[0]))
+    return np.einsum("nj,ij->ni", standard, sqrt_covariance)
+
+
+def exp_se2(tangent):
+    """Evaluate the SE(2) exponential map for one tangent vector."""
+    vx, vy, omega = tangent
+    if abs(omega) < 1e-9:
+        return np.array([vx, vy]), omega
+
+    sin_omega = math.sin(omega)
+    cos_omega = math.cos(omega)
+    a_term = sin_omega / omega
+    b_term = (1.0 - cos_omega) / omega
+    v_matrix = np.array([[a_term, -b_term], [b_term, a_term]])
+    return v_matrix @ np.array([vx, vy]), omega
+
+
+def retract_pose(mean_pose, tangent):
+    """Push a tangent perturbation forward from a mean Pose2 state."""
+    translation, omega = exp_se2(tangent)
+    mean_theta = mean_pose["theta"]
+    rotation = np.array(
+        [
+            [math.cos(mean_theta), -math.sin(mean_theta)],
+            [math.sin(mean_theta), math.cos(mean_theta)],
+        ]
+    )
+    point = np.array([mean_pose["x"], mean_pose["y"]]) + rotation @ translation
+    return point[0], point[1], mean_theta + omega
+
+
+def wrap_degrees(angle_radians):
+    """Wrap an angle in radians to the range [-180, 180) degrees."""
+    return ((np.rad2deg(angle_radians) + 180.0) % 360.0) - 180.0
+
+
+def draw_heading_arrow(ax, pose_row, length=1.0):
+    """Draw a heading arrow for a mean pose."""
+    dx = length * math.cos(pose_row["theta"])
+    dy = length * math.sin(pose_row["theta"])
+    ax.arrow(
+        pose_row["x"],
+        pose_row["y"],
+        dx,
+        dy,
+        width=0.03,
+        head_width=0.22,
+        head_length=0.28,
+        color="black",
+        length_includes_head=True,
+        zorder=5,
+    )
+
+
+def draw_world_sample_ellipse(ax, positions):
+    """Draw the empirical 2-sigma ellipse of sampled world-frame positions."""
+    covariance_xy = np.cov(positions[:, :2].T)
+    center = positions[:, :2].mean(axis=0)
+    eigenvalues, eigenvectors = np.linalg.eigh(covariance_xy)
+    order = np.argsort(eigenvalues)[::-1]
+    eigenvalues = eigenvalues[order]
+    eigenvectors = eigenvectors[:, order]
+    angle = np.degrees(np.arctan2(eigenvectors[1, 0], eigenvectors[0, 0]))
+    width, height = 4.0 * np.sqrt(np.maximum(eigenvalues, 0.0))
+    ellipse = Ellipse(
+        xy=(center[0], center[1]),
+        width=width,
+        height=height,
+        angle=angle,
+        edgecolor="black",
+        facecolor="none",
+        linestyle="--",
+        linewidth=1.2,
+        alpha=0.9,
+        zorder=4,
+    )
+    ax.add_patch(ellipse)
+
+
+def plot_pose2_marginals(data_dir, output_path):
+    """Render representative Pose2 marginals as pushed-forward sample clouds."""
+    local_keys = load_key_list(data_dir / "w100_local_keys.csv")
+    wide_keys = load_key_list(data_dir / "w100_wide_keys.csv")
+    pose_rows = {row["key"]: row for row in load_pose_rows(data_dir / "w100_poses.csv")}
+    local_covariance = np.loadtxt(data_dir / "w100_local_covariance.csv", delimiter=",")
+    wide_covariance = np.loadtxt(data_dir / "w100_wide_covariance.csv", delimiter=",")
+    local_key = representative_pose_key(local_keys, local_covariance)
+    wide_key = representative_pose_key(wide_keys, wide_covariance)
+    rng = np.random.default_rng(7)
+
+    figure, axes = plt.subplots(
+        1,
+        3,
+        figsize=(12.0, 4.8),
+        gridspec_kw={"width_ratios": [1.0, 1.0, 0.05]},
+    )
+    heatmap_axes = axes[:2]
+    colorbar_axis = axes[2]
+    specifications = [
+        (
+            "Representative local Pose2 marginal",
+            local_key,
+            local_keys,
+            local_covariance,
+        ),
+        (
+            "Representative wide-separated Pose2 marginal",
+            wide_key,
+            wide_keys,
+            wide_covariance,
+        ),
+    ]
+
+    for ax, (title, key, keys, covariance) in zip(heatmap_axes, specifications):
+        marginal = pose_block(covariance, keys, key)
+        mean_pose = pose_rows[key]
+        samples_tangent = sample_tangent_gaussian(rng, marginal, size=4000)
+        positions = np.array(
+            [retract_pose(mean_pose, tangent) for tangent in samples_tangent]
+        )
+        headings = wrap_degrees(positions[:, 2])
+        heatmap = ax.scatter(
+            positions[:, 0],
+            positions[:, 1],
+            c=headings,
+            s=6,
+            alpha=0.32,
+            cmap="turbo",
+            vmin=-180.0,
+            vmax=180.0,
+            linewidths=0.0,
+            zorder=2,
+        )
+        draw_world_sample_ellipse(ax, positions)
+        ax.scatter(
+            [mean_pose["x"]],
+            [mean_pose["y"]],
+            color="black",
+            marker="x",
+            s=65,
+            linewidths=2.0,
+            zorder=6,
+        )
+        draw_heading_arrow(ax, mean_pose)
+        ax.text(
+            mean_pose["x"] + 0.35,
+            mean_pose["y"] + 0.35,
+            f"key {key}",
+            fontsize=9,
+            color="black",
+            zorder=7,
+        )
+        ax.set_title(title)
+        ax.set_aspect("equal", adjustable="box")
+        ax.grid(True, alpha=0.18)
+        ax.set_xlabel("x")
+        ax.set_ylabel("y")
+    colorbar = figure.colorbar(heatmap, cax=colorbar_axis)
+    colorbar.set_label("heading (deg)")
+    figure.subplots_adjust(left=0.08, right=0.94, bottom=0.2, top=0.88, wspace=0.32)
+    figure.savefig(output_path)
+    plt.close(figure)
+
+
+def main():
+    """Parse arguments and generate all requested benchmark figures."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True)
+    parser.add_argument("--output-dir", required=True)
+    parser.add_argument("--copy-csv-dir", default="")
+    parser.add_argument("--visual-data-dir", default="")
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    rows = baseline_speedups(
+        to_float(
+            load_rows(input_path),
+            [
+                "median_total_ms",
+                "sum_query_mean_total_ms",
+                "median_reduction_ms",
+                "median_extraction_ms",
+                "support_cliques",
+                "compressed_cliques",
+                "reduced_state_dim",
+            ],
+        )
+    )
+
+    plot_ablation(rows, output_dir / "results-ablation.pdf")
+    plot_small_queries(rows, output_dir / "results-smallq.pdf")
+    plot_ordering(rows, output_dir / "results-ordering.pdf")
+    plot_structure(rows, output_dir / "results-structure.pdf")
+    plot_selected_cross(rows, output_dir / "results-cross.pdf")
+
+    if args.visual_data_dir:
+        visual_data_dir = Path(args.visual_data_dir)
+        plot_w100_queries(visual_data_dir, output_dir / "results-w100-queries.pdf")
+        plot_pose2_marginals(
+            visual_data_dir, output_dir / "results-w100-covariance.pdf"
+        )
+
+    if args.copy_csv_dir:
+        copy_dir = Path(args.copy_csv_dir)
+        copy_dir.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(input_path, copy_dir / input_path.name)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/timing/timeBayesTreeCovariance.cpp b/timing/timeBayesTreeCovariance.cpp
new file mode 100644
index 0000000000..404a4eb42f
--- /dev/null
+++ b/timing/timeBayesTreeCovariance.cpp
@@ -0,0 +1,1082 @@
+/* ----------------------------------------------------------------------------
+
+ * GTSAM Copyright 2010, Georgia Tech Research Corporation,
+ * Atlanta, Georgia 30332-0415
+ * All Rights Reserved
+ * Authors: Frank Dellaert, et al. (see THANKS for the full author list)
+
+ * See LICENSE for the license information
+ * -------------------------------------------------------------------------- */
+
+/**
+ * @file    timeBayesTreeCovariance.cpp
+ * @brief   Benchmark Bayes-tree covariance recovery variants.
+ * @date    March 2026
+ * @author  Codex 5.4, prompted by Frank Dellaert
+ */
+
+#include <gtsam/config.h>
+#include <gtsam/inference/Ordering.h>
+#include <gtsam/linear/GaussianBayesNet.h>
+#include <gtsam/linear/GaussianBayesTree.h>
+#include <gtsam/linear/GaussianFactorGraph.h>
+#include <gtsam/nonlinear/LevenbergMarquardtOptimizer.h>
+#include <gtsam/nonlinear/NonlinearFactorGraph.h>
+#include <gtsam/nonlinear/Values.h>
+#include <gtsam/slam/dataset.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+using namespace gtsam;
+using namespace std;
+
+namespace {
+
+/// Covariance-recovery variant used in the benchmark.
+enum class Variant {
+  LegacyDense,
+  SteinerDense,
+  LegacySolve,
+  SteinerSolve,
+};
+
+/// One benchmark query together with any selected-block split.
+struct QueryCase {
+  string family;
+  size_t querySize;
+  KeyVector keys;
+  KeyVector left;
+  KeyVector right;
+  bool crossCovariance = false;
+};
+
+/// Structural statistics for the support touched by a query.
+struct SupportStats {
+  size_t supportCliques = 0;
+  size_t compressedCliques = 0;
+};
+
+/// Timing result for one query under one variant.
+struct RawResult {
+  string dataset;
+  string ordering;
+  string family;
+  string mode;
+  string variant;
+  size_t querySize = 0;
+  size_t queryIndex = 0;
+  size_t repeatIndex = 0;
+  double totalMs = 0.0;
+  double reductionMs = 0.0;
+  double extractionMs = 0.0;
+  size_t supportCliques = 0;
+  size_t compressedCliques = 0;
+  size_t reducedStateDim = 0;
+};
+
+/// Grouping key for per-query aggregation across repeated timings.
+struct PerQueryKey {
+  string dataset;
+  string ordering;
+  string family;
+  string mode;
+  string variant;
+  size_t querySize = 0;
+  size_t queryIndex = 0;
+
+  bool operator==(const PerQueryKey& other) const {
+    return dataset == other.dataset && ordering == other.ordering &&
+           family == other.family && mode == other.mode &&
+           variant == other.variant && querySize == other.querySize &&
+           queryIndex == other.queryIndex;
+  }
+};
+
+/// Hash function for PerQueryKey.
+struct PerQueryKeyHash {
+  size_t operator()(const PerQueryKey& key) const {
+    size_t seed = std::hash<string>()(key.dataset);
+    seed ^= std::hash<string>()(key.ordering) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2);
+    seed ^= std::hash<string>()(key.family) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2);
+    seed ^=
+        std::hash<string>()(key.mode) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+    seed ^= std::hash<string>()(key.variant) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2);
+    seed ^= std::hash<size_t>()(key.querySize) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2);
+    seed ^= std::hash<size_t>()(key.queryIndex) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2);
+    return seed;
+  }
+};
+
+/// Aggregated result for one fixed query across repeated timing samples.
+struct PerQueryResult {
+  string dataset;
+  string ordering;
+  string family;
+  string mode;
+  string variant;
+  size_t querySize = 0;
+  size_t queryIndex = 0;
+  size_t repeats = 0;
+  double meanTotalMs = 0.0;
+  double meanReductionMs = 0.0;
+  double meanExtractionMs = 0.0;
+  double medianTotalMs = 0.0;
+  double medianReductionMs = 0.0;
+  double medianExtractionMs = 0.0;
+  double stddevTotalMs = 0.0;
+  double stddevReductionMs = 0.0;
+  double stddevExtractionMs = 0.0;
+  size_t supportCliques = 0;
+  size_t compressedCliques = 0;
+  size_t reducedStateDim = 0;
+};
+
+/// Grouping key for aggregating repeated benchmark queries.
+struct SummaryKey {
+  string dataset;
+  string ordering;
+  string family;
+  string mode;
+  string variant;
+  size_t querySize = 0;
+
+  bool operator==(const SummaryKey& other) const {
+    return dataset == other.dataset && ordering == other.ordering &&
+           family == other.family && mode == other.mode &&
+           variant == other.variant && querySize == other.querySize;
+  }
+};
+
+/// Hash function for SummaryKey.
+struct SummaryKeyHash {
+  size_t operator()(const SummaryKey& key) const {
+    size_t seed = std::hash<string>()(key.dataset);
+    seed ^= std::hash<string>()(key.ordering) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2);
+    seed ^= std::hash<string>()(key.family) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2);
+    seed ^=
+        std::hash<string>()(key.mode) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+    seed ^= std::hash<string>()(key.variant) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2);
+    seed ^= std::hash<size_t>()(key.querySize) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2);
+    return seed;
+  }
+};
+
+/// Aggregated timing samples for one SummaryKey.
+struct SummaryValue {
+  vector<double> queryMeanTotalMs;
+  vector<double> queryMeanReductionMs;
+  vector<double> queryMeanExtractionMs;
+  vector<double> supportCliques;
+  vector<double> compressedCliques;
+  vector<double> reducedStateDim;
+  size_t repeats = 0;
+};
+
+/// Return a deduplicated, sorted key list.
+KeyVector uniqueSortedKeys(const KeyVector& keys) {
+  KeyVector result = keys;
+  sort(result.begin(), result.end());
+  result.erase(unique(result.begin(), result.end()), result.end());
+  return result;
+}
+
+/// Return all Pose2 keys in sorted order.
+KeyVector poseKeys(const Values& values) {
+  KeyVector keys;
+  for (const auto& keyValue : values.extract<Pose2>()) {
+    keys.push_back(keyValue.first);
+  }
+  sort(keys.begin(), keys.end());
+  return keys;
+}
+
+/// Return the variable dimensions for a key list.
+vector<size_t> dimsForKeys(const KeyVector& keys, const Values& values) {
+  vector<size_t> dims;
+  dims.reserve(keys.size());
+  for (Key key : keys) {
+    dims.push_back(values.at(key).dim());
+  }
+  return dims;
+}
+
+/// Convert block dimensions to prefix offsets.
+vector<size_t> blockOffsets(const vector<size_t>& dims) {
+  vector<size_t> offsets(dims.size() + 1, 0);
+  for (size_t i = 0; i < dims.size(); ++i) {
+    offsets[i + 1] = offsets[i] + dims[i];
+  }
+  return offsets;
+}
+
+/// Sum the dimensions of a key set.
+size_t totalDimension(const KeySet& keys, const Values& values) {
+  size_t dim = 0;
+  for (Key key : keys) {
+    dim += values.at(key).dim();
+  }
+  return dim;
+}
+
+/// Compute the lowest common ancestor of two Bayes-tree cliques.
+template <class CLIQUE>
+shared_ptr<CLIQUE> findLowestCommonAncestor(const shared_ptr<CLIQUE>& lhs,
+                                            const shared_ptr<CLIQUE>& rhs) {
+  unordered_set<shared_ptr<CLIQUE>> ancestors;
+  for (auto current = lhs; current; current = current->parent()) {
+    ancestors.insert(current);
+  }
+  for (auto current = rhs; current; current = current->parent()) {
+    if (ancestors.count(current)) {
+      return current;
+    }
+  }
+  return nullptr;
+}
+
+/// Estimate the support size and compressed size for a query.
+SupportStats analyzeSupport(const GaussianBayesTree& bayesTree,
+                            const KeyVector& queryKeys) {
+  vector<GaussianBayesTree::sharedClique> queryCliques;
+  unordered_set<GaussianBayesTree::sharedClique> seen;
+  for (Key key : queryKeys) {
+    auto clique = bayesTree.clique(key);
+    if (seen.insert(clique).second) {
+      queryCliques.push_back(clique);
+    }
+  }
+  if (queryCliques.empty()) {
+    return {};
+  }
+
+  auto root = queryCliques.front();
+  for (size_t i = 1; i < queryCliques.size(); ++i) {
+    root = findLowestCommonAncestor(root, queryCliques[i]);
+  }
+
+  unordered_set<GaussianBayesTree::sharedClique> support;
+  support.insert(root);
+  for (const auto& clique : queryCliques) {
+    for (auto current = clique; current && current != root;
+         current = current->parent()) {
+      support.insert(current);
+    }
+  }
+
+  unordered_map<GaussianBayesTree::sharedClique, size_t> supportChildren;
+  for (const auto& clique : support) {
+    supportChildren[clique] = 0;
+  }
+  for (const auto& clique : support) {
+    if (clique == root) {
+      continue;
+    }
+    auto parent = clique->parent();
+    if (parent && support.count(parent)) {
+      ++supportChildren[parent];
+    }
+  }
+
+  unordered_set<GaussianBayesTree::sharedClique> querySet(queryCliques.begin(),
+                                                          queryCliques.end());
+  size_t compressed = 1;
+  for (const auto& clique : support) {
+    if (clique == root) {
+      continue;
+    }
+    if (querySet.count(clique) || supportChildren[clique] > 1) {
+      ++compressed;
+    }
+  }
+
+  return {support.size(), compressed};
+}
+
+/// Build the legacy reduced factor graph by marginal re-elimination.
+GaussianFactorGraph legacyReducedFactorGraph(const GaussianFactorGraph& graph,
+                                             const Ordering& fullOrdering,
+                                             const KeyVector& queryKeys) {
+  (void)fullOrdering;
+  return GaussianFactorGraph(
+      *graph.marginalMultifrontalBayesTree(queryKeys, EliminatePreferCholesky));
+}
+
+/// Build the localized reduced factor graph using Bayes-tree joint extraction.
+GaussianFactorGraph steinerReducedFactorGraph(
+    const GaussianBayesTree& bayesTree, const KeyVector& queryKeys) {
+  return *bayesTree.joint(queryKeys, EliminatePreferCholesky);
+}
+
+/// Build the legacy two-key reduced factor graph using the pairwise joint path.
+GaussianFactorGraph pairReducedFactorGraph(const GaussianBayesTree& bayesTree,
+                                           const KeyVector& queryKeys) {
+  return *bayesTree.joint(queryKeys[0], queryKeys[1], EliminatePreferCholesky);
+}
+
+/// Eliminate a reduced factor graph to a Bayes net ordered by the query keys.
+GaussianBayesNet queryBayesNet(const GaussianFactorGraph& graph,
+                               const KeyVector& queryKeys) {
+  return *graph.marginalMultifrontalBayesNet(Ordering(queryKeys),
+                                             EliminatePreferCholesky);
+}
+
+/// Recover selected covariance columns using triangular solves.
+Matrix covarianceColumns(const GaussianBayesNet& bayesNet,
+                         const KeyVector& orderedKeys,
+                         const vector<size_t>& dims,
+                         const vector<size_t>& selectedBlocks) {
+  const auto [R, rhs] = bayesNet.matrix(Ordering(orderedKeys));
+  (void)rhs;
+  const vector<size_t> offsets = blockOffsets(dims);
+  const size_t totalDim = offsets.back();
+
+  size_t selectedDim = 0;
+  for (size_t blockIndex : selectedBlocks) {
+    selectedDim += dims.at(blockIndex);
+  }
+
+  Matrix selectors = Matrix::Zero(totalDim, selectedDim);
+  size_t selectedOffset = 0;
+  for (size_t blockIndex : selectedBlocks) {
+    const size_t begin = offsets[blockIndex];
+    const size_t dim = dims[blockIndex];
+    selectors.block(begin, selectedOffset, dim, dim).setIdentity();
+    selectedOffset += dim;
+  }
+
+  R.transpose().triangularView<Eigen::Lower>().solveInPlace(selectors);
+  R.triangularView<Eigen::Upper>().solveInPlace(selectors);
+  return selectors;
+}
+
+/// Extract a left-by-right cross-covariance block from packed selected columns.
+Matrix extractPackedCrossBlock(const Matrix& selectedColumns,
+                               const KeyVector& orderedKeys,
+                               const vector<size_t>& dims,
+                               const KeyVector& left, const KeyVector& right) {
+  const FastMap<Key, size_t> keyIndex = Ordering(orderedKeys).invert();
+  const vector<size_t> offsets = blockOffsets(dims);
+  size_t leftDim = 0;
+  for (Key key : left) {
+    leftDim += dims.at(keyIndex.at(key));
+  }
+  size_t rightDim = 0;
+  for (Key key : right) {
+    rightDim += dims.at(keyIndex.at(key));
+  }
+
+  Matrix result(leftDim, rightDim);
+  size_t rowOffset = 0;
+  for (Key leftKey : left) {
+    const size_t leftBlock = keyIndex.at(leftKey);
+    const size_t leftDimBlock = dims[leftBlock];
+    const size_t leftBegin = offsets[leftBlock];
+    size_t columnOffset = 0;
+    size_t selectedOffset = 0;
+    for (Key rightKey : right) {
+      const size_t rightBlock = keyIndex.at(rightKey);
+      const size_t rightDimBlock = dims[rightBlock];
+      result.block(rowOffset, columnOffset, leftDimBlock, rightDimBlock) =
+          selectedColumns.block(leftBegin, selectedOffset, leftDimBlock,
+                                rightDimBlock);
+      columnOffset += rightDimBlock;
+      selectedOffset += rightDimBlock;
+    }
+    rowOffset += leftDimBlock;
+  }
+  return result;
+}
+
+/// Extract a left-by-right cross-covariance block from a full covariance
+/// matrix.
+Matrix extractDenseCrossBlock(const Matrix& fullCovariance,
+                              const KeyVector& orderedKeys,
+                              const vector<size_t>& dims, const KeyVector& left,
+                              const KeyVector& right) {
+  const FastMap<Key, size_t> keyIndex = Ordering(orderedKeys).invert();
+  const vector<size_t> offsets = blockOffsets(dims);
+  size_t leftDim = 0;
+  for (Key key : left) {
+    leftDim += dims.at(keyIndex.at(key));
+  }
+  size_t rightDim = 0;
+  for (Key key : right) {
+    rightDim += dims.at(keyIndex.at(key));
+  }
+
+  Matrix result(leftDim, rightDim);
+  size_t rowOffset = 0;
+  for (Key leftKey : left) {
+    const size_t leftBlock = keyIndex.at(leftKey);
+    const size_t leftDimBlock = dims[leftBlock];
+    const size_t leftBegin = offsets[leftBlock];
+    size_t columnOffset = 0;
+    for (Key rightKey : right) {
+      const size_t rightBlock = keyIndex.at(rightKey);
+      const size_t rightDimBlock = dims[rightBlock];
+      const size_t rightBegin = offsets[rightBlock];
+      result.block(rowOffset, columnOffset, leftDimBlock, rightDimBlock) =
+          fullCovariance.block(leftBegin, rightBegin, leftDimBlock,
+                               rightDimBlock);
+      columnOffset += rightDimBlock;
+    }
+    rowOffset += leftDimBlock;
+  }
+  return result;
+}
+
+/// Convert a Variant enum to a CSV-friendly name.
+string variantName(Variant variant) {
+  switch (variant) {
+    case Variant::LegacyDense:
+      return "legacy_dense";
+    case Variant::SteinerDense:
+      return "steiner_dense";
+    case Variant::LegacySolve:
+      return "legacy_solve";
+    case Variant::SteinerSolve:
+      return "steiner_solve";
+  }
+  return "unknown";
+}
+
+/// Convert an ordering type to a human-readable name.
+string orderingName(Ordering::OrderingType orderingType) {
+  return orderingType == Ordering::METIS ? "METIS" : "COLAMD";
+}
+
+/// Compute the median of a small sample vector.
+double median(vector<double> values) {
+  if (values.empty()) {
+    return 0.0;
+  }
+  sort(values.begin(), values.end());
+  const size_t mid = values.size() / 2;
+  if (values.size() % 2 == 0) {
+    return 0.5 * (values[mid - 1] + values[mid]);
+  }
+  return values[mid];
+}
+
+/// Compute the arithmetic mean of a sample vector.
+double mean(const vector<double>& values) {
+  if (values.empty()) {
+    return 0.0;
+  }
+  const double sum = accumulate(values.begin(), values.end(), 0.0);
+  return sum / static_cast<double>(values.size());
+}
+
+/// Compute the population standard deviation of a sample vector.
+double stddev(const vector<double>& values) {
+  if (values.size() < 2) {
+    return 0.0;
+  }
+  const double sampleMean = mean(values);
+  double sumSquares = 0.0;
+  for (double value : values) {
+    const double delta = value - sampleMean;
+    sumSquares += delta * delta;
+  }
+  return sqrt(sumSquares / static_cast<double>(values.size()));
+}
+
+/// Sample representative starting indices across a window range.
+vector<size_t> sampledStarts(size_t maxStartInclusive, size_t maxQueries,
+                             size_t stride = 1) {
+  vector<size_t> starts;
+  if (maxStartInclusive == 0) {
+    starts.push_back(0);
+    return starts;
+  }
+
+  for (size_t start = 0;
+       start <= maxStartInclusive && starts.size() < maxQueries;
+       start += stride) {
+    starts.push_back(start);
+  }
+  if (starts.size() > maxQueries) {
+    starts.resize(maxQueries);
+  }
+  if (starts.size() < maxQueries && starts.back() != maxStartInclusive) {
+    starts.push_back(maxStartInclusive);
+  }
+
+  if (starts.size() > maxQueries) {
+    vector<size_t> reduced;
+    reduced.reserve(maxQueries);
+    for (size_t i = 0; i < maxQueries; ++i) {
+      const size_t index = static_cast<size_t>(
+          llround((starts.size() - 1) * (double(i) / double(maxQueries - 1))));
+      reduced.push_back(starts[index]);
+    }
+    starts = reduced;
+  }
+  return starts;
+}
+
+/// Generate contiguous local-window benchmark queries.
+vector<QueryCase> generateLocalWindows(const KeyVector& poseKeys,
+                                       size_t querySize, size_t maxQueries) {
+  vector<QueryCase> queries;
+  if (poseKeys.size() < querySize) {
+    return queries;
+  }
+  const size_t maxStart = poseKeys.size() - querySize;
+  for (size_t start :
+       sampledStarts(maxStart, maxQueries, max<size_t>(1, querySize / 2))) {
+    QueryCase query;
+    query.family = "local_window";
+    query.querySize = querySize;
+    query.keys.assign(poseKeys.begin() + start,
+                      poseKeys.begin() + start + querySize);
+    queries.push_back(query);
+  }
+  return queries;
+}
+
+/// Generate single-pose queries that exercise the legacy marginal path.
+vector<QueryCase> generateSinglePoseQueries(const KeyVector& poseKeys,
+                                            size_t maxQueries) {
+  vector<QueryCase> queries;
+  if (poseKeys.empty()) {
+    return queries;
+  }
+  for (size_t start :
+       sampledStarts(poseKeys.size() - 1, maxQueries,
+                     max<size_t>(1, poseKeys.size() / maxQueries))) {
+    QueryCase query;
+    query.family = "single_pose";
+    query.querySize = 1;
+    query.keys = {poseKeys[start]};
+    queries.push_back(query);
+  }
+  return queries;
+}
+
+/// Generate adjacent-pair queries that exercise the legacy pairwise path.
+vector<QueryCase> generatePairPoseQueries(const KeyVector& poseKeys,
+                                          size_t maxQueries) {
+  vector<QueryCase> queries;
+  if (poseKeys.size() < 2) {
+    return queries;
+  }
+  for (size_t start : sampledStarts(poseKeys.size() - 2, maxQueries, 1)) {
+    QueryCase query;
+    query.family = "pair_pose";
+    query.querySize = 2;
+    query.keys = {poseKeys[start], poseKeys[start + 1]};
+    queries.push_back(query);
+  }
+  return queries;
+}
+
+/// Generate highly overlapping windows to test repeated local queries.
+vector<QueryCase> generateRepeatedOverlap(const KeyVector& poseKeys,
+                                          size_t querySize, size_t maxQueries) {
+  vector<QueryCase> queries;
+  if (poseKeys.size() < querySize) {
+    return queries;
+  }
+  const size_t maxStart = poseKeys.size() - querySize;
+  for (size_t start = 0; start <= maxStart && queries.size() < maxQueries;
+       ++start) {
+    QueryCase query;
+    query.family = "overlap_window";
+    query.querySize = querySize;
+    query.keys.assign(poseKeys.begin() + start,
+                      poseKeys.begin() + start + querySize);
+    queries.push_back(query);
+  }
+  return queries;
+}
+
+/// Generate wide-separated queries spread across the trajectory.
+vector<QueryCase> generateWideSeparated(const KeyVector& poseKeys,
+                                        size_t querySize, size_t maxQueries) {
+  vector<QueryCase> queries;
+  if (poseKeys.size() < querySize) {
+    return queries;
+  }
+
+  const double gap = querySize == 1
+                         ? 0.0
+                         : double(poseKeys.size() - 1) / double(querySize - 1);
+  for (size_t sample = 0; sample < maxQueries; ++sample) {
+    const double offset =
+        maxQueries == 0 ? 0.0
+                        : (gap / max<double>(1.0, double(maxQueries))) * sample;
+    QueryCase query;
+    query.family = "wide_separated";
+    query.querySize = querySize;
+    size_t previous = 0;
+    for (size_t j = 0; j < querySize; ++j) {
+      size_t index = static_cast<size_t>(llround(offset + gap * j));
+      index = min(index, poseKeys.size() - querySize + j);
+      if (j > 0) {
+        index = max(index, previous + 1);
+      }
+      previous = index;
+      query.keys.push_back(poseKeys[index]);
+    }
+    queries.push_back(query);
+  }
+  return queries;
+}
+
+/// Generate selected cross-covariance queries from overlapping windows.
+vector<QueryCase> generateSelectedCross(const KeyVector& poseKeys,
+                                        size_t querySize, size_t maxQueries) {
+  vector<QueryCase> queries;
+  if (poseKeys.size() < querySize || querySize < 2) {
+    return queries;
+  }
+  const size_t split = querySize / 2;
+  for (QueryCase query :
+       generateRepeatedOverlap(poseKeys, querySize, maxQueries)) {
+    query.family = "selected_cross";
+    query.crossCovariance = true;
+    query.left.assign(query.keys.begin(), query.keys.begin() + split);
+    query.right.assign(query.keys.begin() + split, query.keys.end());
+    queries.push_back(query);
+  }
+  return queries;
+}
+
+/// Build the full benchmark workload across all query families.
+vector<QueryCase> buildWorkload(const KeyVector& poseKeyList) {
+  vector<QueryCase> queries;
+  auto singlePose = generateSinglePoseQueries(poseKeyList, 32);
+  queries.insert(queries.end(), singlePose.begin(), singlePose.end());
+
+  auto pairPose = generatePairPoseQueries(poseKeyList, 32);
+  queries.insert(queries.end(), pairPose.begin(), pairPose.end());
+
+  for (size_t querySize : {size_t(3), size_t(5), size_t(10), size_t(20)}) {
+    auto local = generateLocalWindows(poseKeyList, querySize, 16);
+    queries.insert(queries.end(), local.begin(), local.end());
+
+    auto wide = generateWideSeparated(poseKeyList, querySize, 8);
+    queries.insert(queries.end(), wide.begin(), wide.end());
+  }
+
+  auto overlap = generateRepeatedOverlap(poseKeyList, 10, 32);
+  queries.insert(queries.end(), overlap.begin(), overlap.end());
+
+  for (size_t querySize : {size_t(10), size_t(20)}) {
+    auto selected = generateSelectedCross(poseKeyList, querySize, 24);
+    queries.insert(queries.end(), selected.begin(), selected.end());
+  }
+  return queries;
+}
+
+/// Time one query under one covariance-recovery variant.
+RawResult benchmarkQuery(const string& datasetName, const string& orderingLabel,
+                         Variant variant,
+                         const GaussianFactorGraph& linearGraph,
+                         const GaussianBayesTree& bayesTree,
+                         const Ordering& fullOrdering, const Values& values,
+                         const QueryCase& query, size_t queryIndex,
+                         size_t repeatIndex) {
+  const KeyVector orderedKeys = uniqueSortedKeys(query.keys);
+  const SupportStats supportStats = analyzeSupport(bayesTree, orderedKeys);
+
+  if (orderedKeys.size() == 1) {
+    const auto reductionStart = chrono::steady_clock::now();
+    const auto marginalFactor =
+        bayesTree.marginalFactor(orderedKeys.front(), EliminatePreferCholesky);
+    const Matrix information = marginalFactor->information();
+    const auto reductionEnd = chrono::steady_clock::now();
+
+    const auto extractionStart = chrono::steady_clock::now();
+    Matrix recovered;
+    if (variant == Variant::LegacyDense || variant == Variant::SteinerDense) {
+      recovered = information.inverse();
+    } else {
+      Eigen::LLT<Matrix> llt(information.selfadjointView<Eigen::Upper>());
+      recovered = Matrix::Identity(information.rows(), information.cols());
+      llt.solveInPlace(recovered);
+    }
+    const auto extractionEnd = chrono::steady_clock::now();
+
+    volatile double checksum = recovered.sum();
+    (void)checksum;
+
+    RawResult result;
+    result.dataset = datasetName;
+    result.ordering = orderingLabel;
+    result.family = query.family;
+    result.mode = "joint";
+    result.variant = variantName(variant);
+    result.querySize = query.querySize;
+    result.queryIndex = queryIndex;
+    result.repeatIndex = repeatIndex;
+    result.reductionMs =
+        chrono::duration<double, milli>(reductionEnd - reductionStart).count();
+    result.extractionMs =
+        chrono::duration<double, milli>(extractionEnd - extractionStart)
+            .count();
+    result.totalMs = result.reductionMs + result.extractionMs;
+    result.supportCliques = 1;
+    result.compressedCliques = 1;
+    result.reducedStateDim = information.rows();
+    return result;
+  }
+
+  const auto reductionStart = chrono::steady_clock::now();
+  GaussianFactorGraph reducedGraph =
+      orderedKeys.size() == 2
+          ? pairReducedFactorGraph(bayesTree, orderedKeys)
+          : ((variant == Variant::LegacyDense ||
+              variant == Variant::LegacySolve)
+                 ? legacyReducedFactorGraph(linearGraph, fullOrdering,
+                                            orderedKeys)
+                 : steinerReducedFactorGraph(bayesTree, orderedKeys));
+  const auto reductionEnd = chrono::steady_clock::now();
+
+  const auto extractionStart = chrono::steady_clock::now();
+  GaussianBayesNet reducedBayesNet = queryBayesNet(reducedGraph, orderedKeys);
+  const vector<size_t> dims = dimsForKeys(orderedKeys, values);
+  Matrix recovered;
+  if (query.crossCovariance) {
+    const FastMap<Key, size_t> keyIndex = Ordering(orderedKeys).invert();
+    vector<size_t> rightBlocks;
+    rightBlocks.reserve(query.right.size());
+    for (Key key : query.right) {
+      rightBlocks.push_back(keyIndex.at(key));
+    }
+
+    Matrix selectedColumns;
+    if (variant == Variant::LegacyDense || variant == Variant::SteinerDense) {
+      const auto [R, rhs] = reducedBayesNet.matrix(Ordering(orderedKeys));
+      (void)rhs;
+      const Matrix information = R.transpose() * R;
+      const Matrix covariance = information.inverse();
+      recovered = extractDenseCrossBlock(covariance, orderedKeys, dims,
+                                         query.left, query.right);
+    } else {
+      selectedColumns =
+          covarianceColumns(reducedBayesNet, orderedKeys, dims, rightBlocks);
+      recovered = extractPackedCrossBlock(selectedColumns, orderedKeys, dims,
+                                          query.left, query.right);
+    }
+  } else {
+    if (variant == Variant::LegacyDense || variant == Variant::SteinerDense) {
+      const auto [R, rhs] = reducedBayesNet.matrix(Ordering(orderedKeys));
+      (void)rhs;
+      const Matrix information = R.transpose() * R;
+      recovered = information.inverse();
+    } else {
+      vector<size_t> allBlocks(orderedKeys.size());
+      iota(allBlocks.begin(), allBlocks.end(), 0);
+      recovered =
+          covarianceColumns(reducedBayesNet, orderedKeys, dims, allBlocks);
+    }
+  }
+  const auto extractionEnd = chrono::steady_clock::now();
+
+  volatile double checksum = recovered.sum();
+  (void)checksum;
+
+  RawResult result;
+  result.dataset = datasetName;
+  result.ordering = orderingLabel;
+  result.family = query.family;
+  result.mode = query.crossCovariance ? "cross" : "joint";
+  result.variant = variantName(variant);
+  result.querySize = query.querySize;
+  result.queryIndex = queryIndex;
+  result.repeatIndex = repeatIndex;
+  result.reductionMs =
+      chrono::duration<double, milli>(reductionEnd - reductionStart).count();
+  result.extractionMs =
+      chrono::duration<double, milli>(extractionEnd - extractionStart).count();
+  result.totalMs = result.reductionMs + result.extractionMs;
+  result.supportCliques = supportStats.supportCliques;
+  result.compressedCliques = supportStats.compressedCliques;
+  result.reducedStateDim = totalDimension(reducedGraph.keys(), values);
+  return result;
+}
+
+/// Write per-query timing results to CSV.
+void writeRawCsv(const filesystem::path& path,
+                 const vector<RawResult>& results) {
+  ofstream os(path);
+  os << "dataset,ordering,query_family,mode,variant,query_size,query_index,"
+        "repeat_index,"
+        "total_ms,reduction_ms,extraction_ms,support_cliques,compressed_"
+        "cliques,"
+        "reduced_state_dim\n";
+  os << fixed << setprecision(6);
+  for (const auto& result : results) {
+    os << result.dataset << ',' << result.ordering << ',' << result.family
+       << ',' << result.mode << ',' << result.variant << ',' << result.querySize
+       << ',' << result.queryIndex << ',' << result.repeatIndex << ','
+       << result.totalMs << ',' << result.reductionMs << ','
+       << result.extractionMs << ',' << result.supportCliques << ','
+       << result.compressedCliques << ',' << result.reducedStateDim << '\n';
+  }
+}
+
+/// Aggregate repeated raw timings into one row per distinct query.
+vector<PerQueryResult> aggregatePerQueryResults(
+    const vector<RawResult>& results) {
+  struct PerQuerySamples {
+    vector<double> totalMs;
+    vector<double> reductionMs;
+    vector<double> extractionMs;
+    size_t supportCliques = 0;
+    size_t compressedCliques = 0;
+    size_t reducedStateDim = 0;
+  };
+
+  unordered_map<PerQueryKey, PerQuerySamples, PerQueryKeyHash> perQuery;
+  for (const auto& result : results) {
+    const PerQueryKey key{result.dataset,   result.ordering, result.family,
+                          result.mode,      result.variant,  result.querySize,
+                          result.queryIndex};
+    auto& value = perQuery[key];
+    value.totalMs.push_back(result.totalMs);
+    value.reductionMs.push_back(result.reductionMs);
+    value.extractionMs.push_back(result.extractionMs);
+    value.supportCliques = result.supportCliques;
+    value.compressedCliques = result.compressedCliques;
+    value.reducedStateDim = result.reducedStateDim;
+  }
+
+  vector<PerQueryResult> aggregated;
+  aggregated.reserve(perQuery.size());
+  for (const auto& [key, value] : perQuery) {
+    PerQueryResult result;
+    result.dataset = key.dataset;
+    result.ordering = key.ordering;
+    result.family = key.family;
+    result.mode = key.mode;
+    result.variant = key.variant;
+    result.querySize = key.querySize;
+    result.queryIndex = key.queryIndex;
+    result.repeats = value.totalMs.size();
+    result.meanTotalMs = mean(value.totalMs);
+    result.meanReductionMs = mean(value.reductionMs);
+    result.meanExtractionMs = mean(value.extractionMs);
+    result.medianTotalMs = median(value.totalMs);
+    result.medianReductionMs = median(value.reductionMs);
+    result.medianExtractionMs = median(value.extractionMs);
+    result.stddevTotalMs = stddev(value.totalMs);
+    result.stddevReductionMs = stddev(value.reductionMs);
+    result.stddevExtractionMs = stddev(value.extractionMs);
+    result.supportCliques = value.supportCliques;
+    result.compressedCliques = value.compressedCliques;
+    result.reducedStateDim = value.reducedStateDim;
+    aggregated.push_back(result);
+  }
+  return aggregated;
+}
+
+/// Write per-query aggregated timing results to CSV.
+void writePerQueryCsv(const filesystem::path& path,
+                      const vector<PerQueryResult>& results) {
+  ofstream os(path);
+  os << "dataset,ordering,query_family,mode,variant,query_size,query_index,"
+        "repeats,mean_total_ms,mean_reduction_ms,mean_extraction_ms,"
+        "median_total_ms,median_reduction_ms,median_extraction_ms,"
+        "stddev_total_ms,stddev_reduction_ms,stddev_extraction_ms,"
+        "support_cliques,compressed_cliques,reduced_state_dim\n";
+  os << fixed << setprecision(6);
+  for (const auto& result : results) {
+    os << result.dataset << ',' << result.ordering << ',' << result.family
+       << ',' << result.mode << ',' << result.variant << ',' << result.querySize
+       << ',' << result.queryIndex << ',' << result.repeats << ','
+       << result.meanTotalMs << ',' << result.meanReductionMs << ','
+       << result.meanExtractionMs << ',' << result.medianTotalMs << ','
+       << result.medianReductionMs << ',' << result.medianExtractionMs << ','
+       << result.stddevTotalMs << ',' << result.stddevReductionMs << ','
+       << result.stddevExtractionMs << ',' << result.supportCliques << ','
+       << result.compressedCliques << ',' << result.reducedStateDim << '\n';
+  }
+}
+
+/// Write family-level summary statistics aggregated from per-query results.
+void writeSummaryCsv(const filesystem::path& path,
+                     const vector<PerQueryResult>& results) {
+  unordered_map<SummaryKey, SummaryValue, SummaryKeyHash> summary;
+  for (const auto& result : results) {
+    const SummaryKey key{result.dataset, result.ordering, result.family,
+                         result.mode,    result.variant,  result.querySize};
+    auto& value = summary[key];
+    value.queryMeanTotalMs.push_back(result.meanTotalMs);
+    value.queryMeanReductionMs.push_back(result.meanReductionMs);
+    value.queryMeanExtractionMs.push_back(result.meanExtractionMs);
+    value.supportCliques.push_back(static_cast<double>(result.supportCliques));
+    value.compressedCliques.push_back(
+        static_cast<double>(result.compressedCliques));
+    value.reducedStateDim.push_back(
+        static_cast<double>(result.reducedStateDim));
+    value.repeats = result.repeats;
+  }
+
+  ofstream os(path);
+  os << "dataset,ordering,query_family,mode,variant,query_size,queries,repeats,"
+        "median_total_ms,sum_query_mean_total_ms,median_reduction_ms,"
+        "median_extraction_ms,support_cliques,compressed_cliques,"
+        "reduced_state_dim\n";
+  os << fixed << setprecision(6);
+  for (const auto& [key, value] : summary) {
+    const double totalTime = accumulate(value.queryMeanTotalMs.begin(),
+                                        value.queryMeanTotalMs.end(), 0.0);
+    os << key.dataset << ',' << key.ordering << ',' << key.family << ','
+       << key.mode << ',' << key.variant << ',' << key.querySize << ','
+       << value.queryMeanTotalMs.size() << ',' << value.repeats << ','
+       << median(value.queryMeanTotalMs) << ',' << totalTime << ','
+       << median(value.queryMeanReductionMs) << ','
+       << median(value.queryMeanExtractionMs) << ','
+       << median(value.supportCliques) << ',' << median(value.compressedCliques)
+       << ',' << median(value.reducedStateDim) << '\n';
+  }
+}
+
+/// Read a string argument from argv or return a default value.
+string argumentOrDefault(char** begin, char** end, const string& flag,
+                         const string& defaultValue) {
+  for (auto it = begin; it != end; ++it) {
+    if (string(*it) == flag && it + 1 != end) {
+      return *(it + 1);
+    }
+  }
+  return defaultValue;
+}
+
+/// Read an unsigned integer argument from argv or return a default value.
+size_t sizeTArgumentOrDefault(char** begin, char** end, const string& flag,
+                              size_t defaultValue) {
+  return static_cast<size_t>(
+      stoull(argumentOrDefault(begin, end, flag, to_string(defaultValue))));
+}
+
+/// Split a comma-separated argument into individual values.
+vector<string> splitCommaSeparated(const string& input) {
+  vector<string> values;
+  size_t start = 0;
+  while (start < input.size()) {
+    const size_t comma = input.find(',', start);
+    if (comma == string::npos) {
+      values.push_back(input.substr(start));
+      break;
+    }
+    values.push_back(input.substr(start, comma - start));
+    start = comma + 1;
+  }
+  return values;
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  const filesystem::path outputDir = argumentOrDefault(
+      argv, argv + argc, "--output-dir",
+      (filesystem::path("timing") / "results" / "bayes_tree_covariance")
+          .string());
+  const vector<string> datasets = splitCommaSeparated(argumentOrDefault(
+      argv, argv + argc, "--datasets", "w100.graph,w10000.graph,w20000.txt"));
+  const size_t repeats =
+      sizeTArgumentOrDefault(argv, argv + argc, "--repeats", 10);
+
+  filesystem::create_directories(outputDir);
+
+  vector<RawResult> rawResults;
+
+  for (const string& datasetName : datasets) {
+    cout << "Loading " << datasetName << endl;
+    const auto [graphPtr, initialPtr] =
+        load2D(findExampleDataFile(datasetName));
+    const KeyVector anchoredPoseKeys = poseKeys(*initialPtr);
+    if (!anchoredPoseKeys.empty()) {
+      graphPtr->addPrior(anchoredPoseKeys.front(),
+                         initialPtr->at<Pose2>(anchoredPoseKeys.front()),
+                         noiseModel::Diagonal::Sigmas(
+                             (Vector(3) << 1e-6, 1e-6, 1e-6).finished()));
+    }
+
+    cout << "Optimizing " << datasetName << endl;
+    LevenbergMarquardtOptimizer optimizer(*graphPtr, *initialPtr);
+    Values result = optimizer.optimize();
+    GaussianFactorGraph linearGraph = *graphPtr->linearize(result);
+    const KeyVector poseKeyList = poseKeys(result);
+    const vector<QueryCase> workload = buildWorkload(poseKeyList);
+
+    for (const auto orderingType : {Ordering::COLAMD, Ordering::METIS}) {
+#ifndef GTSAM_SUPPORT_NESTED_DISSECTION
+      if (orderingType == Ordering::METIS) {
+        continue;
+      }
+#endif
+      const Ordering ordering = Ordering::Create(orderingType, linearGraph);
+      cout << "  Ordering " << orderingName(orderingType) << " with "
+           << workload.size() << " queries" << endl;
+      GaussianBayesTree bayesTree =
+          *linearGraph.eliminateMultifrontal(ordering, EliminatePreferCholesky);
+
+      size_t queryIndex = 0;
+      for (const QueryCase& query : workload) {
+        if (query.querySize == 1) {
+          (void)bayesTree.marginalFactor(query.keys.front(),
+                                         EliminatePreferCholesky);
+        } else if (query.querySize == 2) {
+          (void)bayesTree.joint(query.keys[0], query.keys[1],
+                                EliminatePreferCholesky);
+        }
+        for (const Variant variant :
+             {Variant::LegacyDense, Variant::SteinerDense, Variant::LegacySolve,
+              Variant::SteinerSolve}) {
+          try {
+            (void)benchmarkQuery(datasetName, orderingName(orderingType),
+                                 variant, linearGraph, bayesTree, ordering,
+                                 result, query, queryIndex, 0);
+            for (size_t repeatIndex = 0; repeatIndex < repeats; ++repeatIndex) {
+              rawResults.push_back(benchmarkQuery(
+                  datasetName, orderingName(orderingType), variant, linearGraph,
+                  bayesTree, ordering, result, query, queryIndex, repeatIndex));
+            }
+          } catch (const std::exception& error) {
+            cerr << "Failure for dataset=" << datasetName
+                 << " ordering=" << orderingName(orderingType)
+                 << " variant=" << variantName(variant)
+                 << " family=" << query.family << " query_index=" << queryIndex
+                 << ": " << error.what() << endl;
+            throw;
+          }
+        }
+        ++queryIndex;
+      }
+    }
+  }
+
+  writeRawCsv(outputDir / "raw.csv", rawResults);
+  const vector<PerQueryResult> perQueryResults =
+      aggregatePerQueryResults(rawResults);
+  writePerQueryCsv(outputDir / "per_query.csv", perQueryResults);
+  writeSummaryCsv(outputDir / "summary.csv", perQueryResults);
+  cout << "Wrote benchmark results to " << outputDir << endl;
+  return 0;
+}