Impute per-dim empirical mean (not zero) for missing features in HeterogeneousMTGP (#3294)

Carl Hvarfner · facebook-github-bot · commit ea00d744f7f3 · 2026-04-28T07:51:22.000-07:00
Summary:

HeterogeneousMTGP zero-pads missing per-task feature columns, which risks heavily skewing parameter ranges that are substantially outsize zero and lending these dimensions very difficult to fit.

Impute the per-dim empirical mean of the train_Xs columns containing that dim
instead, keeping padded values inside the model's input range. Falls back to 0 if no task contains a dim.

Reviewed By: saitcakmak

Differential Revision: D102390346
diff --git a/botorch/models/heterogeneous_mtgp.py b/botorch/models/heterogeneous_mtgp.py
@@ -110,8 +110,21 @@ def __init__(
         """
         self.full_feature_dim = full_feature_dim
         self.feature_indices = feature_indices
+        imputation_values = self._compute_imputation_values(
+            train_Xs=train_Xs,
+            feature_indices=feature_indices,
+            full_feature_dim=full_feature_dim,
+        )
+        # The first time we map to full tensor, we have to pass in the imputation values
+        # as they have not yet been registered as buffers - this has to wait until after
+        # super().__init__.
         full_X = torch.cat(
-            [self.map_to_full_tensor(X=X, task_index=i) for i, X in enumerate(train_Xs)]
+            [
+                self.map_to_full_tensor(
+                    X=X, task_index=i, imputation_values=imputation_values
+                )
+                for i, X in enumerate(train_Xs)
+            ]
         )
         full_Y = torch.cat(train_Ys)
         full_Yvar = None if train_Yvars is None else torch.cat(train_Yvars)
@@ -139,6 +152,7 @@ def __init__(
             outcome_transform=outcome_transform,
             validate_task_values=validate_task_values,
         )
+        self.register_buffer("feature_imputation_values", imputation_values)
 
     @classmethod
     def get_all_tasks(
@@ -158,36 +172,80 @@ def get_all_tasks(
             all_tasks_inferred = [0] + all_tasks_inferred
         return all_tasks_inferred, task_feature, num_non_task_features
 
-    def map_to_full_tensor(self, X: Tensor, task_index: int) -> Tensor:
+    def map_to_full_tensor(
+        self,
+        X: Tensor,
+        task_index: int,
+        imputation_values: Tensor | None = None,
+    ) -> Tensor:
         """Map a tensor of task-specific features to the full tensor of features,
         utilizing the feature indices to map each feature to its corresponding
         position in the full tensor. Also append the task index as the last column.
-        The columns of the full tensor that are not used by the given task will be
-        filled with zeros.
+        The columns of the full tensor that are not used by the given task are
+        filled with the per-dimension empirical mean computed across all tasks
+        that contain that dimension (see ``_compute_imputation_values``). This
+        avoids out-of-domain padding values that would otherwise be squashed by
+        an input transform with fixed bounds (e.g. ``Normalize``).
 
         Args:
             X: A tensor of shape ``(n x d_i)`` where ``d_i`` is the number of features
                 in the original task dataset.
             task_index: The index of the task whose features are being mapped.
+            imputation_values: Optional pre-computed imputation values. If not
+                provided, uses ``self.feature_imputation_values``.
 
         Returns:
             A tensor of shape ``(n x (self.full_feature_dim + 1))`` containing the
             mapped features.
 
         Example:
-            >>> # Suppose full feature dim is 3 and the feature indices for
-            >>> # task 5 are [2, 0].
+            >>> # Suppose full feature dim is 3, the feature indices for task 5
+            >>> # are [2, 0], and the empirical mean for missing dim 1 is 7.0.
             >>> X = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
             >>> X_full = self.map_to_full_tensor(X=X, task_index=5)
-            >>> # X_full = torch.tensor([[2.0, 0.0, 1.0, 5.0], [4.0, 0.0, 3.0, 5.0]])
+            >>> # X_full = torch.tensor([[2.0, 7.0, 1.0, 5.0], [4.0, 7.0, 3.0, 5.0]])
         """
+        if imputation_values is None:
+            imputation_values = self.feature_imputation_values
         X_full = torch.zeros(
             *X.shape[:-1], self.full_feature_dim + 1, dtype=X.dtype, device=X.device
         )
+        X_full[..., : self.full_feature_dim] = imputation_values
         X_full[..., self.feature_indices[task_index]] = X
         X_full[..., -1] = task_index
         return X_full
 
+    @staticmethod
+    def _compute_imputation_values(
+        train_Xs: list[Tensor],
+        feature_indices: list[list[int]],
+        full_feature_dim: int,
+    ) -> Tensor:
+        """Compute per-dimension empirical mean across all tasks that contain
+        each dimension of the joint feature space.
+
+        For each dimension ``d`` in ``[0, full_feature_dim)``, collects the values
+        from every task's ``train_X`` column that maps to ``d`` and takes the mean.
+        These values are used by ``map_to_full_tensor`` to impute missing dims when
+        embedding a per-task ``X`` into the full feature space.
+
+        Returns:
+            A tensor of shape ``(full_feature_dim,)`` with the per-dim mean. If a
+            dimension is not present in any task (which should not occur under the
+            constructor's invariants), the value defaults to 0.
+        """
+        dtype = train_Xs[0].dtype
+        device = train_Xs[0].device
+        imputation = torch.zeros(full_feature_dim, dtype=dtype, device=device)
+        for d in range(full_feature_dim):
+            values: list[Tensor] = []
+            for indices, X in zip(feature_indices, train_Xs):
+                if d in indices and X.numel() > 0:
+                    values.append(X[..., indices.index(d)].reshape(-1))
+            if values:
+                imputation[d] = torch.cat(values).mean()
+        return imputation
+
     def posterior(
         self,
         X: Tensor,
diff --git a/test/models/test_heterogeneous_mtgp.py b/test/models/test_heterogeneous_mtgp.py
@@ -122,6 +122,23 @@ def test_standard_heterogeneous_mtgp(self) -> None:
                 model.likelihood.noise_covar.noise.shape[-1], model.num_tasks
             )
 
+        with self.subTest("imputation_uses_per_dim_empirical_mean"):
+            # Full feature space is [x1, x2, x3, x4, x5]. x3 is only in task 0,
+            # x4 and x5 are only in task 2. Imputation values for missing dims
+            # should equal the empirical mean of those columns across tasks.
+            expected_x3_mean = self.ds1.X[:, 2].mean()
+            expected_x4_mean = self.ds3.X[:, 2].mean()
+            expected_x5_mean = self.ds3.X[:, 3].mean()
+            self.assertAllClose(model.feature_imputation_values[2], expected_x3_mean)
+            self.assertAllClose(model.feature_imputation_values[3], expected_x4_mean)
+            self.assertAllClose(model.feature_imputation_values[4], expected_x5_mean)
+            # Task 1 (ds2) does not have x3, x4, x5 -- those columns in the
+            # full training tensor must equal the imputation values, not zero.
+            task1_rows = model.train_inputs[0][model.train_inputs[0][:, -1] == 1]
+            self.assertAllClose(task1_rows[:, 2], expected_x3_mean.expand(3))
+            self.assertAllClose(task1_rows[:, 3], expected_x4_mean.expand(3))
+            self.assertAllClose(task1_rows[:, 4], expected_x5_mean.expand(3))
+
         # Evaluate the posterior (task column required).
         with self.assertRaisesRegex(UnsupportedError, "output_indices"):
             model.posterior(self.ds1.X, output_indices=[0, 1])