Simplify task value remapping API (meta-pytorch#3163)

Carl Hvarfner · facebook-github-bot · commit 60aff2b7a34c · 2026-02-12T12:01:26.000-08:00
Summary: X-link: facebook/Ax#4860 Simplifies the get_task_value_remapping() API from 4 parameters to 2, addressing confusion reported in meta-pytorch#3085. The observed_task_values parameter is removed because the parent diff (D90769576) now makes MultiTaskGP track observed/unobserved tasks internally via _observed_task_indices and _unobserved_task_indices. The default_task_value parameter is removed because the previous behavior—silently mapping unknown tasks to an arbitrary fallback—was confusing and error-prone; instead, unrecognized tasks now map to NaN, providing an explicit error sentinel with a clear warning message. Differential Revision: D90998243
diff --git a/botorch/models/multitask.py b/botorch/models/multitask.py
@@ -336,14 +336,10 @@ def __init__(
 
         self.covar_module = data_covar_module * task_covar_module
         task_mapper = get_task_value_remapping(
-            observed_task_values=torch.tensor(
-                all_tasks_inferred, dtype=torch.long, device=train_X.device
-            ),
             all_task_values=torch.tensor(
                 sorted(all_tasks), dtype=torch.long, device=train_X.device
             ),
             dtype=train_X.dtype,
-            default_task_value=None if output_tasks is None else output_tasks[0],
         )
         self.register_buffer("_task_mapper", task_mapper)
         self._expected_task_values = set(all_tasks)
diff --git a/botorch/models/transforms/outcome.py b/botorch/models/transforms/outcome.py
@@ -22,6 +22,7 @@
 
 from __future__ import annotations
 
+import warnings
 from abc import ABC, abstractmethod
 from collections import OrderedDict
 
@@ -511,12 +512,10 @@ class StratifiedStandardize(Standardize):
     def __init__(
         self,
         stratification_idx: int,
-        observed_task_values: Tensor,
         all_task_values: Tensor,
         batch_shape: torch.Size = torch.Size(),  # noqa: B008
         min_stdv: float = 1e-8,
         dtype: torch.dtype = torch.double,
-        default_task_value: int | None = None,
     ) -> None:
         r"""Standardize outcomes (zero mean, unit variance) along stratification dim.
 
@@ -526,28 +525,22 @@ def __init__(
         Args:
             stratification_idx: The index of the stratification dimension in the
                 input tensor X.
-            observed_task_values: ``t``-dim tensor of task values that were actually
-                observed in the training data.
             all_task_values: ``t``-dim tensor of all possible task values that could
                 appear in the dataset.
             batch_shape: The batch_shape of the training targets.
             min_stdv: The minimum standard deviation for which to perform
                 standardization (if lower, only de-mean the data).
             dtype: The data type for internal computations.
-            default_task_value: The default task value that unexpected tasks are
-                mapped to. This is used in ``get_task_value_remapping``.
         """
         OutcomeTransform.__init__(self)
         self._stratification_idx = stratification_idx
-        observed_task_values = observed_task_values.unique(sorted=True)
+        all_task_values = all_task_values.unique(sorted=True)
         self.strata_mapping = get_task_value_remapping(
-            observed_task_values=observed_task_values,
-            all_task_values=all_task_values.unique(sorted=True),
+            all_task_values=all_task_values,
             dtype=dtype,
-            default_task_value=default_task_value,
         )
         if self.strata_mapping is None:
-            self.strata_mapping = observed_task_values
+            self.strata_mapping = all_task_values
         n_strata = self.strata_mapping.shape[0]
         self._min_stdv = min_stdv
         self.register_buffer("means", torch.zeros(*batch_shape, n_strata, 1))
@@ -629,7 +622,20 @@ def _get_per_input_means_stdvs(
             - The per-input stdvs squared.
         """
         strata = X[..., self._stratification_idx].long()
-        mapped_strata = self.strata_mapping[strata].unsqueeze(-1).long()
+        mapped_strata_float = self.strata_mapping[strata]
+        # Check for unobserved tasks (mapped to NaN) and warn
+        unobserved_mask = torch.isnan(mapped_strata_float)
+        if unobserved_mask.any():
+            warnings.warn(
+                "Predictions are being made for tasks that were not observed "
+                "during training. These tasks will use an identity transform "
+                "(mean=0, stdv=1).",
+                stacklevel=3,
+            )
+            # Map unobserved tasks to index 0 temporarily for gather operation
+            mapped_strata_float = mapped_strata_float.clone()
+            mapped_strata_float[unobserved_mask] = 0.0
+        mapped_strata = mapped_strata_float.unsqueeze(-1).long()
         # get means and stdvs for each strata
         n_extra_batch_dims = mapped_strata.ndim - 2 - len(self._batch_shape)
         expand_shape = mapped_strata.shape[:n_extra_batch_dims] + self.means.shape
@@ -643,12 +649,22 @@ def _get_per_input_means_stdvs(
             dim=-2,
             index=mapped_strata,
         )
+        # Apply identity transform (mean=0, stdv=1) for unobserved tasks
+        if unobserved_mask.any():
+            unobserved_mask_expanded = unobserved_mask.unsqueeze(-1)
+            means = means.clone()
+            stdvs = stdvs.clone()
+            means[unobserved_mask_expanded] = 0.0
+            stdvs[unobserved_mask_expanded] = 1.0
         if include_stdvs_sq:
             stdvs_sq = torch.gather(
                 input=self._stdvs_sq.expand(expand_shape),
                 dim=-2,
                 index=mapped_strata,
             )
+            if unobserved_mask.any():
+                stdvs_sq = stdvs_sq.clone()
+                stdvs_sq[unobserved_mask_expanded] = 1.0
         else:
             stdvs_sq = None
         return means, stdvs, stdvs_sq
diff --git a/botorch/models/utils/assorted.py b/botorch/models/utils/assorted.py
@@ -412,59 +412,51 @@ class fantasize(_Flag):
 
 
 def get_task_value_remapping(
-    observed_task_values: Tensor,
     all_task_values: Tensor,
     dtype: torch.dtype,
-    default_task_value: int | None,
 ) -> Tensor | None:
-    """Construct an mapping of observed task values to contiguous int-valued floats.
+    """Construct a mapping of task values to contiguous int-valued floats.
+
+    This function creates a mapping tensor that remaps task indices. All tasks
+    in ``all_task_values`` are mapped to contiguous integers starting from 0.
+    Task values not in ``all_task_values`` are mapped to NaN.
 
     Args:
-        observed_task_values: A sorted long-valued tensor of task values.
-        all_task_values: A sorted long-valued tensor of task values.
+        all_task_values: A sorted long-valued tensor of all possible task values
+            in the full task space.
         dtype: The dtype of the model inputs (e.g. ``X``), which the new
             task values should have mapped to (e.g. float, double).
-        default_task_value: The default task value to use for missing task values.
 
     Returns:
-        A tensor of shape ``task_values.max() + 1`` that maps task values
+        A tensor of shape ``all_task_values.max() + 1`` that maps task values
         to new task values. The indexing operation ``mapper[task_value]``
         will produce a tensor of new task values, of the same shape as
-        the original. The elements of the ``mapper`` tensor that do not
-        appear in the original ``task_values`` are mapped to ``nan``. The
-        return value will be ``None``, when the task values are contiguous
-        integers starting from zero.
+        the original. All task values in ``all_task_values`` are mapped to
+        contiguous integers [0, 1, ..., n-1] where n is the number of tasks.
+        Task values not in ``all_task_values`` are mapped to NaN. Returns
+        ``None`` when ``all_task_values`` equals [0, 1, ..., n-1].
     """
     if dtype not in (torch.float, torch.double):
         raise ValueError(f"dtype must be torch.float or torch.double, but got {dtype}.")
     task_range = torch.arange(
-        len(observed_task_values),
+        len(all_task_values),
         dtype=all_task_values.dtype,
         device=all_task_values.device,
     )
     mapper = None
 
-    if default_task_value is None:
-        fill_value = float("nan")
-    else:
-        mask = observed_task_values == default_task_value
-        if not mask.any():
-            fill_value = float("nan")
-        else:
-            idx = mask.nonzero().item()
-            fill_value = task_range[idx]
-    # if not all tasks are observed or they are not contiguous integers
+    # if task values are not contiguous integers starting from 0,
     # then map them to contiguous integers
     if not torch.equal(task_range, all_task_values):
         # Create a tensor that maps task values to new task values.
         # The number of tasks should be small, so this should be quite efficient.
         mapper = torch.full(
             (int(all_task_values.max().item()) + 1,),
-            fill_value,
+            float("nan"),
             dtype=dtype,
             device=all_task_values.device,
         )
-        mapper[observed_task_values] = task_range.to(dtype=dtype)
+        mapper[all_task_values] = task_range.to(dtype=dtype)
     return mapper
 
 
diff --git a/test/models/test_fully_bayesian_multitask.py b/test/models/test_fully_bayesian_multitask.py
@@ -517,6 +517,43 @@ def test_fit_model_infer_noise(self):
     def test_fit_model_with_outcome_transform(self):
         self.test_fit_model(use_outcome_transform=True)
 
+    def test_fit_model_with_unobserved_tasks(self) -> None:
+        """Test fitting and predicting when some tasks have no training data."""
+        dtype = torch.double
+        tkwargs = {"device": self.device, "dtype": dtype}
+        # Tasks 0 and 2 observed; task 1 has no training data
+        _, _, _, model = self._get_data_and_model(
+            infer_noise=True,
+            use_outcome_transform=True,
+            output_tasks=[2],
+            observed_task_values=[0, 2],
+            all_tasks=[0, 1, 2],
+            validate_task_values=False,
+            **tkwargs,
+        )
+        # Contiguous all_tasks → no mapper needed
+        self.assertIsNone(model._task_mapper)
+        self.assertEqual(model.pyro_model.num_tasks, 3)
+
+        fit_fully_bayesian_model_nuts(
+            model, warmup_steps=8, num_samples=5, thinning=2, disable_progbar=True
+        )
+        self.assertIsNotNone(model.mean_module)
+
+        # Predict for observed tasks
+        test_X = torch.rand(3, 4, **tkwargs)
+        posterior = model.posterior(test_X)
+        self.assertIsInstance(posterior, GaussianMixturePosterior)
+        # output_tasks=[2] → single output
+        self.assertEqual(posterior.mean.shape[-1], 1)
+
+        # Predict for the UNOBSERVED task (task 1)
+        test_X_unobs = torch.cat(
+            [torch.rand(3, 4, **tkwargs), torch.ones(3, 1, **tkwargs)], dim=-1
+        )
+        posterior_unobs = model.posterior(test_X_unobs)
+        self.assertIsInstance(posterior_unobs, GaussianMixturePosterior)
+
     def test_transforms(self, infer_noise: bool = False):
         tkwargs = {"device": self.device, "dtype": torch.double}
         train_X, train_Y, train_Yvar, test_X = self._get_unnormalized_data(**tkwargs)
diff --git a/test/models/test_multitask.py b/test/models/test_multitask.py
diff --git a/test/models/transforms/test_outcome.py b/test/models/transforms/test_outcome.py