cornellius-gp
diff --git a/‎gpytorch/kernels/rff_kernel.py‎
Lines changed: 12 additions & 11 deletions b/‎gpytorch/kernels/rff_kernel.py‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎gpytorch/models/exact_gp.py‎
Lines changed: 106 additions & 34 deletions b/‎gpytorch/models/exact_gp.py‎
Lines changed: 106 additions & 34 deletions
@@ -1,8 +1,7 @@
 #!/usr/bin/env python3
 
-from __future__ import annotations
-
 import math
+from typing import Optional
 
 import torch
 from linear_operator.operators import LowRankRootLinearOperator, MatmulLinearOperator, RootLinearOperator
@@ -93,14 +92,14 @@ class RFFKernel(Kernel):
 
     has_lengthscale = True
 
-    def __init__(self, num_samples: int, num_dims: int | None = None, **kwargs):
+    def __init__(self, num_samples: int, num_dims: Optional[int] = None, **kwargs):
         super().__init__(**kwargs)
         self.num_samples = num_samples
         if num_dims is not None:
             self._init_weights(num_dims, num_samples)
 
     def _init_weights(
-        self, num_dims: int | None = None, num_samples: int | None = None, randn_weights: Tensor | None = None
+        self, num_dims: Optional[int] = None, num_samples: Optional[int] = None, randn_weights: Optional[Tensor] = None
     ):
         if num_dims is not None and num_samples is not None:
             d = num_dims
@@ -120,22 +119,24 @@ def forward(self, x1: Tensor, x2: Tensor, diag: bool = False, last_dim_is_batch:
         if not hasattr(self, "randn_weights"):
             self._init_weights(num_dims, self.num_samples)
         x1_eq_x2 = torch.equal(x1, x2)
-        z1 = self._featurize(x1, normalize=False)
+        # Always use normalized features (scaled by 1/sqrt(D)) to ensure consistent
+        # feature matrices regardless of whether x1 == x2 or not. This is important
+        # for LinearPredictionStrategy, which extracts features from the LinearOperator.
+        z1 = self._featurize(x1, normalize=True)
         if not x1_eq_x2:
-            z2 = self._featurize(x2, normalize=False)
+            z2 = self._featurize(x2, normalize=True)
         else:
             z2 = z1
-        D = float(self.num_samples)
         if diag:
-            return (z1 * z2).sum(-1) / D
+            return (z1 * z2).sum(-1)
         if x1_eq_x2:
             # Exploit low rank structure, if there are fewer features than data points
             if z1.size(-1) < z2.size(-2):
-                return LowRankRootLinearOperator(z1 / math.sqrt(D))
+                return LowRankRootLinearOperator(z1)
             else:
-                return RootLinearOperator(z1 / math.sqrt(D))
+                return RootLinearOperator(z1)
         else:
-            return MatmulLinearOperator(z1 / D, z2.transpose(-1, -2))
+            return MatmulLinearOperator(z1, z2.transpose(-1, -2))
 
     def _featurize(self, x: Tensor, normalize: bool = False) -> Tensor:
         # Recompute division each time to allow backprop through lengthscale
 
@@ -8,8 +8,11 @@
 from copy import deepcopy
 
 import torch
+from linear_operator.operators import LinearOperator
 from torch import Tensor
 
+from gpytorch.distributions import Distribution
+
 from .. import settings
 from ..distributions import MultitaskMultivariateNormal, MultivariateNormal
 from ..likelihoods import _GaussianLikelihoodBase
@@ -300,7 +303,7 @@ def __call__(self, *args, **kwargs):
 
             # Get the terms that only depend on training data
             if self.prediction_strategy is None:
-                train_output = super().__call__(*train_inputs, **kwargs)
+                train_output = self._get_train_prior_distribution(train_inputs, **kwargs)
 
                 # Create the prediction strategy for
                 self.prediction_strategy = prediction_strategy(
@@ -309,41 +312,110 @@ def __call__(self, *args, **kwargs):
                     train_labels=self.train_targets,
                     likelihood=self.likelihood,
                 )
-
-            # Concatenate the input to the training input
-            full_inputs = []
-            batch_shape = train_inputs[0].shape[:-2]
-            for train_input, input in length_safe_zip(train_inputs, inputs):
-                # Make sure the batch shapes agree for training/test data
-                if batch_shape != train_input.shape[:-2]:
-                    batch_shape = torch.broadcast_shapes(batch_shape, train_input.shape[:-2])
-                    train_input = train_input.expand(*batch_shape, *train_input.shape[-2:])
-                if batch_shape != input.shape[:-2]:
-                    batch_shape = torch.broadcast_shapes(batch_shape, input.shape[:-2])
-                    train_input = train_input.expand(*batch_shape, *train_input.shape[-2:])
-                    input = input.expand(*batch_shape, *input.shape[-2:])
-                full_inputs.append(torch.cat([train_input, input], dim=-2))
-
-            # Get the joint distribution for training/test data
-            full_output = super().__call__(*full_inputs, **kwargs)
-            if settings.debug.on():
-                if not isinstance(full_output, MultivariateNormal):
-                    raise RuntimeError("ExactGP.forward must return a MultivariateNormal")
-            full_mean, full_covar = full_output.loc, full_output.lazy_covariance_matrix
-
-            # Determine the shape of the joint distribution
-            batch_shape = full_output.batch_shape
-            joint_shape = full_output.event_shape
-            tasks_shape = joint_shape[1:]  # For multitask learning
-            test_shape = torch.Size([joint_shape[0] - self.prediction_strategy.train_shape[0], *tasks_shape])
-
+            (
+                test_mean,
+                test_test_covar,
+                test_train_covar,
+                batch_shape,
+                test_shape,
+                posterior_class,
+            ) = self._get_test_prior_mean_and_covariances(train_inputs=train_inputs, test_inputs=inputs, **kwargs)
             # Make the prediction
             with settings.cg_tolerance(settings.eval_cg_tolerance.value()):
-                (
-                    predictive_mean,
-                    predictive_covar,
-                ) = self.prediction_strategy.exact_prediction(full_mean, full_covar)
+                (predictive_mean, predictive_covar,) = self.prediction_strategy.exact_prediction(
+                    test_mean=test_mean,
+                    test_test_covar=test_test_covar,
+                    test_train_covar=test_train_covar,
+                )
 
             # Reshape predictive mean to match the appropriate event shape
             predictive_mean = predictive_mean.view(*batch_shape, *test_shape).contiguous()
-            return full_output.__class__(predictive_mean, predictive_covar)
+            return posterior_class(predictive_mean, predictive_covar)
+
+    def _get_train_prior_distribution(
+        self,
+        train_inputs: Iterable[Tensor],
+        **kwargs,
+    ) -> MultivariateNormal:
+        """Computes the prior distribution on the training set.
+
+        Override this method to customize train-train covariance computation.
+
+        Args:
+            train_inputs: The inputs in the training set.
+            kwargs: Additional keyword arguments passed to the model's forward method.
+
+        Returns:
+            The prior distribution evaluated on the training set.
+        """
+        return super().__call__(*train_inputs, **kwargs)
+
+    def _get_test_prior_mean_and_covariances(
+        self,
+        train_inputs: Iterable[Tensor | LinearOperator],
+        test_inputs: Iterable[Tensor | LinearOperator],
+        **kwargs,
+    ) -> tuple[Tensor, Tensor, Tensor, torch.Size, torch.Size, type[Distribution]]:
+        """Computes the prior mean and covariances on the test set.
+
+        Override this method to customize test-set covariance computations, e.g.,
+        for models with partial observations or per-component additive inference.
+
+        The returned covariances may have additional leading batch dimensions
+        (e.g., for additive component-wise inference). The prediction strategy
+        handles broadcasting with the train-train covariance.
+
+        Note: This method is efficient even when test_inputs overlaps with
+        train_inputs. Slicing the lazy joint covariance only evaluates
+        K(test, [train||test]); K(train, train) is never computed.
+
+        Args:
+            train_inputs: The training inputs.
+            test_inputs: The test inputs.
+            kwargs: Additional keyword arguments passed to the model's forward.
+
+        Returns:
+            A tuple of (test_mean, test_test_covar, test_train_covar, batch_shape,
+            test_shape, posterior_class).
+        """
+        # Concatenate the input to the training input
+        full_inputs = []
+        batch_shape = train_inputs[0].shape[:-2]
+        for train_input, input in length_safe_zip(train_inputs, test_inputs):
+            # Make sure the batch shapes agree for training/test data
+            if batch_shape != train_input.shape[:-2]:
+                batch_shape = torch.broadcast_shapes(batch_shape, train_input.shape[:-2])
+                train_input = train_input.expand(*batch_shape, *train_input.shape[-2:])
+            if batch_shape != input.shape[:-2]:
+                batch_shape = torch.broadcast_shapes(batch_shape, input.shape[:-2])
+                train_input = train_input.expand(*batch_shape, *train_input.shape[-2:])
+                input = input.expand(*batch_shape, *input.shape[-2:])
+            full_inputs.append(torch.cat([train_input, input], dim=-2))
+
+        # Get joint distribution (lazy when settings.lazily_evaluate_kernels is True)
+        full_output = super().__call__(*full_inputs, **kwargs)
+        if settings.debug().on():
+            if not isinstance(full_output, MultivariateNormal):
+                raise RuntimeError("ExactGP.forward must return a MultivariateNormal")
+        joint_mean, joint_covar = full_output.loc, full_output.lazy_covariance_matrix
+
+        # Determine the shape of the joint distribution
+        batch_shape = full_output.batch_shape
+        joint_shape = full_output.event_shape
+        tasks_shape = joint_shape[1:]  # For multitask learning
+
+        test_shape = torch.Size([joint_shape[0] - self.prediction_strategy.train_shape[0], *tasks_shape])
+
+        # Find the components of the distribution that contain test data
+        num_train = self.prediction_strategy.num_train
+        test_mean = joint_mean[..., num_train:]
+
+        # Extract test covariances. Slicing is lazy; K(train, train) is never computed.
+        # evaluate_kernel() converts to the linear operator type needed by prediction.
+        # NOTE: We must slice row and column indices together (not sequentially) for
+        # compatibility with BlockInterleavedLinearOperator used in multitask GPs.
+        test_test_covar = joint_covar[..., num_train:, num_train:].evaluate_kernel()
+        test_train_covar = joint_covar[..., num_train:, :num_train].evaluate_kernel()
+
+        posterior_class = full_output.__class__
+        return (test_mean, test_test_covar, test_train_covar, batch_shape, test_shape, posterior_class)