allenai
diff --git a/‎olmoearth_pretrain/evals/datasets/__init__.py‎
Lines changed: 13 additions & 1 deletion b/‎olmoearth_pretrain/evals/datasets/__init__.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎olmoearth_pretrain/evals/datasets/configs.py‎
Lines changed: 12 additions & 0 deletions b/‎olmoearth_pretrain/evals/datasets/configs.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎olmoearth_pretrain/evals/datasets/pretrain_subset.py‎
Lines changed: 76 additions & 0 deletions b/‎olmoearth_pretrain/evals/datasets/pretrain_subset.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎olmoearth_pretrain/evals/embedding_diagnostics.py‎
Lines changed: 203 additions & 0 deletions b/‎olmoearth_pretrain/evals/embedding_diagnostics.py‎
Lines changed: 203 additions & 0 deletions
@@ -1,6 +1,7 @@
 """OlmoEarth Pretrain eval datasets."""
 
 import logging
+from typing import Any
 
 from olmo_core.config import StrEnum
 from torch.utils.data import Dataset
@@ -14,6 +15,7 @@
 from .mados_dataset import MADOSDataset
 from .normalize import NormMethod
 from .pastis_dataset import PASTISRDataset
+from .pretrain_subset import PretrainSubsetDataset
 from .rslearn_dataset import from_registry_entry
 
 logger = logging.getLogger(__name__)
@@ -40,9 +42,19 @@ def get_eval_dataset(
     # Default to 2std no clip - this matches what our model sees in pretraining,
     # so when using dataset stats (e.g. for MADOS) consistency is important.
     norm_method: str = NormMethod.NORM_NO_CLIP_2_STD,
+    **kwargs: Any,
 ) -> Dataset:
     """Retrieve an eval dataset from the dataset name."""
-    if eval_dataset.startswith("m-"):
+    if eval_dataset == "pretrain_subset":
+        return PretrainSubsetDataset(
+            h5py_dir=kwargs["h5py_dir"],
+            training_modalities=kwargs.get("training_modalities", input_modalities),
+            max_samples=kwargs.get("max_samples", 512),
+            patch_size=kwargs.get("pretrain_patch_size", 4),
+            hw_p=kwargs.get("pretrain_hw_p", 8),
+            seed=kwargs.get("pretrain_seed", 42),
+        )
+    elif eval_dataset.startswith("m-"):
         # m- == "modified for geobench"
         return GeobenchDataset(
             geobench_dir=paths.GEOBENCH_DIR,
 
@@ -49,6 +49,18 @@ def from_dict(cls, d: dict[str, Any]) -> "EvalDatasetConfig":
 
 
 DATASET_TO_CONFIG = {
+    # Dummy config — only used for embedding diagnostics, not actual classification.
+    "pretrain_subset": EvalDatasetConfig(
+        task_type=TaskType.CLASSIFICATION,
+        imputes=[],
+        num_classes=1,
+        is_multilabel=False,
+        supported_modalities=[
+            Modality.SENTINEL2_L2A.name,
+            Modality.SENTINEL1.name,
+            Modality.LANDSAT.name,
+        ],
+    ),
     "m-eurosat": EvalDatasetConfig(
         task_type=TaskType.CLASSIFICATION,
         imputes=[],
 
@@ -0,0 +1,76 @@
+"""Eval dataset adapter that loads a subset of pretraining data.
+
+Wraps OlmoEarthDataset to expose the eval dataset interface
+(returns MaskedOlmoEarthSample, dummy_label) so it can be used
+with the downstream evaluator callback for embedding diagnostics.
+"""
+
+from __future__ import annotations
+
+import logging
+
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from upath import UPath
+
+from olmoearth_pretrain.data.dataset import GetItemArgs, OlmoEarthDataset
+from olmoearth_pretrain.datatypes import MaskedOlmoEarthSample
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_PATCH_SIZE = 4
+DEFAULT_HW_P = 8
+DEFAULT_MAX_SAMPLES = 512
+
+
+class PretrainSubsetDataset(Dataset):
+    """Wraps OlmoEarthDataset for use as an eval dataset.
+
+    Returns (MaskedOlmoEarthSample, dummy_label) to match the eval
+    dataset interface. Uses a fixed subset of indices for reproducibility.
+    """
+
+    def __init__(
+        self,
+        h5py_dir: str,
+        training_modalities: list[str],
+        max_samples: int = DEFAULT_MAX_SAMPLES,
+        patch_size: int = DEFAULT_PATCH_SIZE,
+        hw_p: int = DEFAULT_HW_P,
+        seed: int = 42,
+    ) -> None:
+        """Initialize with a fixed reproducible subset of training indices."""
+        self.patch_size = patch_size
+        self.hw_p = hw_p
+        self.max_samples = max_samples
+
+        self._dataset = OlmoEarthDataset(
+            h5py_dir=UPath(h5py_dir),
+            training_modalities=training_modalities,
+            dtype=np.float32,
+            normalize=True,
+        )
+        self._dataset.prepare()
+
+        total = len(self._dataset)
+        n = min(max_samples, total)
+        rng = np.random.RandomState(seed)
+        self._indices = rng.choice(total, size=n, replace=False).tolist()
+
+    def __len__(self) -> int:
+        """Return number of samples in the subset."""
+        return len(self._indices)
+
+    def __getitem__(self, idx: int) -> tuple[MaskedOlmoEarthSample, torch.Tensor]:
+        """Return (MaskedOlmoEarthSample, dummy_label) for the given index."""
+        real_idx = self._indices[idx]
+        args = GetItemArgs(
+            idx=real_idx,
+            patch_size=self.patch_size,
+            sampled_hw_p=self.hw_p,
+        )
+        _, sample = self._dataset[args]
+        masked = MaskedOlmoEarthSample.from_olmoearthsample(sample)
+        dummy_label = torch.tensor(0, dtype=torch.long)
+        return masked, dummy_label
@@ -0,0 +1,203 @@
+"""Embedding quality diagnostics for detecting representation collapse.
+
+Computes geometry metrics on embedding matrices to diagnose failure modes
+in self-supervised pretraining (dimensional collapse, crowding, etc.).
+
+Supports two embedding shapes:
+- [N, D]: image-level (classification). One embedding per sample.
+- [N, P, D] or [N, H, W, D]: patch-level (segmentation). Multiple patches per sample.
+  Computes global, inter-sample, and intra-sample diagnostics.
+
+Can be used standalone on any embedding tensor, or integrated
+into the eval pipeline via the evaluator callback.
+"""
+
+from __future__ import annotations
+
+import logging
+
+import torch
+from torch import Tensor
+
+logger = logging.getLogger(__name__)
+
+MAX_PAIRWISE_SAMPLES = 2048
+MAX_SVD_SAMPLES = 4096
+MAX_INTRA_SAMPLE_IMAGES = 256
+
+
+def effective_rank(embeddings: Tensor) -> float:
+    """Effective rank via Shannon entropy of singular values.
+
+    Returns a value between 1 (full collapse) and min(N, D) (maximally spread).
+    Roy & Bhattacharyya (2007).
+    """
+    n = embeddings.shape[0]
+    if n > MAX_SVD_SAMPLES:
+        idx = torch.randperm(n, device=embeddings.device)[:MAX_SVD_SAMPLES]
+        embeddings = embeddings[idx]
+    S = torch.linalg.svdvals(embeddings.float())
+    S = S[S > 0]
+    if S.numel() == 0:
+        return 0.0
+    p = S / S.sum()
+    entropy = -(p * p.log()).sum()
+    return entropy.exp().item()
+
+
+def uniformity(embeddings: Tensor, t: float = 2.0) -> float:
+    """Uniformity metric (Wang & Isola 2020). More negative = more uniform."""
+    z = torch.nn.functional.normalize(embeddings.float(), dim=-1)
+    n = z.shape[0]
+    if n > MAX_PAIRWISE_SAMPLES:
+        idx = torch.randperm(n, device=z.device)[:MAX_PAIRWISE_SAMPLES]
+        z = z[idx]
+        n = MAX_PAIRWISE_SAMPLES
+    sq_dists = torch.cdist(z, z, p=2).pow(2)
+    mask = torch.triu(torch.ones(n, n, device=z.device, dtype=torch.bool), diagonal=1)
+    sq_dists_upper = sq_dists[mask]
+    return torch.log(torch.exp(-t * sq_dists_upper).mean()).item()
+
+
+def pairwise_cosine_stats(embeddings: Tensor) -> dict[str, float]:
+    """Pairwise cosine similarity stats. High mean + low std = crowding."""
+    z = torch.nn.functional.normalize(embeddings.float(), dim=-1)
+    n = z.shape[0]
+    if n > MAX_PAIRWISE_SAMPLES:
+        idx = torch.randperm(n, device=z.device)[:MAX_PAIRWISE_SAMPLES]
+        z = z[idx]
+        n = MAX_PAIRWISE_SAMPLES
+    sim = z @ z.T
+    mask = torch.triu(torch.ones(n, n, device=z.device, dtype=torch.bool), diagonal=1)
+    sims = sim[mask]
+    return {
+        "cosine_sim_mean": sims.mean().item(),
+        "cosine_sim_std": sims.std().item(),
+        "cosine_sim_min": sims.min().item(),
+        "cosine_sim_max": sims.max().item(),
+    }
+
+
+def embedding_norm_stats(embeddings: Tensor) -> dict[str, float]:
+    """L2 norm statistics across samples."""
+    norms = embeddings.float().norm(dim=-1)
+    return {
+        "norm_mean": norms.mean().item(),
+        "norm_std": norms.std().item(),
+        "norm_min": norms.min().item(),
+        "norm_max": norms.max().item(),
+    }
+
+
+def compute_embedding_diagnostics(embeddings: Tensor) -> dict[str, float]:
+    """Compute all embedding quality diagnostics on [N, D] embeddings."""
+    if embeddings.ndim != 2:
+        raise ValueError(f"Expected 2D embeddings [N, D], got shape {embeddings.shape}")
+    n, d = embeddings.shape
+    if n < 2:
+        logger.warning("Need at least 2 samples for embedding diagnostics")
+        return {}
+
+    metrics: dict[str, float] = {}
+    metrics["effective_rank"] = effective_rank(embeddings)
+    metrics["embedding_dim"] = float(d)
+    metrics["num_samples"] = float(n)
+    metrics.update(embedding_norm_stats(embeddings))
+
+    if n >= 4:
+        metrics["uniformity"] = uniformity(embeddings)
+        metrics.update(pairwise_cosine_stats(embeddings))
+
+    return metrics
+
+
+def _compute_intra_sample_diagnostics(embeddings: Tensor) -> dict[str, float]:
+    """Compute per-image patch diagnostics, averaged across images.
+
+    Args:
+        embeddings: [N, P, D] tensor where P is patches per image.
+
+    Measures whether patches within an image are diverse (good for segmentation)
+    or collapsed (all patches identical = segmentation impossible).
+    """
+    n, p, d = embeddings.shape
+    if p < 2:
+        logger.warning("Need at least 2 patches per image for intra-sample diagnostics")
+        return {}
+
+    num_images = min(n, MAX_INTRA_SAMPLE_IMAGES)
+    if num_images < n:
+        idx = torch.randperm(n, device=embeddings.device)[:num_images]
+        embeddings = embeddings[idx]
+
+    # Batch cosine sim: normalize then bmm → [num_images, P, P]
+    z = torch.nn.functional.normalize(embeddings.float(), dim=-1)
+    sim_matrices = torch.bmm(z, z.transpose(1, 2))
+    tri_mask = torch.triu(
+        torch.ones(p, p, device=z.device, dtype=torch.bool), diagonal=1
+    )
+
+    cosine_means = []
+    cosine_stds = []
+    for i in range(num_images):
+        sims = sim_matrices[i][tri_mask]
+        cosine_means.append(sims.mean().item())
+        cosine_stds.append(sims.std().item())
+
+    # Batch norm std
+    norms = embeddings.float().norm(dim=-1)  # [num_images, P]
+    norm_stds = norms.std(dim=1)  # [num_images]
+
+    metrics: dict[str, float] = {
+        "norm_std": norm_stds.mean().item(),
+        "num_patches": float(p),
+        "num_images_sampled": float(num_images),
+    }
+    if cosine_means:
+        metrics["cosine_sim_mean"] = sum(cosine_means) / len(cosine_means)
+        metrics["cosine_sim_std"] = sum(cosine_stds) / len(cosine_stds)
+    return metrics
+
+
+def compute_spatial_embedding_diagnostics(embeddings: Tensor) -> dict[str, float]:
+    """Compute diagnostics for spatial (patch-level) embeddings.
+
+    Accepts [N, *, D] where * is one or more spatial dims (e.g. [N, H, W, D]
+    or [N, P, D]). Returns metrics with flat prefixes (global_, inter_, intra_)
+    to avoid deep nesting in wandb.
+    """
+    if embeddings.ndim < 3:
+        raise ValueError(
+            f"Expected 3+ dim embeddings [N, *, D], got shape {embeddings.shape}"
+        )
+
+    n = embeddings.shape[0]
+    d = embeddings.shape[-1]
+    patches = embeddings.reshape(n, -1, d)
+    p = patches.shape[1]
+
+    if n < 2:
+        logger.warning("Need at least 2 samples for spatial embedding diagnostics")
+        return {}
+
+    metrics: dict[str, float] = {}
+
+    # Global: flatten all patches, subsample if huge
+    flat = patches.reshape(-1, d)
+    if flat.shape[0] > MAX_SVD_SAMPLES:
+        idx = torch.randperm(flat.shape[0], device=flat.device)[:MAX_SVD_SAMPLES]
+        flat = flat[idx]
+    for k, v in compute_embedding_diagnostics(flat).items():
+        metrics[f"global_{k}"] = v
+
+    # Inter-sample: mean pool patches per image -> [N, D]
+    pooled = patches.float().mean(dim=1)
+    for k, v in compute_embedding_diagnostics(pooled).items():
+        metrics[f"inter_{k}"] = v
+
+    # Intra-sample: per-image patch diversity
+    if p >= 2:
+        for k, v in _compute_intra_sample_diagnostics(patches).items():
+            metrics[f"intra_{k}"] = v
+
+    return metrics