improve performance notebook

ianhi · ianhi · commit d307fa52a23c · 2025-12-19T20:25:43.000-05:00
diff --git a/docs/ndindex_performance.ipynb b/docs/ndindex_performance.ipynb
diff --git a/src/linked_indices/benchmark_utils.py b/src/linked_indices/benchmark_utils.py
@@ -0,0 +1,76 @@
+"""Benchmark utilities for linked_indices performance testing.
+
+This module provides helpers for rigorous performance measurement using
+Python's timeit module.
+"""
+
+from __future__ import annotations
+
+import timeit
+from typing import Any
+
+import numpy as np
+
+__all__ = ["timeit_benchmark"]
+
+
+def timeit_benchmark(
+    stmt: str | callable,
+    setup: str = "pass",
+    globals: dict[str, Any] | None = None,
+    repeat: int = 7,
+) -> dict[str, float | int]:
+    """
+    Benchmark a statement or callable using timeit with automatic loop count.
+
+    Uses timeit's autorange to determine an appropriate number of loops,
+    then runs multiple trials to get reliable timing statistics.
+
+    Parameters
+    ----------
+    stmt : str or callable
+        The statement or callable to benchmark.
+    setup : str
+        Setup code to run once before the benchmark. Default: "pass"
+    globals : dict, optional
+        Global namespace for the benchmark. Required when stmt uses
+        variables from the calling scope.
+    repeat : int
+        Number of trials to run. Default: 7 (timeit default)
+
+    Returns
+    -------
+    dict
+        Dictionary with timing statistics:
+        - best_ms: Minimum time per call in milliseconds (best measure of algorithm cost)
+        - mean_ms: Mean time per call in milliseconds (typical real-world performance)
+        - std_ms: Standard deviation of times in milliseconds
+        - n_loops: Number of loops per trial (determined by autorange)
+
+    Examples
+    --------
+    >>> from linked_indices.benchmark_utils import timeit_benchmark
+    >>> import numpy as np
+    >>> arr = np.random.randn(1000)
+    >>> result = timeit_benchmark(
+    ...     lambda: np.sum(arr),
+    ...     globals={"np": np, "arr": arr}
+    ... )
+    >>> result["best_ms"] < 1.0  # Should be fast
+    True
+    """
+    timer = timeit.Timer(stmt, setup=setup, globals=globals)
+
+    # Let timeit determine appropriate number of loops
+    n_loops, _ = timer.autorange()
+
+    # Run multiple trials
+    times = timer.repeat(repeat=repeat, number=n_loops)
+    times_per_call = np.array(times) / n_loops
+
+    return {
+        "best_ms": float(times_per_call.min() * 1000),
+        "mean_ms": float(times_per_call.mean() * 1000),
+        "std_ms": float(times_per_call.std() * 1000),
+        "n_loops": n_loops,
+    }
diff --git a/src/linked_indices/example_data.py b/src/linked_indices/example_data.py
@@ -25,6 +25,11 @@
     "multi_interval_dataset",
     "onset_duration_dataset",
     "trial_based_dataset",
+    # NDIndex benchmark dataset generators
+    "create_trial_ndindex_dataset",
+    "create_diagonal_dataset",
+    "create_radial_dataset",
+    "create_jittered_dataset",
 ]
 
 
@@ -680,3 +685,211 @@ def trial_based_dataset(
         )
 
     return ds
+
+
+# =============================================================================
+# NDIndex benchmark dataset generators
+# =============================================================================
+
+
+def create_trial_ndindex_dataset(n_trials: int, n_times: int) -> "xr.Dataset":
+    """
+    Create trial-based dataset with abs_time = trial_onset + rel_time.
+
+    This is the typical neuroscience use case: multiple trials with
+    overlapping relative time but different absolute time ranges.
+    Returns a dataset with NDIndex already set on abs_time.
+
+    Parameters
+    ----------
+    n_trials : int
+        Number of trials.
+    n_times : int
+        Number of time points per trial.
+
+    Returns
+    -------
+    xr.Dataset
+        Dataset with NDIndex set on abs_time coordinate.
+
+    Examples
+    --------
+    >>> from linked_indices.example_data import create_trial_ndindex_dataset
+    >>> ds = create_trial_ndindex_dataset(10, 100)
+    >>> ds.sel(abs_time=0.5, method="nearest")  # Select by absolute time
+    """
+    import xarray as xr
+
+    from linked_indices import NDIndex
+
+    trial_onsets = np.arange(n_trials) * n_times * 0.01
+    rel_time = np.linspace(0, n_times * 0.01, n_times)
+    abs_time = trial_onsets[:, np.newaxis] + rel_time[np.newaxis, :]
+    data = np.random.randn(n_trials, n_times)
+
+    ds = xr.Dataset(
+        {"data": (["trial", "rel_time"], data)},
+        coords={
+            "trial": np.arange(n_trials),
+            "rel_time": rel_time,
+            "abs_time": (["trial", "rel_time"], abs_time),
+        },
+    )
+    return ds.set_xindex(["abs_time"], NDIndex)
+
+
+def create_diagonal_dataset(ny: int, nx: int) -> "xr.Dataset":
+    """
+    Create image-like dataset with diagonal gradient coordinate.
+
+    This is from the slicing gallery: derived[y, x] = y_offset[y] + x_coord[x]
+    Similar structure to trial data but with different scale/semantics.
+    Returns a dataset with NDIndex already set on the derived coordinate.
+
+    Parameters
+    ----------
+    ny : int
+        Number of y (row) coordinates.
+    nx : int
+        Number of x (column) coordinates.
+
+    Returns
+    -------
+    xr.Dataset
+        Dataset with NDIndex set on derived coordinate.
+
+    Examples
+    --------
+    >>> from linked_indices.example_data import create_diagonal_dataset
+    >>> ds = create_diagonal_dataset(100, 100)
+    >>> ds.sel(derived=50, method="nearest")
+    """
+    import xarray as xr
+
+    from linked_indices import NDIndex
+
+    y_coord = np.arange(ny)
+    x_coord = np.arange(nx)
+
+    # Diagonal gradient: each row starts 2 units higher
+    y_offset = y_coord * 2
+    derived_coord = y_offset[:, np.newaxis] + x_coord[np.newaxis, :]
+    data = np.random.randn(ny, nx)
+
+    ds = xr.Dataset(
+        {"data": (["y", "x"], data)},
+        coords={
+            "y": y_coord,
+            "x": x_coord,
+            "derived": (["y", "x"], derived_coord),
+        },
+    )
+    return ds.set_xindex(["derived"], NDIndex)
+
+
+def create_radial_dataset(ny: int, nx: int) -> "xr.Dataset":
+    """
+    Create image-like dataset with radial coordinate (non-linear 2D).
+
+    This tests performance with non-monotonic, complex coordinate patterns.
+    The radius coordinate is the distance from the center of the array.
+    Returns a dataset with NDIndex already set on the radius coordinate.
+
+    Parameters
+    ----------
+    ny : int
+        Number of y (row) coordinates.
+    nx : int
+        Number of x (column) coordinates.
+
+    Returns
+    -------
+    xr.Dataset
+        Dataset with NDIndex set on radius coordinate.
+
+    Examples
+    --------
+    >>> from linked_indices.example_data import create_radial_dataset
+    >>> ds = create_radial_dataset(100, 100)
+    >>> ds.sel(radius=slice(10, 20))  # Select an annulus
+    """
+    import xarray as xr
+
+    from linked_indices import NDIndex
+
+    cy, cx = ny // 2, nx // 2
+    yy, xx = np.meshgrid(np.arange(ny) - cy, np.arange(nx) - cx, indexing="ij")
+    radius = np.sqrt(xx**2 + yy**2)
+    data = np.random.randn(ny, nx)
+
+    ds = xr.Dataset(
+        {"data": (["y", "x"], data)},
+        coords={
+            "y": np.arange(ny),
+            "x": np.arange(nx),
+            "radius": (["y", "x"], radius),
+        },
+    )
+    return ds.set_xindex(["radius"], NDIndex)
+
+
+def create_jittered_dataset(
+    n_trials: int, n_times: int, jitter_std: float = 0.1
+) -> "xr.Dataset":
+    """
+    Create trial dataset with per-trial timing jitter.
+
+    More realistic: trial onsets have random variation, and sampling
+    times have small per-sample jitter (like real physiological recordings).
+    Returns a dataset with NDIndex already set on abs_time.
+
+    Parameters
+    ----------
+    n_trials : int
+        Number of trials.
+    n_times : int
+        Number of time points per trial.
+    jitter_std : float
+        Standard deviation of timing jitter. Default: 0.1
+
+    Returns
+    -------
+    xr.Dataset
+        Dataset with NDIndex set on abs_time coordinate.
+
+    Examples
+    --------
+    >>> from linked_indices.example_data import create_jittered_dataset
+    >>> ds = create_jittered_dataset(10, 100, jitter_std=0.2)
+    >>> ds.sel(abs_time=0.5, method="nearest")
+    """
+    import xarray as xr
+
+    from linked_indices import NDIndex
+
+    np.random.seed(42)  # Reproducible
+
+    # Trial onsets with jitter
+    base_onsets = np.arange(n_trials) * n_times * 0.01
+    trial_onsets = base_onsets + np.random.randn(n_trials) * jitter_std
+    trial_onsets[0] = 0  # First trial starts at 0
+
+    # Per-sample timing jitter within each trial
+    base_rel_time = np.linspace(0, n_times * 0.01, n_times)
+    rel_time_jitter = np.random.randn(n_trials, n_times) * (jitter_std * 0.01)
+
+    # 2D absolute time with jitter
+    abs_time = (
+        trial_onsets[:, np.newaxis] + base_rel_time[np.newaxis, :] + rel_time_jitter
+    )
+    data = np.random.randn(n_trials, n_times)
+
+    ds = xr.Dataset(
+        {"data": (["trial", "rel_time"], data)},
+        coords={
+            "trial": np.arange(n_trials),
+            "rel_time": base_rel_time,
+            "abs_time": (["trial", "rel_time"], abs_time),
+        },
+    )
+    return ds.set_xindex(["abs_time"], NDIndex)
diff --git a/src/linked_indices/nd_index.py b/src/linked_indices/nd_index.py
@@ -192,15 +192,15 @@ def _find_indices_for_value(
             # Find flat index of closest value
             flat_idx = np.argmin(np.abs(values - value))
         else:
-            # Exact match required
-            matches = np.where(values == value)
-            if len(matches[0]) == 0:
+            # Exact match required - use flatnonzero for efficiency
+            flat_matches = np.flatnonzero(values == value)
+            if len(flat_matches) == 0:
                 raise KeyError(
                     f"Value {value!r} not found in coordinate {coord_name!r}. "
                     f"Use method='nearest' for approximate matching."
                 )
             # Use the first match
-            flat_idx = np.ravel_multi_index(tuple(m[0] for m in matches), values.shape)
+            flat_idx = flat_matches[0]
 
         # Convert to multi-dimensional indices
         indices = np.unravel_index(flat_idx, values.shape)