jeffkbkim
diff --git a/‎torchrec/metrics/metric_module.py‎
Lines changed: 141 additions & 23 deletions b/‎torchrec/metrics/metric_module.py‎
Lines changed: 141 additions & 23 deletions
@@ -21,9 +21,24 @@
 import torch.nn as nn
 from torch.distributed.tensor import DeviceMesh
 from torch.profiler import record_function
+from torchmetrics.utilities.data import (
+    dim_zero_cat,
+    dim_zero_max,
+    dim_zero_mean,
+    dim_zero_min,
+    dim_zero_sum,
+)
 from torchrec.metrics.accuracy import AccuracyMetric
-from torchrec.metrics.auc import AUCMetric
-from torchrec.metrics.auprc import AUPRCMetric
+from torchrec.metrics.auc import (
+    _grouping_keys_state_reduction,
+    _state_reduction,
+    AUCMetric,
+)
+from torchrec.metrics.auprc import (
+    _grouping_keys_state_reduction as auprc_grouping_keys_state_reduction,
+    _state_reduction as auprc_state_reduction,
+    AUPRCMetric,
+)
 from torchrec.metrics.average import AverageMetric
 from torchrec.metrics.cali_free_ne import CaliFreeNEMetric
 from torchrec.metrics.calibration import CalibrationMetric
@@ -58,23 +73,92 @@
 from torchrec.metrics.output import OutputMetric
 from torchrec.metrics.precision import PrecisionMetric
 from torchrec.metrics.precision_session import PrecisionSessionMetric
-from torchrec.metrics.rauc import RAUCMetric
+from torchrec.metrics.rauc import (
+    _grouping_keys_state_reduction as rauc_grouping_keys_state_reduction,
+    _state_reduction as rauc_state_reduction,
+    RAUCMetric,
+)
 from torchrec.metrics.rec_metric import RecMetric, RecMetricException, RecMetricList
 from torchrec.metrics.recall import RecallMetric
 from torchrec.metrics.recall_session import RecallSessionMetric
 from torchrec.metrics.scalar import ScalarMetric
-from torchrec.metrics.segmented_ne import SegmentedNEMetric
+from torchrec.metrics.segmented_ne import _state_reduction_sum, SegmentedNEMetric
 from torchrec.metrics.serving_calibration import ServingCalibrationMetric
 from torchrec.metrics.serving_ne import ServingNEMetric
 from torchrec.metrics.tensor_weighted_avg import TensorWeightedAvgMetric
 from torchrec.metrics.throughput import ThroughputMetric
-from torchrec.metrics.tower_qps import TowerQPSMetric
+from torchrec.metrics.tower_qps import _max_reduction, TowerQPSMetric
 from torchrec.metrics.unweighted_ne import UnweightedNEMetric
 from torchrec.metrics.weighted_avg import WeightedAvgMetric
 from torchrec.metrics.xauc import XAUCMetric
 
 
 logger: logging.Logger = logging.getLogger(__name__)
+# TorchRec-specific custom reduction functions.
+# These work correctly with local+global reduction pattern.
+# Requirements: Associative AND (Commutative OR post-processing makes result order-invariant)
+SAFE_CALLABLE_REDUCTIONS: frozenset[Any] = frozenset(
+    {
+        _state_reduction,  # Concatenation + AUC sorts data, making final result order-invariant
+        _grouping_keys_state_reduction,  # Concatenation along dim=0 + sorting makes result order-invariant
+        auprc_state_reduction,
+        auprc_grouping_keys_state_reduction,
+        rauc_state_reduction,
+        rauc_grouping_keys_state_reduction,
+        _state_reduction_sum,  # Sum on dimension 0.
+        _max_reduction,  # Max is associative and commutative.
+    }
+)
+
+# torchmetrics.Metric built-in reduction functions.
+# All dim_zero_* functions are both associative and commutative (dim_zero_cat is not commutative
+# but torchmetrics.Metric also reduce before sync_dist to reduce number of collectives).
+TORCHMETRICS_REDUCTIONS: frozenset[Any] = frozenset(
+    {
+        dim_zero_sum,
+        dim_zero_mean,
+        dim_zero_max,
+        dim_zero_min,
+        dim_zero_cat,
+    }
+)
+
+
+def _validate_reduction_function(
+    reduction_fn: Union[str, Any, None],
+    state_name: str,
+    metric_namespace: str,
+) -> None:
+    """
+    Validate that a reduction function is safe for local+global reduction pattern.
+
+    Only validates custom reduction functions. TorchMetrics built-in functions
+    (dim_zero_*) are skipped as they're safe by construction (all are associative & commutative).
+
+    Mathematical Requirements:
+    1. **Associativity**: f([f([a,b]), f([c,d])]) = f([a,b,c,d])
+       - Required so local reduction + global reduction = direct reduction
+
+    2. **Commutativity**: f([a, b]) = f([b, a])
+       - Required so rank ordering doesn't affect the result
+       - OR the metric's computation must make the final result order-invariant
+         (e.g., AUC concatenates in rank order but sorts before computing, making final result order-invariant)
+    """
+    # Skip validation for None and torchmetrics.Metric built-in functions (safe by construction)
+    if reduction_fn is None or reduction_fn in TORCHMETRICS_REDUCTIONS:
+        return
+
+    # Validate custom callable reductions
+    if callable(reduction_fn):
+        if reduction_fn not in SAFE_CALLABLE_REDUCTIONS:
+            raise RecMetricException(
+                f"Unknown custom reduction '{reduction_fn}' for state '{state_name}' in '{metric_namespace}'. "
+                f"Must be associative: f([f([a,b]), f([c,d])]) == f([a,b,c,d]) "
+                f"AND commutative: f([a,b]) == f([b,a]) (or metric makes result order-invariant). "
+                f"Known safe custom reductions: {[f for f in SAFE_CALLABLE_REDUCTIONS if f not in TORCHMETRICS_REDUCTIONS]}. "
+                f"Add to SAFE_CALLABLE_REDUCTIONS if verified safe."
+            )
+
 
 REC_METRICS_MAPPING: Dict[RecMetricEnumBase, Type[RecMetric]] = {
     RecMetricEnum.NE: NEMetric,
@@ -218,6 +302,8 @@ def __init__(
         self.oom_count = 0
         self.compute_count = 0
 
+        self._validate_all_reduction_functions()
+
         self.compute_interval_steps = compute_interval_steps
         self.min_compute_interval = min_compute_interval
         self.max_compute_interval = max_compute_interval
@@ -240,6 +326,20 @@ def __init__(
 
         self._register_load_state_dict_pre_hook(self.load_state_dict_hook)
 
+    def _validate_all_reduction_functions(self) -> None:
+        """
+        Validate all reduction functions in rec_metrics during initialization.
+        This ensures that all reduction functions are safe for the local+global reduction pattern.
+        """
+        for metric in self.rec_metrics.rec_metrics:
+            for computation in metric._metrics_computations:  # pyre-ignore[16]
+                for state_name, reduction_fn in computation._reductions.items():  # pyre-ignore[16]
+                    _validate_reduction_function(
+                        reduction_fn,
+                        state_name,
+                        metric._namespace.value,  # pyre-ignore[16]
+                    )
+
     def load_state_dict_hook(
         self,
         state_dict: OrderedDict[str, torch.Tensor],
@@ -408,22 +508,24 @@ def _get_metric_states(
             # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
             #  `items`.
             for state_name, reduction_fn in computation._reductions.items():
-                tensor_or_list: Union[List[torch.Tensor], torch.Tensor] = getattr(
-                    computation, state_name
-                )
-
-                if isinstance(tensor_or_list, list):
-                    gathered = _all_gather_tensor_list(
-                        tensor_or_list, world_size, process_group
-                    )
-                else:
-                    gathered = torch.stack(
-                        _all_gather_tensor(tensor_or_list, world_size, process_group)
+                with record_function(f"## RecMetricModule: {state_name} all gather ##"):
+                    tensor_or_list: Union[List[torch.Tensor], torch.Tensor] = getattr(
+                        computation, state_name
                     )
-                reduced = (
-                    reduction_fn(gathered) if reduction_fn is not None else gathered
-                )
-                result[task.name][state_name] = reduced
+
+                    if isinstance(tensor_or_list, list):
+                        local_reduced = reduction_fn(tensor_or_list)
+                        gathered = _all_gather_tensor_list(
+                            local_reduced, world_size, process_group
+                        )
+                    else:
+                        gathered = torch.stack(
+                            _all_gather_tensor(
+                                tensor_or_list, world_size, process_group
+                            )
+                        )
+                    global_reduced = reduction_fn(gathered)
+                    result[task.name][state_name] = global_reduced
 
         return result
 
@@ -472,7 +574,8 @@ def get_pre_compute_states(
         # throughput metric requires special handling, since it's not a RecMetric
         throughput_metric = self.throughput_metric
         if throughput_metric is not None:
-            aggregated_states[throughput_metric._namespace.value] = (
+            # Merge in case there are rec metric namespaces that overlap with throughput metric namespace
+            aggregated_states.setdefault(throughput_metric._namespace.value, {}).update(
                 self._get_throughput_metric_states(throughput_metric)
             )
 
@@ -666,8 +769,23 @@ def _all_gather_tensor_list(
     world_size: int,
     pg: Union[dist.ProcessGroup, DeviceMesh],
 ) -> List[torch.Tensor]:
-    """All-gather every tensor in a list and flatten the result."""
-    gathered: List[torch.Tensor] = []  # pragma: no cover
+    """
+    All-gather every tensor in a list and flatten the result.
+
+    Note: In the current implementation with local reduction in _get_metric_states,
+    this function should only receive a list with at most 1 tensor after local reduction.
+    """
+    if not tensors:
+        return []
+
+    # After local reduction in _get_metric_states, tensors should contain at most 1 element
+    if len(tensors) > 1:
+        raise ValueError(
+            f"_all_gather_tensor_list expected at most 1 tensor after local reduction, "
+            f"but received {len(tensors)} tensors. This indicates a bug in _get_metric_states."
+        )
+
+    gathered: List[torch.Tensor] = []
     for t in tensors:
         gathered.extend(_all_gather_tensor(t, world_size, pg))
     return gathered