CPUOffloadedRecMetricModule: DtoHs in the update thread (meta-pytorch#3658)

jeffkbkim · meta-codesync[bot] · commit ef7c8eedab91 · 2026-01-22T14:50:54.000-08:00
Summary: Pull Request resolved: meta-pytorch#3658 CPUOffloadedRecMetricModule currently performs DtoH (nonblocking) from the main thread. This can start to become quite expensive when the order of magnitude of the model_out dict size is in the thousands, where each key stores a tensor with 1000+ elements. Instead of the main thread launching the DtoHs, have the update thread be responsible. This will free the main thread to continue training. Differential Revision: D87800947
diff --git a/torchrec/metrics/auprc.py b/torchrec/metrics/auprc.py
@@ -8,12 +8,12 @@
 # pyre-strict
 
 import logging
-from functools import partial
 from typing import Any, cast, Dict, List, Optional, Type
 
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
+from torchrec.metrics.auc import _grouping_keys_state_reduction, _state_reduction
 from torchrec.metrics.metrics_config import RecComputeMode, RecTaskInfo
 from torchrec.metrics.metrics_namespace import MetricName, MetricNamespace, MetricPrefix
 from torchrec.metrics.rec_metric import (
@@ -157,14 +157,6 @@ def compute_auprc_per_group(
     return torch.cat(auprcs)
 
 
-def _state_reduction(state: List[torch.Tensor], dim: int = 1) -> List[torch.Tensor]:
-    return [torch.cat(state, dim=dim)]
-
-
-# pyre-ignore
-_grouping_keys_state_reduction = partial(_state_reduction, dim=0)
-
-
 LIFETIME_WEIGHTED_AUPRC = "lifetime_weighted_auprc"
 LIFETIME_WEIGHT = "lifetime_weight"
 
diff --git a/torchrec/metrics/cpu_offloaded_metric_module.py b/torchrec/metrics/cpu_offloaded_metric_module.py
@@ -105,6 +105,9 @@ def __init__(
         self.update_thread.start()
         self.compute_thread.start()
 
+        self.update_job_time_logger: PercentileLogger = PercentileLogger(
+            metric_name="update_job_time_ms", log_interval=1000
+        )
         self.update_queue_size_logger: PercentileLogger = PercentileLogger(
             metric_name="update_queue_size", log_interval=1000
         )
@@ -144,15 +147,9 @@ def _update_rec_metrics(
             raise self._captured_exception
 
         try:
-            cpu_model_out, transfer_completed_event = (
-                self._transfer_to_cpu(model_out)
-                if self._model_out_device == torch.device("cuda")
-                else (model_out, None)
-            )
             self.update_queue.put_nowait(
                 MetricUpdateJob(
-                    model_out=cpu_model_out,
-                    transfer_completed_event=transfer_completed_event,
+                    model_out=model_out,
                     kwargs=kwargs,
                 )
             )
@@ -206,11 +203,17 @@ def _process_metric_update_job(self, metric_update_job: MetricUpdateJob) -> None
         """
 
         with record_function("## CPUOffloadedRecMetricModule:update ##"):
-            if metric_update_job.transfer_completed_event is not None:
-                metric_update_job.transfer_completed_event.synchronize()
+            start_ms = time.time()
+            cpu_model_out, transfer_completed_event = (
+                self._transfer_to_cpu(metric_update_job.model_out)
+                if self._model_out_device == torch.device("cuda")
+                else (metric_update_job.model_out, None)
+            )
+            if transfer_completed_event is not None:
+                transfer_completed_event.synchronize()
             labels, predictions, weights, required_inputs = parse_task_model_outputs(
                 self.rec_tasks,
-                metric_update_job.model_out,
+                cpu_model_out,
                 self.get_required_inputs(),
             )
             if required_inputs:
@@ -226,6 +229,8 @@ def _process_metric_update_job(self, metric_update_job: MetricUpdateJob) -> None
             if self.throughput_metric:
                 self.throughput_metric.update()
 
+            self.update_job_time_logger.add((time.time() - start_ms) * 1000)
+
     @override
     def shutdown(self) -> None:
         """
@@ -240,6 +245,7 @@ def shutdown(self) -> None:
         if self.compute_thread.is_alive():
             self.compute_thread.join(timeout=30.0)
 
+        self.update_job_time_logger.log_percentiles()
         self.update_queue_size_logger.log_percentiles()
         self.compute_queue_size_logger.log_percentiles()
         self.compute_job_time_logger.log_percentiles()
diff --git a/torchrec/metrics/metric_job_types.py b/torchrec/metrics/metric_job_types.py
@@ -8,7 +8,7 @@
 # pyre-strict
 
 import concurrent
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 
 import torch
 from torchrec.metrics.metric_module import MetricValue
@@ -21,12 +21,11 @@ class MetricUpdateJob:
     update each metric state tensors with intermediate model outputs
     """
 
-    __slots__ = ["model_out", "transfer_completed_event", "kwargs"]
+    __slots__ = ["model_out", "kwargs"]
 
     def __init__(
         self,
         model_out: Dict[str, torch.Tensor],
-        transfer_completed_event: Optional[torch.cuda.Event],
         kwargs: Dict[str, Any],
     ) -> None:
         """
@@ -37,9 +36,6 @@ def __init__(
         """
 
         self.model_out: Dict[str, torch.Tensor] = model_out
-        self.transfer_completed_event: Optional[torch.cuda.Event] = (
-            transfer_completed_event
-        )
         self.kwargs: Dict[str, Any] = kwargs
 
 
diff --git a/torchrec/metrics/metric_module.py b/torchrec/metrics/metric_module.py
@@ -34,11 +34,7 @@
     _state_reduction,
     AUCMetric,
 )
-from torchrec.metrics.auprc import (
-    _grouping_keys_state_reduction as auprc_grouping_keys_state_reduction,
-    _state_reduction as auprc_state_reduction,
-    AUPRCMetric,
-)
+from torchrec.metrics.auprc import AUPRCMetric
 from torchrec.metrics.average import AverageMetric
 from torchrec.metrics.cali_free_ne import CaliFreeNEMetric
 from torchrec.metrics.calibration import CalibrationMetric
@@ -73,11 +69,7 @@
 from torchrec.metrics.output import OutputMetric
 from torchrec.metrics.precision import PrecisionMetric
 from torchrec.metrics.precision_session import PrecisionSessionMetric
-from torchrec.metrics.rauc import (
-    _grouping_keys_state_reduction as rauc_grouping_keys_state_reduction,
-    _state_reduction as rauc_state_reduction,
-    RAUCMetric,
-)
+from torchrec.metrics.rauc import RAUCMetric
 from torchrec.metrics.rec_metric import RecMetric, RecMetricException, RecMetricList
 from torchrec.metrics.recall import RecallMetric
 from torchrec.metrics.recall_session import RecallSessionMetric
@@ -99,12 +91,8 @@
 # Requirements: Associative AND (Commutative OR post-processing makes result order-invariant)
 SAFE_CALLABLE_REDUCTIONS: frozenset[Any] = frozenset(
     {
-        _state_reduction,  # Concatenation + AUC sorts data, making final result order-invariant
+        _state_reduction,  # Concatenation + AUC/AUPRC/RAUC sorts data, making final result order-invariant
         _grouping_keys_state_reduction,  # Concatenation along dim=0 + sorting makes result order-invariant
-        auprc_state_reduction,
-        auprc_grouping_keys_state_reduction,
-        rauc_state_reduction,
-        rauc_grouping_keys_state_reduction,
         _state_reduction_sum,  # Sum on dimension 0.
         _max_reduction,  # Max is associative and commutative.
     }
@@ -382,6 +370,9 @@ def _update_rec_metrics(
                 **kwargs,
             )
 
+            if self.throughput_metric:
+                self.throughput_metric.update()
+
     def update(self, model_out: Dict[str, torch.Tensor], **kwargs: Any) -> None:
         r"""update() is called per batch, usually right after forward() to
         update the local states of metrics based on the model_output.
@@ -391,8 +382,6 @@ def update(self, model_out: Dict[str, torch.Tensor], **kwargs: Any) -> None:
         """
         with record_function("## RecMetricModule:update ##"):
             self._update_rec_metrics(model_out, **kwargs)
-            if self.throughput_metric:
-                self.throughput_metric.update()
             self.trained_batches += 1
 
     def _adjust_compute_interval(self) -> None:
diff --git a/torchrec/metrics/rauc.py b/torchrec/metrics/rauc.py
@@ -8,12 +8,12 @@
 # pyre-strict
 
 import logging
-from functools import partial
 from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type
 
 import torch
 import torch.distributed as dist
 from torchmetrics.utilities.distributed import gather_all_tensors
+from torchrec.metrics.auc import _grouping_keys_state_reduction, _state_reduction
 from torchrec.metrics.metrics_config import RecComputeMode, RecTaskInfo
 from torchrec.metrics.metrics_namespace import MetricName, MetricNamespace, MetricPrefix
 from torchrec.metrics.rec_metric import (
@@ -201,14 +201,6 @@ def compute_rauc_per_group(
     return torch.cat(raucs)
 
 
-def _state_reduction(state: List[torch.Tensor], dim: int = 1) -> List[torch.Tensor]:
-    return [torch.cat(state, dim=dim)]
-
-
-# pyre-ignore
-_grouping_keys_state_reduction = partial(_state_reduction, dim=0)
-
-
 class RAUCMetricComputation(RecMetricComputation):
     r"""
     This class implements the RecMetricComputation for RAUC, i.e. Regression AUC.
diff --git a/torchrec/metrics/tests/test_cpu_offloaded_metric_module.py b/torchrec/metrics/tests/test_cpu_offloaded_metric_module.py
diff --git a/torchrec/metrics/tests/test_metric_module.py b/torchrec/metrics/tests/test_metric_module.py