CPUOffloadedRecMetricModule: DtoHs in the update thread (#3658)

jeffkbkim · facebook-github-bot · commit 657b7cb5c685 · 2026-01-23T11:03:06.000-08:00
Summary:

CPUOffloadedRecMetricModule currently performs DtoH (nonblocking) from the main thread. This can start to become quite expensive when the order of magnitude of the model_out dict size is in the thousands, where each key stores a tensor with 1000+ elements.

Instead of the main thread launching the DtoHs, have the update thread be responsible. This will free the main thread to continue training.

Differential Revision: D87800947
diff --git a/torchrec/metrics/auprc.py b/torchrec/metrics/auprc.py
@@ -8,12 +8,12 @@
 # pyre-strict
 
 import logging
-from functools import partial
 from typing import Any, cast, Dict, List, Optional, Type
 
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
+from torchrec.metrics.auc import _grouping_keys_state_reduction, _state_reduction
 from torchrec.metrics.metrics_config import RecComputeMode, RecTaskInfo
 from torchrec.metrics.metrics_namespace import MetricName, MetricNamespace, MetricPrefix
 from torchrec.metrics.rec_metric import (
@@ -157,14 +157,6 @@ def compute_auprc_per_group(
     return torch.cat(auprcs)
 
 
-def _state_reduction(state: List[torch.Tensor], dim: int = 1) -> List[torch.Tensor]:
-    return [torch.cat(state, dim=dim)]
-
-
-# pyre-ignore
-_grouping_keys_state_reduction = partial(_state_reduction, dim=0)
-
-
 LIFETIME_WEIGHTED_AUPRC = "lifetime_weighted_auprc"
 LIFETIME_WEIGHT = "lifetime_weight"
 
diff --git a/torchrec/metrics/cpu_offloaded_metric_module.py b/torchrec/metrics/cpu_offloaded_metric_module.py
@@ -105,6 +105,9 @@ def __init__(
         self.update_thread.start()
         self.compute_thread.start()
 
+        self.update_job_time_logger: PercentileLogger = PercentileLogger(
+            metric_name="update_job_time_ms", log_interval=1000
+        )
         self.update_queue_size_logger: PercentileLogger = PercentileLogger(
             metric_name="update_queue_size", log_interval=1000
         )
@@ -144,15 +147,9 @@ def _update_rec_metrics(
             raise self._captured_exception
 
         try:
-            cpu_model_out, transfer_completed_event = (
-                self._transfer_to_cpu(model_out)
-                if self._model_out_device == torch.device("cuda")
-                else (model_out, None)
-            )
             self.update_queue.put_nowait(
                 MetricUpdateJob(
-                    model_out=cpu_model_out,
-                    transfer_completed_event=transfer_completed_event,
+                    model_out=model_out,
                     kwargs=kwargs,
                 )
             )
@@ -206,11 +203,17 @@ def _process_metric_update_job(self, metric_update_job: MetricUpdateJob) -> None
         """
 
         with record_function("## CPUOffloadedRecMetricModule:update ##"):
-            if metric_update_job.transfer_completed_event is not None:
-                metric_update_job.transfer_completed_event.synchronize()
+            start_ms = time.time()
+            cpu_model_out, transfer_completed_event = (
+                self._transfer_to_cpu(metric_update_job.model_out)
+                if self._model_out_device == torch.device("cuda")
+                else (metric_update_job.model_out, None)
+            )
+            if transfer_completed_event is not None:
+                transfer_completed_event.synchronize()
             labels, predictions, weights, required_inputs = parse_task_model_outputs(
                 self.rec_tasks,
-                metric_update_job.model_out,
+                cpu_model_out,
                 self.get_required_inputs(),
             )
             if required_inputs:
@@ -226,6 +229,8 @@ def _process_metric_update_job(self, metric_update_job: MetricUpdateJob) -> None
             if self.throughput_metric:
                 self.throughput_metric.update()
 
+            self.update_job_time_logger.add((time.time() - start_ms) * 1000)
+
     @override
     def shutdown(self) -> None:
         """
@@ -240,6 +245,7 @@ def shutdown(self) -> None:
         if self.compute_thread.is_alive():
             self.compute_thread.join(timeout=30.0)
 
+        self.update_job_time_logger.log_percentiles()
         self.update_queue_size_logger.log_percentiles()
         self.compute_queue_size_logger.log_percentiles()
         self.compute_job_time_logger.log_percentiles()
diff --git a/torchrec/metrics/metric_job_types.py b/torchrec/metrics/metric_job_types.py
@@ -8,7 +8,7 @@
 # pyre-strict
 
 import concurrent
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 
 import torch
 from torchrec.metrics.metric_module import MetricValue
@@ -21,12 +21,11 @@ class MetricUpdateJob:
     update each metric state tensors with intermediate model outputs
     """
 
-    __slots__ = ["model_out", "transfer_completed_event", "kwargs"]
+    __slots__ = ["model_out", "kwargs"]
 
     def __init__(
         self,
         model_out: Dict[str, torch.Tensor],
-        transfer_completed_event: Optional[torch.cuda.Event],
         kwargs: Dict[str, Any],
     ) -> None:
         """
@@ -37,9 +36,6 @@ def __init__(
         """
 
         self.model_out: Dict[str, torch.Tensor] = model_out
-        self.transfer_completed_event: Optional[torch.cuda.Event] = (
-            transfer_completed_event
-        )
         self.kwargs: Dict[str, Any] = kwargs
 
 
diff --git a/torchrec/metrics/metric_module.py b/torchrec/metrics/metric_module.py
@@ -34,11 +34,7 @@
     _state_reduction,
     AUCMetric,
 )
-from torchrec.metrics.auprc import (
-    _grouping_keys_state_reduction as auprc_grouping_keys_state_reduction,
-    _state_reduction as auprc_state_reduction,
-    AUPRCMetric,
-)
+from torchrec.metrics.auprc import AUPRCMetric
 from torchrec.metrics.average import AverageMetric
 from torchrec.metrics.cali_free_ne import CaliFreeNEMetric
 from torchrec.metrics.calibration import CalibrationMetric
@@ -74,11 +70,7 @@
 from torchrec.metrics.output import OutputMetric
 from torchrec.metrics.precision import PrecisionMetric
 from torchrec.metrics.precision_session import PrecisionSessionMetric
-from torchrec.metrics.rauc import (
-    _grouping_keys_state_reduction as rauc_grouping_keys_state_reduction,
-    _state_reduction as rauc_state_reduction,
-    RAUCMetric,
-)
+from torchrec.metrics.rauc import RAUCMetric
 from torchrec.metrics.rec_metric import RecMetric, RecMetricException, RecMetricList
 from torchrec.metrics.recall import RecallMetric
 from torchrec.metrics.recall_session import RecallSessionMetric
@@ -100,12 +92,8 @@
 # Requirements: Associative AND (Commutative OR post-processing makes result order-invariant)
 SAFE_CALLABLE_REDUCTIONS: frozenset[Any] = frozenset(
     {
-        _state_reduction,  # Concatenation + AUC sorts data, making final result order-invariant
+        _state_reduction,  # Concatenation + AUC/AUPRC/RAUC sorts data, making final result order-invariant
         _grouping_keys_state_reduction,  # Concatenation along dim=0 + sorting makes result order-invariant
-        auprc_state_reduction,
-        auprc_grouping_keys_state_reduction,
-        rauc_state_reduction,
-        rauc_grouping_keys_state_reduction,
         _state_reduction_sum,  # Sum on dimension 0.
         _max_reduction,  # Max is associative and commutative.
     }
@@ -384,6 +372,9 @@ def _update_rec_metrics(
                 **kwargs,
             )
 
+            if self.throughput_metric:
+                self.throughput_metric.update()
+
     def update(self, model_out: Dict[str, torch.Tensor], **kwargs: Any) -> None:
         r"""update() is called per batch, usually right after forward() to
         update the local states of metrics based on the model_output.
@@ -393,8 +384,6 @@ def update(self, model_out: Dict[str, torch.Tensor], **kwargs: Any) -> None:
         """
         with record_function("## RecMetricModule:update ##"):
             self._update_rec_metrics(model_out, **kwargs)
-            if self.throughput_metric:
-                self.throughput_metric.update()
             self.trained_batches += 1
 
     def _adjust_compute_interval(self) -> None:
diff --git a/torchrec/metrics/rauc.py b/torchrec/metrics/rauc.py
@@ -8,12 +8,12 @@
 # pyre-strict
 
 import logging
-from functools import partial
 from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type
 
 import torch
 import torch.distributed as dist
 from torchmetrics.utilities.distributed import gather_all_tensors
+from torchrec.metrics.auc import _grouping_keys_state_reduction, _state_reduction
 from torchrec.metrics.metrics_config import RecComputeMode, RecTaskInfo
 from torchrec.metrics.metrics_namespace import MetricName, MetricNamespace, MetricPrefix
 from torchrec.metrics.rec_metric import (
@@ -201,14 +201,6 @@ def compute_rauc_per_group(
     return torch.cat(raucs)
 
 
-def _state_reduction(state: List[torch.Tensor], dim: int = 1) -> List[torch.Tensor]:
-    return [torch.cat(state, dim=dim)]
-
-
-# pyre-ignore
-_grouping_keys_state_reduction = partial(_state_reduction, dim=0)
-
-
 class RAUCMetricComputation(RecMetricComputation):
     r"""
     This class implements the RecMetricComputation for RAUC, i.e. Regression AUC.
diff --git a/torchrec/metrics/tests/test_cpu_offloaded_metric_module.py b/torchrec/metrics/tests/test_cpu_offloaded_metric_module.py
@@ -75,7 +75,6 @@ def setUp(self) -> None:
             model_out_device=torch.device("cpu"),
             batch_size=self.batch_size,
             world_size=self.world_size,
-            device=torch.device("cpu"),
             rec_tasks=self.tasks,
             rec_metrics=self.rec_metrics,
             throughput_metric=ThroughputMetric(
@@ -158,7 +157,6 @@ def test_update_rec_metrics_queue_full(self) -> None:
             model_out_device=torch.device("cuda"),
             batch_size=self.batch_size,
             world_size=self.world_size,
-            device=torch.device("cuda"),
             rec_tasks=self.tasks,
             rec_metrics=self.rec_metrics,
             update_queue_size=1,  # Small queue size
@@ -389,12 +387,11 @@ def test_state_dict_save_load(self) -> None:
             model_out_device=torch.device("cuda"),
             batch_size=self.batch_size,
             world_size=self.world_size,
-            device=torch.device("cuda"),
             rec_tasks=self.tasks,
             rec_metrics=RecMetricList([offloaded_metric]),
         )
 
-        # Update comms module with new state tensors. Offloaded module is untouched.
+        # Update comms module with new state tensors
         comms_metric = cast(
             MockRecMetric, offloaded_module.comms_module.rec_metrics.rec_metrics[0]
         )
@@ -457,7 +454,6 @@ def test_sync(self) -> None:
             model_out_device=torch.device("cuda"),
             batch_size=self.batch_size,
             world_size=self.world_size,
-            device=torch.device("cuda"),
             rec_tasks=self.tasks,
             rec_metrics=RecMetricList([offloaded_metric]),
         )
@@ -484,11 +480,6 @@ def test_sync(self) -> None:
             },
         )
 
-    # pyre-ignore[56]
-    @unittest.skipIf(
-        torch.cuda.device_count() < 1,
-        "Not enough GPUs, this test requires at least one GPU",
-    )
     def test_flush_remaining_work(self) -> None:
         """Test _flush_remaining_work() processes all items in queue during shutdown."""
         test_queue = queue.Queue()
@@ -498,7 +489,6 @@ def test_flush_remaining_work(self) -> None:
                 "task1-label": torch.tensor([0.7]),
                 "task1-weight": torch.tensor([1.0]),
             },
-            transfer_completed_event=torch.cuda.Event(),
             kwargs={},
         )
 
@@ -510,6 +500,114 @@ def test_flush_remaining_work(self) -> None:
         self.assertEqual(items_processed, 2)
         self.assertTrue(test_queue.empty())
 
+    def _run_dtoh_transfer_test(self, use_cuda: bool) -> None:
+        """
+        Helper to test DtoH transfer behavior based on device type.
+
+        When use_cuda=True:
+          - Module is initialized with device=cuda
+          - _transfer_to_cpu should be called from the 'metric_update' thread
+          - Input tensors start on GPU, end up on CPU
+
+        When use_cuda=False:
+          - Module is initialized with device=cpu
+          - _transfer_to_cpu should NOT be called
+          - Input tensors stay on CPU
+        """
+        offloaded_metric = MockRecMetric(
+            world_size=self.world_size,
+            my_rank=self.my_rank,
+            batch_size=self.batch_size,
+            tasks=self.tasks,
+            initial_states=self.initial_states,
+        )
+
+        device = torch.device("cuda") if use_cuda else torch.device("cpu")
+        offloaded_module = CPUOffloadedRecMetricModule(
+            model_out_device=device,
+            batch_size=self.batch_size,
+            world_size=self.world_size,
+            rec_tasks=self.tasks,
+            rec_metrics=RecMetricList([offloaded_metric]),
+        )
+
+        # Track _transfer_to_cpu calls and which thread made the call
+        transfer_call_info: list = []
+        original_transfer_to_cpu = offloaded_module._transfer_to_cpu
+
+        def tracking_transfer_to_cpu(model_out: dict) -> tuple:
+            transfer_call_info.append(threading.current_thread().name)
+            return original_transfer_to_cpu(model_out)
+
+        # Create tensors on the appropriate device
+        model_out = {
+            "task1-prediction": torch.tensor([0.5, 0.7]),
+            "task1-label": torch.tensor([0.0, 1.0]),
+            "task1-weight": torch.tensor([1.0, 1.0]),
+        }
+        if use_cuda:
+            model_out = {k: v.to("cuda:0") for k, v in model_out.items()}
+            for tensor in model_out.values():
+                self.assertEqual(tensor.device.type, "cuda")
+
+        with patch.object(
+            offloaded_module,
+            "_transfer_to_cpu",
+            side_effect=tracking_transfer_to_cpu,
+        ):
+            offloaded_module.update(model_out)
+            wait_until_true(offloaded_metric.update_called)
+
+        if use_cuda:
+            # For CUDA: verify _transfer_to_cpu was called from the update thread
+            self.assertEqual(
+                len(transfer_call_info),
+                1,
+                "_transfer_to_cpu should be called exactly once for CUDA device",
+            )
+            self.assertEqual(
+                transfer_call_info[0],
+                "metric_update",
+                f"DtoH transfer should happen in 'metric_update' thread, "
+                f"but was called from '{transfer_call_info[0]}'",
+            )
+        else:
+            # For CPU: verify _transfer_to_cpu was NOT called
+            self.assertEqual(
+                len(transfer_call_info),
+                0,
+                "_transfer_to_cpu should NOT be called when device is CPU",
+            )
+
+        # Verify tensors received by the mock metric are on CPU
+        self.assertTrue(offloaded_metric.predictions_update_calls is not None)
+        for predictions in offloaded_metric.predictions_update_calls:
+            for task_name, tensor in predictions.items():
+                self.assertEqual(
+                    tensor.device.type,
+                    "cpu",
+                    f"Tensor for {task_name} should be on CPU",
+                )
+
+        offloaded_module.shutdown()
+
+    # pyre-ignore[56]
+    @unittest.skipIf(
+        torch.cuda.device_count() < 1,
+        "Not enough GPUs, this test requires at least one GPU",
+    )
+    def test_dtoh_transfer_in_update_thread_for_cuda_device(self) -> None:
+        """
+        Test that DtoH transfer happens in the update thread when device=cuda.
+        """
+        self._run_dtoh_transfer_test(use_cuda=True)
+
+    def test_no_dtoh_transfer_for_cpu_device(self) -> None:
+        """
+        Test that _transfer_to_cpu is NOT called when device=cpu.
+        """
+        self._run_dtoh_transfer_test(use_cuda=False)
+
 
 @skip_if_asan_class
 class CPUOffloadedMetricModuleDistributedTest(MultiProcessTestBase):
@@ -615,7 +713,6 @@ def _compare_metric_results_worker(
         model_out_device=torch.device("cuda"),
         batch_size=batch_size,
         world_size=world_size,
-        device=torch.device("cuda"),
         rec_tasks=tasks,
         rec_metrics=RecMetricList([offloaded_metric]),
     ).to(device)
diff --git a/torchrec/metrics/tests/test_metric_module.py b/torchrec/metrics/tests/test_metric_module.py