CPUOffloadedRecMetricModule: DtoHs in the update thread (meta-pytorch#3658)

jeffkbkim · meta-codesync[bot] · commit 050e753a1e16 · 2026-01-13T11:24:30.000-08:00
Summary: Pull Request resolved: meta-pytorch#3658 CPUOffloadedRecMetricModule currently performs DtoH (nonblocking) from the main thread. This can start to become quite expensive when the order of magnitude of the model_out dict size is in the thousands, where each key stores a tensor with 1000+ elements. Instead of the main thread launching the DtoHs, have the update thread be responsible. This will free the main thread to continue training. Differential Revision: D87800947
diff --git a/torchrec/metrics/cpu_offloaded_metric_module.py b/torchrec/metrics/cpu_offloaded_metric_module.py
@@ -105,6 +105,9 @@ def __init__(
         self.update_thread.start()
         self.compute_thread.start()
 
+        self.update_job_time_logger: PercentileLogger = PercentileLogger(
+            metric_name="update_job_time_ms", log_interval=1000
+        )
         self.update_queue_size_logger: PercentileLogger = PercentileLogger(
             metric_name="update_queue_size", log_interval=1000
         )
@@ -144,15 +147,9 @@ def _update_rec_metrics(
             raise self._captured_exception
 
         try:
-            cpu_model_out, transfer_completed_event = (
-                self._transfer_to_cpu(model_out)
-                if self._device == torch.device("cuda")
-                else (model_out, None)
-            )
             self.update_queue.put_nowait(
                 MetricUpdateJob(
-                    model_out=cpu_model_out,
-                    transfer_completed_event=transfer_completed_event,
+                    model_out=model_out,
                     kwargs=kwargs,
                 )
             )
@@ -206,11 +203,17 @@ def _process_metric_update_job(self, metric_update_job: MetricUpdateJob) -> None
         """
 
         with record_function("## CPUOffloadedRecMetricModule:update ##"):
-            if metric_update_job.transfer_completed_event is not None:
-                metric_update_job.transfer_completed_event.synchronize()
+            start_ms = time.time()
+            cpu_model_out, transfer_completed_event = (
+                self._transfer_to_cpu(metric_update_job.model_out)
+                if self._device == torch.device("cuda")
+                else (metric_update_job.model_out, None)
+            )
+            if transfer_completed_event is not None:
+                transfer_completed_event.synchronize()
             labels, predictions, weights, required_inputs = parse_task_model_outputs(
                 self.rec_tasks,
-                metric_update_job.model_out,
+                cpu_model_out,
                 self.get_required_inputs(),
             )
             if required_inputs:
@@ -226,6 +229,8 @@ def _process_metric_update_job(self, metric_update_job: MetricUpdateJob) -> None
             if self.throughput_metric:
                 self.throughput_metric.update()
 
+            self.update_job_time_logger.add((time.time() - start_ms) * 1000)
+
     @override
     def shutdown(self) -> None:
         """
@@ -240,6 +245,7 @@ def shutdown(self) -> None:
         if self.compute_thread.is_alive():
             self.compute_thread.join(timeout=30.0)
 
+        self.update_job_time_logger.log_percentiles()
         self.update_queue_size_logger.log_percentiles()
         self.compute_queue_size_logger.log_percentiles()
         self.compute_job_time_logger.log_percentiles()
diff --git a/torchrec/metrics/metric_job_types.py b/torchrec/metrics/metric_job_types.py
@@ -8,7 +8,7 @@
 # pyre-strict
 
 import concurrent
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 
 import torch
 from torchrec.metrics.metric_module import MetricValue
@@ -21,12 +21,11 @@ class MetricUpdateJob:
     update each metric state tensors with intermediate model outputs
     """
 
-    __slots__ = ["model_out", "transfer_completed_event", "kwargs"]
+    __slots__ = ["model_out", "kwargs"]
 
     def __init__(
         self,
         model_out: Dict[str, torch.Tensor],
-        transfer_completed_event: Optional[torch.cuda.Event],
         kwargs: Dict[str, Any],
     ) -> None:
         """
@@ -37,9 +36,6 @@ def __init__(
         """
 
         self.model_out: Dict[str, torch.Tensor] = model_out
-        self.transfer_completed_event: Optional[torch.cuda.Event] = (
-            transfer_completed_event
-        )
         self.kwargs: Dict[str, Any] = kwargs
 
 
diff --git a/torchrec/metrics/metric_module.py b/torchrec/metrics/metric_module.py
@@ -366,6 +366,9 @@ def _update_rec_metrics(
                 **kwargs,
             )
 
+            if self.throughput_metric:
+                self.throughput_metric.update()
+
     def update(self, model_out: Dict[str, torch.Tensor], **kwargs: Any) -> None:
         r"""update() is called per batch, usually right after forward() to
         update the local states of metrics based on the model_output.
@@ -375,8 +378,6 @@ def update(self, model_out: Dict[str, torch.Tensor], **kwargs: Any) -> None:
         """
         with record_function("## RecMetricModule:update ##"):
             self._update_rec_metrics(model_out, **kwargs)
-            if self.throughput_metric:
-                self.throughput_metric.update()
             self.trained_batches += 1
 
     def _adjust_compute_interval(self) -> None:
diff --git a/torchrec/metrics/tests/test_cpu_offloaded_metric_module.py b/torchrec/metrics/tests/test_cpu_offloaded_metric_module.py
@@ -480,11 +480,6 @@ def test_sync(self) -> None:
             },
         )
 
-    # pyre-ignore[56]
-    @unittest.skipIf(
-        torch.cuda.device_count() < 1,
-        "Not enough GPUs, this test requires at least one GPU",
-    )
     def test_flush_remaining_work(self) -> None:
         """Test _flush_remaining_work() processes all items in queue during shutdown."""
         test_queue = queue.Queue()
@@ -494,7 +489,6 @@ def test_flush_remaining_work(self) -> None:
                 "task1-label": torch.tensor([0.7]),
                 "task1-weight": torch.tensor([1.0]),
             },
-            transfer_completed_event=torch.cuda.Event(),
             kwargs={},
         )
 
@@ -506,6 +500,114 @@ def test_flush_remaining_work(self) -> None:
         self.assertEqual(items_processed, 2)
         self.assertTrue(test_queue.empty())
 
+    def _run_dtoh_transfer_test(self, use_cuda: bool) -> None:
+        """
+        Helper to test DtoH transfer behavior based on device type.
+
+        When use_cuda=True:
+          - Module is initialized with device=cuda
+          - _transfer_to_cpu should be called from the 'metric_update' thread
+          - Input tensors start on GPU, end up on CPU
+
+        When use_cuda=False:
+          - Module is initialized with device=cpu
+          - _transfer_to_cpu should NOT be called
+          - Input tensors stay on CPU
+        """
+        offloaded_metric = MockRecMetric(
+            world_size=self.world_size,
+            my_rank=self.my_rank,
+            batch_size=self.batch_size,
+            tasks=self.tasks,
+            initial_states=self.initial_states,
+        )
+
+        device = torch.device("cuda") if use_cuda else torch.device("cpu")
+        offloaded_module = CPUOffloadedRecMetricModule(
+            batch_size=self.batch_size,
+            world_size=self.world_size,
+            device=device,
+            rec_tasks=self.tasks,
+            rec_metrics=RecMetricList([offloaded_metric]),
+        )
+
+        # Track _transfer_to_cpu calls and which thread made the call
+        transfer_call_info: list = []
+        original_transfer_to_cpu = offloaded_module._transfer_to_cpu
+
+        def tracking_transfer_to_cpu(model_out: dict) -> tuple:
+            transfer_call_info.append(threading.current_thread().name)
+            return original_transfer_to_cpu(model_out)
+
+        # Create tensors on the appropriate device
+        model_out = {
+            "task1-prediction": torch.tensor([0.5, 0.7]),
+            "task1-label": torch.tensor([0.0, 1.0]),
+            "task1-weight": torch.tensor([1.0, 1.0]),
+        }
+        if use_cuda:
+            model_out = {k: v.to("cuda:0") for k, v in model_out.items()}
+            for tensor in model_out.values():
+                self.assertEqual(tensor.device.type, "cuda")
+
+        with patch.object(
+            offloaded_module,
+            "_transfer_to_cpu",
+            side_effect=tracking_transfer_to_cpu,
+        ):
+            offloaded_module.update(model_out)
+            wait_until_true(offloaded_metric.update_called)
+
+        if use_cuda:
+            # For CUDA: verify _transfer_to_cpu was called from the update thread
+            self.assertEqual(
+                len(transfer_call_info),
+                1,
+                "_transfer_to_cpu should be called exactly once for CUDA device",
+            )
+            self.assertEqual(
+                transfer_call_info[0],
+                "metric_update",
+                f"DtoH transfer should happen in 'metric_update' thread, "
+                f"but was called from '{transfer_call_info[0]}'",
+            )
+        else:
+            # For CPU: verify _transfer_to_cpu was NOT called
+            self.assertEqual(
+                len(transfer_call_info),
+                0,
+                "_transfer_to_cpu should NOT be called when device is CPU",
+            )
+
+        # Verify tensors received by the mock metric are on CPU
+        self.assertTrue(offloaded_metric.predictions_update_calls is not None)
+        for predictions in offloaded_metric.predictions_update_calls:
+            for task_name, tensor in predictions.items():
+                self.assertEqual(
+                    tensor.device.type,
+                    "cpu",
+                    f"Tensor for {task_name} should be on CPU",
+                )
+
+        offloaded_module.shutdown()
+
+    # pyre-ignore[56]
+    @unittest.skipIf(
+        torch.cuda.device_count() < 1,
+        "Not enough GPUs, this test requires at least one GPU",
+    )
+    def test_dtoh_transfer_in_update_thread_for_cuda_device(self) -> None:
+        """
+        Test that DtoH transfer happens in the update thread when device=cuda.
+        """
+        self._run_dtoh_transfer_test(use_cuda=True)
+
+    def test_no_dtoh_transfer_for_cpu_device(self) -> None:
+        """
+        Test that _transfer_to_cpu is NOT called when device=cpu.
+        """
+        self._run_dtoh_transfer_test(use_cuda=False)
+
 
 @skip_if_asan_class
 class CPUOffloadedMetricModuleDistributedTest(MultiProcessTestBase):
diff --git a/torchrec/metrics/tests/test_metric_module.py b/torchrec/metrics/tests/test_metric_module.py
@@ -90,12 +90,15 @@ def __init__(
     def _update_rec_metrics(
         self, model_out: Dict[str, torch.Tensor], **kwargs: Any
     ) -> None:
-        if isinstance(model_out, MagicMock):
-            return
-        labels, predictions, weights, _ = parse_task_model_outputs(
-            self.rec_tasks, model_out
-        )
-        self.rec_metrics.update(predictions=predictions, labels=labels, weights=weights)
+        if not isinstance(model_out, MagicMock):
+            labels, predictions, weights, _ = parse_task_model_outputs(
+                self.rec_tasks, model_out
+            )
+            self.rec_metrics.update(
+                predictions=predictions, labels=labels, weights=weights
+            )
+        if self.throughput_metric:
+            self.throughput_metric.update()
 
 
 class MetricModuleTest(unittest.TestCase):