Add waitcounter around recmetrics (#3677)

nipung90 · facebook-github-bot · commit 48b36a3e8cd1 · 2026-01-23T14:44:24.000-08:00
Summary:

Add waitcounter around recmetrics

Differential Revision: D89328312
diff --git a/torchrec/metrics/cpu_offloaded_metric_module.py b/torchrec/metrics/cpu_offloaded_metric_module.py
@@ -15,6 +15,7 @@
 
 import torch
 from torch import distributed as dist
+from torch.monitor import _WaitCounter
 from torch.profiler import record_function
 from torchrec.metrics.cpu_comms_metric_module import CPUCommsRecMetricModule
 from torchrec.metrics.metric_job_types import (
@@ -311,35 +312,36 @@ def _process_metric_compute_job(
         3. Compute metrics via comms module
         """
 
-        with record_function("## CPUOffloadedRecMetricModule:compute ##"):
-            start_ms = time.time()
-            self.comms_module.load_local_metric_state_snapshot(
-                metric_compute_job.metric_state_snapshot
-            )
-
-            with record_function("## cpu_all_gather ##"):
-                # Manual distributed sync (replaces TorchMetrics.metric.Metric.sync())
-                all_gather_start_ms = time.time()
-                aggregated_states = self.comms_module.get_pre_compute_states(
-                    self.cpu_process_group
-                )
-                self.all_gather_time_logger.add(
-                    (time.time() - all_gather_start_ms) * 1000
+        with _WaitCounter("pytorch.wait_counter.rec_metrics.compute_job").guard():
+            with record_function("## CPUOffloadedRecMetricModule:compute ##"):
+                start_ms = time.time()
+                self.comms_module.load_local_metric_state_snapshot(
+                    metric_compute_job.metric_state_snapshot
                 )
 
-            with record_function("## cpu_load_states ##"):
-                self.comms_module.load_pre_compute_states(aggregated_states)
+                with record_function("## cpu_all_gather ##"):
+                    # Manual distributed sync (replaces TorchMetrics.metric.Metric.sync())
+                    all_gather_start_ms = time.time()
+                    aggregated_states = self.comms_module.get_pre_compute_states(
+                        self.cpu_process_group
+                    )
+                    self.all_gather_time_logger.add(
+                        (time.time() - all_gather_start_ms) * 1000
+                    )
+
+                with record_function("## cpu_load_states ##"):
+                    self.comms_module.load_pre_compute_states(aggregated_states)
 
-            with record_function("## metric_compute ##"):
-                compute_start_ms = time.time()
-                computed_metrics = self.comms_module.compute()
-                self.compute_job_time_logger.add((time.time() - start_ms) * 1000)
-                self.compute_metrics_time_logger.add(
-                    (time.time() - compute_start_ms) * 1000
-                )
-                self.compute_count += 1
-                self._adjust_compute_interval()
-                return computed_metrics
+                with record_function("## metric_compute ##"):
+                    compute_start_ms = time.time()
+                    computed_metrics = self.comms_module.compute()
+                    self.compute_job_time_logger.add((time.time() - start_ms) * 1000)
+                    self.compute_metrics_time_logger.add(
+                        (time.time() - compute_start_ms) * 1000
+                    )
+                    self.compute_count += 1
+                    self._adjust_compute_interval()
+                    return computed_metrics
 
     def _update_loop(self) -> None:
         """
diff --git a/torchrec/metrics/metric_module.py b/torchrec/metrics/metric_module.py
@@ -20,6 +20,7 @@
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed.tensor import DeviceMesh
+from torch.monitor import _WaitCounter
 from torch.profiler import record_function
 from torchrec.metrics.accuracy import AccuracyMetric
 from torchrec.metrics.auc import AUCMetric
@@ -74,7 +75,6 @@
 from torchrec.metrics.weighted_avg import WeightedAvgMetric
 from torchrec.metrics.xauc import XAUCMetric
 
-
 logger: logging.Logger = logging.getLogger(__name__)
 
 REC_METRICS_MAPPING: Dict[RecMetricEnumBase, Type[RecMetric]] = {
@@ -345,21 +345,22 @@ def compute(self) -> Dict[str, MetricValue]:
         """
         self.compute_count += 1
         ret: Dict[str, MetricValue] = {}
-        with record_function("## RecMetricModule:compute ##"):
-            if self.rec_metrics:
-                self._adjust_compute_interval()
-                ret.update(self.rec_metrics.compute())
-            if self.throughput_metric:
-                ret.update(self.throughput_metric.compute())
-            if self.state_metrics:
-                for namespace, component in self.state_metrics.items():
-                    ret.update(
-                        {
-                            f"{compose_customized_metric_key(namespace, metric_name)}": metric_value
-                            for metric_name, metric_value in component.get_metrics().items()
-                        }
-                    )
-        return ret
+        with _WaitCounter("pytorch.wait_counter.rec_metrics.compute_job").guard():
+            with record_function("## RecMetricModule:compute ##"):
+                if self.rec_metrics:
+                    self._adjust_compute_interval()
+                    ret.update(self.rec_metrics.compute())
+                if self.throughput_metric:
+                    ret.update(self.throughput_metric.compute())
+                if self.state_metrics:
+                    for namespace, component in self.state_metrics.items():
+                        ret.update(
+                            {
+                                f"{compose_customized_metric_key(namespace, metric_name)}": metric_value
+                                for metric_name, metric_value in component.get_metrics().items()
+                            }
+                        )
+            return ret
 
     def local_compute(self) -> Dict[str, MetricValue]:
         r"""local_compute() is called when per-trainer metrics are required. It's