shelve changes to: [fbgemm_gpu] Report TBE data configuration with EEG-based indices estimation (flesh out D71147675, pt 2)

Gantaphon Chalumporn · facebook-github-bot · commit 46d993d37753 · 2025-04-28T15:33:21.000-07:00
Differential Revision: D73397802
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/stats/bench_params_reporter.py b/fbgemm_gpu/fbgemm_gpu/tbe/stats/bench_params_reporter.py
@@ -57,25 +57,14 @@ def __init__(
         self.logger: logging.Logger = logging.getLogger(__name__)
         self.logger.setLevel(logging.INFO)
 
-    def report_stats(
+    def extract_params(
         self,
         embedding_op: SplitTableBatchedEmbeddingBagsCodegen,
         indices: torch.Tensor,
         offsets: torch.Tensor,
         per_sample_weights: Optional[torch.Tensor] = None,
         batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
-    ) -> None:
-        """
-        Print input stats (for debugging purpose only)
-
-        Args:
-            indices (Tensor): Input indices
-            offsets (Tensor): Input offsets
-            per_sample_weights (Optional[Tensor]): Input per
-                sample weights
-        """
-        if embedding_op.iter.item() % self.report_interval == 0:
-            pass
+    ) -> TBEDataConfig:
 
         # Transfer indices back to CPU for EEG analysis
         indices_cpu = indices.cpu()
@@ -89,12 +78,12 @@ def report_stats(
 
         # Set T to be the number of features we are looking at
         T = len(embedding_op.feature_table_map)
-        # Set E to be the median of the rowcounts to avoid biasing the
+        # Set E to be the mean of the rowcounts to avoid biasing
         E = rowcounts[0] if len(set(rowcounts)) == 1 else np.ceil((np.mean(rowcounts)))
         # Set mixed_dim to be True if there are multiple dims
         mixed_dim = len(set(dims)) > 1
-        # Set D to be the median of the dims to avoid biasing
-        D = dims[0] if mixed_dim else np.ceil((np.mean(dims)))
+        # Set D to be the mean of the dims to avoid biasing
+        D = dims[0] if not mixed_dim else np.ceil((np.mean(dims)))
 
         # Compute indices distribution parameters
         heavy_hitters, q, s, _, _ = torch.ops.fbgemm.tbe_estimate_indices_distribution(
@@ -123,15 +112,15 @@ def report_stats(
         )
 
         # Compute pooling parameters
-        bag_sizes = offsets[1:] - offsets[:-1]
+        bag_sizes = (offsets[1:] - offsets[:-1]).tolist()
         mixed_bag_sizes = len(set(bag_sizes)) > 1
         pooling_params = PoolingParams(
             L=np.ceil(np.mean(bag_sizes)) if mixed_bag_sizes else bag_sizes[0],
             sigma_L=(np.ceil(np.std(bag_sizes)) if mixed_bag_sizes else None),
             length_distribution=("normal" if mixed_bag_sizes else None),
         )
 
-        config = TBEDataConfig(
+        return TBEDataConfig(
             T=T,
             E=E,
             D=D,
@@ -143,8 +132,31 @@ def report_stats(
             use_cpu=(not torch.cuda.is_available()),
         )
 
-        # Write the TBE config to FileStore
-        self.filestore.write(
-            f"tbe-{embedding_op.uuid}-config-estimation-{embedding_op.iter.item()}.json",
-            io.BytesIO(config.json(format=True).encode()),
-        )
+    def report_stats(
+        self,
+        embedding_op: SplitTableBatchedEmbeddingBagsCodegen,
+        indices: torch.Tensor,
+        offsets: torch.Tensor,
+        per_sample_weights: Optional[torch.Tensor] = None,
+        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+    ) -> None:
+        """
+        Print input stats (for debugging purpose only)
+
+        Args:
+            indices (Tensor): Input indices
+            offsets (Tensor): Input offsets
+            per_sample_weights (Optional[Tensor]): Input per
+                sample weights
+        """
+        if embedding_op.iter.item() % self.report_interval == 0:
+            # Extract TBE config
+            config = self.extract_params(
+                embedding_op, indices, offsets, per_sample_weights
+            )
+
+            # Write the TBE config to FileStore
+            self.filestore.write(
+                f"tbe-{embedding_op.uuid}-config-estimation-{embedding_op.iter.item()}.json",
+                io.BytesIO(config.json(format=True).encode()),
+            )
diff --git a/fbgemm_gpu/test/tbe/stats/__init__.py b/fbgemm_gpu/test/tbe/stats/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/fbgemm_gpu/test/tbe/stats/tbe_bench_params_reporter_test.py b/fbgemm_gpu/test/tbe/stats/tbe_bench_params_reporter_test.py
@@ -0,0 +1,111 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# pyre-strict
+
+import unittest
+from unittest.mock import MagicMock, patch
+
+import torch
+from fbgemm_gpu.split_embedding_configs import EmbOptimType as OptimType, SparseType
+from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
+    EmbeddingLocation,
+    PoolingMode,
+)
+
+from fbgemm_gpu.split_table_batched_embeddings_ops_training import (
+    ComputeDevice,
+    SplitTableBatchedEmbeddingBagsCodegen,
+)
+from fbgemm_gpu.tbe.bench import (
+    BatchParams,
+    IndicesParams,
+    PoolingParams,
+    TBEDataConfig,
+)
+from fbgemm_gpu.tbe.stats import TBEBenchmarkParamsReporter
+from fbgemm_gpu.tbe.utils import get_device
+
+
+class TestTBEBenchmarkParamsReporter(unittest.TestCase):
+    @patch("fbgemm_gpu.utils.FileStore")  # Mock FileStore
+    def test_report_stats(
+        self,
+        mock_filestore: MagicMock,  # Mock FileStore
+    ) -> None:
+
+        tbeconfig = TBEDataConfig(
+            T=2,
+            E=1024,
+            D=32,
+            mixed_dim=True,
+            weighted=False,
+            batch_params=BatchParams(B=512),
+            indices_params=IndicesParams(
+                heavy_hitters=torch.tensor([]),
+                zipf_q=0.1,
+                zipf_s=0.1,
+                index_dtype=torch.int64,
+                offset_dtype=torch.int64,
+            ),
+            pooling_params=PoolingParams(L=2),
+            use_cpu=True,
+        )
+
+        embedding_location = EmbeddingLocation.HOST
+
+        _, Ds = tbeconfig.generate_embedding_dims()
+        embedding_op = SplitTableBatchedEmbeddingBagsCodegen(
+            [
+                (
+                    tbeconfig.E,
+                    D,
+                    embedding_location,
+                    ComputeDevice.CPU,
+                )
+                for D in Ds
+            ],
+            optimizer=OptimType.EXACT_ROWWISE_ADAGRAD,
+            learning_rate=0.01,
+            weights_precision=SparseType.FP32,
+            pooling_mode=PoolingMode.SUM,
+            output_dtype=SparseType.FP32,
+        )
+
+        embedding_op = embedding_op.to(get_device())
+
+        requests = tbeconfig.generate_requests(1)
+
+        # Initialize the reporter
+        reporter = TBEBenchmarkParamsReporter(report_interval=1)
+        # Set the mock filestore as the reporter's filestore
+        reporter.filestore = mock_filestore
+
+        request = requests[0]
+        # Call the report_stats method
+        extracted_config = reporter.extract_params(
+            embedding_op=embedding_op,
+            indices=request.indices,
+            offsets=request.offsets,
+        )
+
+        reporter.report_stats(
+            embedding_op=embedding_op,
+            indices=request.indices,
+            offsets=request.offsets,
+        )
+
+        # TODO: This is not working because need more details in initial config
+        # Assert that the reconstructed configuration matches the original
+        # assert (
+        #     extracted_config == tbeconfig
+        # ), "Extracted configuration does not match the original TBEDataConfig"
+
+        # Check if the write method was called on the FileStore
+        assert (
+            reporter.filestore.write.assert_called_once
+        ), "FileStore.write() was not called"