Add tracking and streaming logic to SplitTableBatchedEmbeddingBagsCodegen (pytorch#4741)

chouxi · facebook-github-bot · commit 7893c006c667 · 2025-08-19T16:52:02.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1762 It follows similar logic to SSD TBE https://fburl.com/code/fxdcxma3 It tries to 1. store the updated ids/count 2. next iteration streams out the updated embeddings and ids, before the embedding cache are populated again. the prefetch pipeline logic also the same to SSDTBE. Differential Revision: D78438757
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -60,6 +60,8 @@
 
 from fbgemm_gpu.utils.loader import load_torch_module, load_torch_module_bc
 
+from torch.autograd.profiler import record_function
+
 try:
     load_torch_module(
         "//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops_training_gpu",
@@ -626,6 +628,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
     lxu_cache_locations_list: List[Tensor]
     lxu_cache_locations_empty: Tensor
     timesteps_prefetched: List[int]
+    prefetched_info: List[Tuple[Tensor, Tensor]]
     record_cache_metrics: RecordCacheMetrics
     # pyre-fixme[13]: Attribute `uvm_cache_stats` is never initialized.
     uvm_cache_stats: torch.Tensor
@@ -690,6 +693,8 @@ def __init__(  # noqa C901
         embedding_table_index_type: torch.dtype = torch.int64,
         embedding_table_offset_type: torch.dtype = torch.int64,
         embedding_shard_info: Optional[List[Tuple[int, int, int, int]]] = None,
+        enable_raw_embedding_streaming: bool = False,
+        res_params: Optional[RESParams] = None,
     ) -> None:
         super(SplitTableBatchedEmbeddingBagsCodegen, self).__init__()
         self.uuid = str(uuid.uuid4())
@@ -700,6 +705,7 @@ def __init__(  # noqa C901
         )
 
         self.logging_table_name: str = self.get_table_name_for_logging(table_names)
+        self.enable_raw_embedding_streaming: bool = enable_raw_embedding_streaming
         self.pooling_mode = pooling_mode
         self.is_nobag: bool = self.pooling_mode == PoolingMode.NONE
 
@@ -1460,6 +1466,30 @@ def __init__(  # noqa C901
             )
         self.embedding_table_offset_type: torch.dtype = embedding_table_offset_type
 
+        self.prefetched_info: List[Tuple[Tensor, Tensor]] = torch.jit.annotate(
+            List[Tuple[Tensor, Tensor]], []
+        )
+        if self.enable_raw_embedding_streaming:
+            self.res_params: RESParams = res_params or RESParams()
+            self.res_params.table_sizes = [0] + list(accumulate(rows))
+            res_port_from_env = os.getenv("LOCAL_RES_PORT")
+            self.res_params.res_server_port = (
+                int(res_port_from_env) if res_port_from_env else 0
+            )
+            # pyre-fixme[4]: Attribute must be annotated.
+            self._raw_embedding_streamer = torch.classes.fbgemm.RawEmbeddingStreamer(
+                self.uuid,
+                self.enable_raw_embedding_streaming,
+                self.res_params.res_store_shards,
+                self.res_params.res_server_port,
+                self.res_params.table_names,
+                self.res_params.table_offsets,
+                self.res_params.table_sizes,
+            )
+            logging.info(
+                f"{self.uuid} raw embedding streaming enabled with {self.res_params=}"
+            )
+
     @torch.jit.ignore
     def log(self, msg: str) -> None:
         """
@@ -2521,7 +2551,13 @@ def _prefetch(
             self.local_uvm_cache_stats.zero_()
         self._report_io_size_count("prefetch_input", indices)
 
+        # streaming before updating the cache
+        self.raw_embedding_stream()
+
         final_lxu_cache_locations = torch.empty_like(indices, dtype=torch.int32)
+        linear_cache_indices_merged = torch.zeros(
+            0, dtype=indices.dtype, device=indices.device
+        )
         for (
             partial_indices,
             partial_lxu_cache_locations,
@@ -2537,6 +2573,9 @@ def _prefetch(
                 vbe_metadata.max_B if vbe_metadata is not None else -1,
                 base_offset,
             )
+            linear_cache_indices_merged = torch.cat(
+                [linear_cache_indices_merged, linear_cache_indices]
+            )
 
             if (
                 self.record_cache_metrics.record_cache_miss_counter
@@ -2617,6 +2656,23 @@ def _prefetch(
             if self.should_log():
                 self.print_uvm_cache_stats(use_local_cache=False)
 
+        if self.enable_raw_embedding_streaming:
+            with record_function(
+                "## uvm_save_prefetched_rows {} {} ##".format(self.timestep, self.uuid)
+            ):
+                (
+                    linear_unique_indices,
+                    linear_unique_indices_length,
+                    _,
+                ) = torch.ops.fbgemm.get_unique_indices(
+                    linear_cache_indices_merged,
+                    self.total_cache_hash_size,
+                    compute_count=False,
+                )
+                self.prefetched_info.append(
+                    (linear_unique_indices, linear_unique_indices_length)
+                )
+
     def should_log(self) -> bool:
         """Determines if we should log for this step, using exponentially decreasing frequency.
 
@@ -3829,6 +3885,55 @@ def _debug_print_input_stats_factory_null(
             return _debug_print_input_stats_factory_impl
         return _debug_print_input_stats_factory_null
 
+    @torch.jit.ignore
+    def raw_embedding_stream(self) -> None:
+        if not self.enable_raw_embedding_streaming:
+            return None
+        # when pipelining is enabled
+        # prefetch in iter i happens before the backward sparse in iter i - 1
+        # so embeddings for iter i - 1's changed ids are not updated.
+        # so we can only fetch the indices from the iter i - 2
+        # when pipelining is disabled
+        # prefetch in iter i happens before forward iter i
+        # so we can get the iter i - 1's changed ids safely.
+        target_prev_iter = 1
+        if self.prefetch_pipeline:
+            target_prev_iter = 2
+        if not len(self.prefetched_info) > (target_prev_iter - 1):
+            return None
+        with record_function(
+            "## uvm_lookup_prefetched_rows {} {} ##".format(self.timestep, self.uuid)
+        ):
+            (updated_indices, updated_count) = self.prefetched_info.pop(0)
+            updated_locations = torch.ops.fbgemm.lxu_cache_lookup(
+                updated_indices,
+                self.lxu_cache_state,
+                self.total_cache_hash_size,
+                gather_cache_stats=False,  # not collecting cache stats
+                num_uniq_cache_indices=updated_count,
+            )
+            updated_weights = torch.empty(
+                [updated_indices.size()[0], self.max_D_cache],
+                # pyre-ignore Incompatible parameter type [6]: In call `torch._C._VariableFunctions.empty`, for argument `dtype`, expected `Optional[dtype]` but got `Union[Module, dtype, Tensor]`
+                dtype=self.lxu_cache_weights.dtype,
+                # pyre-ignore Incompatible parameter type [6]: In call `torch._C._VariableFunctions.empty`, for argument `device`, expected `Union[None, int, str, device]` but got `Union[Module, device, Tensor]`
+                device=self.lxu_cache_weights.device,
+            )
+            torch.ops.fbgemm.masked_index_select(
+                updated_weights,
+                updated_locations,
+                self.lxu_cache_weights,
+                updated_count,
+            )
+            # stream weights
+            self._raw_embedding_streamer.stream(
+                updated_indices.to(device=torch.device("cpu")),
+                updated_weights.to(device=torch.device("cpu")),
+                updated_count.to(device=torch.device("cpu")),
+                False,  # require_tensor_copy
+                False,  # blocking_tensor_copy
+            )
+
 
 class DenseTableBatchedEmbeddingBagsCodegen(nn.Module):
     """
diff --git a/fbgemm_gpu/test/tbe/training/forward_test.py b/fbgemm_gpu/test/tbe/training/forward_test.py
@@ -12,6 +12,7 @@
 import math
 import random
 import unittest
+from unittest.mock import MagicMock, patch
 
 import hypothesis.strategies as st
 import numpy as np
@@ -24,6 +25,7 @@
 )
 from fbgemm_gpu.split_table_batched_embeddings_ops_training import (
     ComputeDevice,
+    RESParams,
     SplitTableBatchedEmbeddingBagsCodegen,
 )
 from fbgemm_gpu.tbe.utils import (
@@ -129,6 +131,8 @@ def execute_forward_(  # noqa C901
         use_cpu: bool,
         output_dtype: SparseType,
         use_experimental_tbe: bool,
+        enable_raw_embedding_streaming: bool = False,
+        prefetch_pipeline: bool = False,
     ) -> None:
         # NOTE: cache is not applicable to CPU version.
         assume(not use_cpu or not use_cache)
@@ -158,6 +162,10 @@ def execute_forward_(  # noqa C901
                 and pooling_mode != PoolingMode.NONE
             )
         )
+        # NOTE: Raw embedding streaming requires UVM cache
+        assume(not enable_raw_embedding_streaming or use_cache)
+        # NOTE: Raw embedding streaming not supported on CPU
+        assume(not enable_raw_embedding_streaming or not use_cpu)
 
         emb_op = SplitTableBatchedEmbeddingBagsCodegen
         if pooling_mode == PoolingMode.SUM:
@@ -285,6 +293,16 @@ def execute_forward_(  # noqa C901
         else:
             f = torch.cat(fs, dim=0).view(-1, D)
 
+        # Create RES parameters if raw embedding streaming is enabled
+        res_params = None
+        if enable_raw_embedding_streaming:
+            res_params = RESParams(
+                res_store_shards=1,
+                table_names=[f"table_{i}" for i in range(T)],
+                table_offsets=[sum(Es[:i]) for i in range(T + 1)],
+                table_sizes=Es,
+            )
+
         # Create a TBE op
         cc = emb_op(
             embedding_specs=[
@@ -305,6 +323,9 @@ def execute_forward_(  # noqa C901
             pooling_mode=pooling_mode,
             output_dtype=output_dtype,
             use_experimental_tbe=use_experimental_tbe,
+            prefetch_pipeline=prefetch_pipeline,
+            enable_raw_embedding_streaming=enable_raw_embedding_streaming,
+            res_params=res_params,
         )
         # Test torch JIT script compatibility
         if not use_cpu:
@@ -1158,6 +1179,94 @@ def test_forward_fused_pooled_emb_quant(
                 cat_deq_lowp_pooled_output, cat_dq_fp32_pooled_output
             )
 
+    def _check_raw_embedding_stream_call_counts(
+        self,
+        mock_raw_embedding_stream: unittest.mock.Mock,
+        num_iterations: int,
+        prefetch_pipeline: bool,
+        L: int,
+    ) -> None:
+        # For TBE (not SSD), raw_embedding_stream is called once per prefetch
+        # when there's data to stream
+        expected_calls = num_iterations if L > 0 else 0
+        if prefetch_pipeline:
+            # With prefetch pipeline, there might be fewer calls initially
+            expected_calls = max(0, expected_calls - 1)
+
+        self.assertGreaterEqual(mock_raw_embedding_stream.call_count, 0)
+        # Allow some flexibility in call count due to caching behavior
+        self.assertLessEqual(mock_raw_embedding_stream.call_count, expected_calls + 2)
+
+    @unittest.skipIf(*gpu_unavailable)
+    @given(
+        T=st.integers(min_value=1, max_value=5),
+        D=st.integers(min_value=2, max_value=64),
+        B=st.integers(min_value=1, max_value=32),
+        log_E=st.integers(min_value=3, max_value=4),
+        L=st.integers(min_value=1, max_value=10),
+        weights_precision=st.sampled_from([SparseType.FP32, SparseType.FP16]),
+        cache_algorithm=st.sampled_from(CacheAlgorithm),
+        pooling_mode=st.sampled_from([PoolingMode.SUM, PoolingMode.MEAN]),
+        weighted=st.booleans(),
+        mixed=st.booleans(),
+        prefetch_pipeline=st.booleans(),
+    )
+    @settings(
+        verbosity=VERBOSITY,
+        max_examples=MAX_EXAMPLES_LONG_RUNNING,
+        deadline=None,
+        suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
+    )
+    def test_forward_raw_embedding_streaming(
+        self,
+        T: int,
+        D: int,
+        B: int,
+        log_E: int,
+        L: int,
+        weights_precision: SparseType,
+        cache_algorithm: CacheAlgorithm,
+        pooling_mode: PoolingMode,
+        weighted: bool,
+        mixed: bool,
+        prefetch_pipeline: bool,
+    ) -> None:
+        """Test raw embedding streaming functionality integrated with forward pass."""
+        num_iterations = 5
+
+        with patch(
+            "fbgemm_gpu.split_table_batched_embeddings_ops_training.torch.classes.fbgemm.RawEmbeddingStreamer"
+        ) as mock_streamer_class:
+            # Mock the RawEmbeddingStreamer class
+            mock_streamer_instance = MagicMock()
+            mock_streamer_class.return_value = mock_streamer_instance
+
+            # Run multiple iterations to test streaming behavior
+            for _ in range(num_iterations):
+                self.execute_forward_(
+                    T=T,
+                    D=D,
+                    B=B,
+                    log_E=log_E,
+                    L=L,
+                    weights_precision=weights_precision,
+                    weighted=weighted,
+                    mixed=mixed,
+                    mixed_B=False,  # Keep simple for streaming tests
+                    use_cache=True,  # Required for streaming
+                    cache_algorithm=cache_algorithm,
+                    pooling_mode=pooling_mode,
+                    use_cpu=False,  # Streaming not supported on CPU
+                    output_dtype=SparseType.FP32,
+                    use_experimental_tbe=False,
+                    enable_raw_embedding_streaming=True,
+                    prefetch_pipeline=prefetch_pipeline,
+                )
+
+            self._check_raw_embedding_stream_call_counts(
+                mock_streamer_instance, num_iterations, prefetch_pipeline, L
+            )
+
 
 if __name__ == "__main__":
     unittest.main()