Implement a write method in DMP (#3801)

Xinyi Wang · meta-codesync[bot] · commit a77cf3b177da · 2026-03-06T09:08:01.000-08:00
Summary: Pull Request resolved: #3801 TorchRec allows users to create embeddings with custom input. This was done in D78749760. In this diff I expose this method to DistributedDataParallel (DMP), so that for modules with config enable_embedding_update = True, DMP will be able to update the embeddings with custom input. **Approach** We recursively initialize writable modules in `_init_dmp` method and when callers call `write` update all found modules with provided kjt Reviewed By: kausv Differential Revision: D93914739 fbshipit-source-id: 38f2019c079df325dbe246b7dea79de80a6f113f
diff --git a/torchrec/distributed/batched_embedding_kernel.py b/torchrec/distributed/batched_embedding_kernel.py
@@ -74,7 +74,6 @@
     GroupedEmbeddingConfig,
     ShardedEmbeddingTable,
 )
-from torchrec.distributed.model_tracker.types import IndexedLookup
 from torchrec.distributed.shards_wrapper import LocalShardsWrapper
 from torchrec.distributed.types import (
     LazyAwaitable,
diff --git a/torchrec/distributed/embedding.py b/torchrec/distributed/embedding.py
@@ -498,6 +498,7 @@ def __init__(
         self._write_splits: List[int] = []
         self._feature_splits: List[int] = []
         self._features_order: List[int] = []
+        self._writable_embedding_names: set[str] = set()
 
         self._has_uninitialized_input_dist: bool = True
         logger.info(f"EC index dedup enabled: {self._use_index_dedup}.")
@@ -1685,6 +1686,7 @@ def _create_write_dist(self) -> None:
             if sharding.enable_embedding_update:
                 self._write_dists.append(sharding.create_write_dist())
                 self._write_splits.append(sharding._get_num_writable_features())
+                self._writable_embedding_names.update(sharding.embedding_names())
 
     # pyrefly: ignore[bad-override]
     def write_dist(
@@ -1694,6 +1696,10 @@ def write_dist(
             raise ValueError("enable_embedding_update is False for this collection")
         if not self._write_dists:
             self._create_write_dist()
+        if set(embeddings.keys()) != self._writable_embedding_names:
+            raise ValueError(
+                f"write_dist feature names {embeddings.keys()} do not match expected {self._writable_embedding_names}"
+            )
         with torch.no_grad():
             embeddings_by_shards = embeddings.split(self._write_splits)
             awaitables = []
diff --git a/torchrec/distributed/model_parallel.py b/torchrec/distributed/model_parallel.py
@@ -11,6 +11,7 @@
 import copy
 import logging as logger
 from collections import defaultdict, OrderedDict
+from functools import wraps
 from typing import Any, Callable, cast, Dict, Iterator, List, Optional, Set, Tuple, Type
 
 import torch
@@ -30,6 +31,7 @@
 from torch.nn.parallel import DistributedDataParallel
 from torchrec.distributed.collective_utils import create_on_rank_and_share_result
 from torchrec.distributed.comm import get_local_size
+from torchrec.distributed.embedding import ShardedEmbeddingCollection
 from torchrec.distributed.model_tracker.model_delta_tracker import (
     ModelDeltaTracker,
     ModelDeltaTrackerTrec,
@@ -40,7 +42,6 @@
     ModelTrackerConfigs,
     RawIdTrackerConfig,
     Trackers,
-    UniqueRows,
 )
 from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
 from torchrec.distributed.sharding_plan import get_default_sharders
@@ -61,7 +62,6 @@
     append_prefix,
     copy_to_device,
     filter_state_dict,
-    none_throws,
     sharded_model_copy,
 )
 from torchrec.optim.fused import FusedOptimizerModule
@@ -77,6 +77,40 @@
 _DDP_STATE_DICT_PREFIX = "module."
 
 
+def _populate_updatable_modules(
+    func: Callable[..., nn.Module],
+) -> Callable[..., nn.Module]:
+    """
+    Decorator that populates the list of modules that can be updated with kjt.
+    Specifically, modules with enable_embedding_update flag set to True.
+
+    Applied to _shard_modules_impl to automatically process returned modules.
+    """
+
+    @wraps(func)
+    def wrapper(
+        self: "DistributedModelParallel",
+        module: nn.Module,
+        path: str = "",
+        module_id_cache: Optional[Dict[str, "ShardedModule"]] = None,
+    ) -> nn.Module:
+        result = func(self, module, path, module_id_cache)
+
+        module_id = id(result)
+        if module_id_cache and module_id in module_id_cache:
+            # skip adding duplicate one
+            return result
+
+        if isinstance(result, ShardedEmbeddingCollection) and getattr(
+            result, "enable_embedding_update", False
+        ):
+            self._writable_sharded_modules.append(result)
+
+        return result
+
+    return wrapper
+
+
 class DataParallelWrapper(abc.ABC):
     """
     Interface implemented by custom data parallel wrappers.
@@ -297,6 +331,7 @@ def __init__(
                 # pyrefly: ignore[bad-argument-type, missing-argument]
                 plan = planner.plan(module, self.sharders)
         self._plan: ShardingPlan = plan
+        self._writable_sharded_modules: list[ShardedEmbeddingCollection] = []
         self._dmp_wrapped_module: nn.Module = self._init_dmp(module)
         self._optim: CombinedOptimizer = self._init_optim(self._dmp_wrapped_module)
 
@@ -462,6 +497,7 @@ def _fused_optim_impl(
             )
         return fused_optims
 
+    @_populate_updatable_modules
     def _shard_modules_impl(
         self,
         module: nn.Module,
@@ -613,6 +649,18 @@ def load_state_dict(
     ) -> _IncompatibleKeys:
         return self._load_state_dict(self, state_dict, prefix, strict)
 
+    def write(self, *input, **kwargs) -> None:
+        """
+        Write features to the sharded module if it has enable_embedding_update flag.
+        """
+        if len(self._writable_sharded_modules) == 0:
+            raise RuntimeError(
+                "No writable sharded modules found. Please check `enable_embedding_update` flag in your embedding config"
+            )
+
+        for module in self._writable_sharded_modules:
+            module.write(*input, **kwargs)
+
     def _load_state_dict(
         self,
         module: nn.Module,
diff --git a/torchrec/distributed/tests/test_embedding_update.py b/torchrec/distributed/tests/test_embedding_update.py

Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,6 @@`
`74`	`74`	`GroupedEmbeddingConfig,`
`75`	`75`	`ShardedEmbeddingTable,`
`76`	`76`	`)`
`77`		`-from torchrec.distributed.model_tracker.types import IndexedLookup`
`78`	`77`	`from torchrec.distributed.shards_wrapper import LocalShardsWrapper`
`79`	`78`	`from torchrec.distributed.types import (`
`80`	`79`	`LazyAwaitable,`