Megatron-LM/megatron/core/transformer/moe/moe_layer.py at a8861740df5e73babc38dfd9273a5c55277682d4 · Wohox/Megatron-LM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

from __future__ import annotations

import warnings
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional, Protocol, Union

import torch

from megatron.core import parallel_state, tensor_parallel, utils
from megatron.core.process_groups_config import ProcessGroupCollection
from megatron.core.transformer.module import MegatronModule
from megatron.core.transformer.moe.moe_utils import (
    MoECudaGraphPartialCaptureSignal,
    MoECudaGraphTensorStore,
    get_default_pg_collection,
    maybe_skip_or_early_return_by_cudagraph,
)
from megatron.core.transformer.moe.router import TopKRouter
from megatron.core.transformer.moe.token_dispatcher import (
    MoEAllGatherTokenDispatcher,
    MoEAlltoAllTokenDispatcher,
    MoEFlexTokenDispatcher,
    MoETokenDispatcher,
)
from megatron.core.transformer.moe.token_dispatcher_inference import (
    InferenceCUDAGraphTokenDispatcher,
)
from megatron.core.transformer.spec_utils import ModuleSpec, build_module
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.typed_torch import apply_module
from megatron.core.utils import internal_api

try:
    import flashinfer  # pylint: disable=unused-import

    HAVE_FLASHINFER = True
except ImportError:
    HAVE_FLASHINFER = False

if HAVE_FLASHINFER:
    try:
        import flashinfer_cubin  # pylint: disable=unused-import
        import flashinfer_jit_cache  # pylint: disable=unused-import

        HAVE_FLASHINFER_CUBIN_AND_JIT_CACHE = True
    except ImportError:
        HAVE_FLASHINFER_CUBIN_AND_JIT_CACHE = False

try:
    import transformer_engine as te  # pylint: disable=unused-import

    from megatron.core.extensions.transformer_engine import TELinear, te_checkpoint

    HAVE_TE = True
except ImportError:
    HAVE_TE = False


class RouterInterface(Protocol):
    """Interface for the router used in an MoELayer."""

    def forward(self, input: torch.Tensor, /) -> tuple[torch.Tensor, torch.Tensor]:
        """Forward pass of the router.

        Returns:
            A tuple of (probabilities, routing_map).
        """
        ...

    def set_layer_number(self, layer_number: int) -> None:
        """Set the layer number for the router.

        Called from transformer_layer during initialization.
        """
        ...


class RouterBuilder(Protocol):
    """Protocol for building a Router."""

    def __call__(
        self, /, *, config: TransformerConfig, pg_collection: ProcessGroupCollection | None
    ) -> RouterInterface: ...


@dataclass
class MoESubmodules:
    """MoE Layer Submodule spec"""

    experts: Union[ModuleSpec, type] = None
    shared_experts: Union[ModuleSpec, type] = None
    router: RouterBuilder = TopKRouter


class BaseMoELayer(MegatronModule, ABC):
    """Base class for a mixture of experts layer.

    Args:
        config (TransformerConfig): Configuration object for the transformer model.
    """

    def __init__(
        self,
        config: TransformerConfig,
        layer_number: Optional[int] = None,
        pg_collection: Optional[ProcessGroupCollection] = None,
        is_mtp_layer: bool = False,
    ):
        super(BaseMoELayer, self).__init__(config)
        self.config = config
        self.layer_number = layer_number
        self.is_mtp_layer = is_mtp_layer
        self.ep_group = pg_collection.ep
        # use pg_collection.expt_tp_group as tensor parallel group in this module.
        self.attn_tp_group = pg_collection.tp
        ep_size = utils.get_pg_size(self.ep_group)
        ep_rank = utils.get_pg_rank(self.ep_group)
        assert ep_size > 0, "Expected non-negative expert parallel size"

        assert self.config.num_moe_experts % ep_size == 0
        self.num_local_experts = self.config.num_moe_experts // ep_size
        local_expert_indices_offset = ep_rank * self.num_local_experts

        self.use_shared_expert = self.config.moe_shared_expert_intermediate_size is not None
        self.shared_expert_overlap = self.config.moe_shared_expert_overlap

        self.local_expert_indices = [
            local_expert_indices_offset + i for i in range(self.num_local_experts)
        ]
        assert all(map(lambda x: x < self.config.num_moe_experts, self.local_expert_indices))
        self.router: RouterInterface = None
        self.experts = None
        self.shared_experts = None
        self.token_dispatcher: Optional[MoETokenDispatcher] = None
        self.layer_number = layer_number

    @abstractmethod
    def forward(self, hidden_states):
        """Forward method for the MoE layer."""
        pass

    def set_layer_number(self, layer_number: int):
        """Set the layer number for the MoE layer."""
        self.layer_number = layer_number
        self.router.set_layer_number(layer_number)


class MoELayer(BaseMoELayer):
    """Mixture of Experts layer.

    This layer implements a Mixture of Experts model, where each token is routed to a
    subset of experts. This implementation supports different token dispatching
    strategies such as All-to-All and All-Gather.
    """

    def __init__(
        self,
        config: TransformerConfig,
        submodules: Optional[MoESubmodules] = None,
        layer_number: Optional[int] = None,
        pg_collection: Optional[ProcessGroupCollection] = None,
        is_mtp_layer: bool = False,
    ):
        self.submodules = submodules
        # TODO(Hepteract): delete the usage of the global parallel_state.
        # Initialize process groups with the global parallel_state.
        if pg_collection is None:
            pg_collection = get_default_pg_collection()
        super(MoELayer, self).__init__(
            config=config,
            layer_number=layer_number,
            pg_collection=pg_collection,
            is_mtp_layer=is_mtp_layer,
        )
        # If using mcore cudagraphs, recompute is handled by transformer_layer.MoETransformerLayer
        self.moe_layer_recompute = (
            config.recompute_granularity == 'selective'
            and "moe" in config.recompute_modules
            and config.cuda_graph_impl != 'local'
        )
        self.shared_experts_recompute = (
            config.recompute_granularity == 'selective'
            and "shared_experts" in config.recompute_modules
        )

        self.tp_group = pg_collection.tp

        # Initialize router.
        self.router = submodules.router(
            config=self.config, pg_collection=pg_collection, is_mtp_layer=is_mtp_layer
        )
        self.tp_group = pg_collection.tp

        # Initialize latent projections.
        if self.config.moe_latent_size:
            assert HAVE_TE, "TransformerEngine is required for MoE latent projections."
            self.fc1_latent_proj = TELinear(
                self.config.hidden_size,
                self.config.moe_latent_size,
                parallel_mode="duplicated",
                config=self.config,
                init_method=self.config.init_method,
                bias=self.config.add_bias_linear,
                skip_bias_add=False,
                skip_weight_param_allocation=False,
                is_expert=False,
            )
            self.fc2_latent_proj = TELinear(
                self.config.moe_latent_size,
                self.config.hidden_size,
                parallel_mode="duplicated",
                config=self.config,
                init_method=self.config.output_layer_init_method,
                bias=self.config.add_bias_linear,
                skip_bias_add=False,
                skip_weight_param_allocation=False,
                is_expert=False,
            )

        # Initialize token dispatcher
        if config.moe_token_dispatcher_type == "allgather":
            self.token_dispatcher = MoEAllGatherTokenDispatcher(
                self.num_local_experts,
                self.local_expert_indices,
                config=self.config,
                pg_collection=pg_collection,
            )
        elif config.moe_token_dispatcher_type == "alltoall":
            self.token_dispatcher = MoEAlltoAllTokenDispatcher(
                self.num_local_experts,
                self.local_expert_indices,
                config=self.config,
                pg_collection=pg_collection,
            )
        elif config.moe_token_dispatcher_type == "flex":
            self.token_dispatcher = MoEFlexTokenDispatcher(
                self.num_local_experts,
                self.local_expert_indices,
                config=self.config,
                pg_collection=pg_collection,
            )
        else:
            raise ValueError(
                f"Unsupported token dispatcher type: {config.moe_token_dispatcher_type}"
            )

        # Initialize experts
        self.experts = build_module(
            self.submodules.experts,
            self.num_local_experts,
            self.config,
            pg_collection=pg_collection,
        )

        # Initialize shared experts
        if self.use_shared_expert:
            self.shared_experts = build_module(
                self.submodules.shared_experts,
                config=self.config,
                pg_collection=pg_collection,
                gate=self.config.moe_shared_expert_gate,
            )
            if self.shared_expert_overlap:
                self.token_dispatcher.set_shared_experts(self.shared_experts)

        # Inference-optimized mode setup
        if config.transformer_impl == "inference_optimized":
            assert (
                HAVE_FLASHINFER
            ), "flashinfer-python is required for inference-optimized MoE implementation."
            if not HAVE_FLASHINFER_CUBIN_AND_JIT_CACHE:
                warnings.warn(
                    "flashinfer-cubin and/or flashinfer-jit-cache not found. "
                    "The FlashInfer cutlass kernel will be JIT compiled,"
                    "which may take a long time."
                )
            self._setup_inference_mode(pg_collection)

        # Cudagraph tensor store for resuming the forward pass from the end of the cudagraph.
        self.cudagraph_tensor_store = MoECudaGraphTensorStore()
        self.fwd_execution_map = ["route", "expert_compute", "postprocess"]

        # Delay wgrad computation for TE grouped GEMM
        self._delayed_wgrad_event: Optional[torch.cuda.Event] = None
        self._delayed_wgrad_stream: Optional[torch.cuda.Stream] = None
        self._process_expert_grads_fn = None
        if self.config.delay_wgrad_compute_for_te_grouped_gemm:
            self._delayed_wgrad_event = torch.cuda.Event()
            self._delayed_wgrad_stream = torch.cuda.Stream(device="cuda")

    def _setup_inference_mode(self, pg_collection):
        """Set up inference-optimized token dispatcher and state.

        Called from __init__ when config.transformer_impl == "inference_optimized".
        Creates an InferenceCUDAGraphTokenDispatcher alongside the standard dispatcher,
        which is swapped in during CUDA-graphed forward passes.
        """

        assert self.config.moe_token_dispatcher_type == "alltoall", (
            f"Inference-optimized MoE requires 'alltoall' dispatcher, "
            f"got '{self.config.moe_token_dispatcher_type}'"
        )
        self.is_inference_cuda_graphed_iteration = False
        self._inference_token_dispatcher = InferenceCUDAGraphTokenDispatcher(
            self.num_local_experts,
            self.local_expert_indices,
            config=self.config,
            pg_collection=pg_collection,
        )

    def set_inference_cuda_graphed_iteration(self):
        """Enable CUDA-graphed iteration mode on this layer, its router, and its experts.

        Swaps in the inference-optimized token dispatcher and disables
        shared expert overlap.
        """
        self.is_inference_cuda_graphed_iteration = True
        if hasattr(self.router, "set_inference_cuda_graphed_iteration"):
            self.router.set_inference_cuda_graphed_iteration()
        if hasattr(self.experts, "set_inference_cuda_graphed_iteration"):
            self.experts.set_inference_cuda_graphed_iteration()

        if self._inference_token_dispatcher is not None:
            self._saved_token_dispatcher = self.token_dispatcher
            self.token_dispatcher = self._inference_token_dispatcher
            self._saved_shared_expert_overlap = self.shared_expert_overlap
            self.shared_expert_overlap = False

    def unset_inference_cuda_graphed_iteration(self):
        """Disable CUDA-graphed iteration mode on this layer, its router, and its experts.

        Restores the standard token dispatcher and shared expert overlap setting.
        """
        self.is_inference_cuda_graphed_iteration = False
        if hasattr(self.router, "unset_inference_cuda_graphed_iteration"):
            self.router.unset_inference_cuda_graphed_iteration()
        if hasattr(self.experts, "unset_inference_cuda_graphed_iteration"):
            self.experts.unset_inference_cuda_graphed_iteration()

        if hasattr(self, "_saved_token_dispatcher"):
            self.token_dispatcher = self._saved_token_dispatcher
            self.shared_expert_overlap = self._saved_shared_expert_overlap

    @maybe_skip_or_early_return_by_cudagraph("route")
    def route(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = None):
        """Compute token routing for preprocessing.

        This method uses the router to determine which experts to send each token to,
        producing routing probabilities and a mapping.
        """
        probs, routing_map = apply_module(self.router)(hidden_states, padding_mask)
        return probs, routing_map

    @maybe_skip_or_early_return_by_cudagraph("preprocess")
    def preprocess(
        self, hidden_states: torch.Tensor, probs: torch.Tensor, routing_map: torch.Tensor
    ):
        """Preprocess token routing for dispatch.

        This method preprocesses the hidden states and routing probabilities for the token
        dispatcher.
        """
        # Project the hidden_states from hidden dimension down to latent dimenion.
        if self.config.moe_latent_size:
            assert (
                not self.shared_expert_overlap
            ), "Shared expert overlap not supported when MoE latent projections are used."
            hidden_states, _ = self.fc1_latent_proj(hidden_states)
        hidden_states, probs = self.token_dispatcher.dispatch_preprocess(
            hidden_states, routing_map, probs
        )
        return hidden_states, probs

    def dispatch(self, hidden_states: torch.Tensor, probs: torch.Tensor):
        """Dispatches tokens to assigned expert ranks via communication.

        This method performs the actual communication (e.g., All-to-All) to distribute
        tokens and their associated probabilities to the devices hosting their assigned
        experts.
        """
        if self.config.delay_wgrad_compute_for_te_grouped_gemm:
            hidden_states = _RegisterDelayedWgradForExperts.apply(self, hidden_states)
        return self.token_dispatcher.token_dispatch(hidden_states, probs)

    @maybe_skip_or_early_return_by_cudagraph("shared_experts_compute")
    def shared_experts_compute(self, hidden_states: torch.Tensor):
        """Computes the output of the shared experts.

        If a shared expert is configured and not overlapped with communication,
        it is computed here.
        """
        shared_expert_output = None
        if self.use_shared_expert and not self.shared_expert_overlap:
            # Compute the shared expert separately when not overlapped with communication.
            if self.shared_experts_recompute:
                if self.config.fp8 or self.config.fp4:
                    shared_expert_output = te_checkpoint(
                        self.shared_experts,
                        False,
                        tensor_parallel.random.get_cuda_rng_tracker,
                        parallel_state.get_tensor_model_parallel_group(),
                        hidden_states,
                    )
                else:
                    shared_expert_output = tensor_parallel.checkpoint(
                        self.shared_experts, False, hidden_states
                    )
            else:
                shared_expert_output = self.shared_experts(hidden_states)

        return shared_expert_output

    @internal_api
    def routed_experts_compute(self, hidden_states: torch.Tensor, probs: torch.Tensor):
        """Computes the output of the routed experts on the dispatched tokens.

        This method first post-processes the dispatched input to get permuted tokens
        for each expert. It then passes the tokens through the local experts.
        The output from the experts is preprocessed for the combine step.
        """
        if self.config.delay_wgrad_compute_for_te_grouped_gemm:
            hidden_states = _RecordExpertDgradCompletion.apply(
                self._delayed_wgrad_event, hidden_states
            )
        dispatched_input, tokens_per_expert, permuted_probs = (
            self.token_dispatcher.dispatch_postprocess(hidden_states, probs)
        )
        if (
            hasattr(self, "_inference_token_dispatcher")
            and self.is_inference_cuda_graphed_iteration
        ):
            routing_map = self.token_dispatcher.routing_map
            expert_output, mlp_bias = self.experts(
                dispatched_input, tokens_per_expert, permuted_probs, routing_map=routing_map
            )
        else:
            expert_output, mlp_bias = self.experts(
                dispatched_input, tokens_per_expert, permuted_probs
            )
        assert mlp_bias is None, f"mlp_bias is not supported for {type(self.token_dispatcher)}"
        output = self.token_dispatcher.combine_preprocess(expert_output)

        return output, mlp_bias

    def combine(self, output: torch.Tensor):
        """Combines expert outputs via communication and adds shared expert output.

        This method uses the token dispatcher to combine the outputs from different
        experts (e.g., via an All-to-All communication).
        """
        output = self.token_dispatcher.token_combine(output)
        return output

    def postprocess(self, output: torch.Tensor, shared_expert_output: Optional[torch.Tensor]):
        """Project the output back from latent dimension to hidden dimension after combine
        in latent dimension if needed. Combine expert output with shared_experts if needed."""

        output = self.token_dispatcher.combine_postprocess(output)
        if self.config.moe_latent_size:
            output, _ = self.fc2_latent_proj(output)

        if shared_expert_output is not None:
            output = output + shared_expert_output
        return output

    def router_and_preprocess(self, hidden_states: torch.Tensor):
        """This method is a combined method of route and preprocess. Deprecated."""

        probs, routing_map = self.route(hidden_states)
        hidden_states, probs, residual = self.preprocess(hidden_states, probs, routing_map)
        return hidden_states, probs, residual

    def forward(
        self,
        hidden_states: torch.Tensor,
        intermediate_tensors=None,
        padding_mask: Optional[torch.Tensor] = None,
    ):
        """Forward pass for the MoE layer.

        The forward pass comprises four main steps:
        1. Routing & Preprocessing: Route tokens to the assigned experts and prepare for dispatch.
        2. Dispatch: Tokens are sent to the expert devices using communication collectives.
        3. Expert Computation: Experts process the dispatched tokens.
        4. Combine: The outputs from the experts are combined and returned.

        Args:
            hidden_states (torch.Tensor): The input tensor shape [seq_length, bsz, hidden_size].
            padding_mask (torch.Tensor, optional): Boolean mask indicating non-padding tokens.
                                                   Shape [seq_length, bsz]. True for valid tokens,
                                                   False for padding tokens. Defaults to None.
        Returns:
            A tuple containing the output tensor and the MLP bias, if any.
        """
        if self.training and self.attn_tp_group.size() > 1 and not self.config.sequence_parallel:
            raise ValueError(
                "During training, performance may degrade if MoE and tensor parallelism"
                "are enabled without also enabling sequence parallelism."
            )
        # Transpose from [bsz, seq_length] to [seq_length, bsz] to align with hidden_states
        if padding_mask is not None:
            padding_mask = padding_mask.transpose(0, 1).bool()

        # MoE forward: route -> dispatch -> compute -> combine
        def custom_forward(hidden_states, intermediate_tensors=None, padding_mask=None):
            try:
                if "route" in self.fwd_execution_map:
                    shared_expert_output = self.shared_experts_compute(hidden_states)
                    probs, routing_map = self.route(hidden_states, padding_mask)
                    hidden_states, probs = self.preprocess(hidden_states, probs, routing_map)

                    if intermediate_tensors is not None:
                        return hidden_states, probs, shared_expert_output

            except MoECudaGraphPartialCaptureSignal as e:
                # This signal is raised from the maybe_skip_or_early_return_by_cudagraph decorator.
                # It means we should early-return from the MoE layer forward pass.
                # This happens when we are partially capturing the CUDA graph of the MoE layer,
                # like cuda_graph_scope=["moe_router", "moe_preprocess"].
                # We need to return the intermediate tensors as CUDA graph outputs.
                return e.get_early_return_outputs(hidden_states, shared_expert_output)

            if "expert_compute" in self.fwd_execution_map:
                if intermediate_tensors is not None:
                    hidden_states, probs = intermediate_tensors

                dispatched_input, probs = self.dispatch(hidden_states, probs)
                output, mlp_bias = self.routed_experts_compute(dispatched_input, probs)
                assert (
                    mlp_bias is None
                ), f"mlp_bias is not supported for {type(self.token_dispatcher)}"
                output = self.combine(output)

                if intermediate_tensors is not None:
                    return output, mlp_bias

            if "postprocess" in self.fwd_execution_map:
                if intermediate_tensors is not None:
                    output, shared_expert_output = intermediate_tensors

                output = self.postprocess(output, shared_expert_output)

                if intermediate_tensors is not None:
                    return output

            return output, mlp_bias

        if self.moe_layer_recompute and self.training:
            if self.config.fp8 or self.config.fp4:
                outputs = te_checkpoint(
                    custom_forward,
                    False,
                    tensor_parallel.random.get_cuda_rng_tracker,
                    parallel_state.get_tensor_model_parallel_group(),
                    hidden_states,
                    intermediate_tensors,
                    padding_mask,
                )
            else:
                outputs = tensor_parallel.checkpoint(
                    custom_forward, False, hidden_states, intermediate_tensors, padding_mask
                )
        else:
            outputs = custom_forward(hidden_states, intermediate_tensors, padding_mask)

        return outputs

    def backward_dw(self, routed_experts: bool = True, shared_experts: bool = False):
        """Compute weight gradients for experts and shared experts."""
        # TODO(Wohox): replace the "routed_experts" and "shared_experts" arguments with better
        # naming to better explain that they are actually from different fine-grained callables,
        # or use scanning to decide which backward_dw should be called.
        if routed_experts:
            self.experts.backward_dw()
            if self.config.moe_latent_size:
                # TODO(Wohox): fc2_latent_proj forward and backward are executed in comm stream,
                # so we execute its backward_dw in the comm stream too. But this may harm the
                # EP overlap performance. Better to check if there is a better way to handle this.
                from megatron.core.pipeline_parallel.utils import get_comm_stream

                comm_stream = get_comm_stream()
                with torch.cuda.stream(comm_stream):
                    self.fc2_latent_proj.backward_dw()
        if shared_experts:
            if self.use_shared_expert and not self.shared_expert_overlap:
                self.shared_experts.backward_dw()
            if self.config.moe_latent_size:
                self.fc1_latent_proj.backward_dw()

    def set_for_recompute_pre_mlp_layernorm(self):
        """Set the MoE layer for recompute pre_mlp_layernorm. Only needed for fp8/fp4."""
        # If shared_experts_recompute is used, nothing needs to be done because the checkpoint
        # function will save the original input tensors.
        if self.shared_experts is not None and not self.shared_experts_recompute:
            from megatron.core.extensions.transformer_engine import set_save_original_input

            set_save_original_input(self.shared_experts.linear_fc1)

    def register_process_expert_grads_fn(self, fn):
        """Register a callback to process expert gradients after delayed wgrad computation.

        This is used by FSDP to defer the reduce-scatter of expert parameter
        gradients until the delayed wgrad computation has completed.

        Args:
            fn: A callable that processes expert gradients (e.g., triggers
                FSDP reduce-scatter for expert parameters).
        """
        self._process_expert_grads_fn = fn


class _RecordExpertDgradCompletion(torch.autograd.Function):
    """Autograd function that records a CUDA event when expert data gradients finish.

    Placed in the forward graph just before the expert computation so that during
    the backward pass, when the expert dgrad completes, we record an event. The
    subsequent ``_RegisterDelayedWgradForExperts`` waits on this event before
    launching the delayed wgrad computation on a separate CUDA stream.
    """

    @staticmethod
    def forward(ctx, event: torch.cuda.Event, *inputs):
        """Forward pass that stores the event and passes through inputs unchanged."""
        ctx.event = event
        return inputs[0] if len(inputs) == 1 else inputs

    @staticmethod
    def backward(ctx, *grad_outputs):
        """Backward pass that records the event when expert dgrad completes."""
        ctx.event.record(torch.cuda.current_stream())
        ctx.event = None
        return (None,) + grad_outputs


class _RegisterDelayedWgradForExperts(torch.autograd.Function):
    """Autograd function that orchestrates delayed wgrad computation for MoE experts.

    Placed in the forward graph at the dispatch boundary. During the backward pass,
    this function:
      1. Records an event on the current (backward) stream to signal the dgrad is done.
      2. Executes the delayed wgrad computation on a dedicated CUDA stream.
      3. Waits for the wgrad computation to complete.
      4. Invokes the registered gradient processing callback (e.g., FSDP reduce-scatter).
    """

    @staticmethod
    def forward(ctx, module: MoELayer, *inputs):
        """Forward pass that stores the MoE module and passes through inputs unchanged."""
        ctx.module = module
        return inputs[0] if len(inputs) == 1 else inputs

    @staticmethod
    def backward(ctx, *grad_outputs):
        """Backward pass that executes delayed wgrad computation on a separate stream."""
        module = ctx.module
        event = module._delayed_wgrad_event
        wgrad_stream = module._delayed_wgrad_stream

        wgrad_stream.wait_event(event)
        with torch.cuda.stream(wgrad_stream):
            with torch.cuda.nvtx.range("delayed_expert_wgrad"):
                module.backward_dw(routed_experts=True, shared_experts=False)
            event.record(wgrad_stream)

        torch.cuda.current_stream().wait_event(event)

        if module._process_expert_grads_fn is not None:
            module._process_expert_grads_fn()

        ctx.module = None
        return (None,) + grad_outputs