diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index af2ccf9d693a..6cb42d012b8f 100644
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -55,16 +55,15 @@ foreach(DEP_IDX RANGE ${DEP_COUNT_MINUS_ONE})
   endif()
 
   if(DEP_PATCH_FILE AND NOT DEP_PATCH_FILE STREQUAL "")
+    set(_patch_file "${CMAKE_CURRENT_SOURCE_DIR}/${DEP_PATCH_FILE}")
     list(
       APPEND
       FETCH_ARGS
       PATCH_COMMAND
-      patch
-      -p1
-      --forward
-      --batch
-      -i
-      "${CMAKE_CURRENT_SOURCE_DIR}/${DEP_PATCH_FILE}")
+      bash
+      -c
+      "patch -p1 --forward --batch --dry-run -i '${_patch_file}' && patch -p1 --forward --batch -i '${_patch_file}' || echo 'Patch already applied, skipping.'"
+    )
   endif()
 
   FetchContent_Declare(${FETCH_ARGS})
diff --git a/cpp/include/tensorrt_llm/executor/types.h b/cpp/include/tensorrt_llm/executor/types.h
index 89618dce540f..77f910455c57 100644
--- a/cpp/include/tensorrt_llm/executor/types.h
+++ b/cpp/include/tensorrt_llm/executor/types.h
@@ -243,6 +243,9 @@ enum class ContextChunkingPolicy
     /// @brief Iterate through each context request in sequence and attempt to increase its chunk
     /// count until the constraint is exceeded.
     kEQUAL_PROGRESS = 1,
+
+    /// @brief Force every context request to have a chunk size of `unit_size` or 0 unless it's the last chunk.
+    kFORCE_CHUNK = 2,
 };
 
 std::ostream& operator<<(std::ostream& os, ContextChunkingPolicy policy);
diff --git a/cpp/tensorrt_llm/batch_manager/microBatchScheduler.cpp b/cpp/tensorrt_llm/batch_manager/microBatchScheduler.cpp
index 92d4589e0b66..40b760c3cb0a 100644
--- a/cpp/tensorrt_llm/batch_manager/microBatchScheduler.cpp
+++ b/cpp/tensorrt_llm/batch_manager/microBatchScheduler.cpp
@@ -217,16 +217,52 @@ void MicroBatchScheduler::setCtxRequestsChunkSize<MicroBatchScheduler::ContextCh
     }
 }
 
+// Assigns chunk sizes to context requests under the kFORCE_CHUNK policy.
+//
+// Every request is assigned exactly min(contextRemainingLength, chunkUnitSize) tokens.
+// Requests whose chunk would push the running total past ctxTokensCapacity are zeroed.
+//
+// This policy is designed for linear attention state caching, so reusable KV-cache tokens are NOT
+// calculated because it's not supported yet.
+template <>
+void MicroBatchScheduler::setCtxRequestsChunkSize<MicroBatchScheduler::ContextChunkingPolicy::kFORCE_CHUNK>(
+    RequestVector& contextsToBeChunked, std::optional<SizeType32> ctxTokensCapacity, SizeType32 const chunkUnitSize,
+    std::optional<SizeType32> const& maxContextLength)
+{
+    if (maxContextLength && maxContextLength.value() < chunkUnitSize)
+    {
+        TLLM_THROW(
+            "The forced chunk size (%d) exceeds the max context length (%d)", chunkUnitSize, maxContextLength.value());
+    }
+    SizeType32 totalTokens{0};
+    for (auto& llmReq : contextsToBeChunked)
+    {
+        SizeType32 const chunkSize = std::min(llmReq->getContextRemainingLength(), chunkUnitSize);
+        if (ctxTokensCapacity && totalTokens + chunkSize > ctxTokensCapacity.value())
+        {
+            llmReq->setContextChunkSize(0);
+        }
+        else
+        {
+            llmReq->setContextChunkSize(chunkSize);
+            totalTokens += llmReq->getContextChunkSize();
+        }
+    }
+}
+
 // Entry point for chunk-size assignment. Resets all chunk sizes to zero, then
 // dispatches to the appropriate policy-specific implementation:
 //
-//   kEQUAL_PROGRESS      — all requests advance together one chunkUnitSize at a time.
+//   kEQUAL_PROGRESS        — all requests advance together one chunkUnitSize at a time.
 //   kFIRST_COME_FIRST_SERVED — requests are served greedily in order until the budget
 //                              is exhausted.
+//   kFORCE_CHUNK           — every request gets exactly min(remaining, chunkUnitSize)
+//                              tokens; budget is charged at face value (no reuse discount).
 //
-// Both policies are compute-aware: tokens covered by the reusable KV-cache prefix are
-// not charged against ctxTokensCapacity. See the individual template specialisations
-// above for full details.
+// EQUAL_PROGRESS and FIRST_COME_FIRST_SERVED are compute-aware: tokens covered by the
+// reusable KV-cache prefix are not charged against ctxTokensCapacity.
+// FORCE_CHUNK intentionally skips reuse accounting.
+// See the individual template specialisations above for full details.
 void MicroBatchScheduler::setCtxRequestsChunkSize(RequestVector& contextsToBeChunked,
     ContextChunkingPolicy const ctxChunkPolicy, std::optional<SizeType32> ctxTokensCapacity,
     SizeType32 const chunkUnitSize, std::optional<SizeType32> const& maxContextLength)
@@ -245,6 +281,10 @@ void MicroBatchScheduler::setCtxRequestsChunkSize(RequestVector& contextsToBeChu
         setCtxRequestsChunkSize<MicroBatchScheduler::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED>(
             contextsToBeChunked, ctxTokensCapacity, chunkUnitSize, maxContextLength);
         break;
+    case ContextChunkingPolicy::kFORCE_CHUNK:
+        setCtxRequestsChunkSize<MicroBatchScheduler::ContextChunkingPolicy::kFORCE_CHUNK>(
+            contextsToBeChunked, ctxTokensCapacity, chunkUnitSize, maxContextLength);
+        break;
     default: TLLM_THROW("The chunked scheduling type `NO_CHUNKING` cannot be performed.");
     }
 
@@ -384,6 +424,12 @@ std::tuple<RequestVector, RequestVector> MicroBatchScheduler::operator()(Request
         allContextRequestsFit = false;
     }
 
+    // For FORCE_CHUNK policy, always re-chunk regardless of whether all contexts fit.
+    if (mCtxChunkConfig && mCtxChunkConfig.value().chunkingPolicy == ContextChunkingPolicy::kFORCE_CHUNK)
+    {
+        allContextRequestsFit = false;
+    }
+
     // 2. If not all contexts fit into the batch, the chunk size should be adjusted accordingly.
     if (!allContextRequestsFit)
     {
diff --git a/cpp/tensorrt_llm/executor/types.cpp b/cpp/tensorrt_llm/executor/types.cpp
index 86b1b3d38312..e07c759e1b4d 100644
--- a/cpp/tensorrt_llm/executor/types.cpp
+++ b/cpp/tensorrt_llm/executor/types.cpp
@@ -38,6 +38,7 @@ std::ostream& operator<<(std::ostream& os, ContextChunkingPolicy policy)
     {
     case ContextChunkingPolicy::kEQUAL_PROGRESS: os << "EQUAL_PROGRESS"; break;
     case ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED: os << "FIRST_COME_FIRST_SERVED"; break;
+    case ContextChunkingPolicy::kFORCE_CHUNK: os << "FORCE_CHUNK"; break;
     }
     return os;
 }
diff --git a/cpp/tensorrt_llm/nanobind/executor/bindings.cpp b/cpp/tensorrt_llm/nanobind/executor/bindings.cpp
index 4f873e2ed1b2..78c90a86ca37 100644
--- a/cpp/tensorrt_llm/nanobind/executor/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/executor/bindings.cpp
@@ -94,7 +94,8 @@ void initBindings(nb::module_& m)
 
     nb::enum_<tle::ContextChunkingPolicy>(m, "ContextChunkingPolicy")
         .value("EQUAL_PROGRESS", tle::ContextChunkingPolicy::kEQUAL_PROGRESS)
-        .value("FIRST_COME_FIRST_SERVED", tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED);
+        .value("FIRST_COME_FIRST_SERVED", tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED)
+        .value("FORCE_CHUNK", tle::ContextChunkingPolicy::kFORCE_CHUNK);
 
     nb::enum_<tle::CommunicationType>(m, "CommunicationType").value("MPI", tle::CommunicationType::kMPI);
 
diff --git a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp
index 3440a59737fc..35486c3be937 100644
--- a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp
+++ b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp
@@ -410,9 +410,11 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::optional<torch:
         if (out_hidden < args.hidden_size)
         {
             // out_tensor has unpadded hidden dim (e.g., nvfp4 with padding).
-            // Set valid_hidden_size so the finalize kernel writes only the needed columns.
+            // Set valid_hidden_size so the finalize kernel writes only the needed columns
+            // directly into out_tensor. Keep output_hidden_size at the full hidden_size so
+            // Gemm2 still computes at the padded width (its cubin config requires it).
             args.valid_hidden_size = out_hidden;
-            args.output_hidden_size = tensorrt_llm::common::roundUp(out_hidden, static_cast<int64_t>(128));
+            args.output_hidden_size = args.hidden_size;
         }
         else
         {
diff --git a/cpp/tests/unit_tests/batch_manager/microBatchSchedulerTest.cpp b/cpp/tests/unit_tests/batch_manager/microBatchSchedulerTest.cpp
index 4edfda5f1ebf..f017e4d8937e 100644
--- a/cpp/tests/unit_tests/batch_manager/microBatchSchedulerTest.cpp
+++ b/cpp/tests/unit_tests/batch_manager/microBatchSchedulerTest.cpp
@@ -1847,3 +1847,293 @@ TEST_F(ContextChunkingTest, DraftTokensNoChunkingDiscardNone)
     setExpectedFinalDraftLengths<Policy::kEQUAL_PROGRESS>({3});
     setExpectedFinalDraftLengths<Policy::kFIRST_COME_FIRST_SERVED>({3});
 }
+
+// ############################################################################
+//
+// FORCE_CHUNK policy tests
+//
+// ############################################################################
+
+class ForceChunkTest : public MicroBatchSchedulerTest
+{
+protected:
+    using Policy = ContextChunkingPolicy;
+
+    static RequestVector initRequests(
+        std::vector<SizeType32> const& lengths, std::vector<SizeType32> const& draftLengths = {})
+    {
+        RequestVector reqs;
+        constexpr SizeType32 maxNewTokens = 1;
+        for (size_t i = 0; i < lengths.size(); ++i)
+        {
+            auto draftLen = draftLengths.size() > 0 ? draftLengths[i] : 0;
+            reqs.push_back(createRequest(lengths[i], maxNewTokens, i, /*beamWidth=*/1, draftLen));
+        }
+        return reqs;
+    }
+
+    /// Run a single chunking iteration: call setCtxRequestsChunkSize with kFORCE_CHUNK,
+    /// then moveToNextContextChunk for active requests.
+    static void chunkIteration(RequestVector& reqs, SizeType32 chunkUnitSize,
+        std::optional<SizeType32> ctxTokensCapacity = std::nullopt,
+        std::optional<SizeType32> maxContextLength = std::nullopt)
+    {
+        RequestVector active;
+        std::copy_if(reqs.begin(), reqs.end(), std::back_inserter(active),
+            [](auto const& r) { return r->getContextRemainingLength() > 0; });
+
+        MicroBatchScheduler::setCtxRequestsChunkSize(
+            active, Policy::kFORCE_CHUNK, ctxTokensCapacity, chunkUnitSize, maxContextLength);
+
+        for (auto const& r : active)
+        {
+            r->moveToNextContextChunk();
+        }
+    }
+
+    /// Verify context positions of all requests match expected values.
+    static void expectPositions(
+        RequestVector const& reqs, std::vector<SizeType32> const& expected, std::string const& label = "")
+    {
+        ASSERT_EQ(reqs.size(), expected.size()) << label;
+        for (size_t i = 0; i < reqs.size(); ++i)
+        {
+            EXPECT_EQ(reqs[i]->getContextCurrentPosition(), expected[i])
+                << label << " request " << i << " (id=" << reqs[i]->mRequestId << ")";
+        }
+    }
+
+    /// Verify chunk sizes of active requests (those with remaining context).
+    static void expectChunkSizes(
+        RequestVector const& reqs, std::vector<SizeType32> const& expected, std::string const& label = "")
+    {
+        RequestVector active;
+        std::copy_if(reqs.begin(), reqs.end(), std::back_inserter(active),
+            [](auto const& r) { return r->getContextRemainingLength() > 0; });
+
+        ASSERT_EQ(active.size(), expected.size()) << label;
+        for (size_t i = 0; i < active.size(); ++i)
+        {
+            EXPECT_EQ(active[i]->getContextChunkSize(), expected[i])
+                << label << " request " << i << " (id=" << active[i]->mRequestId << ")";
+        }
+    }
+};
+
+TEST_F(ForceChunkTest, Basic)
+{
+    // A single request with prompt_len > chunk_unit_size is chunked to unit_size.
+    auto reqs = initRequests({30});
+    MicroBatchScheduler::setCtxRequestsChunkSize(reqs, Policy::kFORCE_CHUNK, /*ctxTokensCapacity=*/std::nullopt,
+        /*chunkUnitSize=*/10, /*maxContextLength=*/std::nullopt);
+
+    EXPECT_EQ(reqs[0]->getContextChunkSize(), 10);
+}
+
+TEST_F(ForceChunkTest, PromptSmallerThanUnit)
+{
+    // When prompt_len < chunk_unit_size, chunk_size = prompt_len (min).
+    auto reqs = initRequests({8});
+    MicroBatchScheduler::setCtxRequestsChunkSize(reqs, Policy::kFORCE_CHUNK, std::nullopt, 20, std::nullopt);
+
+    EXPECT_EQ(reqs[0]->getContextChunkSize(), 8);
+}
+
+TEST_F(ForceChunkTest, ExactUnitSize)
+{
+    // When prompt_len == chunk_unit_size, chunk_size = prompt_len.
+    auto reqs = initRequests({10});
+    MicroBatchScheduler::setCtxRequestsChunkSize(reqs, Policy::kFORCE_CHUNK, std::nullopt, 10, std::nullopt);
+
+    EXPECT_EQ(reqs[0]->getContextChunkSize(), 10);
+}
+
+TEST_F(ForceChunkTest, MultipleRequests)
+{
+    // Each request independently gets min(remaining, unit_size).
+    auto reqs = initRequests({25, 15, 5});
+    MicroBatchScheduler::setCtxRequestsChunkSize(reqs, Policy::kFORCE_CHUNK, std::nullopt, 10, std::nullopt);
+
+    EXPECT_EQ(reqs[0]->getContextChunkSize(), 10);
+    EXPECT_EQ(reqs[1]->getContextChunkSize(), 10);
+    EXPECT_EQ(reqs[2]->getContextChunkSize(), 5); // min(5, 10) = 5
+}
+
+TEST_F(ForceChunkTest, CapacityLimits)
+{
+    // When capacity is limited, later requests get chunk_size=0.
+    auto reqs = initRequests({30, 30});
+    MicroBatchScheduler::setCtxRequestsChunkSize(
+        reqs, Policy::kFORCE_CHUNK, /*ctxTokensCapacity=*/15, /*chunkUnitSize=*/10, std::nullopt);
+
+    // req0 gets 10, req1 would push total to 20 > 15 → 0
+    EXPECT_EQ(reqs[0]->getContextChunkSize(), 10);
+    EXPECT_EQ(reqs[1]->getContextChunkSize(), 0);
+}
+
+TEST_F(ForceChunkTest, CapacityExactFit)
+{
+    // When capacity exactly accommodates all chunks.
+    auto reqs = initRequests({30, 30});
+    MicroBatchScheduler::setCtxRequestsChunkSize(
+        reqs, Policy::kFORCE_CHUNK, /*ctxTokensCapacity=*/20, /*chunkUnitSize=*/10, std::nullopt);
+
+    EXPECT_EQ(reqs[0]->getContextChunkSize(), 10);
+    EXPECT_EQ(reqs[1]->getContextChunkSize(), 10);
+}
+
+TEST_F(ForceChunkTest, MultiIteration)
+{
+    // A request with prompt_len=25 and chunk_unit_size=10 processes in 3 iterations:
+    // chunk 1: 10, chunk 2: 10, chunk 3: 5.
+    auto reqs = initRequests({25});
+
+    // Iteration 1
+    chunkIteration(reqs, 10);
+    expectPositions(reqs, {10}, "iter 1");
+
+    // Iteration 2
+    chunkIteration(reqs, 10);
+    expectPositions(reqs, {20}, "iter 2");
+
+    // Iteration 3
+    chunkIteration(reqs, 10);
+    expectPositions(reqs, {25}, "iter 3");
+}
+
+TEST_F(ForceChunkTest, MultiRequestMultiIteration)
+{
+    // Two requests with different lengths processed over multiple iterations.
+    // prompt_len={25, 12}, chunk_unit_size=10.
+    auto reqs = initRequests({25, 12});
+
+    // Iteration 1: both get 10
+    chunkIteration(reqs, 10);
+    expectPositions(reqs, {10, 10}, "iter 1");
+
+    // Iteration 2: req0 gets 10, req1 gets 2 (remaining)
+    chunkIteration(reqs, 10);
+    expectPositions(reqs, {20, 12}, "iter 2");
+
+    // Iteration 3: only req0 active (remaining=5), req1 done
+    chunkIteration(reqs, 10);
+    expectPositions(reqs, {25, 12}, "iter 3");
+}
+
+TEST_F(ForceChunkTest, CapacityAcrossIterations)
+{
+    // With limited capacity, some requests may be delayed to later iterations.
+    // prompt_len={25, 25}, chunk_unit_size=10, capacity=15.
+    auto reqs = initRequests({25, 25});
+
+    // Iteration 1: req0=10, req1=0 (10+10=20 > 15)
+    chunkIteration(reqs, 10, /*ctxTokensCapacity=*/15);
+    expectPositions(reqs, {10, 0}, "iter 1");
+
+    // Iteration 2: req0=10, req1=0 (still constrained)
+    chunkIteration(reqs, 10, 15);
+    expectPositions(reqs, {20, 0}, "iter 2");
+
+    // Iteration 3: req0=5, req1=10 (5+10=15 == capacity)
+    chunkIteration(reqs, 10, 15);
+    expectPositions(reqs, {25, 10}, "iter 3");
+
+    // Iteration 4: only req1 active (remaining=15), gets 10
+    chunkIteration(reqs, 10, 15);
+    expectPositions(reqs, {25, 20}, "iter 4");
+
+    // Iteration 5: req1 remaining=5
+    chunkIteration(reqs, 10, 15);
+    expectPositions(reqs, {25, 25}, "iter 5");
+}
+
+TEST_F(ForceChunkTest, FullSchedulerPath)
+{
+    // Test via MicroBatchScheduler::operator() — FORCE_CHUNK always re-chunks
+    // even when all contexts fit within the token budget.
+    batch_scheduler::ContextChunkingConfig chunkConfig;
+    chunkConfig.chunkingPolicy = Policy::kFORCE_CHUNK;
+    chunkConfig.chunkUnitSize = 10;
+
+    auto scheduler = std::make_shared<MicroBatchScheduler>(chunkConfig);
+
+    constexpr SizeType32 maxBatchSize = 4;
+    constexpr SizeType32 maxNumTokens = 100;
+
+    RequestVector activeRequests;
+    activeRequests.push_back(createRequest(/*promptLen=*/30, /*maxNewTokens=*/1, /*reqId=*/0));
+
+    ReqIdsSet inflightReqIds;
+    auto const [contextRequests, genRequests]
+        = (*scheduler)(activeRequests, inflightReqIds, maxBatchSize, maxNumTokens);
+
+    // Despite budget=100 >> prompt=30, FORCE_CHUNK limits chunk to unit_size=10.
+    ASSERT_EQ(contextRequests.size(), 1);
+    EXPECT_EQ(contextRequests[0]->getContextChunkSize(), 10);
+    EXPECT_EQ(genRequests.size(), 0);
+}
+
+TEST_F(ForceChunkTest, FullSchedulerMultipleRequests)
+{
+    // Test full scheduler path with multiple requests.
+    batch_scheduler::ContextChunkingConfig chunkConfig;
+    chunkConfig.chunkingPolicy = Policy::kFORCE_CHUNK;
+    chunkConfig.chunkUnitSize = 10;
+
+    auto scheduler = std::make_shared<MicroBatchScheduler>(chunkConfig);
+
+    constexpr SizeType32 maxBatchSize = 4;
+    constexpr SizeType32 maxNumTokens = 100;
+
+    RequestVector activeRequests;
+    activeRequests.push_back(createRequest(25, 1, 0));
+    activeRequests.push_back(createRequest(15, 1, 1));
+    activeRequests.push_back(createRequest(5, 1, 2));
+
+    ReqIdsSet inflightReqIds;
+    auto const [contextRequests, genRequests]
+        = (*scheduler)(activeRequests, inflightReqIds, maxBatchSize, maxNumTokens);
+
+    ASSERT_EQ(contextRequests.size(), 3);
+    // Find by request ID since sorting may reorder.
+    std::map<uint64_t, SizeType32> chunks;
+    for (auto const& req : contextRequests)
+    {
+        chunks[req->mRequestId] = req->getContextChunkSize();
+    }
+    EXPECT_EQ(chunks[0], 10);
+    EXPECT_EQ(chunks[1], 10);
+    EXPECT_EQ(chunks[2], 5);
+}
+
+TEST_F(ForceChunkTest, FullSchedulerWithGeneration)
+{
+    // Context chunking with concurrent generation requests.
+    // Generation tokens reduce the available budget for context chunks.
+    batch_scheduler::ContextChunkingConfig chunkConfig;
+    chunkConfig.chunkingPolicy = Policy::kFORCE_CHUNK;
+    chunkConfig.chunkUnitSize = 10;
+
+    auto scheduler = std::make_shared<MicroBatchScheduler>(chunkConfig);
+
+    constexpr SizeType32 maxBatchSize = 4;
+    constexpr SizeType32 maxNumTokens = 15;
+
+    RequestVector activeRequests;
+    // Context request
+    activeRequests.push_back(createRequest(30, 1, 0));
+    // Generation request (already transitioned)
+    auto genReq = createRequest(5, 10, 1);
+    genReq->setState(LlmRequestState::kGENERATION_IN_PROGRESS);
+    genReq->addNewTokens({42});
+    activeRequests.push_back(genReq);
+
+    ReqIdsSet inflightReqIds;
+    auto const [contextRequests, genRequests]
+        = (*scheduler)(activeRequests, inflightReqIds, maxBatchSize, maxNumTokens);
+
+    EXPECT_EQ(genRequests.size(), 1);
+    ASSERT_EQ(contextRequests.size(), 1);
+    // Budget remaining = 15 - 1 (gen) = 14; chunk = min(30, 10) = 10
+    EXPECT_EQ(contextRequests[0]->getContextChunkSize(), 10);
+}
diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
index 63faa3076c49..55ffaf6f52b2 100644
--- a/examples/llm-api/quickstart_advanced.py
+++ b/examples/llm-api/quickstart_advanced.py
@@ -94,6 +94,11 @@ def add_llm_args(parser):
         default=False,
         action='store_true',
         help='Use low precision combine in MoE (only for NVFP4 quantization)')
+    parser.add_argument(
+        '--moe_load_balancer_config',
+        type=str,
+        default=None,
+        help='Path to a YAML file for MoE load balancer (EPLB) configuration.')
 
     # KV cache
     parser.add_argument('--kv_cache_dtype', type=str, default='auto')
@@ -308,7 +313,7 @@ def setup_llm(args, **kwargs):
             enable_piecewise_cuda_graph= \
                 args.use_piecewise_cuda_graph)
         if args.use_torch_compile else None,
-        moe_config=MoeConfig(backend=args.moe_backend, use_low_precision_moe_combine=args.use_low_precision_moe_combine),
+        moe_config=MoeConfig(backend=args.moe_backend, use_low_precision_moe_combine=args.use_low_precision_moe_combine, load_balancer=args.moe_load_balancer_config),
         sampler_type=args.sampler_type,
         max_seq_len=args.max_seq_len,
         max_batch_size=args.max_batch_size,
diff --git a/examples/serve/prometheus_metrics.py b/examples/serve/prometheus_metrics.py
index 5f8b8505171c..2b04fd8f3198 100644
--- a/examples/serve/prometheus_metrics.py
+++ b/examples/serve/prometheus_metrics.py
@@ -32,8 +32,13 @@
 METRICS_URL = "http://localhost:8000/prometheus/metrics"
 
 
-def fetch_metrics() -> dict | None:
-    """Fetch metrics from the Prometheus endpoint."""
+def fetch_metrics() -> str | None:
+    """Fetch raw Prometheus exposition text from the metrics endpoint.
+
+    Returns:
+        The decoded response body as a string, or None if the request
+        failed or returned a non-200 status.
+    """
     try:
         response = urlopen(METRICS_URL)
         if response.status == 200:
@@ -46,8 +51,16 @@ def fetch_metrics() -> dict | None:
         return None
 
 
-def parse_and_display_metrics(metrics_data: dict) -> None:
-    """Parse and display relevant TensorRT-LLM metrics."""
+def parse_and_display_metrics(metrics_data: str) -> None:
+    """Parse Prometheus exposition text and print TensorRT-LLM metrics.
+
+    Searches the raw text for a predefined set of metrics (request counts,
+    latency histograms, KV cache stats). Found metrics are printed with
+    their sample lines; missing metrics are listed separately.
+
+    Args:
+        metrics_data: Raw Prometheus exposition text returned by fetch_metrics().
+    """
     if not metrics_data:
         return
 
@@ -62,6 +75,8 @@ def parse_and_display_metrics(metrics_data: dict) -> None:
         f"{METRIC_PREFIX}time_to_first_token_seconds": "Time to first token",
         f"{METRIC_PREFIX}request_queue_time_seconds": "Request queue time",
         f"{METRIC_PREFIX}kv_cache_hit_rate": "KV cache hit rate",
+        f"{METRIC_PREFIX}kv_cache_reused_blocks_total": "KV cache reused blocks (cumulative)",
+        f"{METRIC_PREFIX}kv_cache_missed_blocks_total": "KV cache missed blocks (cumulative)",
         f"{METRIC_PREFIX}kv_cache_utilization": "KV cache utilization",
     }
 
@@ -100,10 +115,16 @@ def parse_and_display_metrics(metrics_data: dict) -> None:
 
 
 def main():
+    """Send completion requests to a running TensorRT-LLM server and display Prometheus metrics.
+
+    Sends 10 completion requests sequentially, fetching and printing
+    the Prometheus metrics after each response to show how counters, histograms,
+    and gauges evolve over time.
+    """
     print("Prometheus Metrics Example")
     print("=" * 80)
     print("This script will:")
-    print("1. Send several completion requests to a running TensorRT-LLM server")
+    print("1. Send 10 completion requests to a running TensorRT-LLM server")
     print(
         "2. After each response, fetch and display Prometheus metrics from the /prometheus/metrics endpoint"
     )
@@ -111,8 +132,8 @@ def main():
 
     # Make several completion requests to generate metrics
     print("Sending completion requests...")
-    num_requests = 10
-    for i in range(num_requests):
+    NUM_REQUESTS = 10
+    for i in range(NUM_REQUESTS):
         try:
             response = client.completions.create(
                 model="Server",
@@ -124,7 +145,7 @@ def main():
                 stream=False,
             )
             print(
-                f"  Request {i + 1}/{num_requests} completed. Response: {response.choices[0].text[:50]}..."
+                f"  Request {i + 1}/{NUM_REQUESTS} completed. Response: {response.choices[0].text[:50]}..."
             )
 
             # Fetch and display metrics after each response
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index d4cfe2d576ac..175aa6a46133 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -1072,7 +1072,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     envVarsToExport[varName] = env."${varName}"
                 }
 
-                srunArgs = [
+                def srunArgs = [
                     "--container-name=multi_node_test-\${SLURM_JOB_ID}",
                     "--container-image=$containerImageArg",
                     "--container-workdir=$jobWorkspace",
diff --git a/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh b/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh
index 01660c230076..631b48eb2edf 100644
--- a/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh
+++ b/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh
@@ -28,6 +28,7 @@ for i in $(seq 0 $((numGenServers - 1))); do
         --ntasks-per-node=$gpusPerNodePerGenServer \
         $runScript &> $jobWorkspace/gen_server_$i.log &
     echo "Started gen server $i"
+    sleep 5  # Wait for pyxis container namespace initialization to avoid race condition
 done
 
 # Start ctx servers (skip if gen_only_no_context mode)
@@ -43,11 +44,13 @@ if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" != "1" ]; then
         --ntasks-per-node=$gpusPerNodePerCtxServer \
             $runScript &> $jobWorkspace/ctx_server_$i.log &
         echo "Started ctx server $i"
+        sleep 5  # Wait for pyxis container namespace initialization to avoid race condition
     done
 else
     echo "Skipping ctx servers (gen_only_no_context mode)"
 fi
 
+sleep 5  # Wait for pyxis container namespace initialization to avoid race condition
 
 # Start disagg server
 echo "Starting disagg server..."
@@ -59,6 +62,7 @@ srun "${srunArgs[@]}" --kill-on-bad-exit=1 --overlap \
     --ntasks-per-node=1 \
     $runScript &> $jobWorkspace/disagg_server.log &
 echo "Started disagg server"
+sleep 5  # Wait for pyxis container namespace initialization to avoid race condition
 
 # Start benchmark
 echo "Starting benchmark..."
diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
index 0e8bdffa6ad8..065b1faa9129 100644
--- a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 import math
 from typing import List, Optional, Tuple
@@ -13,7 +16,8 @@
                          DynamicTensorSpec, OptimizationProfile, TunableRunner,
                          TuningConfig)
 from ..cute_dsl_utils import IS_CUTLASS_DSL_AVAILABLE
-from ..utils import (fp4_scale_infer_shape, fp8_scale_infer_shape,
+from ..utils import (deep_gemm_gen_tuning_buckets, fp4_scale_infer_shape,
+                     fp8_scale_infer_shape,
                      get_last_power_of_2_num_tokens_buckets,
                      last_positive_power_of_2, next_positive_power_of_2)
 
@@ -319,6 +323,8 @@ def get_dense_gemm_approximate_cta_nums(
         Sm100BlockwiseGemmKernel
     from ..cute_dsl_kernels.blackwell.dense_blockscaled_gemm_persistent import \
         Sm100BlockScaledPersistentDenseGemmKernel
+    from ..cute_dsl_kernels.blackwell.moe_as_dense_gemm.fc1 import \
+        Sm100BlockScaledPersistentDenseGemmKernel as DenseGemmSwigluKernel
     from ..cute_dsl_kernels.blackwell.top_k.filtered_top_k_decode_varlen import \
         FilteredTopKKernelVarlenDecode
     from ..cute_dsl_kernels.blackwell.top_k.single_pass_multi_cta_radix_topk import \
@@ -2819,6 +2825,738 @@ def _(
         assert output.shape == (batch_size, m,
                                 n), "CuTe DSL fp8 bmm output shape is incorrect"
 
+    # =============================================================================
+    # Dense GEMM with SwiGLU Fusion (FC1 Kernel for MoE as Dense GEMM)
+    # =============================================================================
+
+    class CuteDSLNVFP4DenseGemmSwigluRunner(TunableRunner):
+        """Runner for Dense GEMM with SwiGLU fusion (MoE FC1 layer as dense GEMM).
+
+        This kernel performs: C = SwiGLU(alpha * (SFA * A) @ (SFB * B))
+        where SwiGLU(x) = up * silu(gate), with up/gate extracted from interleaved output.
+
+        Input shapes:
+        - A: (M, K) - activation tensor
+        - B: (N, K, L) - weight tensor (L is typically 1 for dense)
+        - alpha: (expert_count,) - per-expert scaling, indexed by weight_per_expert
+
+        Output shape:
+        - C: (M, N//2) - N//2 due to SwiGLU fusion
+        """
+
+        kernel_class = DenseGemmSwigluKernel
+        kernel_cache = dict()
+        tuning_config_cache = dict()
+        _CUTLASS_DTYPE_MAP = {
+            torch.bfloat16: cutlass.BFloat16,
+            torch.float16: cutlass.Float16,
+            torch.float32: cutlass.Float32,
+            torch.float4_e2m1fn_x2: cutlass.Float4E2M1FN,
+        }
+
+        def __init__(
+            self,
+            expert_count: int,
+            weight_per_expert: int,
+            output_dtype: torch.dtype,
+            scaling_vector_size: int = 16,
+        ):
+            super().__init__()
+            self.expert_count = expert_count
+            self.weight_per_expert = weight_per_expert
+            self.output_dtype = output_dtype
+            self.scaling_vector_size = scaling_vector_size
+
+        def unique_id(self):
+            return (
+                self.expert_count,
+                self.weight_per_expert,
+                self.output_dtype,
+                self.scaling_vector_size,
+            )
+
+        def __hash__(self):
+            return hash(self.unique_id())
+
+        def __eq__(self, other):
+            if not isinstance(other, CuteDSLNVFP4DenseGemmSwigluRunner):
+                return False
+            return self.unique_id() == other.unique_id()
+
+        def get_valid_tactics(
+            self,
+            inputs: List[torch.Tensor],
+            profile: OptimizationProfile,
+            **kwargs,
+        ) -> List[Tuple[Tuple[int, int], Tuple[int, int]]]:
+            """Return valid (mma_tiler_mn, cluster_shape_mn) combinations."""
+            # Check SM version - only supports SM 100 and SM 103
+            major, minor = torch.cuda.get_device_capability()
+            if not (major == 10 and minor in [0, 3]):
+                return []
+
+            a = inputs[0]
+            b = inputs[1]
+            # a: [m, k//2] (fp4 packed), b: [num_expert, weight_per_expert, k//2]
+            m = a.shape[0]
+            k = a.shape[1] * 2  # fp4 packed in k dimension
+            n = b.shape[0] * b.shape[1]  # num_expert * weight_per_expert
+            l = 1  # dense GEMM
+
+            # Define candidates together
+            mma_tiler_mn_candidates = [(128, 128), (128, 256), (256, 256)]
+            cluster_shape_mn_candidates = [(1, 1), (1, 2), (1, 4), (2, 1)]
+
+            # Map torch dtype to cutlass dtype
+            if self.output_dtype not in self._CUTLASS_DTYPE_MAP:
+                raise ValueError(
+                    f"Unsupported output_dtype {self.output_dtype} for FC1 DenseGEMM runner"
+                )
+            c_cutlass_dtype = self._CUTLASS_DTYPE_MAP[self.output_dtype]
+
+            tactics = []
+            for mma_tiler_mn, cluster_shape_mn in itertools.product(
+                    mma_tiler_mn_candidates, cluster_shape_mn_candidates):
+                if self.kernel_class.can_implement(
+                        cutlass.Float4E2M1FN,  # ab_dtype
+                        cutlass.Float8E4M3FN,  # sf_dtype
+                        self.scaling_vector_size,
+                        c_cutlass_dtype,  # c_dtype
+                        mma_tiler_mn,
+                        cluster_shape_mn,
+                        m,
+                        n,
+                        k,
+                        l,
+                        "k",  # a_major
+                        "k",  # b_major
+                        "n",  # c_major
+                        self.expert_count,
+                        self.weight_per_expert,
+                ):
+                    tactics.append((mma_tiler_mn, cluster_shape_mn))
+
+            return tactics
+
+        def get_tuning_config(self) -> TuningConfig:
+            key = self.unique_id()
+            if key not in self.tuning_config_cache:
+                self.tuning_config_cache[key] = TuningConfig(
+                    dynamic_tensor_specs=(DynamicTensorSpec(
+                        0, 0, deep_gemm_gen_tuning_buckets), ),
+                    constraint_specs=(ConstraintSpec(2, 0,
+                                                     fp4_scale_infer_shape), ),
+                    use_cold_l2_cache=True,
+                    tune_max_num_tokens=512,
+                    distributed_tuning_strategy=DistributedTuningStrategy.
+                    PARALLEL,
+                )
+            return self.tuning_config_cache[key]
+
+        def forward(
+            self,
+            inputs: List[Optional[torch.Tensor]],
+            tactic: Optional[Tuple[Tuple[int, int], Tuple[int, int]]],
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
+            """Execute the dense GEMM with SwiGLU fusion.
+
+            Args:
+                inputs: [a, b, a_sf, b_sf, alpha, alpha_post, norm_const]
+                    - alpha_post can be None to skip post-SwiGLU scaling
+                tactic: ((mma_m, mma_n), (cluster_m, cluster_n))
+
+            Returns:
+                Tuple of (output, output_scale_factor)
+            """
+            a, b, a_sf, b_sf, alpha, alpha_post, norm_const = inputs[:7]
+
+            # Get dimensions
+            # a: [m, k//2] (fp4 packed), b: [num_expert, weight_per_expert, k//2]
+            m = a.shape[0]
+            k = a.shape[1] * 2  # fp4 packed in k dimension
+            n = b.shape[0] * b.shape[1]  # num_expert * weight_per_expert
+            l = 1  # dense GEMM
+            n_out = n // 2  # SwiGLU output
+
+            # Default tactic if not provided
+            if isinstance(tactic, tuple):
+                mma_tiler_mn, cluster_shape_mn = tactic
+            else:
+                mma_tiler_mn, cluster_shape_mn = (128, 128), (1, 1)
+
+            # Allocate output tensor
+            c_dtype = self.output_dtype
+            if c_dtype == torch.float4_e2m1fn_x2:
+                # FP4 packed: 2 elements per byte, so shape is (m, n_out // 2)
+                c = torch.empty((m, n_out // 2), dtype=c_dtype, device=a.device)
+            else:
+                c = torch.empty((m, n_out), dtype=c_dtype, device=a.device)
+
+            # Allocate output scale factor (for FP4 output quantization)
+            # Shape: (32, 4, pad_up(m, 128) // 128, 4, scale_n_out // 4, l)
+            scale_n_out = n_out // self.scaling_vector_size
+            c_sf_shape = (32, 4, pad_up(m, 128) // 128, 4, scale_n_out // 4, l)
+            c_sf = torch.empty(c_sf_shape, dtype=torch.uint8, device=a.device)
+
+            # Get CUDA stream
+            torch_stream = torch.cuda.current_stream()
+            stream = cuda.CUstream(torch_stream.cuda_stream)
+
+            # Map torch dtype to cutlass dtype
+            if c_dtype not in self._CUTLASS_DTYPE_MAP:
+                raise ValueError(
+                    f"Unsupported output_dtype {c_dtype} for FC1 DenseGEMM runner"
+                )
+            c_cutlass_dtype = self._CUTLASS_DTYPE_MAP[c_dtype]
+
+            # Create pointers for kernel
+            a_ptr = make_ptr(cutlass.Float4E2M1FN,
+                             a.data_ptr(),
+                             cute.AddressSpace.gmem,
+                             assumed_align=32)
+            b_ptr = make_ptr(cutlass.Float4E2M1FN,
+                             b.data_ptr(),
+                             cute.AddressSpace.gmem,
+                             assumed_align=32)
+            a_sf_ptr = make_ptr(cutlass.Float8E4M3FN,
+                                a_sf.data_ptr(),
+                                cute.AddressSpace.gmem,
+                                assumed_align=16)
+            b_sf_ptr = make_ptr(cutlass.Float8E4M3FN,
+                                b_sf.data_ptr(),
+                                cute.AddressSpace.gmem,
+                                assumed_align=16)
+            c_ptr = make_ptr(c_cutlass_dtype,
+                             c.data_ptr(),
+                             cute.AddressSpace.gmem,
+                             assumed_align=16)
+            c_sf_ptr = make_ptr(cutlass.Float8E4M3FN,
+                                c_sf.data_ptr(),
+                                cute.AddressSpace.gmem,
+                                assumed_align=16)
+            alpha_ptr = make_ptr(cutlass.Float32, alpha.data_ptr(),
+                                 cute.AddressSpace.gmem)
+            alpha_post_ptr = None
+            if alpha_post is not None:
+                alpha_post_ptr = make_ptr(cutlass.Float32,
+                                          alpha_post.data_ptr(),
+                                          cute.AddressSpace.gmem)
+            norm_const_ptr = make_ptr(cutlass.Float32, norm_const.data_ptr(),
+                                      cute.AddressSpace.gmem)
+
+            # Cache key for compiled kernel
+            cache_key = (
+                self.weight_per_expert,
+                mma_tiler_mn,
+                cluster_shape_mn,
+                self.scaling_vector_size,
+                self.expert_count,
+                alpha_post is not None,  # Whether alpha_post is enabled
+                self.
+                output_dtype,  # Include output dtype to avoid cache collision
+            )
+
+            if cache_key not in self.__class__.kernel_cache:
+                # Get max active clusters only when compiling kernel
+                hardware_info = cutlass.utils.HardwareInfo()
+                max_active_clusters = hardware_info.get_max_active_clusters(
+                    cluster_shape_mn[0] * cluster_shape_mn[1])
+
+                kernel = self.kernel_class(
+                    sf_vec_size=self.scaling_vector_size,
+                    mma_tiler_mn=mma_tiler_mn,
+                    cluster_shape_mn=cluster_shape_mn,
+                    weight_per_expert=self.weight_per_expert,
+                )
+
+                # Compile the kernel and cache it
+                compiled_gemm = cute.compile(
+                    kernel.wrapper,
+                    a_ptr,
+                    b_ptr,
+                    a_sf_ptr,
+                    b_sf_ptr,
+                    c_ptr,
+                    c_sf_ptr,
+                    alpha_ptr,
+                    alpha_post_ptr,
+                    norm_const_ptr,
+                    m,
+                    n,
+                    k,
+                    l,
+                    expert_count=self.expert_count,
+                    scaling_vector_size=self.scaling_vector_size,
+                    max_active_clusters=max_active_clusters,
+                    stream=stream,
+                )
+                self.__class__.kernel_cache[cache_key] = compiled_gemm
+            else:
+                compiled_gemm = self.__class__.kernel_cache[cache_key]
+
+            # Call the compiled kernel
+            compiled_gemm(
+                a_ptr,
+                b_ptr,
+                a_sf_ptr,
+                b_sf_ptr,
+                c_ptr,
+                c_sf_ptr,
+                alpha_ptr,
+                alpha_post_ptr,
+                norm_const_ptr,
+                m,
+                n,
+                k,
+                l,
+                stream=stream,
+            )
+
+            return c, c_sf
+
+    @torch.library.custom_op(
+        "trtllm::cute_dsl_nvfp4_dense_gemm_swiglu_blackwell",
+        mutates_args=(),
+        device_types="cuda",
+    )
+    def cute_dsl_nvfp4_dense_gemm_swiglu_blackwell(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        input_scale: torch.Tensor,
+        weight_scale: torch.Tensor,
+        alpha: torch.Tensor,
+        alpha_post: Optional[torch.Tensor],
+        norm_const: torch.Tensor,
+        expert_count: int,
+        weight_per_expert: int,
+        output_dtype: torch.dtype,
+        scaling_vector_size: int = 16,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Dense GEMM with SwiGLU fusion for MoE FC1 layer.
+
+        Computes: C = alpha_post * SwiGLU(alpha * (input @ weight.T))
+        When alpha_post is None: C = SwiGLU(alpha * (input @ weight.T))
+
+        Args:
+            input: Input activation tensor (M, K//2) in fp4 packed format
+            weight: Weight tensor [num_expert, weight_per_expert, k//2] in fp4 packed format
+            input_scale: Scale factor for input
+            weight_scale: Scale factor for weight
+            alpha: Per-expert alpha scale (expert_count,)
+            alpha_post: Per-token per-expert alpha scale (M, expert_count) applied after SwiGLU,
+                or None to skip post-SwiGLU scaling
+            norm_const: Normalization constant for SFC generation
+            expert_count: Number of experts
+            weight_per_expert: Number of weight columns per expert
+            output_dtype: Output data type (bfloat16 or float16)
+            scaling_vector_size: Block scaling vector size (default: 16)
+
+        Returns:
+            Tuple of (output, output_scale_factor)
+        """
+        runner = CuteDSLNVFP4DenseGemmSwigluRunner(
+            expert_count=expert_count,
+            weight_per_expert=weight_per_expert,
+            output_dtype=output_dtype,
+            scaling_vector_size=scaling_vector_size,
+        )
+
+        inputs = [
+            input, weight, input_scale, weight_scale, alpha, alpha_post,
+            norm_const
+        ]
+
+        tuner = AutoTuner.get()
+        _, best_tactic = tuner.choose_one(
+            "trtllm::cute_dsl_nvfp4_dense_gemm_swiglu_blackwell",
+            [runner],
+            runner.get_tuning_config(),
+            inputs,
+        )
+
+        output, output_sf = runner(inputs, tactic=best_tactic)
+        return output, output_sf
+
+    @torch.library.register_fake(
+        "trtllm::cute_dsl_nvfp4_dense_gemm_swiglu_blackwell")
+    def _(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        input_scale: torch.Tensor,
+        weight_scale: torch.Tensor,
+        alpha: torch.Tensor,
+        alpha_post: Optional[torch.Tensor],
+        norm_const: torch.Tensor,
+        expert_count: int,
+        weight_per_expert: int,
+        output_dtype: torch.dtype,
+        scaling_vector_size: int = 16,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # weight: [num_expert, weight_per_expert, k//2] (fp4 packed)
+        m = input.shape[0]
+        n = weight.shape[0] * weight.shape[1]  # num_expert * weight_per_expert
+        n_out = n // 2  # SwiGLU output
+        l = 1  # dense GEMM
+
+        if output_dtype == torch.float4_e2m1fn_x2:
+            # FP4 packed: 2 elements per byte
+            output = input.new_empty((m, n_out // 2), dtype=output_dtype)
+        else:
+            output = input.new_empty((m, n_out), dtype=output_dtype)
+
+        # Output scale factor shape
+        scale_n_out = n_out // scaling_vector_size
+        c_sf_shape = (32, 4, pad_up(m, 128) // 128, 4, scale_n_out // 4, l)
+        output_sf = input.new_empty(c_sf_shape, dtype=torch.uint8)
+
+        return output, output_sf
+
+    # Import FC2 kernel
+    from ..cute_dsl_kernels.blackwell.moe_as_dense_gemm.fc2 import \
+        Sm100BlockScaledPersistentDenseGemmKernel as DenseGemmFC2Kernel
+
+    class CuteDSLNVFP4DenseGemmFC2Runner(TunableRunner):
+        """Runner for Dense GEMM FC2 layer (MoE second projection).
+
+        This kernel performs: C = (A * SFA) @ (B * SFB) * alpha_scale
+        where alpha_scale has shape (m, expert_count) for per-token-per-expert scaling.
+
+        Input shapes:
+        - A: (M, K) - activation tensor, K = weight_per_expert * expert_count
+        - B: (N, K) - weight tensor
+        - alpha_scale: (M, expert_count) - per-token-per-expert scaling
+
+        Output shape:
+        - C: (M, N)
+        """
+
+        kernel_class = DenseGemmFC2Kernel
+        kernel_cache = dict()
+        tuning_config_cache = dict()
+        _CUTLASS_DTYPE_MAP = {
+            torch.bfloat16: cutlass.BFloat16,
+            torch.float16: cutlass.Float16,
+            torch.float32: cutlass.Float32,
+        }
+
+        def __init__(
+            self,
+            expert_count: int,
+            weight_per_expert: int,
+            output_dtype: torch.dtype,
+            scaling_vector_size: int = 16,
+        ):
+            super().__init__()
+            self.expert_count = expert_count
+            self.weight_per_expert = weight_per_expert
+            self.output_dtype = output_dtype
+            self.scaling_vector_size = scaling_vector_size
+
+        def unique_id(self):
+            return (
+                self.expert_count,
+                self.weight_per_expert,
+                self.output_dtype,
+                self.scaling_vector_size,
+            )
+
+        def __hash__(self):
+            return hash(self.unique_id())
+
+        def __eq__(self, other):
+            if not isinstance(other, CuteDSLNVFP4DenseGemmFC2Runner):
+                return False
+            return self.unique_id() == other.unique_id()
+
+        def get_valid_tactics(
+            self,
+            inputs: List[torch.Tensor],
+            profile: OptimizationProfile,
+            **kwargs,
+        ) -> List[Tuple[Tuple[int, int], Tuple[int, int]]]:
+            """Return valid (mma_tiler_mn, cluster_shape_mn) combinations."""
+            # Check SM version - only supports SM 100 and SM 103
+            major, minor = torch.cuda.get_device_capability()
+            if not (major == 10 and minor in [0, 3]):
+                return []
+
+            a = inputs[0]
+            b = inputs[1]
+            # a: [m, k//2] (fp4 packed), b: [n, k//2]
+            m = a.shape[0]
+            k = a.shape[1] * 2  # fp4 packed in k dimension
+            n = b.shape[0]
+            l = 1  # dense GEMM
+
+            # Define candidates together
+            mma_tiler_mn_candidates = [(128, 64), (128, 128), (128, 256)]
+            cluster_shape_mn_candidates = [(1, 1), (1, 2), (1, 4)]
+
+            # Map torch dtype to cutlass dtype
+            if self.output_dtype not in self._CUTLASS_DTYPE_MAP:
+                raise ValueError(
+                    f"Unsupported output_dtype {self.output_dtype} for FC2 DenseGEMM runner"
+                )
+            c_cutlass_dtype = self._CUTLASS_DTYPE_MAP[self.output_dtype]
+
+            tactics = []
+            for mma_tiler_mn, cluster_shape_mn in itertools.product(
+                    mma_tiler_mn_candidates, cluster_shape_mn_candidates):
+                if self.kernel_class.can_implement(
+                        cutlass.Float4E2M1FN,  # ab_dtype
+                        cutlass.Float8E4M3FN,  # sf_dtype
+                        self.scaling_vector_size,
+                        c_cutlass_dtype,  # c_dtype
+                        mma_tiler_mn,
+                        cluster_shape_mn,
+                        m,
+                        n,
+                        k,
+                        l,
+                        "k",  # a_major
+                        "k",  # b_major
+                        "n",  # c_major
+                        self.expert_count,
+                        self.weight_per_expert,
+                ):
+                    tactics.append((mma_tiler_mn, cluster_shape_mn))
+
+            return tactics
+
+        def get_tuning_config(self) -> TuningConfig:
+            key = self.unique_id()
+            if key not in self.tuning_config_cache:
+                self.tuning_config_cache[key] = TuningConfig(
+                    dynamic_tensor_specs=(DynamicTensorSpec(
+                        0, 0, get_last_power_of_2_num_tokens_buckets,
+                        last_positive_power_of_2), ),
+                    constraint_specs=(
+                        ConstraintSpec(2, 0, fp4_scale_infer_shape),
+                        ConstraintSpec(4, 0, lambda shapes: shapes[0][0]),
+                    ),
+                    use_cold_l2_cache=True,
+                    tune_max_num_tokens=256,
+                    distributed_tuning_strategy=DistributedTuningStrategy.
+                    PARALLEL,
+                )
+            return self.tuning_config_cache[key]
+
+        def forward(
+            self,
+            inputs: List[torch.Tensor],
+            tactic: Optional[Tuple[Tuple[int, int], Tuple[int, int]]],
+        ) -> torch.Tensor:
+            """Execute the dense GEMM FC2.
+
+            Args:
+                inputs: [a, b, a_sf, b_sf, alpha_scale]
+                tactic: ((mma_m, mma_n), (cluster_m, cluster_n))
+
+            Returns:
+                Output tensor
+            """
+            a, b, a_sf, b_sf, alpha_scale = inputs[:5]
+
+            # Get dimensions
+            # a: [m, k//2] (fp4 packed), b: [n, k//2]
+            m = a.shape[0]
+            k = a.shape[1] * 2  # fp4 packed in k dimension
+            n = b.shape[0]
+            l = 1  # dense GEMM
+
+            # Default tactic if not provided
+            if isinstance(tactic, tuple):
+                mma_tiler_mn, cluster_shape_mn = tactic
+            else:
+                mma_tiler_mn, cluster_shape_mn = (128, 128), (1, 1)
+
+            # Allocate output tensor
+            c_dtype = self.output_dtype
+            c = torch.empty((m, n), dtype=c_dtype, device=a.device)
+
+            # Get CUDA stream
+            torch_stream = torch.cuda.current_stream()
+            stream = cuda.CUstream(torch_stream.cuda_stream)
+
+            # Map torch dtype to cutlass dtype
+            if c_dtype not in self._CUTLASS_DTYPE_MAP:
+                raise ValueError(
+                    f"Unsupported output_dtype {c_dtype} for FC2 DenseGEMM runner"
+                )
+            c_cutlass_dtype = self._CUTLASS_DTYPE_MAP[c_dtype]
+
+            # Create pointers for kernel
+            a_ptr = make_ptr(cutlass.Float4E2M1FN,
+                             a.data_ptr(),
+                             cute.AddressSpace.gmem,
+                             assumed_align=32)
+            b_ptr = make_ptr(cutlass.Float4E2M1FN,
+                             b.data_ptr(),
+                             cute.AddressSpace.gmem,
+                             assumed_align=32)
+            a_sf_ptr = make_ptr(cutlass.Float8E4M3FN,
+                                a_sf.data_ptr(),
+                                cute.AddressSpace.gmem,
+                                assumed_align=16)
+            b_sf_ptr = make_ptr(cutlass.Float8E4M3FN,
+                                b_sf.data_ptr(),
+                                cute.AddressSpace.gmem,
+                                assumed_align=16)
+            alpha_scale_ptr = make_ptr(cutlass.Float32,
+                                       alpha_scale.data_ptr(),
+                                       cute.AddressSpace.gmem,
+                                       assumed_align=4)
+            c_ptr = make_ptr(c_cutlass_dtype,
+                             c.data_ptr(),
+                             cute.AddressSpace.gmem,
+                             assumed_align=16)
+
+            # Cache key for compiled kernel
+            cache_key = (
+                self.expert_count,
+                self.weight_per_expert,
+                mma_tiler_mn,
+                cluster_shape_mn,
+                self.scaling_vector_size,
+                self.
+                output_dtype,  # Include output dtype to avoid cache collision
+            )
+
+            if cache_key not in self.__class__.kernel_cache:
+                # Get max active clusters only when compiling kernel
+                hardware_info = cutlass.utils.HardwareInfo()
+                max_active_clusters = hardware_info.get_max_active_clusters(
+                    cluster_shape_mn[0] * cluster_shape_mn[1])
+
+                kernel = self.kernel_class(
+                    sf_vec_size=self.scaling_vector_size,
+                    mma_tiler_mn=mma_tiler_mn,
+                    cluster_shape_mn=cluster_shape_mn,
+                    expert_count=self.expert_count,
+                    weight_per_expert=self.weight_per_expert,
+                )
+
+                # Compile the kernel and cache it
+                compiled_gemm = cute.compile(
+                    kernel.wrapper,
+                    a_ptr,
+                    b_ptr,
+                    a_sf_ptr,
+                    b_sf_ptr,
+                    alpha_scale_ptr,
+                    c_ptr,
+                    m,
+                    n,
+                    k,
+                    l,
+                    expert_count=self.expert_count,
+                    scaling_vector_size=self.scaling_vector_size,
+                    max_active_clusters=max_active_clusters,
+                    stream=stream,
+                )
+                self.__class__.kernel_cache[cache_key] = compiled_gemm
+            else:
+                compiled_gemm = self.__class__.kernel_cache[cache_key]
+
+            # Call the compiled kernel
+            compiled_gemm(
+                a_ptr,
+                b_ptr,
+                a_sf_ptr,
+                b_sf_ptr,
+                alpha_scale_ptr,
+                c_ptr,
+                m,
+                n,
+                k,
+                l,
+                stream=stream,
+            )
+
+            return c
+
+    @torch.library.custom_op(
+        "trtllm::cute_dsl_nvfp4_dense_gemm_fc2_blackwell",
+        mutates_args=(),
+        device_types="cuda",
+    )
+    def cute_dsl_nvfp4_dense_gemm_fc2_blackwell(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        input_scale: torch.Tensor,
+        weight_scale: torch.Tensor,
+        alpha_scale: torch.Tensor,
+        expert_count: int,
+        weight_per_expert: int,
+        output_dtype: torch.dtype,
+        scaling_vector_size: int = 16,
+    ) -> torch.Tensor:
+        """Dense GEMM FC2 for MoE second projection.
+
+        Performs: C = (A * SFA) @ (B * SFB) * alpha_scale
+
+        Args:
+            input: Input activation (M, K//2) in fp4 packed format
+            weight: Weight tensor (N, K//2) in fp4 packed format
+            input_scale: Scale factor for input (swizzled)
+            weight_scale: Scale factor for weight (swizzled)
+            alpha_scale: Per-token-per-expert scale (M, expert_count)
+            expert_count: Number of experts
+            weight_per_expert: Number of weights per expert
+            output_dtype: Output data type (bfloat16 or float16)
+            scaling_vector_size: Block scaling vector size (default: 16)
+
+        Returns:
+            Output tensor (M, N)
+        """
+        # FC2 DenseGEMM kernel tiles K with MMA tile size 256.
+        # weight_per_expert must be 256-aligned so expert boundaries
+        # align with MMA tile boundaries for correct alpha_scale splitting.
+        _MMA_TILE_K = 256
+        assert weight_per_expert % _MMA_TILE_K == 0, (
+            f"cute_dsl_nvfp4_dense_gemm_fc2_blackwell requires weight_per_expert "
+            f"to be a multiple of {_MMA_TILE_K} (got {weight_per_expert})")
+
+        runner = CuteDSLNVFP4DenseGemmFC2Runner(
+            expert_count=expert_count,
+            weight_per_expert=weight_per_expert,
+            output_dtype=output_dtype,
+            scaling_vector_size=scaling_vector_size,
+        )
+
+        inputs = [input, weight, input_scale, weight_scale, alpha_scale]
+
+        tuner = AutoTuner.get()
+        _, best_tactic = tuner.choose_one(
+            "trtllm::cute_dsl_nvfp4_dense_gemm_fc2_blackwell",
+            [runner],
+            runner.get_tuning_config(),
+            inputs,
+        )
+
+        output = runner(inputs, tactic=best_tactic)
+        return output
+
+    @torch.library.register_fake(
+        "trtllm::cute_dsl_nvfp4_dense_gemm_fc2_blackwell")
+    def _(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        input_scale: torch.Tensor,
+        weight_scale: torch.Tensor,
+        alpha_scale: torch.Tensor,
+        expert_count: int,
+        weight_per_expert: int,
+        output_dtype: torch.dtype,
+        scaling_vector_size: int = 16,
+    ) -> torch.Tensor:
+        # input: [m, k//2] (fp4 packed), weight: [n, k//2]
+        m = input.shape[0]
+        n = weight.shape[0]
+
+        output = input.new_empty((m, n), dtype=output_dtype)
+        return output
+
     def _get_num_sms() -> int:
         """Return the number of SMs on the current device (cached)."""
         if not hasattr(_get_num_sms, "_value"):
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
index ee150d1be98a..9ae4589e32ab 100644
--- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -20,7 +20,8 @@
 from ..cute_dsl_utils import IS_CUTLASS_DSL_AVAILABLE
 from ..modules.multi_stream_utils import do_multi_stream
 from ..modules.swiglu import silu_and_mul_kernel
-from ..utils import (ActivationType, fp4_scale_infer_shape,
+from ..utils import (ActivationType, deep_gemm_gen_tuning_buckets,
+                     fp4_scale_infer_shape,
                      get_last_power_of_2_num_tokens_buckets,
                      last_positive_power_of_2)
 
@@ -1449,14 +1450,7 @@ def _(
     return input.new_empty((M, N), dtype=output_dtype)
 
 
-def deep_gemm_gen_tuning_buckets(x: int):
-    buckets = tuple(range(8, 128, 8))
-    # Clamp x to be between 4096 and 8192.
-    if x >= 128:
-        x = min(x, 8192)
-        x = max(x, 4096)
-        buckets += tuple(range(128, x, 128))
-    return buckets
+# deep_gemm_gen_tuning_buckets is imported from ..utils
 
 
 def _fp8_quantize_1x128_ue8m0(input: torch.Tensor, tactic: int):
diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/moe_as_dense_gemm/__init__.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/moe_as_dense_gemm/__init__.py
new file mode 100644
index 000000000000..f9b5502cb4ce
--- /dev/null
+++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/moe_as_dense_gemm/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/moe_as_dense_gemm/fc1.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/moe_as_dense_gemm/fc1.py
new file mode 100644
index 000000000000..b67a3100be4b
--- /dev/null
+++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/moe_as_dense_gemm/fc1.py
@@ -0,0 +1,2649 @@
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Type, Union
+
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+import cutlass.pipeline as pipeline
+import cutlass.utils as utils
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass._mlir.dialects import math
+from cutlass.cute.nvgpu import cpasync, tcgen05
+
+from tensorrt_llm._torch.cute_dsl_kernels.blackwell.utils import fmin, silu_f32
+
+"""
+This example provides an experimental implementation of the SM100 batched dense blockscaled
+GEMM kernel, please note that the APIs and implementation details related to this kernel
+may change in future releases.
+
+A high-performance persistent batched dense blockscaled GEMM example for the NVIDIA Blackwell
+SM100 architecture using CUTE DSL.
+- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M")
+  for MXF8 input type and can only be row-major("K") for MXF4/NVF4 input type
+- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K")
+  for MXF8 input type and can only be row-major("K") for MXF4/NVF4 input type
+- Matrix C is MxNxL, L is batch dimension, C can be row-major("N") or column-major("M")
+- Matrix SFA layout is filled internally according to A shape and BlockScaledBasicChunk,
+  which has M×ceil_div(K, sf_vec_size)×L elements respectively
+- Matrix SFB layout is filled internally according to B shape and BlockScaledBasicChunk,
+  which has N×ceil_div(K, sf_vec_size)×L elements respectively
+
+This GEMM kernel supports the following features:
+    - Utilizes Tensor Memory Access (TMA) for efficient memory operations
+    - Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations (including 2cta mma instructions)
+    - Implements TMA multicast with cluster to reduce L2 memory traffic
+    - Support persistent tile scheduling to better overlap memory load/store with mma between tiles
+    - Support warp specialization to avoid explicit pipelining between mainloop load and mma
+
+This GEMM works as follows:
+1. DMA warp: Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations.
+2. MMA warp:
+    - Load scale factor A/B from shared memory (SMEM) to tensor memory (TMEM) using tcgen05.cp instruction.
+    - Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+3. EPILOGUE warp:
+    - Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+    - Type convert C matrix to output type.
+    - Optionally store C matrix from registers (RMEM) to shared memory (SMEM) to global
+      memory (GMEM) with TMA operations, or directly store C matrix from registers (RMEM)
+      to global memory (GMEM) without TMA operations.
+    - Optionally accept an elementwise lambda function epilogue_op to apply to the output tensor:
+      e.g., relu can set epilogue_op = lambda x: cute.where(x > 0, x, cute.full_like(x, 0))
+
+SM100 tcgen05.mma.kind.block_scale instructions operate as follows:
+- Read matrix A from SMEM
+- Read matrix B from SMEM
+- Read scalefactor A from TMEM
+- Read scalefactor B from TMEM
+- Write accumulator to TMEM
+The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+
+Input arguments to this example is shown below:
+
+.. code-block:: bash
+
+    python examples/blackwell/dense_blockscaled_gemm_persistent.py            \
+      --ab_dtype Float4E2M1FN --sf_dtype Float8E8M0FNU --sf_vec_size 16        \
+      --c_dtype Float16                                                        \
+      --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                            \
+      --mnkl 8192,8192,1024,1
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python examples/blackwell/dense_blockscaled_gemm_persistent.py        \
+      --ab_dtype Float4E2M1FN --sf_dtype Float8E8M0FNU --sf_vec_size 16        \
+      --c_dtype Float16                                                        \
+      --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                            \
+      --mnkl 8192,8192,1024,1                                                  \
+      --warmup_iterations 1 --iterations 10 --skip_ref_check
+
+
+Constraints:
+* Supported input data types: mxf8, mxf4, nvf4
+  see detailed valid dtype combinations in below Sm100BlockScaledPersistentDenseGemmKernel class documentation
+* A/B tensor must have the same data type, mixed data type is not supported (e.g., mxf8 x mxf4)
+* Mma tiler M must be 128 or 256(use_2cta_instrs)
+* Mma tiler N must be 128 or 256
+* Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+* Cluster shape M must be multiple of 2 if Mma tiler M is 256(use_2cta_instrs)
+* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned,
+  i.e, number of elements is a multiple of 16 and 32 for Float8 and Float4, respectively.
+"""
+
+
+class Sm100BlockScaledPersistentDenseGemmKernel:
+    """This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types
+    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
+
+    :param sf_vec_size: Scalefactor vector size.
+    :type sf_vec_size: int
+    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
+    :type cluster_shape_mn: Tuple[int, int]
+
+    :note: In current version, A and B tensor must have the same data type
+        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported
+
+    :note: Supported combinations of A/B data types, SF data typs and SF vector size:
+        - MXF8: A/B: Float8E5M2/Float8E4M3FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - MXF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - NVF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU/Float8E4M3FN + sf_vec_size: 16
+
+    :note: Supported accumulator data types:
+        - Float32
+
+    :note: Supported C data types:
+        - Float32
+        - Float16/BFloat16
+        - Float8E4M3FN/Float8E5M2
+        # Note: We don't have SFD generation support in this example for now,
+        # so Float4E2M1FN output is only for internal testing and will not be released.
+        - Float4E2M1FN
+
+    :note: Constraints:
+        - MMA tiler M must be 128 or 256 (use_2cta_instrs)
+        # TODO: Add 64 and 192 support
+        - MMA tiler N must be 128/256
+        - Cluster shape M must be multiple of 2 if Mma tiler M is 256
+        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+        - Also, Cluster shape M/N must be <= 4 for scale factor multicasts due to limited size of scale factors
+
+    Example:
+        >>> gemm = Sm100BlockScaledPersistentDenseGemmKernel(
+        ...     sf_vec_size=16, mma_tiler_mn=(256, 128), cluster_shape_mn=(2, 1)
+        ... )
+        >>> gemm(a_tensor, b_tensor, sfa_tensor, sfb_tensor, c_tensor, max_active_clusters, stream)
+    """
+
+    def __init__(
+        self,
+        sf_vec_size: int,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        weight_per_expert: int,
+        use_prefetch: bool = False,
+        prefetch_dist: int = 3,
+        vectorized_f32: bool = True,
+    ):
+        """Initializes the configuration for a Blackwell dense GEMM kernel with SwiGLU fusion.
+
+        This configuration includes several key aspects:
+
+        1.  MMA Instruction Settings (tcgen05):
+            - acc_dtype: Data types for MMA accumulator, always set to Float32
+            - sf_vec_size: Scalefactor A/B vector size.
+            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
+
+        2.  Cluster Shape:
+            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+
+        3.  SwiGLU Fusion:
+            - The kernel computes C = up * silu(gate) where up and gate come from
+              interleaved weight matrix B (granularity=64)
+            - Output N dimension is N/2 due to SwiGLU fusion
+
+        :param sf_vec_size: Scalefactor vector size.
+        :type sf_vec_size: int
+        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: Tuple[int, int]
+        :param weight_per_expert: Number of weights per expert (computed as N // num_experts).
+        :type weight_per_expert: int
+        :param use_prefetch: Enable prefetch operations (default: False).
+        :type use_prefetch: bool
+        :param prefetch_dist: Prefetch distance for TMA operations (default: 3).
+        :type prefetch_dist: int
+        :param vectorized_f32: Enable vectorized f32x2 operations for better performance (default: True).
+        :type vectorized_f32: bool
+        """
+
+        self.acc_dtype = cutlass.Float32
+        self.sf_vec_size = sf_vec_size
+        self.use_2cta_instrs = mma_tiler_mn[0] == 256
+        self.cluster_shape_mn = cluster_shape_mn
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+
+        self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+
+        self.occupancy = 1
+        # Set specialized warp ids
+        self.epilog_warp_id = (
+            0,
+            1,
+            2,
+            3,
+        )
+        self.mma_warp_id = 4
+        self.tma_warp_id = 5
+        self.threads_per_cta = 32 * len((self.mma_warp_id, self.tma_warp_id, *self.epilog_warp_id))
+        # Set barrier id for cta sync, epilogue sync and tmem ptr sync
+        self.cta_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=1,
+            num_threads=self.threads_per_cta,
+        )
+        self.epilog_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=2,
+            num_threads=32 * len(self.epilog_warp_id),
+        )
+        self.tmem_alloc_barrier = pipeline.NamedBarrier(
+            barrier_id=3,
+            num_threads=32 * len((self.mma_warp_id, *self.epilog_warp_id)),
+        )
+        self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+        SM100_TMEM_CAPACITY_COLUMNS = 512
+        self.num_tmem_alloc_cols = SM100_TMEM_CAPACITY_COLUMNS
+
+        self.weight_per_expert = weight_per_expert
+
+        self.use_prefetch = use_prefetch
+        self.prefetch_dist = prefetch_dist
+        self.vectorized_f32 = vectorized_f32
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B/SFA/SFB
+        - Computing epilogue subtile
+        - Setting up A/B/SFA/SFB/C stage counts in shared memory
+        - Computing A/B/SFA/SFB/C shared memory layout
+        """
+        # Compute mma instruction shapes
+        # (MMA_Tile_Shape_M, MMA_Tile_Shape_N, MMA_Inst_Shape_K)
+        self.mma_inst_shape_mn = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+        )
+        # (CTA_Tile_Shape_M, Round_Up(MMA_Tile_Shape_N, 128), MMA_Inst_Shape_K)
+        # TODO: round up to 128, it is prepared for supporting N=64 or 192.
+        self.mma_inst_shape_mn_sfb = (
+            self.mma_inst_shape_mn[0] // (2 if self.use_2cta_instrs else 1),
+            cute.round_up(self.mma_inst_shape_mn[1], 128),
+        )
+
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_inst_shape_mn[0],
+            self.mma_inst_shape_mn[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.mma_tiler_sfb = (
+            self.mma_inst_shape_mn_sfb[0],
+            self.mma_inst_shape_mn_sfb[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+
+        # Output tile shape for C (N dimension is halved due to SwiGLU fusion)
+        self.mma_tiler_c = (
+            self.mma_inst_shape_mn[0],
+            self.mma_inst_shape_mn[1] // 2,
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.cta_tile_shape_mnk_c = (
+            self.mma_tiler_c[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler_c[1],
+            self.mma_tiler_c[2],
+        )
+
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+        self.cluster_layout_sfb_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma_sfb.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.num_mcast_ctas_sfb = cute.size(self.cluster_layout_sfb_vmnk.shape[1])
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+        self.is_sfb_mcast = self.num_mcast_ctas_sfb > 1
+
+        # Compute epilogue subtile
+        # self.epi_tile = sm100_utils.compute_epilogue_tile_shape(
+        #     self.cta_tile_shape_mnk,
+        #     self.use_2cta_instrs,
+        #     self.c_layout,
+        #     self.c_dtype,
+        # )
+        self.epi_tile = (128, 64)
+        # Compute epilogue tile count for SFC quantization (use output tile shape)
+        self.epi_tile_cnt = (
+            self.cta_tile_shape_mnk_c[0] // self.epi_tile[0],
+            self.cta_tile_shape_mnk_c[1] // self.epi_tile[1],
+        )
+
+        # Setup A/B/C stage count in shared memory and ACC stage count in tensor memory
+        self.num_acc_stage, self.num_ab_stage, self.num_c_stage = self._compute_stages(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.b_dtype,
+            self.epi_tile,
+            self.c_dtype,
+            self.c_layout,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.smem_capacity,
+            self.occupancy,
+        )
+
+        # Overlap and double buffer accumulator when num_acc_stage == 1 for cta_tile_n = 256 case
+        self.overlapping_accum = self.num_acc_stage == 1
+
+        # Compute number of TMEM columns for SFA/SFB/Accumulator
+        sf_atom_mn = 32
+        mma_inst_tile_k = 4
+        self.num_sfa_tmem_cols = (self.cta_tile_shape_mnk[0] // sf_atom_mn) * mma_inst_tile_k
+        self.num_sfb_tmem_cols = (self.cta_tile_shape_mnk[1] // sf_atom_mn) * mma_inst_tile_k
+        self.num_sf_tmem_cols = self.num_sfa_tmem_cols + self.num_sfb_tmem_cols
+        self.num_accumulator_tmem_cols = (
+            self.cta_tile_shape_mnk[1] * self.num_acc_stage
+            if not self.overlapping_accum
+            else self.cta_tile_shape_mnk[1] * 2 - self.num_sf_tmem_cols
+        )
+
+        self.epi_tile_n_required = 2 * cute.size(self.epi_tile[1])
+        # Only when overlapping_accum is enabled, we need to release accumulator buffer early in epilogue
+        self.iter_acc_early_release_in_epilogue = self.num_sf_tmem_cols // self.epi_tile_n_required
+
+        # Compute A/B/SFA/SFB/C shared memory layout
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+        self.sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+        self.c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.c_dtype,
+            self.c_layout,
+            self.epi_tile,
+            self.num_c_stage,
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        a_tensor: cute.Tensor,
+        b_tensor: cute.Tensor,
+        sfa_tensor: cute.Tensor,
+        sfb_tensor: cute.Tensor,
+        alpha_scale: cute.Tensor,
+        alpha_scale_post: Optional[cute.Tensor],
+        c_tensor: cute.Tensor,
+        sfc_tensor: Optional[cute.Tensor],
+        norm_const_tensor: Optional[cute.Tensor],
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        """Execute the GEMM operation with SwiGLU fusion and optional quantization.
+
+        This method performs FC1 layer computation:
+        1. GEMM: acc = alpha * (SFA * A) * (SFB * B)
+        2. SwiGLU: out = up * silu(gate), where up/gate are extracted from interleaved acc (granularity=64)
+        3. Post-SwiGLU scaling (optional): C = out * alpha_post (per-token per-expert)
+        4. Optional Quant: When sfc_tensor is provided, generates SFC and quantizes output
+
+        Steps:
+        - Setup static attributes before smem/grid/tma computation
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size with regard to hardware constraints
+        - Define shared storage for kernel
+        - Launch the kernel synchronously
+
+        :param a_tensor: Input tensor A
+        :type a_tensor: cute.Tensor
+        :param b_tensor: Input tensor B (weights are interleaved: [up_0:64, gate_64:128, ...])
+        :type b_tensor: cute.Tensor
+        :param sfa_tensor: Scale factor tensor A
+        :type sfa_tensor: cute.Tensor
+        :param sfb_tensor: Scale factor tensor B
+        :type sfb_tensor: cute.Tensor
+        :param alpha_scale: Pre-SwiGLU alpha scaling tensor for each expert (expert_count, l)
+        :type alpha_scale: cute.Tensor
+        :param alpha_scale_post: Post-SwiGLU alpha scaling tensor per-token per-expert (m, expert_count, l),
+            or None to skip post-SwiGLU scaling
+        :type alpha_scale_post: Optional[cute.Tensor]
+        :param c_tensor: Output tensor C (N dimension is N/2 due to SwiGLU)
+        :type c_tensor: cute.Tensor
+        :param sfc_tensor: Scale factor tensor C for quantized output (None if not quantizing)
+        :type sfc_tensor: Optional[cute.Tensor]
+        :param norm_const_tensor: Normalization constant for scale factor generation (None if not quantizing)
+        :type norm_const_tensor: Optional[cute.Tensor]
+        :param max_active_clusters: Maximum number of active clusters
+        :type max_active_clusters: cutlass.Constexpr
+        :param stream: CUDA stream for asynchronous execution
+        :type stream: cuda.CUstream
+        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
+        :type epilogue_op: cutlass.Constexpr
+        :raises TypeError: If input data types are incompatible with the MMA instruction.
+        """
+        # Setup static attributes before smem/grid/tma computation
+        self.a_dtype: Type[cutlass.Numeric] = a_tensor.element_type
+        self.b_dtype: Type[cutlass.Numeric] = b_tensor.element_type
+        self.sf_dtype: Type[cutlass.Numeric] = sfa_tensor.element_type
+        self.c_dtype: Type[cutlass.Numeric] = c_tensor.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(a_tensor).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(b_tensor).mma_major_mode()
+        self.c_layout = utils.LayoutEnum.from_tensor(c_tensor)
+
+        # Check if input data types are compatible with MMA instruction
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+
+        # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
+        # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
+        sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(a_tensor.shape, self.sf_vec_size)
+        sfa_tensor = cute.make_tensor(sfa_tensor.iterator, sfa_layout)
+
+        # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+        sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(b_tensor.shape, self.sf_vec_size)
+        sfb_tensor = cute.make_tensor(sfb_tensor.iterator, sfb_layout)
+
+        # Determine if we need to apply post-SwiGLU alpha scaling
+        self.apply_alpha_post = alpha_scale_post is not None
+
+        # Determine if we need to generate scale factor C for quantization
+        self.generate_sfc = sfc_tensor is not None and norm_const_tensor is not None
+        if cutlass.const_expr(self.generate_sfc):
+            sfc_layout = blockscaled_utils.tile_atom_to_shape_SF(c_tensor.shape, self.sf_vec_size)
+            sfc_tensor = cute.make_tensor(sfc_tensor.iterator, sfc_layout)
+
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        # For 2CTA blockscaled kernels, SFB needs to be replicated across peer CTAs.
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for A
+        a_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+            a_op,
+            a_tensor,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for B
+        b_op = sm100_utils.cluster_shape_to_tma_atom_B(self.cluster_shape_mn, tiled_mma.thr_id)
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+            b_op,
+            b_tensor,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for SFA
+        sfa_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        sfa_smem_layout = cute.slice_(self.sfa_smem_layout_staged, (None, None, None, 0))
+        tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
+            sfa_op,
+            sfa_tensor,
+            sfa_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        # Setup TMA load for SFB
+        sfb_op = sm100_utils.cluster_shape_to_tma_atom_SFB(self.cluster_shape_mn, tiled_mma.thr_id)
+        sfb_smem_layout = cute.slice_(self.sfb_smem_layout_staged, (None, None, None, 0))
+        tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
+            sfb_op,
+            sfb_tensor,
+            sfb_smem_layout,
+            self.mma_tiler_sfb,
+            tiled_mma_sfb,
+            self.cluster_layout_sfb_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        sfa_copy_size = cute.size_in_bytes(self.sf_dtype, sfa_smem_layout)
+        sfb_copy_size = cute.size_in_bytes(self.sf_dtype, sfb_smem_layout)
+        self.num_tma_load_bytes = (
+            a_copy_size + b_copy_size + sfa_copy_size + sfb_copy_size
+        ) * atom_thr_size
+
+        # Setup TMA store for C
+        epi_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0))
+        tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            c_tensor,
+            epi_smem_layout,
+            self.epi_tile,
+        )
+
+        # Compute grid size (use output tile shape for C due to SwiGLU fusion)
+        self.tile_sched_params, grid = self._compute_grid(
+            c_tensor,
+            self.cta_tile_shape_mnk_c,
+            self.cluster_shape_mn,
+            max_active_clusters,
+        )
+
+        self.buffer_align_bytes = 1024
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorage:
+            ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    cute.cosize(self.c_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sSFA: cute.struct.Align[
+                cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfa_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sSFB: cute.struct.Align[
+                cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfb_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+
+        self.shared_storage = SharedStorage
+
+        # Extract M dimension for epilogue boundary check (prevents OOB access
+        # on alpha_scale_post when M < epi_tile_m)
+        m_total = cute.size(a_tensor.shape, mode=[0])
+
+        # Launch the kernel synchronously
+        self.kernel(
+            tiled_mma,
+            tiled_mma_sfb,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_sfa,
+            tma_tensor_sfa,
+            tma_atom_sfb,
+            tma_tensor_sfb,
+            tma_atom_c,
+            tma_tensor_c,
+            sfc_tensor,
+            norm_const_tensor,
+            alpha_scale,
+            alpha_scale_post,
+            self.cluster_layout_vmnk,
+            self.cluster_layout_sfb_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.sfa_smem_layout_staged,
+            self.sfb_smem_layout_staged,
+            self.c_smem_layout_staged,
+            self.epi_tile,
+            self.tile_sched_params,
+            epilogue_op,
+            m_total,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            stream=stream,
+            min_blocks_per_mp=1,
+        )
+        return
+
+    # GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tiled_mma_sfb: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_sfa: cute.CopyAtom,
+        mSFA_mkl: cute.Tensor,
+        tma_atom_sfb: cute.CopyAtom,
+        mSFB_nkl: cute.Tensor,
+        tma_atom_c: cute.CopyAtom,
+        mC_mnl: cute.Tensor,
+        mSFC_mnl: Optional[cute.Tensor],
+        norm_const_tensor: Optional[cute.Tensor],
+        alpha_scale: cute.Tensor,
+        alpha_scale_post: Optional[cute.Tensor],
+        cluster_layout_vmnk: cute.Layout,
+        cluster_layout_sfb_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        sfa_smem_layout_staged: cute.Layout,
+        sfb_smem_layout_staged: cute.Layout,
+        c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout],
+        epi_tile: cute.Tile,
+        tile_sched_params: utils.PersistentTileSchedulerParams,
+        epilogue_op: cutlass.Constexpr,
+        m_total: cutlass.Int32,
+    ):
+        """
+        GPU device kernel performing the Persistent batched GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.tma_warp_id:
+            cpasync.prefetch_descriptor(tma_atom_a)
+            cpasync.prefetch_descriptor(tma_atom_b)
+            cpasync.prefetch_descriptor(tma_atom_sfa)
+            cpasync.prefetch_descriptor(tma_atom_sfb)
+            cpasync.prefetch_descriptor(tma_atom_c)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coords inside cluster
+        bidx, bidy, bidz = cute.arch.block_idx()
+        mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+        block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+        # Coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        # Initialize mainloop ab_pipeline (barrier) and states
+        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        ab_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_tma_producer
+        )
+        ab_pipeline = pipeline.PipelineTmaUmma.create(
+            barrier_storage=storage.ab_full_mbar_ptr.data_ptr(),
+            num_stages=self.num_ab_stage,
+            producer_group=ab_pipeline_producer_group,
+            consumer_group=ab_pipeline_consumer_group,
+            tx_count=self.num_tma_load_bytes,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize acc_pipeline (barrier) and states
+        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_acc_consumer_threads = len(self.epilog_warp_id) * (2 if use_2cta_instrs else 1)
+        acc_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_acc_consumer_threads
+        )
+        acc_pipeline = pipeline.PipelineUmmaAsync.create(
+            barrier_storage=storage.acc_full_mbar_ptr.data_ptr(),
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Tensor memory dealloc barrier init
+        tmem = utils.TmemAllocator(
+            storage.tmem_holding_buf,
+            barrier_for_retrieve=self.tmem_alloc_barrier,
+            allocator_warp_id=self.epilog_warp_id[0],
+            is_two_cta=use_2cta_instrs,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+        )
+
+        # Cluster arrive after barrier init
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_arrive_relaxed()
+
+        #
+        # Setup smem tensor A/B/SFA/SFB/C
+        #
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sC = storage.sC.get_tensor(c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner)
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sSFA = storage.sSFA.get_tensor(sfa_smem_layout_staged)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sSFB = storage.sSFB.get_tensor(sfb_smem_layout_staged)
+
+        #
+        # Compute multicast mask for A/B/SFA/SFB buffer full
+        #
+        a_full_mcast_mask = None
+        b_full_mcast_mask = None
+        sfa_full_mcast_mask = None
+        sfb_full_mcast_mask = None
+        if cutlass.const_expr(self.is_a_mcast or self.is_b_mcast or use_2cta_instrs):
+            a_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1
+            )
+            sfa_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            sfb_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_sfb_vmnk, block_in_cluster_coord_sfb_vmnk, mcast_mode=1
+            )
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, RestM, RestK, RestL)
+        gA_mkl = cute.local_tile(
+            mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+        # (bN, bK, RestN, RestK, RestL)
+        gB_nkl = cute.local_tile(
+            mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None)
+        )
+        # (bM, bK, RestM, RestK, RestL)
+        gSFA_mkl = cute.local_tile(
+            mSFA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+        # (bN, bK, RestN, RestK, RestL)
+        gSFB_nkl = cute.local_tile(
+            mSFB_nkl,
+            cute.slice_(self.mma_tiler_sfb, (0, None, None)),
+            (None, None, None),
+        )
+        # (bM, bN, RestM, RestN, RestL) - use mma_tiler_c for output due to SwiGLU fusion
+        gC_mnl = cute.local_tile(
+            mC_mnl, cute.slice_(self.mma_tiler_c, (None, None, 0)), (None, None, None)
+        )
+        k_tile_cnt = cutlass.Int32(cute.size(gA_mkl, mode=[3]))
+
+        # Alpha scale post-SwiGLU global tensor tiling (optional)
+        # alpha_scale_post shape: (M, expert_count, L)
+        # (bM, RestM, expert_count, RestL)
+        galpha_scale_post_mel = None
+        if cutlass.const_expr(self.apply_alpha_post):
+            galpha_scale_post_mel = cute.local_tile(
+                alpha_scale_post,
+                cute.slice_(self.cta_tile_shape_mnk, (None, 0, 0)),
+                (None, None, None),
+            )
+
+        #
+        # Partition global tensor for TiledMMA_A/B/C
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        thr_mma_sfb = tiled_mma_sfb.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+        tCgA = thr_mma.partition_A(gA_mkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+        tCgSFA = thr_mma.partition_A(gSFA_mkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgSFB = thr_mma_sfb.partition_B(gSFB_nkl)
+        # (MMA, MMA_M, MMA_N, RestM, RestN, RestL)
+        tCgC = thr_mma.partition_C(gC_mnl)
+
+        #
+        # Partition global/shared tensor for TMA load A/B
+        #
+        # TMA load A partition_S/D
+        a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+        tAsA, tAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2],
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3),
+        )
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestN, RestK, RestL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        # TMA load SFA partition_S/D
+        sfa_cta_layout = a_cta_layout
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+        tAsSFA, tAgSFA = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfa,
+            block_in_cluster_coord_vmnk[2],
+            sfa_cta_layout,
+            cute.group_modes(sSFA, 0, 3),
+            cute.group_modes(tCgSFA, 0, 3),
+        )
+        tAsSFA = cute.filter_zeros(tAsSFA)
+        tAgSFA = cute.filter_zeros(tAgSFA)
+
+        # TMA load SFB partition_S/D
+        sfb_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_sfb_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestN, RestK, RestL)
+        tBsSFB, tBgSFB = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfb,
+            block_in_cluster_coord_sfb_vmnk[1],
+            sfb_cta_layout,
+            cute.group_modes(sSFB, 0, 3),
+            cute.group_modes(tCgSFB, 0, 3),
+        )
+        tBsSFB = cute.filter_zeros(tBsSFB)
+        tBgSFB = cute.filter_zeros(tBgSFB)
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+        # (MMA, MMA_M, MMA_N, STAGE)
+        if cutlass.const_expr(self.overlapping_accum):
+            num_acc_stage_overlapped = 2
+            tCtAcc_fake = tiled_mma.make_fragment_C(
+                cute.append(acc_shape, num_acc_stage_overlapped)
+            )
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_fake = cute.make_tensor(
+                tCtAcc_fake.iterator,
+                cute.make_layout(
+                    tCtAcc_fake.shape,
+                    stride=(
+                        tCtAcc_fake.stride[0],
+                        tCtAcc_fake.stride[1],
+                        tCtAcc_fake.stride[2],
+                        (256 - self.num_sf_tmem_cols) * tCtAcc_fake.stride[0][1],
+                    ),
+                ),
+            )
+        else:
+            tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, self.num_acc_stage))
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_wait()
+        else:
+            self.cta_sync_barrier.arrive_and_wait()
+
+        #
+        # Specialized TMA load warp
+        #
+        if warp_idx == self.tma_warp_id:
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_ab_stage
+            )
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((atom_v, rest_v), RestK)
+                tAgA_slice = tAgA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+                # ((atom_v, rest_v), RestK)
+                tBgB_slice = tBgB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+
+                # ((atom_v, rest_v), RestK)
+                tAgSFA_slice = tAgSFA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+                slice_n = mma_tile_coord_mnl[1]
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    slice_n = mma_tile_coord_mnl[1] // 2
+                # ((atom_v, rest_v), RestK)
+                tBgSFB_slice = tBgSFB[(None, slice_n, None, mma_tile_coord_mnl[2])]
+
+                prefetch_dist = self.prefetch_dist
+                # Sending a batch of inflight Prefetches before starting TMALDG loop
+                # Prefetch logic: use_prefetch for both A&B, or explicit A-only/B-only
+                if self.use_prefetch:
+                    # Prefetch both A and B (default behavior)
+                    for k_tile in cutlass.range(0, min(prefetch_dist, k_tile_cnt), unroll=1):
+                        # Prefetch both A and B (default behavior)
+                        cute.prefetch(
+                            tma_atom_a,
+                            tAgA_slice[(None, k_tile)],
+                        )
+                        cute.prefetch(
+                            tma_atom_b,
+                            tBgB_slice[(None, k_tile)],
+                        )
+                        cute.prefetch(
+                            tma_atom_sfa,
+                            tAgSFA_slice[(None, k_tile)],
+                        )
+                        cute.prefetch(
+                            tma_atom_sfb,
+                            tBgSFB_slice[(None, k_tile)],
+                        )
+
+                # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt
+                ab_producer_state.reset_count()
+                peek_ab_empty_status = cutlass.Boolean(1)
+                if ab_producer_state.count < k_tile_cnt:
+                    peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+                #
+                # Tma load loop
+                #
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):
+                    # Conditionally wait for AB buffer empty
+                    ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status)
+
+                    # TMA load A/B/SFA/SFB
+                    cute.copy(
+                        tma_atom_a,
+                        tAgA_slice[(None, ab_producer_state.count)],
+                        tAsA[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=a_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB_slice[(None, ab_producer_state.count)],
+                        tBsB[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=b_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_sfa,
+                        tAgSFA_slice[(None, ab_producer_state.count)],
+                        tAsSFA[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=sfa_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_sfb,
+                        tBgSFB_slice[(None, ab_producer_state.count)],
+                        tBsSFB[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=sfb_full_mcast_mask,
+                    )
+
+                    # Prefetch logic in the loop: use_prefetch for both A&B, or explicit A-only/B-only
+                    if k_tile < k_tile_cnt - prefetch_dist:
+                        if self.use_prefetch:
+                            # Prefetch both A and B (default behavior)
+                            cute.prefetch(
+                                tma_atom_a,
+                                tAgA_slice[(None, ab_producer_state.count + prefetch_dist)],
+                            )
+                            cute.prefetch(
+                                tma_atom_b,
+                                tBgB_slice[(None, ab_producer_state.count + prefetch_dist)],
+                            )
+                            cute.prefetch(
+                                tma_atom_sfa,
+                                tAgSFA_slice[(None, ab_producer_state.count + prefetch_dist)],
+                            )
+                            cute.prefetch(
+                                tma_atom_sfb,
+                                tBgSFB_slice[(None, ab_producer_state.count + prefetch_dist)],
+                            )
+
+                    # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt + k_tile + 1
+                    ab_producer_state.advance()
+                    peek_ab_empty_status = cutlass.Boolean(1)
+                    if ab_producer_state.count < k_tile_cnt:
+                        peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Wait A/B buffer empty
+            #
+            ab_pipeline.producer_tail(ab_producer_state)
+
+        #
+        # Specialized MMA warp
+        #
+        if warp_idx == self.mma_warp_id:
+            #
+            # Bar sync for retrieve tensor memory ptr from shared mem
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator/SFA/SFB tensor
+            #
+            acc_tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # Make accumulator tmem tensor
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+            # Make SFA tmem tensor
+            sfa_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base),
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_M, MMA_K)
+            tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+
+            # Make SFB tmem tensor
+            sfb_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr
+                + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base)
+                + tcgen05.find_tmem_tensor_col_offset(tCtSFA),
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_N, MMA_K)
+            tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
+            #
+            # Partition for S2T copy of SFA/SFB
+            #
+            (
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t,
+                tCtSFA_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFA, tCtSFA)
+            (
+                tiled_copy_s2t_sfb,
+                tCsSFB_compact_s2t,
+                tCtSFB_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFB, tCtSFB)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_ab_stage
+            )
+            acc_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_acc_stage
+            )
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                # Get accumulator stage index
+                if cutlass.const_expr(self.overlapping_accum):
+                    acc_stage_index = acc_producer_state.phase ^ 1
+                else:
+                    acc_stage_index = acc_producer_state.index
+
+                # Set tensor memory buffer for current tile
+                # (MMA, MMA_M, MMA_N)
+                tCtAcc = tCtAcc_base[(None, None, None, acc_stage_index)]
+
+                # Peek (try_wait) AB buffer full for k_tile = 0
+                ab_consumer_state.reset_count()
+                peek_ab_full_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_tile_cnt and is_leader_cta:
+                    peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+                #
+                # Wait for accumulator buffer empty
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_acquire(acc_producer_state)
+
+                tCtSFB_mma = tCtSFB
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    # Move in increments of 64 columns of SFB
+                    offset = cutlass.Int32((mma_tile_coord_mnl[1] % 2) * 2)
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr
+                        + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base)
+                        + tcgen05.find_tmem_tensor_col_offset(tCtSFA)
+                        + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+
+                #
+                # Reset the ACCUMULATE field for each tile
+                #
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+
+                #
+                # Mma mainloop
+                #
+                for k_tile in range(k_tile_cnt):
+                    if is_leader_cta:
+                        # Conditionally wait for AB buffer full
+                        ab_pipeline.consumer_wait(ab_consumer_state, peek_ab_full_status)
+
+                        #  Copy SFA/SFB from smem to tmem
+                        s2t_stage_coord = (
+                            None,
+                            None,
+                            None,
+                            None,
+                            ab_consumer_state.index,
+                        )
+                        tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+                        tCsSFB_compact_s2t_staged = tCsSFB_compact_s2t[s2t_stage_coord]
+                        cute.copy(
+                            tiled_copy_s2t_sfa,
+                            tCsSFA_compact_s2t_staged,
+                            tCtSFA_compact_s2t,
+                        )
+                        cute.copy(
+                            tiled_copy_s2t_sfb,
+                            tCsSFB_compact_s2t_staged,
+                            tCtSFB_compact_s2t,
+                        )
+
+                        # tCtAcc += tCrA * tCrSFA * tCrB * tCrSFB
+                        num_kblocks = cute.size(tCrA, mode=[2])
+                        for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                            kblock_coord = (
+                                None,
+                                None,
+                                kblock_idx,
+                                ab_consumer_state.index,
+                            )
+
+                            # Set SFA/SFB tensor to tiled_mma
+                            sf_kblock_coord = (None, None, kblock_idx)
+                            tiled_mma.set(
+                                tcgen05.Field.SFA,
+                                tCtSFA[sf_kblock_coord].iterator,
+                            )
+                            tiled_mma.set(
+                                tcgen05.Field.SFB,
+                                tCtSFB_mma[sf_kblock_coord].iterator,
+                            )
+
+                            cute.gemm(
+                                tiled_mma,
+                                tCtAcc,
+                                tCrA[kblock_coord],
+                                tCrB[kblock_coord],
+                                tCtAcc,
+                            )
+
+                            # Enable accumulate on tCtAcc after first kblock
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        # Async arrive AB buffer empty
+                        ab_pipeline.consumer_release(ab_consumer_state)
+
+                    # Peek (try_wait) AB buffer full for k_tile = k_tile + 1
+                    ab_consumer_state.advance()
+                    peek_ab_full_status = cutlass.Boolean(1)
+                    if ab_consumer_state.count < k_tile_cnt:
+                        if is_leader_cta:
+                            peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+                #
+                # Async arrive accumulator buffer full
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_commit(acc_producer_state)
+                acc_producer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Wait for accumulator buffer empty
+            #
+            acc_pipeline.producer_tail(acc_producer_state)
+        #
+        # Specialized epilogue warps
+        #
+        if warp_idx < self.mma_warp_id:
+            #
+            # Alloc tensor memory buffer
+            #
+            tmem.allocate(self.num_tmem_alloc_cols)
+
+            #
+            # Bar sync for retrieve tensor memory ptr from shared memory
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            acc_tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Partition for epilogue
+            #
+            epi_tidx = tidx
+            (
+                tiled_copy_t2r,
+                tTR_tAcc_base,
+                tTR_rAcc_up,
+                tTR_rAcc_gate,
+                tTR_gAlphaPost,
+                tTR_gC,
+            ) = self.epilog_tmem_copy_and_partition(
+                epi_tidx, tCtAcc_base, tCgC, galpha_scale_post_mel, epi_tile, use_2cta_instrs
+            )
+
+            tTR_rC = cute.make_rmem_tensor(tTR_rAcc_up.shape, self.c_dtype)
+            tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition(
+                tiled_copy_t2r, tTR_rC, epi_tidx, sC
+            )
+            (
+                tma_atom_c,
+                bSG_sC,
+                bSG_gC_partitioned,
+            ) = self.epilog_gmem_copy_and_partition(epi_tidx, tma_atom_c, tCgC, epi_tile, sC)
+
+            # Setup SFC tensor partition for quantization (if needed)
+            if cutlass.const_expr(self.generate_sfc):
+                norm_const = norm_const_tensor[0]
+                # (EPI_TILE_M, EPI_TILE_N, RestM, RestN, RestL)
+                gSFC_mnl = cute.local_tile(mSFC_mnl, epi_tile, (None, None, None))
+                thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+                # (T2R, T2R_M, T2R_N, RestM, RestN, RestL)
+                tCgSFC_mnl = thr_copy_t2r.partition_D(gSFC_mnl)
+                tCgSFC_mnl = cute.filter_zeros(tCgSFC_mnl)
+                # (T2R, T2R_M, T2R_N)
+                tCrSFC = cute.make_rmem_tensor(
+                    tCgSFC_mnl[(None, None, None, 0, 0, 0)].layout, self.sf_dtype
+                )
+                tCrSFC_pvscale = cute.make_rmem_tensor_like(tCrSFC, cutlass.Float32)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            acc_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_acc_stage
+            )
+
+            # Threads/warps participating in tma store pipeline
+            c_producer_group = pipeline.CooperativeGroup(
+                pipeline.Agent.Thread,
+                32 * len(self.epilog_warp_id),
+                32 * len(self.epilog_warp_id),
+            )
+            c_pipeline = pipeline.PipelineTmaStore.create(
+                num_stages=self.num_c_stage,
+                producer_group=c_producer_group,
+            )
+
+            ### core code to get alpha_scale (pre-SwiGLU)
+            unroll_scale_idx = work_tile.tile_idx[1] // (
+                self.weight_per_expert // self.cta_tile_shape_mnk[1]
+            )
+            current_alpha_scale = alpha_scale[unroll_scale_idx, work_tile.tile_idx[2]]
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((ATOM_V, REST_V), EPI_M, EPI_N)
+                bSG_gC = bSG_gC_partitioned[
+                    (
+                        None,
+                        None,
+                        None,
+                        *mma_tile_coord_mnl,
+                    )
+                ]
+
+                # Get accumulator stage index
+                if cutlass.const_expr(self.overlapping_accum):
+                    acc_stage_index = acc_consumer_state.phase
+                    reverse_subtile = (
+                        cutlass.Boolean(True) if acc_stage_index == 0 else cutlass.Boolean(False)
+                    )
+                else:
+                    acc_stage_index = acc_consumer_state.index
+
+                # Set tensor memory buffer for current tile
+                # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+                tTR_tAcc = tTR_tAcc_base[(None, None, None, None, None, acc_stage_index)]
+
+                if cutlass.const_expr(self.generate_sfc):
+                    # (T2R, T2R_M, T2R_N, RestM, RestN)
+                    tCgSFC_mn = tCgSFC_mnl[
+                        (
+                            None,
+                            None,
+                            None,
+                            None,
+                            None,
+                            0,
+                        )
+                    ]
+
+                #
+                # Wait for accumulator buffer full
+                #
+                acc_pipeline.consumer_wait(acc_consumer_state)
+
+                # Calculate expert index and batch index for alpha scaling
+                # tiles_per_expert: number of MMA tiles (in pre-SwiGLU N dimension) per expert
+                # weight_per_expert is in pre-SwiGLU space, cta_tile_shape_mnk[1] is MMA tile N
+                tiles_per_expert = self.weight_per_expert // self.cta_tile_shape_mnk[1]
+                expert_idx = cur_tile_coord[1] // tiles_per_expert
+                batch_idx = cur_tile_coord[2]
+
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
+
+                #
+                # Process accumulator subtiles with SwiGLU fusion and store to global memory
+                # Each iteration processes a pair of subtiles (up, gate) and computes
+                # up * silu(gate)
+                #
+                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
+                num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt
+
+                for subtile_idx in cutlass.range(0, subtile_cnt, 2):
+                    real_subtile_idx = subtile_idx // 2
+                    if cutlass.const_expr(self.overlapping_accum):
+                        if reverse_subtile:
+                            real_subtile_idx = (
+                                self.cta_tile_shape_mnk[1] // self.epi_tile_n_required
+                                - 1
+                                - subtile_idx // 2
+                            )
+                    #
+                    # Load accumulator from tensor memory buffer to register
+                    # Load both up and gate subtiles
+                    #
+                    tTR_tAcc_mn_up = tTR_tAcc[(None, None, None, real_subtile_idx * 2)]
+                    tTR_tAcc_mn_gate = tTR_tAcc[(None, None, None, real_subtile_idx * 2 + 1)]
+
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn_up, tTR_rAcc_up)
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn_gate, tTR_rAcc_gate)
+
+                    #
+                    # Async arrive accumulator buffer empty earlier when overlapping_accum is enabled
+                    #
+                    if cutlass.const_expr(self.overlapping_accum):
+                        if subtile_idx // 2 == self.iter_acc_early_release_in_epilogue:
+                            # Fence for TMEM load
+                            cute.arch.fence_view_async_tmem_load()
+                            with cute.arch.elect_one():
+                                acc_pipeline.consumer_release(acc_consumer_state)
+                            acc_consumer_state.advance()
+
+                    acc_vec_up = tTR_rAcc_up.load()
+                    acc_vec_gate = tTR_rAcc_gate.load()
+
+                    #
+                    # SwiGLU activation: output = up * silu(gate)
+                    # where silu(x) = x * sigmoid(x)
+                    # up and gate are extracted from interleaved accumulator subtiles
+                    #
+                    tCompute = cute.make_rmem_tensor(acc_vec_gate.shape, self.acc_dtype)
+                    if cutlass.const_expr(self.vectorized_f32):
+                        # SwiGLU Packed Version: uses f32x2 packed operations for better performance
+                        # Computes: output = (alpha * up) * silu(alpha * gate)
+                        # where silu(x) = x * sigmoid(x) = x / (1 + exp(-x))
+                        LOG2_E = cutlass.Float32(1.4426950408889634)
+                        for i in cutlass.range_constexpr(0, cute.size(tTR_rAcc_up), 2):
+                            acc_vec_up_alpha = cute.arch.mul_packed_f32x2(
+                                (acc_vec_up[i], acc_vec_up[i + 1]),
+                                (
+                                    cutlass.Float32(current_alpha_scale),
+                                    cutlass.Float32(current_alpha_scale),
+                                ),
+                            )
+                            acc_vec_gate_alpha = cute.arch.mul_packed_f32x2(
+                                (acc_vec_gate[i], acc_vec_gate[i + 1]),
+                                (
+                                    cutlass.Float32(current_alpha_scale),
+                                    cutlass.Float32(current_alpha_scale),
+                                ),
+                            )
+                            tCompute_log2e = cute.arch.mul_packed_f32x2(
+                                (acc_vec_gate_alpha[0], acc_vec_gate_alpha[1]), (-LOG2_E, -LOG2_E)
+                            )
+                            (
+                                tCompute[i],
+                                tCompute[i + 1],
+                            ) = cute.arch.add_packed_f32x2(
+                                (
+                                    cute.math.exp2(tCompute_log2e[0], fastmath=True),
+                                    cute.math.exp2(tCompute_log2e[1], fastmath=True),
+                                ),
+                                (1.0, 1.0),
+                            )
+                            tCompute[i] = cute.arch.rcp_approx(tCompute[i])
+                            tCompute[i + 1] = cute.arch.rcp_approx(tCompute[i + 1])
+                            (
+                                tCompute[i],
+                                tCompute[i + 1],
+                            ) = cute.arch.mul_packed_f32x2(
+                                (tCompute[i], tCompute[i + 1]),
+                                (acc_vec_gate_alpha[0], acc_vec_gate_alpha[1]),
+                            )
+                            (
+                                tCompute[i],
+                                tCompute[i + 1],
+                            ) = cute.arch.mul_packed_f32x2(
+                                (tCompute[i], tCompute[i + 1]),
+                                (acc_vec_up_alpha[0], acc_vec_up_alpha[1]),
+                            )
+                    else:
+                        # SwiGLU Unpacked Version: scalar operations
+                        # Computes: output = (alpha * up) * silu(alpha * gate)
+                        for i in cutlass.range_constexpr(cute.size(tTR_rAcc_up)):
+                            acc_vec_up_alpha = acc_vec_up[i] * cutlass.Float32(current_alpha_scale)
+                            acc_vec_gate_alpha = acc_vec_gate[i] * cutlass.Float32(
+                                current_alpha_scale
+                            )
+                            tCompute[i] = acc_vec_up_alpha * silu_f32(
+                                acc_vec_gate_alpha, fastmath=True
+                            )
+
+                    # Apply post-SwiGLU alpha scaling - per-token version (optional)
+                    if cutlass.const_expr(self.apply_alpha_post):
+                        # galpha_scale_post_mel has shape: (bM, RestM, expert_count, RestL)
+                        # - bM: within-tile M coordinate (0 to cta_tile_shape_mnk[0]-1)
+                        # - RestM: tile index in M dimension (cur_tile_coord[0])
+                        # Each epilogue thread (epi_tidx) handles exactly ONE M coordinate
+                        # Thread layout: 128 threads map to 128 M coordinates in the epilogue tile
+                        # Guard with M boundary check to prevent OOB global memory access.
+                        # Use cur_tile_coord[0] (not mma_tile_coord_mnl[0]) so that in 2CTA
+                        # mode each CTA correctly computes its own M offset within the cluster.
+                        # Nest the checks to avoid unsigned subtraction underflow.
+                        m_start = cur_tile_coord[0] * self.epi_tile[0]
+                        if m_start < m_total:
+                            if epi_tidx < (m_total - m_start):
+                                current_alpha_scale_post = tTR_gAlphaPost[
+                                    epi_tidx, cur_tile_coord[0], expert_idx, batch_idx
+                                ]
+                                for i in cutlass.range_constexpr(cute.size(tCompute)):
+                                    tCompute[i] = tCompute[i] * cutlass.Float32(
+                                        current_alpha_scale_post
+                                    )
+
+                    if cutlass.const_expr(self.generate_sfc):
+                        #
+                        # Quantization path for Float4E2M1FN output:
+                        # 1. Compute per-vector absolute max from SwiGLU result
+                        # 2. Generate scale factor C (SFC) based on max values
+                        # 3. Store SFC to global memory
+                        # 4. Quantize output by scaling with reciprocal of SFC
+                        #
+                        # Guard: skip SFC store for CTA tiles beyond the M boundary.
+                        # In 2CTA mode, CTA 1 may have cur_tile_coord[0] beyond
+                        # the actual M tile count; direct memory SFC store would
+                        # write OOB and corrupt adjacent GPU memory.
+                        m_tile_in_bounds = cur_tile_coord[0] * self.epi_tile[0] < m_total
+                        # Assume subtile partitioned always happens on n dimension
+                        sfc_subtile_idx_mn = (
+                            cur_tile_coord[0] * self.epi_tile_cnt[0],
+                            cur_tile_coord[1] * self.epi_tile_cnt[1] + real_subtile_idx,
+                        )
+                        tCgSFC = tCgSFC_mn[
+                            (
+                                None,
+                                None,
+                                None,
+                                *sfc_subtile_idx_mn,
+                            )
+                        ]
+
+                        #
+                        # Get absolute max across a vector and Compute SFC
+                        #
+                        tTR_rAcc_frg = cute.logical_divide(
+                            tCompute, cute.make_layout(self.sf_vec_size)
+                        )
+                        acc_frg = tTR_rAcc_frg.load()
+                        acc_frg = epilogue_op(acc_frg)
+
+                        # Apply element-wise absolute value using math.absf (supports vectors)
+                        abs_acc_frg_ir = math.absf(acc_frg.ir_value())
+                        abs_acc_frg = type(acc_frg)(abs_acc_frg_ir, acc_frg.shape, acc_frg.dtype)
+
+                        if cutlass.const_expr(self.vectorized_f32):
+                            for vi in cutlass.range_constexpr(abs_acc_frg.shape[1]):
+                                tCrSFC_pvscale[vi] = abs_acc_frg[None, vi].reduce(
+                                    cute.ReductionOp.MAX,
+                                    cutlass.Float32(0.0),
+                                    0,  # Use 0.0 as init for abs values
+                                )
+                            for vi in cutlass.range_constexpr(0, abs_acc_frg.shape[1], 2):
+                                tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1] = (
+                                    cute.arch.mul_packed_f32x2(
+                                        (tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1]),
+                                        (
+                                            self.get_dtype_rcp_limits(self.c_dtype),
+                                            self.get_dtype_rcp_limits(self.c_dtype),
+                                        ),
+                                    )
+                                )
+                                tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1] = (
+                                    cute.arch.mul_packed_f32x2(
+                                        (tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1]),
+                                        (norm_const, norm_const),
+                                    )
+                                )
+                        else:
+                            for vi in cutlass.range_constexpr(abs_acc_frg.shape[1]):
+                                tCrSFC_pvscale[vi] = (
+                                    abs_acc_frg[None, vi].reduce(
+                                        cute.ReductionOp.MAX,
+                                        cutlass.Float32(0.0),
+                                        0,  # Use 0.0 as init for abs values
+                                    )
+                                    * self.get_dtype_rcp_limits(self.c_dtype)
+                                    * norm_const
+                                )
+
+                        # Store SFC to register
+                        tCrSFC.store(tCrSFC_pvscale.load().to(self.sf_dtype))
+
+                        #
+                        # Store SFC to global memory (guarded for M boundary)
+                        #
+                        if m_tile_in_bounds:
+                            cute.autovec_copy(tCrSFC, tCgSFC)
+
+                        #
+                        # Compute quantized output values and convert to C type
+                        #
+                        tCrSFC_qpvscale_up = tCrSFC.load().to(cutlass.Float32)
+                        fp32_max = cutlass.Float32(3.40282346638528859812e38)
+                        if cutlass.const_expr(self.vectorized_f32):
+                            for vi in cutlass.range_constexpr(0, cute.size(tCrSFC), 2):
+                                acc_scale = cute.arch.mul_packed_f32x2(
+                                    (
+                                        cute.arch.rcp_approx(tCrSFC_qpvscale_up[vi]),
+                                        cute.arch.rcp_approx(tCrSFC_qpvscale_up[vi + 1]),
+                                    ),
+                                    (norm_const, norm_const),
+                                )
+                                acc_scale_min0 = fmin(acc_scale[0], fp32_max, nan=True)
+                                acc_scale_min1 = fmin(acc_scale[1], fp32_max, nan=True)
+
+                                vec0 = tTR_rAcc_frg[None, vi]
+                                vec1 = tTR_rAcc_frg[None, vi + 1]
+                                for ei in cutlass.range_constexpr(self.sf_vec_size):
+                                    vec0[ei], vec1[ei] = cute.arch.mul_packed_f32x2(
+                                        (vec0[ei], vec1[ei]),
+                                        (acc_scale_min0, acc_scale_min1),
+                                    )
+                        else:
+                            for vi in cutlass.range_constexpr(cute.size(tCrSFC)):
+                                acc_scale = norm_const * cute.arch.rcp_approx(
+                                    tCrSFC_qpvscale_up[vi]
+                                )
+                                acc_scale = fmin(acc_scale, fp32_max, nan=True)
+
+                                vec = tTR_rAcc_frg[None, vi]
+                                for ei in cutlass.range_constexpr(self.sf_vec_size):
+                                    vec[ei] = vec[ei] * acc_scale
+
+                        acc_vec = tiled_copy_r2s.retile(tCompute).load()
+                        tRS_rC.store(acc_vec.to(self.c_dtype))
+                    else:
+                        #
+                        # Convert to C type (non-quantization path)
+                        #
+                        acc_vec = tiled_copy_r2s.retile(tCompute).load()
+                        acc_vec = epilogue_op(acc_vec.to(self.c_dtype))
+                        tRS_rC.store(acc_vec)
+
+                    #
+                    # Store C to shared memory
+                    #
+                    num_prev_subtiles = num_prev_subtiles + 1
+                    c_buffer = num_prev_subtiles % self.num_c_stage
+
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rC,
+                        tRS_sC[(None, None, None, c_buffer)],
+                    )
+                    # Fence and barrier to make sure shared memory store is visible to TMA store
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+                    self.epilog_sync_barrier.arrive_and_wait()
+
+                    #
+                    # TMA store C to global memory
+                    #
+                    if warp_idx == self.epilog_warp_id[0]:
+                        cute.copy(
+                            tma_atom_c,
+                            bSG_sC[(None, c_buffer)],
+                            bSG_gC[(None, real_subtile_idx)],
+                        )
+                        # Fence and barrier to make sure shared memory store is visible to TMA store
+                        c_pipeline.producer_commit()
+                        c_pipeline.producer_acquire()
+                    self.epilog_sync_barrier.arrive_and_wait()
+
+                #
+                # Async arrive accumulator buffer empty
+                #
+                if cutlass.const_expr(not self.overlapping_accum):
+                    with cute.arch.elect_one():
+                        acc_pipeline.consumer_release(acc_consumer_state)
+                    acc_consumer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+
+                prev_work_tile_tile_idx_n = work_tile.tile_idx[1]
+                work_tile = tile_sched.get_current_work()
+
+                # if previous tile and current tile has same tile_idx_n,
+                # then we don't need to update alpha_scale,
+                # otherwise we need to update alpha_scale with ldg
+                if work_tile.is_valid_tile and prev_work_tile_tile_idx_n != work_tile.tile_idx[1]:
+                    unroll_scale_idx = work_tile.tile_idx[1] // (
+                        self.weight_per_expert // self.cta_tile_shape_mnk[1]
+                    )
+                    current_alpha_scale = alpha_scale[unroll_scale_idx, work_tile.tile_idx[2]]
+
+            #
+            # Dealloc the tensor memory buffer
+            #
+            tmem.relinquish_alloc_permit()
+            self.epilog_sync_barrier.arrive_and_wait()
+            tmem.free(acc_tmem_ptr)
+            #
+            # Wait for C store complete
+            #
+            c_pipeline.producer_tail()
+
+    def mainloop_s2t_copy_and_partition(
+        self,
+        sSF: cute.Tensor,
+        tSF: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for smem to tmem load for scale factor tensor, then use it
+        to partition smem memory (source) and tensor memory (destination).
+
+        :param sSF: The scale factor tensor in smem
+        :type sSF: cute.Tensor
+        :param tSF: The scale factor tensor in tmem
+        :type tSF: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where:
+            - tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t)
+            - tCsSF_compact_s2t: The partitioned scale factor tensor in smem
+            - tSF_compact_s2t: The partitioned scale factor tensor in tmem
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # (MMA, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact = cute.filter_zeros(sSF)
+        # (MMA, MMA_MN, MMA_K)
+        tCtSF_compact = cute.filter_zeros(tSF)
+
+        # Make S2T CopyAtom and tiledCopy
+        copy_atom_s2t = cute.make_copy_atom(
+            tcgen05.Cp4x32x128bOp(self.cta_group),
+            self.sf_dtype,
+        )
+        tiled_copy_s2t = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSF_compact)
+        thr_copy_s2t = tiled_copy_s2t.get_slice(0)
+
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t_ = thr_copy_s2t.partition_S(tCsSF_compact)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(tiled_copy_s2t, tCsSF_compact_s2t_)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+        tCtSF_compact_s2t = thr_copy_s2t.partition_D(tCtSF_compact)
+
+        return tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        galpha_scale_post_mel: Optional[cute.Tensor],
+        epi_tile: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> Tuple[
+        cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor, Optional[cute.Tensor], cute.Tensor
+    ]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory
+        (source) and register array (destination). Returns separate accumulator tensors
+        for up and gate values used in SwiGLU fusion, plus alpha_scale_post and tTR_gC.
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param galpha_scale_post_mel: The global alpha_scale_post tensor (bM, RestM, expert_count, RestL),
+            or None if post-SwiGLU scaling is disabled
+        :type galpha_scale_post_mel: Optional[cute.Tensor]
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc_up, tTR_rAcc_gate, gAlphaPost, tTR_gC) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc_up: The accumulated tensor in register for up values
+            - tTR_rAcc_gate: The accumulated tensor in register for gate values
+            - gAlphaPost: The alpha_scale_post tensor for per-token access (None if disabled)
+            - tTR_gC: The partitioned output tensor for coordinate mapping
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor, Optional[cute.Tensor], cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.c_layout,
+            self.c_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0, None)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        gC_mnl_epi = cute.flat_divide(gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
+        # (T2R, T2R_M, T2R_N) - for up values
+        tTR_rAcc_up = cute.make_rmem_tensor(
+            tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
+        )
+        # (T2R, T2R_M, T2R_N) - for gate values
+        tTR_rAcc_gate = cute.make_rmem_tensor(
+            tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
+        )
+
+        # Pass through the tiled alpha_scale_post tensor and tTR_gC for direct access in epilogue
+        # galpha_scale_post_mel: (bM, RestM, expert_count, RestL)
+        # tTR_gC: (T2R, T2R_M, T2R_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        # We'll compute per-token indices in the epilogue based on tTR_gC coordinates
+
+        return tiled_copy_t2r, tTR_tAcc, tTR_rAcc_up, tTR_rAcc_gate, galpha_scale_post_mel, tTR_gC
+
+    def epilog_smem_copy_and_partition(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tTR_rC: cute.Tensor,
+        tidx: cutlass.Int32,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for shared memory store, then use it to partition
+        register array (source) and shared memory (destination).
+
+        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+        :type tiled_copy_t2r: cute.TiledCopy
+        :param tTR_rC: The partitioned accumulator tensor
+        :type tTR_rC: cute.Tensor
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+        :type sepi: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
+            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
+            - tRS_rC: The partitioned tensor C (register source)
+            - tRS_sC: The partitioned tensor C (smem destination)
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        copy_atom_r2s = sm100_utils.get_smem_store_op(
+            self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r
+        )
+        tiled_copy_r2s = cute.make_tiled_copy_D(copy_atom_r2s, tiled_copy_t2r)
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sC = thr_copy_r2s.partition_D(sC)
+        # (R2S, R2S_M, R2S_N)
+        tRS_rC = tiled_copy_r2s.retile(tTR_rC)
+        return tiled_copy_r2s, tRS_rC, tRS_sC
+
+    def epilog_gmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        atom: Union[cute.CopyAtom, cute.TiledCopy],
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]:
+        """Make tiledCopy for global memory store, then use it to:
+        partition shared memory (source) and global memory (destination) for TMA store version.
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
+        :type atom: cute.CopyAtom or cute.TiledCopy
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+
+        :return: A tuple containing (tma_atom_c, bSG_sC, bSG_gC) where:
+            - tma_atom_c: The TMA copy atom
+            - bSG_sC: The partitioned shared memory tensor C
+            - bSG_gC: The partitioned global tensor C
+        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
+        """
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        gC_epi = cute.flat_divide(gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile)
+
+        tma_atom_c = atom
+        sC_for_tma_partition = cute.group_modes(sC, 0, 2)
+        gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N, RestM, RestN, RestL)
+        bSG_sC, bSG_gC = cpasync.tma_partition(
+            tma_atom_c,
+            0,
+            cute.make_layout(1),
+            sC_for_tma_partition,
+            gC_for_tma_partition,
+        )
+        return tma_atom_c, bSG_sC, bSG_gC
+
+    @staticmethod
+    def get_dtype_rcp_limits(dtype: Type[cutlass.Numeric]) -> float:
+        """
+        Get the reciprocal of the maximum representable value for quantization.
+
+        This is used to compute the scale factor for block-scaled quantization.
+        The scale factor is computed as: sf = max_abs_value * rcp_limit * norm_const
+
+        :param dtype: The target data type for quantization
+        :type dtype: Type[cutlass.Numeric]
+        :return: The reciprocal of the maximum representable value
+        :rtype: float
+        """
+        if dtype == cutlass.Float4E2M1FN:
+            return 1.0 / 6.0  # 6.0 is max value for FP4 E2M1
+        elif dtype == cutlass.Float8E4M3FN:
+            return 1.0 / 448.0  # Max value for FP8 E4M3
+        elif dtype == cutlass.Float8E5M2:
+            return 1.0 / 57344.0  # Max value for FP8 E5M2
+        else:
+            return 1.0
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        epi_tile: cute.Tile,
+        c_dtype: Type[cutlass.Numeric],
+        c_layout: utils.LayoutEnum,
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        smem_capacity: int,
+        occupancy: int,
+    ) -> Tuple[int, int, int]:
+        """Computes the number of stages for A/B/C operands based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape.
+        :type epi_tile: cute.Tile
+        :param c_dtype: Data type of operand C (output).
+        :type c_dtype: type[cutlass.Numeric]
+        :param c_layout: Layout enum of operand C.
+        :type c_layout: utils.LayoutEnum
+        :param sf_dtype: Data type of Scale factor.
+        :type sf_dtype: type[cutlass.Numeric]
+        :param sf_vec_size: Scale factor vector size.
+        :type sf_vec_size: int
+        :param smem_capacity: Total available shared memory capacity in bytes.
+        :type smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+
+        :return: A tuple containing the computed number of stages for:
+                 (ACC stages, A/B operand stages, C stages)
+        :rtype: tuple[int, int, int]
+        """
+        # ACC stages
+        num_acc_stage = 1 if mma_tiler_mnk[1] == 256 else 2
+
+        # Default C stages
+        num_c_stage = 2
+
+        # Calculate smem layout and size for one stage of A, B, SFA, SFB and C
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        sfa_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+        sfb_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        c_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            c_dtype,
+            c_layout,
+            epi_tile,
+            1,
+        )
+
+        ab_bytes_per_stage = (
+            cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
+            + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfa_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one)
+        )
+        mbar_helpers_bytes = 1024
+        c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
+        c_bytes = c_bytes_per_stage * num_c_stage
+
+        # Calculate A/B/SFA/SFB stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial C stages bytes
+        # Divide remaining by bytes needed per A/B/SFA/SFB stage
+        num_ab_stage = (
+            smem_capacity // occupancy - (mbar_helpers_bytes + c_bytes)
+        ) // ab_bytes_per_stage
+
+        # Refine epilogue stages:
+        # Calculate remaining smem after allocating for A/B/SFA/SFB stages and reserved bytes
+        # Add remaining unused smem to epilogue
+        num_c_stage += (
+            smem_capacity
+            - occupancy * ab_bytes_per_stage * num_ab_stage
+            - occupancy * (mbar_helpers_bytes + c_bytes)
+        ) // (occupancy * c_bytes_per_stage)
+
+        return num_acc_stage, num_ab_stage, num_c_stage
+
+    @staticmethod
+    def _compute_grid(
+        c: cute.Tensor,
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+        max_active_clusters: cutlass.Constexpr,
+    ) -> Tuple[utils.PersistentTileSchedulerParams, Tuple[int, int, int]]:
+        """Use persistent tile scheduler to compute the grid size for the output tensor C.
+
+        :param c: The output tensor C
+        :type c: cute.Tensor
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: tuple[int, int, int]
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr
+
+        :return: A tuple containing:
+            - tile_sched_params: Parameters for the persistent tile scheduler.
+            - grid: Grid shape for kernel launch.
+        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
+        """
+        c_shape = cute.slice_(cta_tile_shape_mnk, (None, None, 0))
+        gc = cute.zipped_divide(c, tiler=c_shape)
+        num_ctas_mnl = gc[(0, (None, None, None))].shape
+        cluster_shape_mnl = (*cluster_shape_mn, 1)
+
+        tile_sched_params = utils.PersistentTileSchedulerParams(num_ctas_mnl, cluster_shape_mnl)
+        grid = utils.StaticPersistentTileScheduler.get_grid_shape(
+            tile_sched_params, max_active_clusters
+        )
+
+        return tile_sched_params, grid
+
+    @staticmethod
+    def is_valid_dtypes_and_scale_factor_vec_size(
+        ab_dtype: Type[cutlass.Numeric],
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        c_dtype: Type[cutlass.Numeric],
+    ) -> bool:
+        """
+        Check if the dtypes and sf_vec_size are valid combinations
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param sf_dtype: The data type of the scale factor
+        :type sf_dtype: Type[cutlass.Numeric]
+        :param sf_vec_size: The vector size of the scale factor
+        :type sf_vec_size: int
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+
+        :return: True if the dtypes and sf_vec_size are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        # Check valid ab_dtype
+        if ab_dtype not in {
+            cutlass.Float4E2M1FN,
+            cutlass.Float8E5M2,
+            cutlass.Float8E4M3FN,
+        }:
+            is_valid = False
+
+        # Check valid sf_vec_size
+        if sf_vec_size not in {16, 32}:
+            is_valid = False
+
+        # Check valid sf_dtype
+        if sf_dtype not in {cutlass.Float8E8M0FNU, cutlass.Float8E4M3FN}:
+            is_valid = False
+
+        # Check valid sf_dtype and sf_vec_size combinations
+        if sf_dtype == cutlass.Float8E4M3FN and sf_vec_size == 32:
+            is_valid = False
+        if ab_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN} and sf_vec_size == 16:
+            is_valid = False
+
+        # Check valid c_dtype
+        if c_dtype not in {
+            cutlass.Float32,
+            cutlass.Float16,
+            cutlass.BFloat16,
+            cutlass.Float8E5M2,
+            cutlass.Float8E4M3FN,
+            cutlass.Float4E2M1FN,
+        }:
+            is_valid = False
+
+        return is_valid
+
+    @staticmethod
+    def is_valid_layouts(
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if layouts and dtypes are valid combinations
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param a_major: The major dimension of the A tensor
+        :type a_major: str
+        :param b_major: The major dimension of the B tensor
+        :type b_major: str
+        :param c_major: The major dimension of the C tensor
+        :type c_major: str
+
+        :return: True if the layouts are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        if ab_dtype is cutlass.Float4E2M1FN and not (a_major == "k" and b_major == "k"):
+            is_valid = False
+        # TODO: Currently we don't support m major output for Float4E2M1FN
+        if c_dtype is cutlass.Float4E2M1FN and c_major == "m":
+            is_valid = False
+
+        return is_valid
+
+    @staticmethod
+    def is_valid_mma_tiler_and_cluster_shape(
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ) -> bool:
+        """
+        Check if the mma tiler and cluster shape are valid
+
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+
+        :return: True if the mma tiler and cluster shape are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+        # Skip invalid mma tile shape
+        if mma_tiler_mn[0] not in [128, 256]:
+            is_valid = False
+        # TODO: Add tile_n=64 and tile_n=192 support
+        if mma_tiler_mn[1] not in [64, 128, 256]:
+            is_valid = False
+        # Skip illegal cluster shape
+        if cluster_shape_mn[0] % (2 if mma_tiler_mn[0] == 256 else 1) != 0:
+            is_valid = False
+
+        # Skip invalid cluster shape
+        def is_power_of_2(x):
+            return x > 0 and (x & (x - 1)) == 0
+
+        if (
+            cluster_shape_mn[0] * cluster_shape_mn[1] > 16
+            or cluster_shape_mn[0] <= 0
+            or cluster_shape_mn[1] <= 0
+            # Special cluster shape check for scale factor multicasts.
+            # Due to limited size of scale factors, we can't multicast among more than 4 CTAs.
+            or cluster_shape_mn[0] > 4
+            or cluster_shape_mn[1] > 4
+            or not is_power_of_2(cluster_shape_mn[0])
+            or not is_power_of_2(cluster_shape_mn[1])
+        ):
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def is_valid_tensor_alignment(
+        m: int,
+        n: int,
+        k: int,
+        l: int,  # noqa: E741
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if the tensor alignment is valid
+
+        :param m: The number of rows in the A tensor
+        :type m: int
+        :param n: The number of columns in the B tensor
+        :type n: int
+        :param k: The number of columns in the A tensor
+        :type k: int
+        :param l: The number of columns in the C tensor
+        :type l: int
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+
+        :return: True if the problem shape is valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape):
+            major_mode_idx = 0 if is_mode0_major else 1
+            num_major_elements = tensor_shape[major_mode_idx]
+            num_contiguous_elements = 16 * 8 // dtype.width
+            return num_major_elements % num_contiguous_elements == 0
+
+        if (
+            not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l))
+            or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l))
+            or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l))
+        ):
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def can_implement(
+        ab_dtype: Type[cutlass.Numeric],
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        c_dtype: Type[cutlass.Numeric],
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        m: int,
+        n: int,
+        k: int,
+        l: int,  # noqa: E741
+        a_major: str,
+        b_major: str,
+        c_major: str,
+        expert_count: int,
+        weight_per_expert: int,
+    ) -> bool:
+        """
+        Check if the gemm can be implemented
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param sf_dtype: The data type of the scale factor tensor
+        :type sf_dtype: Type[cutlass.Numeric]
+        :param sf_vec_size: The vector size
+        :type sf_vec_size: int
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+        :param m: The number of rows in the A tensor
+        :type m: int
+        :param n: The number of columns in the B tensor
+        :type n: int
+        :param k: The number of columns in the A tensor
+        :type k: int
+        :param l: The number of columns in the C tensor
+        :type l: int
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+        :param expert_count: The number of experts
+        :type expert_count: int
+        :param weight_per_expert: The number of weights per expert
+        :type weight_per_expert: int
+
+        :return: True if the gemm can be implemented, False otherwise
+        :rtype: bool
+        """
+        can_implement = True
+        # Skip unsupported types
+        if not Sm100BlockScaledPersistentDenseGemmKernel.is_valid_dtypes_and_scale_factor_vec_size(
+            ab_dtype, sf_dtype, sf_vec_size, c_dtype
+        ):
+            can_implement = False
+        # Skip unsupported layouts
+        if not Sm100BlockScaledPersistentDenseGemmKernel.is_valid_layouts(
+            ab_dtype, c_dtype, a_major, b_major, c_major
+        ):
+            can_implement = False
+        # Skip invalid mma tile shape and cluster shape
+        if not Sm100BlockScaledPersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shape(
+            mma_tiler_mn, cluster_shape_mn
+        ):
+            can_implement = False
+        # Skip illegal problem shape for load/store alignment
+        if not Sm100BlockScaledPersistentDenseGemmKernel.is_valid_tensor_alignment(
+            m, n, k, l, ab_dtype, c_dtype, a_major, b_major, c_major
+        ):
+            can_implement = False
+
+        # n must equal expert_count * weight_per_expert
+        if n != expert_count * weight_per_expert:
+            can_implement = False
+
+        # weight_per_expert must be divisible by the CTA N tile size.
+        # CTA N tile = mma_tiler_mn[1] (e.g. 128 for (128,128), 256 for (256,256))
+        cta_tile_shape_n = mma_tiler_mn[1]
+        if weight_per_expert % cta_tile_shape_n != 0:
+            can_implement = False
+
+        # cluster_m > 1 requires 2CTA mode (mma_m=256)
+        if cluster_shape_mn[0] > 1 and mma_tiler_mn[0] != 256:
+            can_implement = False
+
+        return can_implement
+
+    @cute.jit
+    def wrapper(
+        self,
+        a_ptr: cute.Pointer,
+        b_ptr: cute.Pointer,
+        a_sf_ptr: cute.Pointer,
+        b_sf_ptr: cute.Pointer,
+        c_ptr: cute.Pointer,
+        c_sf_ptr: Optional[cute.Pointer],
+        alpha_ptr: cute.Pointer,
+        alpha_post_ptr: Optional[cute.Pointer],
+        norm_const_ptr: Optional[cute.Pointer],
+        m: cutlass.Int64,
+        n: cutlass.Int64,
+        k: cutlass.Int64,
+        l: cutlass.Int64,  # noqa: E741
+        expert_count: cutlass.Constexpr,
+        scaling_vector_size: cutlass.Constexpr,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+    ):
+        """Wrapper function to create cute tensors from raw pointers and call the kernel.
+
+        This wrapper is designed for integration with TensorRT-LLM custom ops.
+        It creates the appropriate cute tensor layouts from raw pointers and dimensions.
+
+        :param a_ptr: Pointer to input activation tensor A (M, K, L)
+        :type a_ptr: cute.Pointer
+        :param b_ptr: Pointer to weight tensor B (N, K, L)
+        :type b_ptr: cute.Pointer
+        :param a_sf_ptr: Pointer to scale factor tensor for A
+        :type a_sf_ptr: cute.Pointer
+        :param b_sf_ptr: Pointer to scale factor tensor for B
+        :type b_sf_ptr: cute.Pointer
+        :param c_ptr: Pointer to output tensor C (M, N//2, L) - N//2 due to SwiGLU
+        :type c_ptr: cute.Pointer
+        :param c_sf_ptr: Pointer to scale factor tensor for C (can be null)
+        :type c_sf_ptr: Optional[cute.Pointer]
+        :param alpha_ptr: Pointer to per-expert alpha scale tensor (expert_count, L)
+        :type alpha_ptr: cute.Pointer
+        :param alpha_post_ptr: Pointer to post-SwiGLU alpha scale tensor (M, expert_count, L),
+            or None to skip post-SwiGLU scaling
+        :type alpha_post_ptr: Optional[cute.Pointer]
+        :param norm_const_ptr: Pointer to normalization constant for SFC generation (can be null)
+        :type norm_const_ptr: Optional[cute.Pointer]
+        :param m: M dimension (number of tokens/rows)
+        :type m: cutlass.Int64
+        :param n: N dimension (full weight width, before SwiGLU)
+        :type n: cutlass.Int64
+        :param k: K dimension (hidden size)
+        :type k: cutlass.Int64
+        :param l: L dimension (batch/expert dimension, typically 1 for dense)
+        :type l: cutlass.Int64
+        :param expert_count: Number of experts
+        :type expert_count: cutlass.Constexpr
+        :param scaling_vector_size: Vector size for block scaling (typically 16)
+        :type scaling_vector_size: cutlass.Constexpr
+        :param max_active_clusters: Maximum number of active clusters for persistent scheduling
+        :type max_active_clusters: cutlass.Constexpr
+        :param stream: CUDA stream for kernel execution
+        :type stream: cuda.CUstream
+        """
+        # Compute derived dimensions
+        scale_k = k // scaling_vector_size
+        n_out = n // 2  # Output N dimension after SwiGLU fusion
+        scale_n_out = n_out // scaling_vector_size
+
+        # Create A tensor: (M, K, L) row-major K
+        a = cute.make_tensor(a_ptr, layout=cute.make_ordered_layout((m, k, l), order=(1, 0, 2)))
+
+        # Create B tensor: (N, K, L) row-major K
+        b = cute.make_tensor(b_ptr, layout=cute.make_ordered_layout((n, k, l), order=(1, 0, 2)))
+
+        # Create C tensor: (M, N//2, L) row-major N
+        c = cute.make_tensor(c_ptr, layout=cute.make_ordered_layout((m, n_out, l), order=(1, 0, 2)))
+
+        # Create scale factor tensors with blockscaled MMA layout
+        # Layout: (32, 4, ceil_div(dim, 128), 4, scale_k // 4, L) with order (2, 1, 4, 0, 3, 5)
+        # Use ceil_div for M dimension to avoid zero-volume tensors when M < 128
+        m_blocks = (m + 127) // 128
+        a_sf = cute.make_tensor(
+            a_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, m_blocks, 4, scale_k // 4, l), order=(2, 1, 4, 0, 3, 5)
+            ),
+        )
+
+        b_sf = cute.make_tensor(
+            b_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, n // 128, 4, scale_k // 4, l), order=(2, 1, 4, 0, 3, 5)
+            ),
+        )
+
+        # Create C scale factor tensor (optional, for FP4 output quantization)
+        # Use m_blocks (ceil_div) to avoid zero-volume tensors when M < 128
+        c_sf = None
+        if cutlass.const_expr(c_sf_ptr is not None):
+            c_sf = cute.make_tensor(
+                c_sf_ptr,
+                layout=cute.make_ordered_layout(
+                    (32, 4, m_blocks, 4, scale_n_out // 4, l), order=(2, 1, 4, 0, 3, 5)
+                ),
+            )
+
+        # Create alpha scale tensor: (expert_count, L)
+        # Indexed by N dimension using weight_per_expert
+        alpha = cute.make_tensor(
+            alpha_ptr,
+            layout=cute.make_ordered_layout((expert_count, l), order=(1, 0)),
+        )
+
+        # Create alpha_scale_post tensor: (M, expert_count, L) (optional)
+        # Per-token per-expert scaling applied after SwiGLU
+        alpha_post = None
+        if cutlass.const_expr(alpha_post_ptr is not None):
+            alpha_post = cute.make_tensor(
+                alpha_post_ptr,
+                layout=cute.make_ordered_layout((m, expert_count, l), order=(1, 0, 2)),
+            )
+
+        # Create norm_const tensor (optional, for FP4 output quantization)
+        norm_const = None
+        if cutlass.const_expr(norm_const_ptr is not None):
+            norm_const = cute.make_tensor(norm_const_ptr, layout=cute.make_layout((1,)))
+
+        return self(
+            a,
+            b,
+            a_sf,
+            b_sf,
+            alpha,
+            alpha_post,
+            c,
+            c_sf,
+            norm_const,
+            max_active_clusters=max_active_clusters,
+            stream=stream,
+        )
+
+
+@cute.jit
+def cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+    sf_ref_tensor: cute.Tensor,
+    sf_mma_tensor: cute.Tensor,
+):
+    """Convert scale factor tensor from MKL layout to mma specification M(32x4xrest_m)xK(4xrest_k)xL layout"""
+    # sf_mma_tensor has flatten shape (32, 4, rest_m, 4, rest_k, l)
+    # group to ((32, 4, rest_m), (4, rest_k), l)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 0, 3)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 1, 3)
+    for i in cutlass.range(cute.size(sf_ref_tensor)):
+        mkl_coord = sf_ref_tensor.layout.get_hier_coord(i)
+        sf_mma_tensor[mkl_coord] = sf_ref_tensor[mkl_coord]
+
+
+@cute.jit
+def cvt_sf_M32x4xrm_K4xrk_L_to_MKL(
+    sf_mma_tensor: cute.Tensor,
+    sf_ref_tensor: cute.Tensor,
+):
+    """Convert scale factor tensor from mma specification M(32x4xrest_m)xK(4xrest_k)xL layout to MKL layout"""
+    # sf_mma_tensor has flatten shape (32, 4, rest_m, 4, rest_k, l)
+    # group to ((32, 4, rest_m), (4, rest_k), l)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 0, 3)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 1, 3)
+    for i in cutlass.range(cute.size(sf_ref_tensor)):
+        mkl_coord = sf_ref_tensor.layout.get_hier_coord(i)
+        sf_ref_tensor[mkl_coord] = sf_mma_tensor[mkl_coord]
diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/moe_as_dense_gemm/fc2.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/moe_as_dense_gemm/fc2.py
new file mode 100644
index 000000000000..8a26748c4e80
--- /dev/null
+++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/moe_as_dense_gemm/fc2.py
@@ -0,0 +1,2407 @@
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple, Type, Union
+
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+import cutlass.pipeline as pipeline
+import cutlass.utils as utils
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass.cute.nvgpu import cpasync, tcgen05
+
+"""
+This example provides an experimental implementation of the SM100 batched dense blockscaled
+GEMM kernel, please note that the APIs and implementation details related to this kernel
+may change in future releases.
+
+A high-performance persistent batched dense blockscaled GEMM example for the NVIDIA Blackwell
+SM100 architecture using CUTE DSL.
+- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M")
+  for MXF8 input type and can only be row-major("K") for MXF4/NVF4 input type
+- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K")
+  for MXF8 input type and can only be row-major("K") for MXF4/NVF4 input type
+- Matrix C is MxNxL, L is batch dimension, C can be row-major("N") or column-major("M")
+- Matrix SFA layout is filled internally according to A shape and BlockScaledBasicChunk,
+  which has M×ceil_div(K, sf_vec_size)×L elements respectively
+- Matrix SFB layout is filled internally according to B shape and BlockScaledBasicChunk,
+  which has N×ceil_div(K, sf_vec_size)×L elements respectively
+
+This GEMM kernel supports the following features:
+    - Utilizes Tensor Memory Access (TMA) for efficient memory operations
+    - Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations (including 2cta mma instructions)
+    - Implements TMA multicast with cluster to reduce L2 memory traffic
+    - Support persistent tile scheduling to better overlap memory load/store with mma between tiles
+    - Support warp specialization to avoid explicit pipelining between mainloop load and mma
+
+This GEMM works as follows:
+1. DMA warp: Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations.
+2. MMA warp:
+    - Load scale factor A/B from shared memory (SMEM) to tensor memory (TMEM) using tcgen05.cp instruction.
+    - Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+3. EPILOGUE warp:
+    - Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+    - Type convert C matrix to output type.
+    - Optionally store C matrix from registers (RMEM) to shared memory (SMEM) to global
+      memory (GMEM) with TMA operations, or directly store C matrix from registers (RMEM)
+      to global memory (GMEM) without TMA operations.
+    - Optionally accept an elementwise lambda function epilogue_op to apply to the output tensor:
+      e.g., relu can set epilogue_op = lambda x: cute.where(x > 0, x, cute.full_like(x, 0))
+
+SM100 tcgen05.mma.kind.block_scale instructions operate as follows:
+- Read matrix A from SMEM
+- Read matrix B from SMEM
+- Read scalefactor A from TMEM
+- Read scalefactor B from TMEM
+- Write accumulator to TMEM
+The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+
+Input arguments to this example is shown below:
+
+.. code-block:: bash
+
+    python examples/blackwell/dense_blockscaled_gemm_persistent.py            \
+      --ab_dtype Float4E2M1FN --sf_dtype Float8E8M0FNU --sf_vec_size 16        \
+      --c_dtype Float16                                                        \
+      --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                            \
+      --mnkl 8192,8192,1024,1
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python examples/blackwell/dense_blockscaled_gemm_persistent.py        \
+      --ab_dtype Float4E2M1FN --sf_dtype Float8E8M0FNU --sf_vec_size 16        \
+      --c_dtype Float16                                                        \
+      --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                            \
+      --mnkl 8192,8192,1024,1                                                  \
+      --warmup_iterations 1 --iterations 10 --skip_ref_check
+
+
+Constraints:
+* Supported input data types: mxf8, mxf4, nvf4
+  see detailed valid dtype combinations in below Sm100BlockScaledPersistentDenseGemmKernel class documentation
+* A/B tensor must have the same data type, mixed data type is not supported (e.g., mxf8 x mxf4)
+* Mma tiler M must be 128 or 256(use_2cta_instrs)
+* Mma tiler N must be 128 or 256
+* Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+* Cluster shape M must be multiple of 2 if Mma tiler M is 256(use_2cta_instrs)
+* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned,
+  i.e, number of elements is a multiple of 16 and 32 for Float8 and Float4, respectively.
+"""
+
+
+class Sm100BlockScaledPersistentDenseGemmKernel:
+    """This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types
+    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
+
+    :param sf_vec_size: Scalefactor vector size.
+    :type sf_vec_size: int
+    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
+    :type cluster_shape_mn: Tuple[int, int]
+
+    :note: In current version, A and B tensor must have the same data type
+        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported
+
+    :note: Supported combinations of A/B data types, SF data typs and SF vector size:
+        - MXF8: A/B: Float8E5M2/Float8E4M3FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - MXF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - NVF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU/Float8E4M3FN + sf_vec_size: 16
+
+    :note: Supported accumulator data types:
+        - Float32
+
+    :note: Supported C data types:
+        - Float32
+        - Float16/BFloat16
+        - Float8E4M3FN/Float8E5M2
+        # Note: We don't have SFD generation support in this example for now,
+        # so Float4E2M1FN output is only for internal testing and will not be released.
+        - Float4E2M1FN
+
+    :note: Constraints:
+        - MMA tiler M must be 128 or 256 (use_2cta_instrs)
+        # TODO: Add 64 and 192 support
+        - MMA tiler N must be 128/256
+        - Cluster shape M must be multiple of 2 if Mma tiler M is 256
+        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+        - Also, Cluster shape M/N must be <= 4 for scale factor multicasts due to limited size of scale factors
+
+    Example:
+        >>> gemm = Sm100BlockScaledPersistentDenseGemmKernel(
+        ...     sf_vec_size=16, mma_tiler_mn=(256, 128), cluster_shape_mn=(2, 1)
+        ... )
+        >>> gemm(a_tensor, b_tensor, sfa_tensor, sfb_tensor, c_tensor, max_active_clusters, stream)
+    """
+
+    def __init__(
+        self,
+        sf_vec_size: int,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        expert_count: int,
+        weight_per_expert: int,
+        use_prefetch: bool = False,
+        prefetch_dist: int = 3,
+    ):
+        """Initializes the configuration for a Blackwell dense GEMM kernel.
+
+        This configuration includes several key aspects:
+
+        1.  MMA Instruction Settings (tcgen05):
+            - acc_dtype: Data types for MMA accumulator, always set to Float32
+            - sf_vec_size: Scalefactor A/B vector size.
+            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
+
+        2.  Cluster Shape:
+            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+
+        :param sf_vec_size: Scalefactor vector size.
+        :type sf_vec_size: int
+        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: Tuple[int, int]
+        :param expert_count: The number of experts
+        :type expert_count: int
+        :param weight_per_expert: The number of weights per expert
+        :type weight_per_expert: int
+        :param use_prefetch: Enable prefetch operations (default: False).
+        :type use_prefetch: bool
+        :param prefetch_dist: Prefetch distance for TMA operations (default: 3).
+        :type prefetch_dist: int
+        """
+
+        self.acc_dtype = cutlass.Float32
+        self.sf_vec_size = sf_vec_size
+        self.use_2cta_instrs = mma_tiler_mn[0] == 256
+        self.cluster_shape_mn = cluster_shape_mn
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+
+        self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+
+        self.occupancy = 1
+        # Set specialized warp ids
+        self.epilog_warp_id = (
+            0,
+            1,
+            2,
+            3,
+        )
+        self.mma_warp_id = 4
+        self.tma_warp_id = 5
+        self.alpha_scale_load_warp_id = 6
+        self.dummy_warp_id = 7
+        self.threads_per_cta = 32 * len(
+            (
+                self.mma_warp_id,
+                self.tma_warp_id,
+                *self.epilog_warp_id,
+                self.alpha_scale_load_warp_id,
+                self.dummy_warp_id,
+            )
+        )
+        # Set barrier id for cta sync, epilogue sync and tmem ptr sync
+        self.cta_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=1,
+            num_threads=self.threads_per_cta,
+        )
+        self.epilog_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=2,
+            num_threads=32 * len(self.epilog_warp_id),
+        )
+        self.tmem_alloc_barrier = pipeline.NamedBarrier(
+            barrier_id=3,
+            num_threads=32 * len((self.mma_warp_id, *self.epilog_warp_id)),
+        )
+        self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+        SM100_TMEM_CAPACITY_COLUMNS = 512
+        self.num_tmem_alloc_cols = SM100_TMEM_CAPACITY_COLUMNS
+
+        self.expert_count = expert_count
+        self.weight_per_expert = weight_per_expert
+
+        self.use_prefetch = use_prefetch
+        self.prefetch_dist = prefetch_dist
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B/SFA/SFB
+        - Computing epilogue subtile
+        - Setting up A/B/SFA/SFB/C stage counts in shared memory
+        - Computing A/B/SFA/SFB/C shared memory layout
+        """
+        # Compute mma instruction shapes
+        # (MMA_Tile_Shape_M, MMA_Tile_Shape_N, MMA_Inst_Shape_K)
+        self.mma_inst_shape_mn = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+        )
+        # (CTA_Tile_Shape_M, Round_Up(MMA_Tile_Shape_N, 128), MMA_Inst_Shape_K)
+        # TODO: round up to 128, it is prepared for supporting N=64 or 192.
+        self.mma_inst_shape_mn_sfb = (
+            self.mma_inst_shape_mn[0] // (2 if self.use_2cta_instrs else 1),
+            cute.round_up(self.mma_inst_shape_mn[1], 128),
+        )
+
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_inst_shape_mn[0],
+            self.mma_inst_shape_mn[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.mma_tiler_sfb = (
+            self.mma_inst_shape_mn_sfb[0],
+            self.mma_inst_shape_mn_sfb[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+        self.cluster_layout_sfb_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma_sfb.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.num_mcast_ctas_sfb = cute.size(self.cluster_layout_sfb_vmnk.shape[1])
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+        self.is_sfb_mcast = self.num_mcast_ctas_sfb > 1
+
+        # Compute epilogue subtile
+        self.epi_tile = sm100_utils.compute_epilogue_tile_shape(
+            self.cta_tile_shape_mnk,
+            self.use_2cta_instrs,
+            self.c_layout,
+            self.c_dtype,
+        )
+
+        # Setup A/B/C stage count in shared memory and ACC stage count in tensor memory
+        self.num_acc_stage, self.num_ab_stage, self.num_c_stage, self.num_alpha_scale_stage = (
+            self._compute_stages(
+                tiled_mma,
+                self.mma_tiler,
+                self.a_dtype,
+                self.b_dtype,
+                self.epi_tile,
+                self.c_dtype,
+                self.c_layout,
+                self.sf_dtype,
+                self.sf_vec_size,
+                self.smem_capacity,
+                self.occupancy,
+            )
+        )
+
+        # Compute A/B/SFA/SFB/C shared memory layout
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+        self.sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+        self.c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.c_dtype,
+            self.c_layout,
+            self.epi_tile,
+            self.num_c_stage,
+        )
+
+        self.alpha_scale_smem_layout_staged = cute.make_layout(
+            (
+                self.cta_tile_shape_mnk[0],
+                self.num_alpha_scale_stage,
+            ),
+            stride=(
+                self.num_alpha_scale_stage,
+                1,
+            ),
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        a_tensor: cute.Tensor,
+        b_tensor: cute.Tensor,
+        sfa_tensor: cute.Tensor,
+        sfb_tensor: cute.Tensor,
+        alpha_scale_tensor: cute.Tensor,
+        c_tensor: cute.Tensor,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        """Execute the GEMM operation in steps:
+        - Setup static attributes before smem/grid/tma computation
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size with regard to hardware constraints
+        - Define shared storage for kernel
+        - Launch the kernel synchronously
+
+        :param a_tensor: Input tensor A
+        :type a_tensor: cute.Tensor
+        :param b_tensor: Input tensor B
+        :type b_tensor: cute.Tensor
+        :param sfa_tensor: Scale factor tensor A
+        :type sfa_tensor: cute.Tensor
+        :param sfb_tensor: Scale factor tensor B
+        :type sfb_tensor: cute.Tensor
+        :param alpha_scale_tensor: Alpha scale tensor
+        :type alpha_scale_tensor: cute.Tensor
+        :param c_tensor: Output tensor C
+        :type c_tensor: cute.Tensor
+        :param max_active_clusters: Maximum number of active clusters
+        :type max_active_clusters: cutlass.Constexpr
+        :param stream: CUDA stream for asynchronous execution
+        :type stream: cuda.CUstream
+        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
+        :type epilogue_op: cutlass.Constexpr
+        :raises TypeError: If input data types are incompatible with the MMA instruction.
+        """
+        # Setup static attributes before smem/grid/tma computation
+        self.a_dtype: Type[cutlass.Numeric] = a_tensor.element_type
+        self.b_dtype: Type[cutlass.Numeric] = b_tensor.element_type
+        self.sf_dtype: Type[cutlass.Numeric] = sfa_tensor.element_type
+        self.c_dtype: Type[cutlass.Numeric] = c_tensor.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(a_tensor).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(b_tensor).mma_major_mode()
+        self.c_layout = utils.LayoutEnum.from_tensor(c_tensor)
+
+        # Check if input data types are compatible with MMA instruction
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+
+        # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
+        # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
+        sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(a_tensor.shape, self.sf_vec_size)
+        sfa_tensor = cute.make_tensor(sfa_tensor.iterator, sfa_layout)
+
+        # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+        sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(b_tensor.shape, self.sf_vec_size)
+        sfb_tensor = cute.make_tensor(sfb_tensor.iterator, sfb_layout)
+
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        # For 2CTA blockscaled kernels, SFB needs to be replicated across peer CTAs.
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for A
+        a_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+            a_op,
+            a_tensor,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for B
+        b_op = sm100_utils.cluster_shape_to_tma_atom_B(self.cluster_shape_mn, tiled_mma.thr_id)
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+            b_op,
+            b_tensor,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for SFA
+        sfa_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        sfa_smem_layout = cute.slice_(self.sfa_smem_layout_staged, (None, None, None, 0))
+        tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
+            sfa_op,
+            sfa_tensor,
+            sfa_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        # Setup TMA load for SFB
+        sfb_op = sm100_utils.cluster_shape_to_tma_atom_SFB(self.cluster_shape_mn, tiled_mma.thr_id)
+        sfb_smem_layout = cute.slice_(self.sfb_smem_layout_staged, (None, None, None, 0))
+        tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
+            sfb_op,
+            sfb_tensor,
+            sfb_smem_layout,
+            self.mma_tiler_sfb,
+            tiled_mma_sfb,
+            self.cluster_layout_sfb_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        sfa_copy_size = cute.size_in_bytes(self.sf_dtype, sfa_smem_layout)
+        sfb_copy_size = cute.size_in_bytes(self.sf_dtype, sfb_smem_layout)
+        self.num_tma_load_bytes = (
+            a_copy_size + b_copy_size + sfa_copy_size + sfb_copy_size
+        ) * atom_thr_size
+
+        # Setup TMA store for C
+        epi_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0))
+        tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            c_tensor,
+            epi_smem_layout,
+            self.epi_tile,
+        )
+
+        # Compute grid size
+        self.tile_sched_params, grid = self._compute_grid(
+            c_tensor,
+            self.cta_tile_shape_mnk,
+            self.cluster_shape_mn,
+            max_active_clusters,
+        )
+
+        self.buffer_align_bytes = 1024
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorage:
+            ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            alpha_scale_load_mbar_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.num_alpha_scale_stage * 2
+            ]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    cute.cosize(self.c_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sSFA: cute.struct.Align[
+                cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfa_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sSFB: cute.struct.Align[
+                cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfb_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            # Alpha scale shared memory
+            sAlphaScale: cute.struct.Align[
+                cute.struct.MemRange[
+                    cutlass.Float32, cute.cosize(self.alpha_scale_smem_layout_staged)
+                ],
+                16,
+            ]
+
+        self.shared_storage = SharedStorage
+
+        # Launch the kernel synchronously
+        self.kernel(
+            tiled_mma,
+            tiled_mma_sfb,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_sfa,
+            tma_tensor_sfa,
+            tma_atom_sfb,
+            tma_tensor_sfb,
+            alpha_scale_tensor,
+            tma_atom_c,
+            tma_tensor_c,
+            self.cluster_layout_vmnk,
+            self.cluster_layout_sfb_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.sfa_smem_layout_staged,
+            self.sfb_smem_layout_staged,
+            self.alpha_scale_smem_layout_staged,
+            self.c_smem_layout_staged,
+            self.epi_tile,
+            self.tile_sched_params,
+            epilogue_op,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            stream=stream,
+            min_blocks_per_mp=1,
+        )
+        return
+
+    # GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tiled_mma_sfb: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_sfa: cute.CopyAtom,
+        mSFA_mkl: cute.Tensor,
+        tma_atom_sfb: cute.CopyAtom,
+        mSFB_nkl: cute.Tensor,
+        malpha_scale_mnl: cute.Tensor,
+        tma_atom_c: cute.CopyAtom,
+        mC_mnl: cute.Tensor,
+        cluster_layout_vmnk: cute.Layout,
+        cluster_layout_sfb_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        sfa_smem_layout_staged: cute.Layout,
+        sfb_smem_layout_staged: cute.Layout,
+        alpha_scale_smem_layout_staged: cute.Layout,
+        c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout],
+        epi_tile: cute.Tile,
+        tile_sched_params: utils.PersistentTileSchedulerParams,
+        epilogue_op: cutlass.Constexpr,
+    ):
+        """
+        GPU device kernel performing the Persistent batched GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.tma_warp_id:
+            cpasync.prefetch_descriptor(tma_atom_a)
+            cpasync.prefetch_descriptor(tma_atom_b)
+            cpasync.prefetch_descriptor(tma_atom_sfa)
+            cpasync.prefetch_descriptor(tma_atom_sfb)
+            cpasync.prefetch_descriptor(tma_atom_c)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coords inside cluster
+        bidx, bidy, bidz = cute.arch.block_idx()
+        mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+        block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+        # Coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        # Initialize mainloop ab_pipeline (barrier) and states
+        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        ab_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_tma_producer
+        )
+        ab_pipeline = pipeline.PipelineTmaUmma.create(
+            barrier_storage=storage.ab_full_mbar_ptr.data_ptr(),
+            num_stages=self.num_ab_stage,
+            producer_group=ab_pipeline_producer_group,
+            consumer_group=ab_pipeline_consumer_group,
+            tx_count=self.num_tma_load_bytes,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize acc_pipeline (barrier) and states
+        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_acc_consumer_threads = len(self.epilog_warp_id) * (2 if use_2cta_instrs else 1)
+        acc_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_acc_consumer_threads
+        )
+        acc_pipeline = pipeline.PipelineUmmaAsync.create(
+            barrier_storage=storage.acc_full_mbar_ptr.data_ptr(),
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize alpha_scale_pipeline (barrier) and states
+        alpha_scale_pipeline_producer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            32 * 1,  # alpha_scale_load_warp_id threads
+            32 * 1,
+        )
+        alpha_scale_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            32 * len(self.epilog_warp_id),  # epilogue warps
+            32 * len(self.epilog_warp_id),
+        )
+        alpha_scale_pipeline = pipeline.PipelineCpAsync.create(
+            barrier_storage=storage.alpha_scale_load_mbar_ptr.data_ptr(),
+            num_stages=self.num_alpha_scale_stage,
+            producer_group=alpha_scale_pipeline_producer_group,
+            consumer_group=alpha_scale_pipeline_consumer_group,
+        )
+
+        # Tensor memory dealloc barrier init
+        tmem = utils.TmemAllocator(
+            storage.tmem_holding_buf,
+            barrier_for_retrieve=self.tmem_alloc_barrier,
+            allocator_warp_id=self.epilog_warp_id[0],
+            is_two_cta=use_2cta_instrs,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+        )
+
+        # Cluster arrive after barrier init
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_arrive_relaxed()
+
+        #
+        # Setup smem tensor A/B/SFA/SFB/C
+        #
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sC = storage.sC.get_tensor(c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner)
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sSFA = storage.sSFA.get_tensor(sfa_smem_layout_staged)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sSFB = storage.sSFB.get_tensor(sfb_smem_layout_staged)
+        # Alpha scale shared memory tensor
+        sAlphaScale = storage.sAlphaScale.get_tensor(alpha_scale_smem_layout_staged)
+
+        #
+        # Compute multicast mask for A/B/SFA/SFB buffer full
+        #
+        a_full_mcast_mask = None
+        b_full_mcast_mask = None
+        sfa_full_mcast_mask = None
+        sfb_full_mcast_mask = None
+        if cutlass.const_expr(self.is_a_mcast or self.is_b_mcast or use_2cta_instrs):
+            a_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1
+            )
+            sfa_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            sfb_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_sfb_vmnk, block_in_cluster_coord_sfb_vmnk, mcast_mode=1
+            )
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, RestM, RestK, RestL)
+        gA_mkl = cute.local_tile(
+            mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+        # (bN, bK, RestN, RestK, RestL)
+        gB_nkl = cute.local_tile(
+            mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None)
+        )
+        # (bM, bK, RestM, RestK, RestL)
+        gSFA_mkl = cute.local_tile(
+            mSFA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+        # (bN, bK, RestN, RestK, RestL)
+        gSFB_nkl = cute.local_tile(
+            mSFB_nkl,
+            cute.slice_(self.mma_tiler_sfb, (0, None, None)),
+            (None, None, None),
+        )
+
+        # (bM, bN, loopM, loopN, loopL)
+        galpha_scale_mnl = cute.local_tile(
+            malpha_scale_mnl,
+            cute.slice_(self.cta_tile_shape_mnk, (None, 0, 0)),
+            (None, None, None),
+        )
+
+        # (bM, bN, RestM, RestN, RestL)
+        gC_mnl = cute.local_tile(
+            mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None)
+        )
+        k_tile_cnt = cutlass.Int32(cute.size(gA_mkl, mode=[3]))
+
+        #
+        # Partition global tensor for TiledMMA_A/B/C
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        thr_mma_sfb = tiled_mma_sfb.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+        tCgA = thr_mma.partition_A(gA_mkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+        tCgSFA = thr_mma.partition_A(gSFA_mkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgSFB = thr_mma_sfb.partition_B(gSFB_nkl)
+        # (MMA, MMA_M, MMA_N, RestM, RestN, RestL)
+        tCgC = thr_mma.partition_C(gC_mnl)
+
+        #
+        # Partition global/shared tensor for TMA load A/B
+        #
+        # TMA load A partition_S/D
+        a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+        tAsA, tAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2],
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3),
+        )
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestN, RestK, RestL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        # TMA load SFA partition_S/D
+        sfa_cta_layout = a_cta_layout
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+        tAsSFA, tAgSFA = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfa,
+            block_in_cluster_coord_vmnk[2],
+            sfa_cta_layout,
+            cute.group_modes(sSFA, 0, 3),
+            cute.group_modes(tCgSFA, 0, 3),
+        )
+        tAsSFA = cute.filter_zeros(tAsSFA)
+        tAgSFA = cute.filter_zeros(tAgSFA)
+
+        # TMA load SFB partition_S/D
+        sfb_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_sfb_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestN, RestK, RestL)
+        tBsSFB, tBgSFB = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfb,
+            block_in_cluster_coord_sfb_vmnk[1],
+            sfb_cta_layout,
+            cute.group_modes(sSFB, 0, 3),
+            cute.group_modes(tCgSFB, 0, 3),
+        )
+        tBsSFB = cute.filter_zeros(tBsSFB)
+        tBgSFB = cute.filter_zeros(tBgSFB)
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+        # (MMA, MMA_M, MMA_N, STAGE)
+        tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, self.num_acc_stage))
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_wait()
+        else:
+            self.cta_sync_barrier.arrive_and_wait()
+
+        #
+        # Specialized TMA load warp
+        #
+        if warp_idx == self.tma_warp_id:
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_ab_stage
+            )
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((atom_v, rest_v), RestK)
+                tAgA_slice = tAgA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+                # ((atom_v, rest_v), RestK)
+                tBgB_slice = tBgB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+
+                # ((atom_v, rest_v), RestK)
+                tAgSFA_slice = tAgSFA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+                slice_n = mma_tile_coord_mnl[1]
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    slice_n = mma_tile_coord_mnl[1] // 2
+                # ((atom_v, rest_v), RestK)
+                tBgSFB_slice = tBgSFB[(None, slice_n, None, mma_tile_coord_mnl[2])]
+
+                prefetch_dist = self.prefetch_dist
+                # Sending a batch of inflight Prefetches before starting TMALDG loop
+                # Prefetch logic: use_prefetch for both A&B, or explicit A-only/B-only
+                if self.use_prefetch:
+                    # Prefetch both A and B (default behavior)
+                    for k_tile in cutlass.range(0, min(prefetch_dist, k_tile_cnt), unroll=1):
+                        # Prefetch both A and B (default behavior)
+                        cute.prefetch(
+                            tma_atom_a,
+                            tAgA_slice[(None, k_tile)],
+                        )
+                        cute.prefetch(
+                            tma_atom_b,
+                            tBgB_slice[(None, k_tile)],
+                        )
+                        cute.prefetch(
+                            tma_atom_sfa,
+                            tAgSFA_slice[(None, k_tile)],
+                        )
+                        cute.prefetch(
+                            tma_atom_sfb,
+                            tBgSFB_slice[(None, k_tile)],
+                        )
+
+                # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt
+                ab_producer_state.reset_count()
+                peek_ab_empty_status = cutlass.Boolean(1)
+                if ab_producer_state.count < k_tile_cnt:
+                    peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+                #
+                # Tma load loop
+                #
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):
+                    # Conditionally wait for AB buffer empty
+                    ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status)
+
+                    # TMA load A/B/SFA/SFB
+                    cute.copy(
+                        tma_atom_a,
+                        tAgA_slice[(None, ab_producer_state.count)],
+                        tAsA[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=a_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB_slice[(None, ab_producer_state.count)],
+                        tBsB[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=b_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_sfa,
+                        tAgSFA_slice[(None, ab_producer_state.count)],
+                        tAsSFA[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=sfa_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_sfb,
+                        tBgSFB_slice[(None, ab_producer_state.count)],
+                        tBsSFB[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=sfb_full_mcast_mask,
+                    )
+
+                    # Prefetch logic in the loop: use_prefetch for both A&B, or explicit A-only/B-only
+                    if k_tile < k_tile_cnt - prefetch_dist:
+                        if self.use_prefetch:
+                            # Prefetch both A and B (default behavior)
+                            cute.prefetch(
+                                tma_atom_a,
+                                tAgA_slice[(None, ab_producer_state.count + prefetch_dist)],
+                            )
+                            cute.prefetch(
+                                tma_atom_b,
+                                tBgB_slice[(None, ab_producer_state.count + prefetch_dist)],
+                            )
+                            cute.prefetch(
+                                tma_atom_sfa,
+                                tAgSFA_slice[(None, ab_producer_state.count + prefetch_dist)],
+                            )
+                            cute.prefetch(
+                                tma_atom_sfb,
+                                tBgSFB_slice[(None, ab_producer_state.count + prefetch_dist)],
+                            )
+
+                    # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt + k_tile + 1
+                    ab_producer_state.advance()
+                    peek_ab_empty_status = cutlass.Boolean(1)
+                    if ab_producer_state.count < k_tile_cnt:
+                        peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Wait A/B buffer empty
+            #
+            ab_pipeline.producer_tail(ab_producer_state)
+
+        #
+        # Specialized Alpha Scale Load warp
+        #
+        if warp_idx == self.alpha_scale_load_warp_id:
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            alpha_scale_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_alpha_scale_stage
+            )
+
+            # Setup copy atom for alpha scale loading
+            atom_copy = cute.make_copy_atom(
+                cute.nvgpu.cpasync.CopyG2SOp(),
+                malpha_scale_mnl.element_type,
+                num_bits_per_copy=malpha_scale_mnl.element_type.width,
+            )
+            tiled_copy_alpha_scale = cute.make_tiled_copy_tv(
+                atom_copy, cute.make_layout((32,)), cute.make_layout((1,))
+            )
+            thr_copy_alpha_scale = tiled_copy_alpha_scale.get_slice(cute.arch.lane_idx())
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+
+                # Reset producer state for this tile
+                alpha_scale_producer_state.reset_count()
+                peek_alpha_scale_empty_status = cutlass.Boolean(1)
+                if alpha_scale_producer_state.count < k_tile_cnt:
+                    peek_alpha_scale_empty_status = alpha_scale_pipeline.producer_try_acquire(
+                        alpha_scale_producer_state
+                    )
+
+                galpha_scale_mnl_current_tile = galpha_scale_mnl[
+                    None, cur_tile_coord[0], None, cur_tile_coord[2]
+                ]
+
+                tAgAlphaScale = thr_copy_alpha_scale.partition_S(galpha_scale_mnl_current_tile)
+                tAsAlphaScale = thr_copy_alpha_scale.partition_D(sAlphaScale)
+
+                # Load alpha scale for each k tile
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):
+                    # Calculate expert index for this k_tile
+                    expert_idx = k_tile // (self.weight_per_expert // self.mma_tiler[2])
+
+                    # Slice alpha scale for current tile and expert
+                    tAgAlphaScale_slice = tAgAlphaScale[(None, None, expert_idx)]
+                    tAsAlphaScale_slice = tAsAlphaScale[
+                        (None, None, alpha_scale_producer_state.index)
+                    ]
+
+                    # Wait for alpha scale buffer empty
+                    alpha_scale_pipeline.producer_acquire(
+                        alpha_scale_producer_state, peek_alpha_scale_empty_status
+                    )
+
+                    num_iters = cute.size(tAgAlphaScale_slice, mode=[1])
+
+                    # Load alpha scale from global to shared memory
+                    for iter_idx in cutlass.range(num_iters, unroll_full=True):
+                        iter_coord = (None, iter_idx)
+                        pred = cutlass.Boolean(
+                            32 * iter_idx + cute.arch.lane_idx() < malpha_scale_mnl.shape[0]
+                        )
+                        if pred:
+                            cute.copy(
+                                tiled_copy_alpha_scale,
+                                tAgAlphaScale_slice[iter_coord],
+                                tAsAlphaScale_slice[iter_coord],
+                            )
+
+                    # Commit and advance
+                    alpha_scale_pipeline.producer_commit(alpha_scale_producer_state)
+                    alpha_scale_producer_state.advance()
+
+                    # Peek next
+                    peek_alpha_scale_empty_status = cutlass.Boolean(1)
+                    if alpha_scale_producer_state.count < k_tile_cnt:
+                        peek_alpha_scale_empty_status = alpha_scale_pipeline.producer_try_acquire(
+                            alpha_scale_producer_state
+                        )
+
+                # Advance to next tile
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            # Wait for alpha scale buffer empty
+            alpha_scale_pipeline.producer_tail(alpha_scale_producer_state)
+
+        #
+        # Specialized MMA warp
+        #
+        if warp_idx == self.mma_warp_id:
+            #
+            # Bar sync for retrieve tensor memory ptr from shared mem
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator/SFA/SFB tensor
+            #
+            acc_tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # Make accumulator tmem tensor
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+            # Make SFA tmem tensor
+            sfa_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base),
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_M, MMA_K)
+            tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+
+            # Make SFB tmem tensor
+            sfb_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr
+                + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base)
+                + tcgen05.find_tmem_tensor_col_offset(tCtSFA),
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_N, MMA_K)
+            tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
+            #
+            # Partition for S2T copy of SFA/SFB
+            #
+            (
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t,
+                tCtSFA_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFA, tCtSFA)
+            (
+                tiled_copy_s2t_sfb,
+                tCsSFB_compact_s2t,
+                tCtSFB_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFB, tCtSFB)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_ab_stage
+            )
+            acc_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_acc_stage
+            )
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                tCtSFB_mma = tCtSFB
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    # Move in increments of 64 columns of SFB
+                    offset = cutlass.Int32((mma_tile_coord_mnl[1] % 2) * 2)
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr
+                        + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base)
+                        + tcgen05.find_tmem_tensor_col_offset(tCtSFA)
+                        + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+
+                # Peek (try_wait) AB buffer full for k_tile = 0
+                ab_consumer_state.reset_count()
+                peek_ab_full_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_tile_cnt and is_leader_cta:
+                    peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+                #
+                # Mma mainloop
+                #
+                for k_tile in range(k_tile_cnt):
+                    # Set tensor memory buffer for current tile
+                    # (MMA, MMA_M, MMA_N)
+                    tCtAcc = tCtAcc_base[(None, None, None, acc_producer_state.index)]
+
+                    if is_leader_cta:
+                        # Wait for accumulator buffer empty(each kblock)
+                        acc_pipeline.producer_acquire(acc_producer_state)
+
+                        # Conditionally wait for AB buffer full
+                        ab_pipeline.consumer_wait(ab_consumer_state, peek_ab_full_status)
+
+                        #  Copy SFA/SFB from smem to tmem
+                        s2t_stage_coord = (
+                            None,
+                            None,
+                            None,
+                            None,
+                            ab_consumer_state.index,
+                        )
+                        tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+                        tCsSFB_compact_s2t_staged = tCsSFB_compact_s2t[s2t_stage_coord]
+                        cute.copy(
+                            tiled_copy_s2t_sfa,
+                            tCsSFA_compact_s2t_staged,
+                            tCtSFA_compact_s2t,
+                        )
+                        cute.copy(
+                            tiled_copy_s2t_sfb,
+                            tCsSFB_compact_s2t_staged,
+                            tCtSFB_compact_s2t,
+                        )
+
+                        #
+                        # Reset the ACCUMULATE field for each tile
+                        #
+                        tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+
+                        # tCtAcc += tCrA * tCrSFA * tCrB * tCrSFB
+                        num_kblocks = cute.size(tCrA, mode=[2])
+                        for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                            kblock_coord = (
+                                None,
+                                None,
+                                kblock_idx,
+                                ab_consumer_state.index,
+                            )
+
+                            # Set SFA/SFB tensor to tiled_mma
+                            sf_kblock_coord = (None, None, kblock_idx)
+                            tiled_mma.set(
+                                tcgen05.Field.SFA,
+                                tCtSFA[sf_kblock_coord].iterator,
+                            )
+                            tiled_mma.set(
+                                tcgen05.Field.SFB,
+                                tCtSFB_mma[sf_kblock_coord].iterator,
+                            )
+
+                            cute.gemm(
+                                tiled_mma,
+                                tCtAcc,
+                                tCrA[kblock_coord],
+                                tCrB[kblock_coord],
+                                tCtAcc,
+                            )
+
+                            # Enable accumulate on tCtAcc after first kblock
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        # Async arrive AB buffer empty
+                        ab_pipeline.consumer_release(ab_consumer_state)
+
+                    # Peek (try_wait) AB buffer full for k_tile = k_tile + 1
+                    ab_consumer_state.advance()
+                    peek_ab_full_status = cutlass.Boolean(1)
+                    if ab_consumer_state.count < k_tile_cnt:
+                        if is_leader_cta:
+                            peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+                    #
+                    # Async arrive accumulator buffer full
+                    #
+                    if is_leader_cta:
+                        acc_pipeline.producer_commit(acc_producer_state)
+                    acc_producer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Wait for accumulator buffer empty
+            #
+            acc_pipeline.producer_tail(acc_producer_state)
+        #
+        # Specialized epilogue warps
+        #
+        if warp_idx < self.mma_warp_id:
+            #
+            # Alloc tensor memory buffer
+            #
+            tmem.allocate(self.num_tmem_alloc_cols)
+
+            #
+            # Bar sync for retrieve tensor memory ptr from shared memory
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            acc_tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Partition for epilogue
+            #
+            epi_tidx = tidx
+            (
+                tiled_copy_t2r,
+                tTR_tAcc_base,
+                tTR_rAcc,
+                tTR_rAcc_final,
+            ) = self.epilog_tmem_copy_and_partition(
+                epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs
+            )
+
+            tTR_rC = cute.make_rmem_tensor(tTR_rAcc.shape, self.c_dtype)
+            tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition(
+                tiled_copy_t2r, tTR_rC, epi_tidx, sC
+            )
+            (
+                tma_atom_c,
+                bSG_sC,
+                bSG_gC_partitioned,
+            ) = self.epilog_gmem_copy_and_partition(epi_tidx, tma_atom_c, tCgC, epi_tile, sC)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            acc_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_acc_stage
+            )
+
+            # Alpha scale consumer state
+            alpha_scale_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_alpha_scale_stage
+            )
+
+            # Threads/warps participating in tma store pipeline
+            c_producer_group = pipeline.CooperativeGroup(
+                pipeline.Agent.Thread,
+                32 * len(self.epilog_warp_id),
+                32 * len(self.epilog_warp_id),
+            )
+            c_pipeline = pipeline.PipelineTmaStore.create(
+                num_stages=self.num_c_stage,
+                producer_group=c_producer_group,
+            )
+
+            # Create copy atom for loading alpha scale from smem
+            alpha_scale_copy_atom_s2r = cute.make_copy_atom(
+                cute.nvgpu.CopyUniversalOp(),
+                cutlass.Float32,
+                num_bits_per_copy=32,
+            )
+            tiled_copy_alpha_scale_s2r = cute.make_tiled_copy_tv(
+                alpha_scale_copy_atom_s2r,
+                cute.make_layout((32 * len(self.epilog_warp_id),)),
+                cute.make_layout((1,)),
+            )
+            thr_copy_alpha_scale_s2r = tiled_copy_alpha_scale_s2r.get_slice(tidx)
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((ATOM_V, REST_V), EPI_M, EPI_N)
+                bSG_gC = bSG_gC_partitioned[
+                    (
+                        None,
+                        None,
+                        None,
+                        *mma_tile_coord_mnl,
+                    )
+                ]
+
+                # initialize the final accumulator
+                tTR_rAcc_final.fill(0.0)
+
+                # Initialize alpha_scale consumer state for this tile
+                alpha_scale_consumer_state.reset_count()
+                peek_alpha_scale_full_status = cutlass.Boolean(1)
+                if alpha_scale_consumer_state.count < k_tile_cnt:
+                    peek_alpha_scale_full_status = alpha_scale_pipeline.consumer_try_wait(
+                        alpha_scale_consumer_state
+                    )
+
+                acc_consumer_state.reset_count()
+                peek_acc_full_status = cutlass.Boolean(1)
+                if acc_consumer_state.count < k_tile_cnt:
+                    peek_acc_full_status = acc_pipeline.consumer_try_wait(acc_consumer_state)
+
+                for k_tile in cutlass.range(k_tile_cnt):
+                    # Set tensor memory buffer for current tile
+                    # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+                    tTR_tAcc = tTR_tAcc_base[
+                        (None, None, None, None, None, acc_consumer_state.index)
+                    ]
+                    tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                    #
+                    # Update accumulator by scale factor in subtiles
+                    #
+                    subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
+
+                    # for subtile_idx in cutlass.range_dynamic(subtile_cnt):
+
+                    #
+                    # Wait for alpha scale buffer full
+                    #
+                    alpha_scale_pipeline.consumer_wait(
+                        alpha_scale_consumer_state, peek_alpha_scale_full_status
+                    )
+
+                    # Load alpha scale from shared memory for current expert
+                    alpha_scale_smem_slice = sAlphaScale[(None, alpha_scale_consumer_state.index)]
+
+                    tAsAlphaScale_slice = thr_copy_alpha_scale_s2r.partition_S(
+                        alpha_scale_smem_slice
+                    )
+                    current_alpha_scale_reg = cute.make_rmem_tensor(
+                        tAsAlphaScale_slice.shape, cutlass.Float32
+                    )
+
+                    cute.copy(
+                        alpha_scale_copy_atom_s2r, tAsAlphaScale_slice, current_alpha_scale_reg
+                    )
+                    current_alpha_scale = current_alpha_scale_reg[0]
+
+                    #
+                    # Wait for accumulator buffer full
+                    #
+                    acc_pipeline.consumer_wait(acc_consumer_state, peek_acc_full_status)
+
+                    for subtile_idx in cutlass.range(subtile_cnt):
+                        #
+                        # Load accumulator from tensor memory buffer to register
+                        #
+                        # (T2R, T2R_M, T2R_N)
+                        tTR_tAcc_mn = tTR_tAcc[(None, None, None, subtile_idx)]
+                        cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc)
+
+                        #
+                        # Update accumulator by scale factor
+                        #
+                        tTR_rAcc_subtile = tTR_rAcc_final[(None, None, None, subtile_idx)]
+
+                        acc_vec = tTR_rAcc.load()
+                        final_vec = tTR_rAcc_subtile.load()
+
+                        final_vec = acc_vec * current_alpha_scale + final_vec
+                        tTR_rAcc_subtile.store(final_vec.to(self.acc_dtype))
+
+                    # Release alpha scale buffer
+                    alpha_scale_pipeline.consumer_release(alpha_scale_consumer_state)
+                    alpha_scale_consumer_state.advance()
+
+                    # Peek next alpha scale
+                    peek_alpha_scale_full_status = cutlass.Boolean(1)
+                    if alpha_scale_consumer_state.count < k_tile_cnt:
+                        peek_alpha_scale_full_status = alpha_scale_pipeline.consumer_try_wait(
+                            alpha_scale_consumer_state
+                        )
+                    #
+                    # Async arrive accumulator buffer empty
+                    #
+                    with cute.arch.elect_one():
+                        acc_pipeline.consumer_release(acc_consumer_state)
+                    acc_consumer_state.advance()
+
+                    peek_acc_full_status = cutlass.Boolean(1)
+                    if acc_consumer_state.count < k_tile_cnt:
+                        peek_acc_full_status = acc_pipeline.consumer_try_wait(acc_consumer_state)
+
+                #
+                # Store accumulator to global memory in subtiles
+                #
+                bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
+
+                subtile_cnt = cute.size(tTR_rAcc_final.shape, mode=[3])
+                num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt
+                for subtile_idx in cutlass.range(subtile_cnt):
+                    #
+                    # Store accumulator to shared memory or global memory
+                    #
+                    tTR_rAcc_subtile = tTR_rAcc_final[(None, None, None, subtile_idx)]
+
+                    #
+                    # Convert to C type
+                    #
+                    acc_vec = tiled_copy_r2s.retile(tTR_rAcc_subtile).load()
+                    acc_vec = epilogue_op(acc_vec.to(self.c_dtype))
+                    tRS_rC.store(acc_vec)
+
+                    #
+                    # Store C to shared memory
+                    #
+                    c_buffer = (num_prev_subtiles + subtile_idx) % self.num_c_stage
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rC,
+                        tRS_sC[(None, None, None, c_buffer)],
+                    )
+                    # Fence and barrier to make sure shared memory store is visible to TMA store
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+                    self.epilog_sync_barrier.arrive_and_wait()
+
+                    #
+                    # TMA store C to global memory
+                    #
+                    if warp_idx == self.epilog_warp_id[0]:
+                        cute.copy(
+                            tma_atom_c,
+                            bSG_sC[(None, c_buffer)],
+                            bSG_gC[(None, subtile_idx)],
+                        )
+                        # Fence and barrier to make sure shared memory store is visible to TMA store
+                        c_pipeline.producer_commit()
+                        c_pipeline.producer_acquire()
+                    self.epilog_sync_barrier.arrive_and_wait()
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Dealloc the tensor memory buffer
+            #
+            tmem.relinquish_alloc_permit()
+            self.epilog_sync_barrier.arrive_and_wait()
+            tmem.free(acc_tmem_ptr)
+            #
+            # Wait for C store complete
+            #
+            c_pipeline.producer_tail()
+
+    def mainloop_s2t_copy_and_partition(
+        self,
+        sSF: cute.Tensor,
+        tSF: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for smem to tmem load for scale factor tensor, then use it
+        to partition smem memory (source) and tensor memory (destination).
+
+        :param sSF: The scale factor tensor in smem
+        :type sSF: cute.Tensor
+        :param tSF: The scale factor tensor in tmem
+        :type tSF: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where:
+            - tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t)
+            - tCsSF_compact_s2t: The partitioned scale factor tensor in smem
+            - tSF_compact_s2t: The partitioned scale factor tensor in tmem
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # (MMA, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact = cute.filter_zeros(sSF)
+        # (MMA, MMA_MN, MMA_K)
+        tCtSF_compact = cute.filter_zeros(tSF)
+
+        # Make S2T CopyAtom and tiledCopy
+        copy_atom_s2t = cute.make_copy_atom(
+            tcgen05.Cp4x32x128bOp(self.cta_group),
+            self.sf_dtype,
+        )
+        tiled_copy_s2t = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSF_compact)
+        thr_copy_s2t = tiled_copy_s2t.get_slice(0)
+
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t_ = thr_copy_s2t.partition_S(tCsSF_compact)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(tiled_copy_s2t, tCsSF_compact_s2t_)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+        tCtSF_compact_s2t = thr_copy_s2t.partition_D(tCtSF_compact)
+
+        return tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory
+        (source) and register array (destination).
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.c_layout,
+            self.c_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0, None)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        gC_mnl_epi = cute.flat_divide(gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc = cute.make_rmem_tensor(
+            tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
+        )
+
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N)
+        tTR_rAcc_final_ = cute.make_rmem_tensor(
+            tTR_gC[(None, None, None, None, None, 0, 0, 0)].shape, self.acc_dtype
+        )
+        tTR_rAcc_final = cute.group_modes(tTR_rAcc_final_, 3, cute.rank(tTR_rAcc_final_))
+
+        return (
+            tiled_copy_t2r,
+            tTR_tAcc,
+            tTR_rAcc,
+            tTR_rAcc_final,
+        )
+
+    def epilog_smem_copy_and_partition(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tTR_rC: cute.Tensor,
+        tidx: cutlass.Int32,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for shared memory store, then use it to partition
+        register array (source) and shared memory (destination).
+
+        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+        :type tiled_copy_t2r: cute.TiledCopy
+        :param tTR_rC: The partitioned accumulator tensor
+        :type tTR_rC: cute.Tensor
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+        :type sepi: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
+            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
+            - tRS_rC: The partitioned tensor C (register source)
+            - tRS_sC: The partitioned tensor C (smem destination)
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        copy_atom_r2s = sm100_utils.get_smem_store_op(
+            self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r
+        )
+        tiled_copy_r2s = cute.make_tiled_copy_D(copy_atom_r2s, tiled_copy_t2r)
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sC = thr_copy_r2s.partition_D(sC)
+        # (R2S, R2S_M, R2S_N)
+        tRS_rC = tiled_copy_r2s.retile(tTR_rC)
+        return tiled_copy_r2s, tRS_rC, tRS_sC
+
+    def epilog_gmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        atom: Union[cute.CopyAtom, cute.TiledCopy],
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]:
+        """Make tiledCopy for global memory store, then use it to:
+        partition shared memory (source) and global memory (destination) for TMA store version.
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
+        :type atom: cute.CopyAtom or cute.TiledCopy
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+
+        :return: A tuple containing (tma_atom_c, bSG_sC, bSG_gC) where:
+            - tma_atom_c: The TMA copy atom
+            - bSG_sC: The partitioned shared memory tensor C
+            - bSG_gC: The partitioned global tensor C
+        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
+        """
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        gC_epi = cute.flat_divide(gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile)
+
+        tma_atom_c = atom
+        sC_for_tma_partition = cute.group_modes(sC, 0, 2)
+        gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N, RestM, RestN, RestL)
+        bSG_sC, bSG_gC = cpasync.tma_partition(
+            tma_atom_c,
+            0,
+            cute.make_layout(1),
+            sC_for_tma_partition,
+            gC_for_tma_partition,
+        )
+        return tma_atom_c, bSG_sC, bSG_gC
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        epi_tile: cute.Tile,
+        c_dtype: Type[cutlass.Numeric],
+        c_layout: utils.LayoutEnum,
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        smem_capacity: int,
+        occupancy: int,
+    ) -> Tuple[int, int, int, int]:
+        """Computes the number of stages for A/B/C/Alpha_Scale operands based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape.
+        :type epi_tile: cute.Tile
+        :param c_dtype: Data type of operand C (output).
+        :type c_dtype: type[cutlass.Numeric]
+        :param c_layout: Layout enum of operand C.
+        :type c_layout: utils.LayoutEnum
+        :param sf_dtype: Data type of Scale factor.
+        :type sf_dtype: type[cutlass.Numeric]
+        :param sf_vec_size: Scale factor vector size.
+        :type sf_vec_size: int
+        :param smem_capacity: Total available shared memory capacity in bytes.
+        :type smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+
+        :return: A tuple containing the computed number of stages for:
+                 (ACC stages, A/B operand stages, C stages, Alpha_Scale stages)
+        :rtype: tuple[int, int, int, int]
+        """
+        # ACC stages
+        num_acc_stage = 2
+        if mma_tiler_mnk[1] == 256:
+            num_acc_stage = 1
+        # else if mma_tiler_mnk[1] == 64:
+        #     num_acc_stage = 6
+
+        # Default C stages
+        num_c_stage = 2
+
+        # Default Alpha scale stages
+        num_alpha_scale_stage = 10
+
+        # Calculate smem layout and size for one stage of A, B, SFA, SFB and C
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        sfa_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+        sfb_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        c_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            c_dtype,
+            c_layout,
+            epi_tile,
+            1,
+        )
+
+        ab_bytes_per_stage = (
+            cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
+            + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfa_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one)
+        )
+        mbar_helpers_bytes = 1024
+        c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
+        c_bytes = c_bytes_per_stage * num_c_stage
+
+        # Alpha scale shared memory
+        # hardcode cta tile shape
+        alpha_bytes = cute.size_in_bytes(
+            cutlass.Float32,
+            cute.make_layout(
+                (mma_tiler_mnk[0] // tiled_mma.thr_id.shape, num_alpha_scale_stage),
+                stride=(num_alpha_scale_stage, 1),
+            ),
+        )
+        # Calculate A/B/SFA/SFB stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial C stages bytes
+        # Divide remaining by bytes needed per A/B/SFA/SFB stage
+        num_ab_stage = (
+            smem_capacity // occupancy - (mbar_helpers_bytes + c_bytes + alpha_bytes)
+        ) // ab_bytes_per_stage
+
+        # Refine epilogue stages:
+        # Calculate remaining smem after allocating for A/B/SFA/SFB stages and reserved bytes
+        # Add remaining unused smem to epilogue
+        num_c_stage += (
+            smem_capacity
+            - occupancy * ab_bytes_per_stage * num_ab_stage
+            - occupancy * (mbar_helpers_bytes + c_bytes + alpha_bytes)
+        ) // (occupancy * c_bytes_per_stage)
+
+        return num_acc_stage, num_ab_stage, num_c_stage, num_alpha_scale_stage
+
+    @staticmethod
+    def _compute_grid(
+        c: cute.Tensor,
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+        max_active_clusters: cutlass.Constexpr,
+    ) -> Tuple[utils.PersistentTileSchedulerParams, Tuple[int, int, int]]:
+        """Use persistent tile scheduler to compute the grid size for the output tensor C.
+
+        :param c: The output tensor C
+        :type c: cute.Tensor
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: tuple[int, int, int]
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr
+
+        :return: A tuple containing:
+            - tile_sched_params: Parameters for the persistent tile scheduler.
+            - grid: Grid shape for kernel launch.
+        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
+        """
+        c_shape = cute.slice_(cta_tile_shape_mnk, (None, None, 0))
+        gc = cute.zipped_divide(c, tiler=c_shape)
+        num_ctas_mnl = gc[(0, (None, None, None))].shape
+        cluster_shape_mnl = (*cluster_shape_mn, 1)
+
+        tile_sched_params = utils.PersistentTileSchedulerParams(num_ctas_mnl, cluster_shape_mnl)
+        grid = utils.StaticPersistentTileScheduler.get_grid_shape(
+            tile_sched_params, max_active_clusters
+        )
+
+        return tile_sched_params, grid
+
+    @staticmethod
+    def is_valid_dtypes_and_scale_factor_vec_size(
+        ab_dtype: Type[cutlass.Numeric],
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        c_dtype: Type[cutlass.Numeric],
+    ) -> bool:
+        """
+        Check if the dtypes and sf_vec_size are valid combinations
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param sf_dtype: The data type of the scale factor
+        :type sf_dtype: Type[cutlass.Numeric]
+        :param sf_vec_size: The vector size of the scale factor
+        :type sf_vec_size: int
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+
+        :return: True if the dtypes and sf_vec_size are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        # Check valid ab_dtype
+        if ab_dtype not in {
+            cutlass.Float4E2M1FN,
+            cutlass.Float8E5M2,
+            cutlass.Float8E4M3FN,
+        }:
+            is_valid = False
+
+        # Check valid sf_vec_size
+        if sf_vec_size not in {16, 32}:
+            is_valid = False
+
+        # Check valid sf_dtype
+        if sf_dtype not in {cutlass.Float8E8M0FNU, cutlass.Float8E4M3FN}:
+            is_valid = False
+
+        # Check valid sf_dtype and sf_vec_size combinations
+        if sf_dtype == cutlass.Float8E4M3FN and sf_vec_size == 32:
+            is_valid = False
+        if ab_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN} and sf_vec_size == 16:
+            is_valid = False
+
+        # Check valid c_dtype
+        if c_dtype not in {
+            cutlass.Float32,
+            cutlass.Float16,
+            cutlass.BFloat16,
+            cutlass.Float8E5M2,
+            cutlass.Float8E4M3FN,
+            cutlass.Float4E2M1FN,
+        }:
+            is_valid = False
+
+        return is_valid
+
+    @staticmethod
+    def is_valid_layouts(
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if layouts and dtypes are valid combinations
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param a_major: The major dimension of the A tensor
+        :type a_major: str
+        :param b_major: The major dimension of the B tensor
+        :type b_major: str
+        :param c_major: The major dimension of the C tensor
+        :type c_major: str
+
+        :return: True if the layouts are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        if ab_dtype is cutlass.Float4E2M1FN and not (a_major == "k" and b_major == "k"):
+            is_valid = False
+        # TODO: Currently we don't support m major output for Float4E2M1FN
+        if c_dtype is cutlass.Float4E2M1FN and c_major == "m":
+            is_valid = False
+
+        return is_valid
+
+    @staticmethod
+    def is_valid_mma_tiler_and_cluster_shape(
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ) -> bool:
+        """
+        Check if the mma tiler and cluster shape are valid
+
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+
+        :return: True if the mma tiler and cluster shape are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+        # Skip invalid mma tile shape
+        if mma_tiler_mn[0] not in [128, 256]:
+            is_valid = False
+        # TODO: Add tile_n=64 and tile_n=192 support
+        if mma_tiler_mn[1] not in [64, 128, 256]:
+            is_valid = False
+        # Skip illegal cluster shape
+        if cluster_shape_mn[0] % (2 if mma_tiler_mn[0] == 256 else 1) != 0:
+            is_valid = False
+
+        # Skip invalid cluster shape
+        def is_power_of_2(x):
+            return x > 0 and (x & (x - 1)) == 0
+
+        if (
+            cluster_shape_mn[0] * cluster_shape_mn[1] > 16
+            or cluster_shape_mn[0] <= 0
+            or cluster_shape_mn[1] <= 0
+            # Special cluster shape check for scale factor multicasts.
+            # Due to limited size of scale factors, we can't multicast among more than 4 CTAs.
+            or cluster_shape_mn[0] > 4
+            or cluster_shape_mn[1] > 4
+            or not is_power_of_2(cluster_shape_mn[0])
+            or not is_power_of_2(cluster_shape_mn[1])
+        ):
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def is_valid_tensor_alignment(
+        m: int,
+        n: int,
+        k: int,
+        l: int,  # noqa: E741
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if the tensor alignment is valid
+
+        :param m: The number of rows in the A tensor
+        :type m: int
+        :param n: The number of columns in the B tensor
+        :type n: int
+        :param k: The number of columns in the A tensor
+        :type k: int
+        :param l: The number of columns in the C tensor
+        :type l: int
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+
+        :return: True if the problem shape is valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape):
+            major_mode_idx = 0 if is_mode0_major else 1
+            num_major_elements = tensor_shape[major_mode_idx]
+            num_contiguous_elements = 16 * 8 // dtype.width
+            return num_major_elements % num_contiguous_elements == 0
+
+        if (
+            not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l))
+            or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l))
+            or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l))
+        ):
+            is_valid = False
+        return is_valid
+
+    @cute.jit
+    def wrapper(
+        self,
+        a_ptr: cute.Pointer,
+        b_ptr: cute.Pointer,
+        a_sf_ptr: cute.Pointer,
+        b_sf_ptr: cute.Pointer,
+        alpha_scale_ptr: cute.Pointer,
+        c_ptr: cute.Pointer,
+        m: cutlass.Int64,
+        n: cutlass.Int64,
+        k: cutlass.Int64,
+        l: cutlass.Int64,  # noqa: E741
+        expert_count: cutlass.Constexpr,
+        scaling_vector_size: cutlass.Constexpr,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+    ):
+        """Wrapper function to create cute.Tensor objects from raw pointers and call the kernel.
+
+        :param a_ptr: Pointer to input tensor A (M, K, L) - K-major
+        :type a_ptr: cute.Pointer
+        :param b_ptr: Pointer to weight tensor B (N, K, L) - K-major
+        :type b_ptr: cute.Pointer
+        :param a_sf_ptr: Pointer to scale factor tensor for A
+        :type a_sf_ptr: cute.Pointer
+        :param b_sf_ptr: Pointer to scale factor tensor for B
+        :type b_sf_ptr: cute.Pointer
+        :param alpha_scale_ptr: Pointer to alpha scale tensor (M, expert_count, L)
+        :type alpha_scale_ptr: cute.Pointer
+        :param c_ptr: Pointer to output tensor C (M, N, L) - N-major
+        :type c_ptr: cute.Pointer
+        :param m: M dimension (number of tokens)
+        :type m: cutlass.Int64
+        :param n: N dimension (output hidden size)
+        :type n: cutlass.Int64
+        :param k: K dimension (weight_per_expert * expert_count)
+        :type k: cutlass.Int64
+        :param l: L dimension (batch, typically 1)
+        :type l: cutlass.Int64
+        :param expert_count: Number of experts
+        :type expert_count: cutlass.Constexpr
+        :param scaling_vector_size: Scale factor vector size
+        :type scaling_vector_size: cutlass.Constexpr
+        :param max_active_clusters: Maximum number of active clusters
+        :type max_active_clusters: cutlass.Constexpr
+        :param stream: CUDA stream
+        :type stream: cuda.CUstream
+        """
+        scale_k = k // scaling_vector_size
+
+        # Create A tensor (M, K, L) - K-major
+        a = cute.make_tensor(
+            a_ptr,
+            layout=cute.make_ordered_layout((m, k, l), order=(1, 0, 2)),
+        )
+
+        # Create B tensor (N, K, L) - K-major
+        b = cute.make_tensor(
+            b_ptr,
+            layout=cute.make_ordered_layout((n, k, l), order=(1, 0, 2)),
+        )
+
+        # Create C tensor (M, N, L) - N-major
+        c = cute.make_tensor(
+            c_ptr,
+            layout=cute.make_ordered_layout((m, n, l), order=(1, 0, 2)),
+        )
+
+        # Create A scale factor tensor (swizzled layout)
+        # Shape: (32, 4, m // 128, 4, scale_k // 4, l)
+        a_sf = cute.make_tensor(
+            a_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, m // 128, 4, scale_k // 4, l), order=(2, 1, 4, 0, 3, 5)
+            ),
+        )
+
+        # Create B scale factor tensor (swizzled layout)
+        # Shape: (32, 4, n // 128, 4, scale_k // 4, l)
+        b_sf = cute.make_tensor(
+            b_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, n // 128, 4, scale_k // 4, l), order=(2, 1, 4, 0, 3, 5)
+            ),
+        )
+
+        # Create alpha scale tensor (M, expert_count, L) - expert_count-major
+        alpha_scale = cute.make_tensor(
+            alpha_scale_ptr,
+            layout=cute.make_ordered_layout((m, expert_count, l), order=(1, 0, 2)),
+        )
+
+        # Call the kernel
+        self.__call__(
+            a,
+            b,
+            a_sf,
+            b_sf,
+            alpha_scale,
+            c,
+            max_active_clusters,
+            stream,
+        )
+
+    @staticmethod
+    def can_implement(
+        ab_dtype: Type[cutlass.Numeric],
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        c_dtype: Type[cutlass.Numeric],
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        m: int,
+        n: int,
+        k: int,
+        l: int,  # noqa: E741
+        a_major: str,
+        b_major: str,
+        c_major: str,
+        expert_count: int,
+        weight_per_expert: int,
+    ) -> bool:
+        """
+        Check if the gemm can be implemented
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param sf_dtype: The data type of the scale factor tensor
+        :type sf_dtype: Type[cutlass.Numeric]
+        :param sf_vec_size: The vector size
+        :type sf_vec_size: int
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+        :param m: The number of rows in the A tensor
+        :type m: int
+        :param n: The number of columns in the B tensor
+        :type n: int
+        :param k: The number of columns in the A tensor
+        :type k: int
+        :param l: The number of columns in the C tensor
+        :type l: int
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+        :param expert_count: The number of experts
+        :type expert_count: int
+        :param weight_per_expert: The number of weights per expert
+        :type weight_per_expert: int
+
+        :return: True if the gemm can be implemented, False otherwise
+        :rtype: bool
+        """
+        can_implement = True
+        # Skip unsupported types
+        if not Sm100BlockScaledPersistentDenseGemmKernel.is_valid_dtypes_and_scale_factor_vec_size(
+            ab_dtype, sf_dtype, sf_vec_size, c_dtype
+        ):
+            can_implement = False
+        # Skip unsupported layouts
+        if not Sm100BlockScaledPersistentDenseGemmKernel.is_valid_layouts(
+            ab_dtype, c_dtype, a_major, b_major, c_major
+        ):
+            can_implement = False
+        # Skip invalid mma tile shape and cluster shape
+        if not Sm100BlockScaledPersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shape(
+            mma_tiler_mn, cluster_shape_mn
+        ):
+            can_implement = False
+        # Skip illegal problem shape for load/store alignment
+        if not Sm100BlockScaledPersistentDenseGemmKernel.is_valid_tensor_alignment(
+            m, n, k, l, ab_dtype, c_dtype, a_major, b_major, c_major
+        ):
+            can_implement = False
+
+        # Add specific check for weight_per_expert, expert_count and k size.
+        # skip unsupported weight_per_expert
+        mma_tile_shape_k = 256
+        if weight_per_expert % mma_tile_shape_k != 0:
+            can_implement = False
+
+        # mma_tile_shape_k = 256
+        if not (k % expert_count == 0 and k // expert_count == weight_per_expert):
+            can_implement = False
+
+        return can_implement
+
+
+@cute.jit
+def cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+    sf_ref_tensor: cute.Tensor,
+    sf_mma_tensor: cute.Tensor,
+):
+    """Convert scale factor tensor from MKL layout to mma specification M(32x4xrest_m)xK(4xrest_k)xL layout"""
+    # sf_mma_tensor has flatten shape (32, 4, rest_m, 4, rest_k, l)
+    # group to ((32, 4, rest_m), (4, rest_k), l)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 0, 3)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 1, 3)
+    for i in cutlass.range(cute.size(sf_ref_tensor)):
+        mkl_coord = sf_ref_tensor.layout.get_hier_coord(i)
+        sf_mma_tensor[mkl_coord] = sf_ref_tensor[mkl_coord]
diff --git a/tensorrt_llm/_torch/disaggregation/transceiver.py b/tensorrt_llm/_torch/disaggregation/transceiver.py
index 09c9932a1b11..d5353a8458db 100644
--- a/tensorrt_llm/_torch/disaggregation/transceiver.py
+++ b/tensorrt_llm/_torch/disaggregation/transceiver.py
@@ -104,6 +104,7 @@ def _broadcast_context_endpoint(self) -> str:
     def _init_sync_policy(self):
         m = self._mapping
         self._ctx_need_tp_sync = m.tp_size > 1 and not m.enable_attention_dp
+        self._ctx_need_pp_sync = m.pp_size > 1
         self._gen_need_sync = not (m.world_size == 1 or (m.enable_attention_dp and m.pp_size == 1))
         pp_allgather: Callable = getattr(self._dist, "pp_allgather")
         self._gen_allgather: Callable = (
@@ -190,9 +191,25 @@ def _need_aux_transfer(req: LlmRequest) -> bool:
         return params is not None and params.schedule_style == DisaggScheduleStyle.GENERATION_FIRST
 
     def _ctx_consensus(self, local_ids: list) -> list:
+        # TP consensus: ensure all TP ranks have peer info
         sync_size = self._dist.tp_size if self._ctx_need_tp_sync else 1
         all_ranks = self._dist.tp_allgather(local_ids) if self._ctx_need_tp_sync else [local_ids]
-        return _find_consensus_request_ids(all_ranks, sync_size)
+        ready_ids = _find_consensus_request_ids(all_ranks, sync_size)
+
+        # PP consensus: ensure all PP ranks have peer info before promoting.
+        # In PP, the first PP rank schedules and propagates to others. If a
+        # request is promoted on the first rank but peer info hasn't arrived
+        # on other ranks, respond_and_send_async on those ranks would fail
+        # to dispatch the KV transfer (gen-first skips listener dispatch).
+        # TODO: This is a workaround for functionality: pp_allgather impacts
+        # the pp loop performance. One possible solution is to let pp rank0
+        # decide the ready request ids, the other pp ranks treat the unready
+        # request as ctx-first requests.
+        if self._ctx_need_pp_sync:
+            pp_all_ranks = getattr(self._dist, "pp_allgather")(ready_ids)
+            ready_ids = _find_consensus_request_ids(pp_all_ranks, self._mapping.pp_size)
+
+        return ready_ids
 
     def _gen_consensus(self, local_ids: list) -> list:
         sync_size = (
@@ -381,15 +398,15 @@ def get_disaggregated_params(self) -> Dict[str, Any]:
 
     def prepare_context_requests(self, requests: List[LlmRequest]):
         # Place new generation-first context requests into wait state, then
-        # use tp_allgather consensus to promote ready requests to CONTEXT_INIT.
+        # use allgather consensus to promote ready requests to CONTEXT_INIT.
         for req in requests:
             rid = get_unique_rid(req)
             if rid not in self._send_sessions:
                 self._wait_reqs[rid] = req
                 req.state = LlmRequestState.DISAGG_CONTEXT_WAIT_SCHEDULER
 
-        # Check which waiting requests have peer info locally, then tp_allgather
-        # consensus so all TP ranks agree before promoting.
+        # Check which waiting requests have peer info locally, then allgather
+        # consensus so all TP/PP ranks agree before promoting.
         # Without consensus, background peer info arriving at different times on
         # different ranks causes scheduling mismatches → hang.
         local_ready = [
diff --git a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
index 97281d3da76e..855bec5f35ef 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
@@ -53,6 +53,7 @@
 from .fused_moe_cute_dsl import CuteDslFusedMoE
 from .fused_moe_cutlass import CutlassFusedMoE
 from .fused_moe_deepgemm import DeepGemmFusedMoE
+from .fused_moe_densegemm import DenseGEMMFusedMoE
 from .fused_moe_trtllm_gen import TRTLLMGenFusedMoE
 
 
@@ -605,8 +606,8 @@ def _forward_chunk_impl(
 
             assert token_selected_experts.shape[1] == self.routing_method.experts_per_token
             assert token_selected_experts.shape == token_final_scales.shape
-            # CutlassFusedMoE expects float32, while TRTLLMGenFusedMoE uses bfloat16
-            if isinstance(self.backend, CutlassFusedMoE):
+            # CutlassFusedMoE and DenseGEMMFusedMoE expect float32, while TRTLLMGenFusedMoE uses bfloat16
+            if isinstance(self.backend, (CutlassFusedMoE, DenseGEMMFusedMoE)):
                 assert token_final_scales.dtype == torch.float32
             assert token_selected_experts.dtype == torch.int32
 
@@ -1117,7 +1118,12 @@ def _get_backend_kwargs(
         kwargs = {}
 
         # Common parameters for Cutlass and DeepGemm
-        if self.backend.__class__ in (CutlassFusedMoE, DeepGemmFusedMoE, CuteDslFusedMoE):
+        if self.backend.__class__ in (
+            CutlassFusedMoE,
+            DeepGemmFusedMoE,
+            CuteDslFusedMoE,
+            DenseGEMMFusedMoE,
+        ):
             pass
 
         # Cutlass-specific parameters
diff --git a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py
index f3ea6e9a0967..0638904ab8e7 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py
@@ -12,6 +12,7 @@
 from .fused_moe_cute_dsl import CuteDslFusedMoE
 from .fused_moe_cutlass import CutlassFusedMoE
 from .fused_moe_deepgemm import DeepGemmFusedMoE
+from .fused_moe_densegemm import DenseGEMMFusedMoE
 from .fused_moe_triton import TritonFusedMoE
 from .fused_moe_trtllm_gen import TRTLLMGenFusedMoE
 from .fused_moe_vanilla import VanillaMoE
@@ -45,6 +46,22 @@ def get_moe_cls(
             return CutlassFusedMoE
     elif moe_backend.upper() == "DEEPGEMM":
         return DeepGemmFusedMoE
+    elif moe_backend.upper() == "DENSEGEMM":
+        if quant_config is None or not quant_config.quant_mode.has_nvfp4():
+            logger.warning(
+                "DenseGEMMFusedMoE only supports nvfp4. "
+                f"Check out details in quant_config: {quant_config}. Using CutlassFusedMoE instead."
+            )
+            return CutlassFusedMoE
+        # DenseGEMM CuTe DSL kernels only support SM100/SM103.
+        from tensorrt_llm._utils import get_sm_version
+        sm_version = get_sm_version()
+        if sm_version not in DenseGEMMFusedMoE._SUPPORTED_SM_VERSIONS:
+            logger.warning(
+                f"DenseGEMMFusedMoE only supports SM {DenseGEMMFusedMoE._SUPPORTED_SM_VERSIONS} "
+                f"(got SM {sm_version}). Using CutlassFusedMoE instead.")
+            return CutlassFusedMoE
+        return DenseGEMMFusedMoE
     elif moe_backend.upper() == "TRTLLM":
         if quant_config is not None and (
                 quant_config.quant_mode.has_fp8_block_scales()
@@ -139,8 +156,8 @@ def create_moe_backend(
     if moe_load_balancer is not None:
         assert moe_cls in [
             WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE, CuteDslFusedMoE,
-            DeepGemmFusedMoE
-        ], "MoE Load Balance is only supported in WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE, CuteDslFusedMoE, and DeepGemmFusedMoE."
+            DeepGemmFusedMoE, DenseGEMMFusedMoE
+        ], "MoE Load Balance is only supported in WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE, CuteDslFusedMoE, DeepGemmFusedMoE, and DenseGEMMFusedMoE."
 
     if bias:
         assert moe_cls in [CutlassFusedMoE, TritonFusedMoE, TRTLLMGenFusedMoE
@@ -211,7 +228,8 @@ def create_moe_backend(
             aux_stream_dict=aux_stream_dict,
             weight_loading_mode=weight_loading_mode,
             apply_router_weight_on_input=apply_router_weight_on_input,
-            layer_idx=layer_idx)
+            layer_idx=layer_idx,
+            activation_type=activation_type)
     elif moe_cls == VanillaMoE:
         assert not apply_router_weight_on_input, "apply_router_weight_on_input is not supported in VanillaMoE."
 
@@ -277,6 +295,23 @@ def create_moe_backend(
             swiglu_beta=swiglu_beta,
             swiglu_limit=swiglu_limit,
         )
+    elif moe_cls == DenseGEMMFusedMoE:
+        return moe_cls(
+            routing_method=routing_method,
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            dtype=dtype,
+            reduce_results=reduce_results,
+            model_config=model_config,
+            aux_stream_dict=aux_stream_dict,
+            weight_loading_mode=weight_loading_mode,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            layer_idx=layer_idx,
+            init_load_balancer=init_load_balancer,
+            without_comm=without_comm,
+            activation_type=activation_type,
+        )
     else:
         raise ValueError(f"Unsupported moe backend: {moe_cls}")
 
@@ -350,7 +385,7 @@ def create_moe(
                                              "1") == "1"
     if enable_configurable_moe or moe_cls == CuteDslFusedMoE:
         if moe_cls in (DeepGemmFusedMoE, TRTLLMGenFusedMoE, CuteDslFusedMoE,
-                       CutlassFusedMoE):
+                       CutlassFusedMoE, DenseGEMMFusedMoE):
             return ConfigurableMoE(
                 routing_method=routing_method,
                 num_experts=num_experts,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_densegemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_densegemm.py
new file mode 100644
index 000000000000..43d457389b17
--- /dev/null
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_densegemm.py
@@ -0,0 +1,603 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import inspect
+import os
+from typing import Dict, List, Optional, Union
+
+import torch
+
+from tensorrt_llm.models.modeling_utils import QuantAlgo
+from tensorrt_llm.quantization.utils import fp4_utils
+
+from ...distributed import allgather
+from ...memory_buffer_utils import get_memory_buffers
+from ...model_config import ModelConfig
+from ...utils import AuxStreamType, EventType, Fp4QuantizedTensor, swizzle_sf, unswizzle_sf
+from .interface import MoE, MoEWeightLoadingMode
+from .quantization import NVFP4CuteDslFusedMoEMethod
+from .routing import BaseMoeRoutingMethod
+
+
+@torch.compile(options={"max-autotune": True})
+def gen_fc2_alpha_fused(
+    token_selected_experts: torch.Tensor,
+    token_final_scales: torch.Tensor,
+    alpha: Optional[torch.Tensor],
+    alpha_max: Optional[torch.Tensor] = None,
+    output: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Generate fc2 alpha values, optionally normalized for FC1 alpha_post fusion.
+
+    Instead of:
+        1. zeros() -> scatter_() -> multiply with alpha (operates on large [N, E] tensor)
+
+    We do:
+        1. Gather alpha values for selected experts (small [N, top_k] tensor)
+        2. Multiply scales with gathered alpha (small tensor operation)
+        3. Optionally normalize by alpha_max for FC1 alpha_post fusion
+        4. Scatter to output (single write to large tensor)
+
+    This reduces memory bandwidth by avoiding read-modify-write on the large tensor.
+
+    Args:
+        token_selected_experts: Expert indices for each token [num_tokens, top_k]
+        token_final_scales: Final scaling factors [num_tokens, top_k]
+        alpha: Per-expert alpha values [expert_size]
+        alpha_max: Max alpha value for normalization (optional)
+        output: Pre-allocated output buffer [num_tokens, expert_size] (optional).
+                If None, a new tensor will be allocated (not compatible with CUDA graph).
+    """
+    # Pre-compute scaled values on small tensor [num_tokens, top_k]
+    if alpha is not None:
+        # Gather alpha for selected experts: alpha[expert_idx] for each selection
+        gathered_alpha = alpha[token_selected_experts.long()]  # [num_tokens, top_k]
+        scaled_values = token_final_scales * gathered_alpha
+    else:
+        scaled_values = token_final_scales
+
+    # Normalize by alpha_max for FC1 alpha_post fusion
+    if alpha_max is not None:
+        scaled_values = scaled_values / alpha_max
+
+    # Use pre-allocated output or create new tensor
+    if output is not None:
+        output.zero_()
+        fc2_alpha = output
+    else:
+        assert alpha is not None, (
+            "alpha must be provided when output buffer is not pre-allocated, "
+            "since expert_size cannot be inferred from token_final_scales alone"
+        )
+        num_tokens = token_selected_experts.shape[0]
+        expert_size = alpha.shape[0]
+        fc2_alpha = torch.zeros(
+            [num_tokens, expert_size],
+            dtype=torch.float32,
+            device=token_selected_experts.device,
+        )
+
+    return fc2_alpha.scatter_(1, token_selected_experts.long(), scaled_values)
+
+
+class DenseGEMMFusedMoE(MoE):
+    """CuteDSL DenseGEMM flow of fused mixture of experts (MoE) Layer.
+
+    This backend uses CuTe DSL dense GEMM kernels with fused SwiGLU for MoE
+    computation. It supports NVFP4 quantization only and is restricted to
+    SM100/SM103 (Blackwell) architectures.
+
+    Unlike CutlassFusedMoE which uses per-expert scattered GEMM, DenseGEMM
+    packs all experts into a single dense matrix and uses standard GEMM operations,
+    which can be more efficient for small token counts (min-latency scenarios).
+
+    Args:
+        num_experts (int): Number of experts in the MoE layer.
+        top_k (int): Number of top experts to select for each input token.
+        hidden_size (int): Size of the hidden state.
+        intermediate_size (int): Size of the intermediate state.
+        aux_stream_dict (Optional[Dict[AuxStreamType, torch.cuda.Stream]]): Auxiliary CUDA streams for overlapping.
+        dtype (Optional[torch.dtype]): Data type for the weights.
+        reduce_results (bool): Whether to reduce the results across devices.
+        model_config (ModelConfig): Configuration object for the model.
+    """
+
+    # Memory buffer pool for CUDA graph compatibility
+    buffers = get_memory_buffers()
+
+    # DenseGEMM only supports SM100 and SM103 (Blackwell CuTe DSL kernels).
+    _SUPPORTED_SM_VERSIONS = (100, 103)
+
+    @classmethod
+    def can_implement(
+        cls,
+        quant_algo: Optional[QuantAlgo],
+        dtype_activation: torch.dtype = torch.bfloat16,
+        swiglu_gptoss_style: bool = False,
+    ) -> tuple:
+        """Check if DenseGEMMFusedMoE can implement the given configuration.
+
+        DenseGEMMFusedMoE supports:
+        - NVFP4 quantization only
+        - SM100/SM103 (Blackwell) only
+        - SwiGLU activation only (swiglu_gptoss_style not supported)
+        """
+        from tensorrt_llm._utils import get_sm_version
+
+        from .interface import _warn_and_return
+
+        sm_version = get_sm_version()
+        if sm_version not in cls._SUPPORTED_SM_VERSIONS:
+            return _warn_and_return(
+                f"DenseGEMMFusedMoE requires SM {cls._SUPPORTED_SM_VERSIONS}, got SM{sm_version}"
+            )
+
+        if quant_algo != QuantAlgo.NVFP4:
+            return _warn_and_return(
+                f"DenseGEMMFusedMoE only supports NVFP4 quantization (got quant_algo={quant_algo})"
+            )
+
+        if swiglu_gptoss_style:
+            return _warn_and_return("DenseGEMMFusedMoE does not support swiglu_gptoss_style")
+
+        return (True, None)
+
+    def __init__(
+        self,
+        *,
+        routing_method: BaseMoeRoutingMethod,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = False,
+        model_config: ModelConfig = ModelConfig(),
+        aux_stream_dict: Optional[Dict[AuxStreamType, torch.cuda.Stream]] = None,
+        weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.VANILLA,
+        apply_router_weight_on_input: bool = False,
+        layer_idx: Optional[int] = None,
+        init_load_balancer: bool = True,
+        without_comm: bool = False,
+        activation_type=None,
+    ):
+        # DenseGEMM CuTe DSL kernels only support SM100 and SM103.
+        from tensorrt_llm._utils import get_sm_version
+
+        from ...utils import ActivationType
+
+        sm_version = get_sm_version()
+        assert sm_version in self._SUPPORTED_SM_VERSIONS, (
+            f"DenseGEMMFusedMoE only supports SM {self._SUPPORTED_SM_VERSIONS} "
+            f"(got SM {sm_version}). The CuTe DSL kernels require Blackwell architecture."
+        )
+
+        # DenseGEMM kernel hardcodes SwiGLU fusion — reject other activation types
+        # before calling super().__init__() to fail fast with a clear message.
+        if activation_type is None:
+            activation_type = ActivationType.Swiglu
+        assert activation_type == ActivationType.Swiglu, (
+            f"DenseGEMMFusedMoE only supports SwiGLU activation "
+            f"(got activation_type={activation_type}). "
+            f"The FC1 kernel fuses SwiGLU into the GEMM epilogue."
+        )
+
+        # FC2 DenseGEMM kernel tiles K dimension with MMA tile size 256.
+        # weight_per_expert (= intermediate_size) must be 256-aligned so that
+        # expert boundaries align with MMA tile boundaries.
+        _MMA_TILE_K = 256
+        assert intermediate_size % _MMA_TILE_K == 0, (
+            f"DenseGEMMFusedMoE requires intermediate_size to be a multiple of "
+            f"{_MMA_TILE_K} (got intermediate_size={intermediate_size}). "
+            f"FC2 kernel cannot correctly split alpha_scale at expert boundaries "
+            f"when weight_per_expert is not MMA tile-K aligned."
+        )
+
+        # Call MoE base class directly (not CutlassFusedMoE).
+        # Note: `without_comm` and `apply_router_weight_on_input` are accepted
+        # for API compatibility with create_moe_backend() but are not passed to
+        # MoE.__init__() since DenseGEMM does not use alltoall communication.
+        super().__init__(
+            routing_method=routing_method,
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            dtype=dtype,
+            reduce_results=reduce_results,
+            model_config=model_config,
+            aux_stream_dict=aux_stream_dict,
+            weight_loading_mode=weight_loading_mode,
+            layer_idx=layer_idx,
+            init_load_balancer=init_load_balancer,
+            activation_type=activation_type,
+        )
+
+        # Environment variable to control fc2_alpha fusion into FC1's alpha_post.
+        # Default: disabled (0). Set to "1" to enable fusion (known accuracy issue under TP).
+        self.use_fused_fc2_alpha = os.environ.get("TRTLLM_MOE_FUSED_FC2_ALPHA", "0") == "1"
+
+        # Pre-register fc2_alpha_max buffer for fused fc2_alpha optimization.
+        # Populated in load_weights() with max(fc2_alpha).
+        self.register_buffer("fc2_alpha_max", torch.zeros(1, dtype=torch.float32))
+
+        # Initialize auxiliary stream and events for gen_fc2_alpha_fused overlap with fc1
+        if self.aux_stream_dict is None:
+            self.aux_stream_dict = aux_stream_dict if aux_stream_dict is not None else {}
+        if AuxStreamType.MoeFc2Alpha not in self.aux_stream_dict:
+            self.aux_stream_dict[AuxStreamType.MoeFc2Alpha] = torch.cuda.Stream()
+        self.event_dict = {}
+        for key in [EventType.Main, EventType.MoeFc2Alpha]:
+            self.event_dict[key] = torch.cuda.Event()
+
+        # Weight creation
+        self._weights_created = False
+        if not model_config.skip_create_weights_in_init:
+            self.create_weights()
+
+    def _supports_load_balancer(self) -> bool:
+        """DenseGEMMFusedMoE supports load balancer."""
+        return True
+
+    def _get_quant_method(self):
+        if self.quant_config is not None and self.quant_config.layer_quant_mode.has_any_quant(
+            exclude_kv_cache=True
+        ):
+            if self.quant_config.layer_quant_mode.has_nvfp4():
+                return NVFP4CuteDslFusedMoEMethod()
+            raise ValueError(
+                f"{self.__class__.__name__} only supports NVFP4 quantization, "
+                f"got {self.quant_config.quant_mode}."
+            )
+        raise ValueError(
+            f"{self.__class__.__name__} requires quantization (NVFP4), "
+            f"but no quantization config was provided."
+        )
+
+    def create_weights(self):
+        if self._weights_created:
+            return
+
+        self.quant_method = self._get_quant_method()
+        self.quant_method.create_weights(self)
+
+        self._weights_created = True
+
+    def load_weights(self, weights: List[Dict], allow_partial_loading: bool = False):
+        assert self._weights_created
+        assert len(weights) == 1
+        weights = weights[0]
+
+        kargs = {}
+        if "allow_partial_loading" in inspect.getfullargspec(self.quant_method.load_weights).args:
+            kargs["allow_partial_loading"] = allow_partial_loading
+        self.quant_method.load_weights(self, weights, self.weight_loading_mode, **kargs)
+
+        # Transpose w2_weight layout: (E, H, ...) -> (H, E, ...) for dense GEMM.
+        # NOTE: .contiguous() on the transposed view allocates a full-size temporary,
+        # temporarily doubling peak memory.  An in-place multi-dim transpose is not
+        # feasible without complex cycle-following, and this runs only once during
+        # weight loading, so the trade-off is acceptable.
+        w2_transposed = self.w2_weight.transpose(0, 1).contiguous()
+        self.w2_weight.reshape([-1]).copy_(w2_transposed.reshape([-1]), non_blocking=True)
+        del w2_transposed
+        if self.has_any_quant:
+            if self.has_nvfp4:
+                self._transform_w2_weight_scale_for_min_latency()
+                # Compute fc2_alpha_max for fused fc2_alpha optimization
+                self.fc2_alpha_max.copy_(torch.max(self.fc2_alpha).reshape(1), non_blocking=True)
+            else:
+                raise ValueError(
+                    f"{self.__class__.__name__} only supports nvfp4 quantization, "
+                    f"got {self.quant_config.quant_mode}."
+                )
+
+    def post_load_weights(self):
+        self.quant_method.post_load_weights(self)
+
+    def _transform_w2_weight_scale_for_min_latency(self):
+        """Transform w2_weight_scale for minimum latency path optimization."""
+        # Calculate padded dimensions
+        nrows = fp4_utils.pad_up(self.hidden_size, 128)
+        ncols = fp4_utils.pad_up(
+            self.intermediate_size_per_partition // self.scaling_vector_size, 4
+        )
+
+        # Clone and convert weight scale to uint8
+        w2_weight_scale = self.w2_weight_scale.clone().view(torch.uint8)
+
+        # Unswizzle the scale factor
+        w2_weight_scale = unswizzle_sf(
+            w2_weight_scale,
+            self.hidden_size * self.expert_size_per_partition,
+            self.intermediate_size_per_partition,
+        )
+
+        # Reshape and transpose for min latency layout
+        w2_weight_scale = w2_weight_scale.reshape([self.expert_size_per_partition, nrows, ncols])
+        w2_weight_scale = w2_weight_scale.transpose(0, 1).reshape(
+            nrows, self.expert_size_per_partition * ncols
+        )
+
+        # Swizzle back with new layout
+        w2_weight_scale = swizzle_sf(
+            w2_weight_scale,
+            self.hidden_size,
+            self.expert_size_per_partition * self.intermediate_size_per_partition,
+        )
+
+        # Copy back to original tensor
+        self.w2_weight_scale.copy_(
+            w2_weight_scale.view(self.w2_weight_scale.dtype).view(self.w2_weight_scale.shape),
+            non_blocking=True,
+        )
+
+    def quantize_input(
+        self, x: Union[torch.Tensor, Fp4QuantizedTensor], post_quant_comm: bool = True
+    ):
+        """Quantize inputs prior to post-communication (alltoall/allgather) or before MoE computation.
+
+        Args:
+            x: Input tensor to quantize
+            post_quant_comm:
+                If True, quantize for post-quant communication path.
+                If False, quantize for non-communication path
+
+        Returns: (x, x_sf) where x_sf is already reshaped to 2D if needed
+
+        For quantization methods that produce scaling factors:
+        - x_sf is reshaped from 1D to 2D: [num_elements] -> [batch_size, ceil_div(hidden_size, scaling_vector_size)]
+        - The 2D shape is required for proper handling in alltoall/allgather operations
+        - scaling_vector_size is typically the group size for block-wise quantization
+        """
+        x_sf = None
+        if self.has_nvfp4:
+            if isinstance(x, Fp4QuantizedTensor):
+                assert not x.is_sf_swizzled, (
+                    "Fp4QuantizedTensor should not be swizzled before communication"
+                )
+                x_row = x.shape[0]
+                x, x_sf = x.fp4_tensor, x.scaling_factor
+            else:
+                x_row = x.shape[0]
+                x, x_sf = torch.ops.trtllm.fp4_quantize(
+                    x, self.fc31_input_scale, self.scaling_vector_size, False, False
+                )
+        else:
+            raise ValueError(
+                f"{self.__class__.__name__} only supports nvfp4 quantization, "
+                f"got {self.quant_config.quant_mode}."
+            )
+
+        if x_sf is not None:
+            x_sf = x_sf.view(x_row, -1)
+
+        return x, x_sf
+
+    def run_moe_nvfp4(
+        self,
+        x: torch.Tensor,
+        token_selected_experts: torch.Tensor,
+        token_final_scales: Optional[torch.Tensor],
+        x_sf: Optional[torch.Tensor] = None,
+        enable_alltoall: bool = False,
+    ) -> torch.Tensor:
+        """Run MoE computation with NVFP4 quantization.
+
+        Args:
+            x: Input tensor
+            token_selected_experts: Expert indices for each token
+            token_final_scales: Final scaling factors for each token
+            x_sf: Input scale factors
+            enable_alltoall: Whether alltoall communication is enabled
+
+        Note:
+            The implementation is controlled by TRTLLM_MOE_FUSED_FC2_ALPHA env var (default: enabled).
+            When enabled, fc2_alpha is fused into FC1's alpha_post with scalar fc2_alpha_max in FC2.
+            When disabled, uses the original per-token per-expert fc2_alpha in FC2.
+        """
+        assert self.has_nvfp4
+        num_tokens = x.shape[0]
+
+        # Get pre-allocated buffer for fc2_alpha (CUDA graph compatible)
+        capture_graph = torch.cuda.is_current_stream_capturing()
+        fc2_alpha_buffer = DenseGEMMFusedMoE.buffers.get_buffer(
+            (num_tokens, self.expert_size_per_partition),
+            dtype=torch.float32,
+            buffer_name="fc2_alpha",
+            reserve_buffer=capture_graph,
+        )
+
+        if self.use_fused_fc2_alpha:
+            # New implementation: fuse fc2_alpha into FC1's alpha_post
+            x_sf = swizzle_sf(x_sf, num_tokens, self.hidden_size)
+
+            # Generate normalized fc2_alpha for FC1 alpha_post fusion
+            fc2_alpha_normalized = gen_fc2_alpha_fused(
+                token_selected_experts,
+                token_final_scales,
+                self.fc2_alpha,
+                self.fc2_alpha_max,  # Normalize by max for FC1 alpha_post
+                fc2_alpha_buffer,  # Pre-allocated buffer
+            )
+
+            # FC1: GEMM + SwiGLU with post-SwiGLU alpha scaling (fused fc2_alpha)
+            fc1_output, fc1_output_sf = torch.ops.trtllm.cute_dsl_nvfp4_dense_gemm_swiglu_blackwell(
+                x,
+                self.w3_w1_weight.view(torch.uint8),
+                x_sf,
+                self.w3_w1_weight_scale,
+                self.fc31_alpha,
+                fc2_alpha_normalized,  # Pass normalized fc2_alpha as alpha_post
+                self.fc2_input_scale,
+                expert_count=self.expert_size_per_partition,
+                weight_per_expert=2 * self.intermediate_size_per_partition,
+                output_dtype=torch.float4_e2m1fn_x2,
+                scaling_vector_size=self.scaling_vector_size,
+            )
+
+            # FC2: Standard nvfp4_gemm with scalar alpha = fc2_alpha_max
+            final_hidden_states = torch.ops.trtllm.nvfp4_gemm(
+                fc1_output.view(torch.uint8),
+                self.w2_weight.view(torch.uint8).reshape(self.hidden_size, -1),
+                fc1_output_sf.view(torch.uint8).reshape(-1),
+                self.w2_weight_scale.view(torch.uint8),
+                self.fc2_alpha_max,
+                torch.bfloat16,
+                to_userbuffers=False,
+                allowed_backends="cutlass,cublaslt,cutedsl,cuda_core",
+            )
+        else:
+            # Original implementation: per-token per-expert fc2_alpha in FC2
+            self.event_dict[EventType.Main].record()
+            x_sf = swizzle_sf(x_sf, num_tokens, self.hidden_size)
+
+            # FC1: GEMM + SwiGLU, output is fp4 quantized
+            fc1_output, fc1_output_sf = torch.ops.trtllm.cute_dsl_nvfp4_dense_gemm_swiglu_blackwell(
+                x,
+                self.w3_w1_weight.view(torch.uint8),
+                x_sf,
+                self.w3_w1_weight_scale,
+                self.fc31_alpha,
+                None,  # alpha_post: no post-SwiGLU scaling
+                self.fc2_input_scale,
+                expert_count=self.expert_size_per_partition,
+                weight_per_expert=2 * self.intermediate_size_per_partition,
+                output_dtype=torch.float4_e2m1fn_x2,
+                scaling_vector_size=self.scaling_vector_size,
+            )
+
+            with torch.cuda.stream(self.aux_stream_dict[AuxStreamType.MoeFc2Alpha]):
+                self.event_dict[EventType.Main].wait()
+                fc2_alpha = gen_fc2_alpha_fused(
+                    token_selected_experts,
+                    token_final_scales,
+                    self.fc2_alpha,
+                    output=fc2_alpha_buffer,  # Use pre-allocated buffer
+                )
+                self.event_dict[EventType.MoeFc2Alpha].record()
+
+            self.event_dict[EventType.MoeFc2Alpha].wait()
+
+            # FC2: input k = expert_count * intermediate_size (after SwiGLU)
+            final_hidden_states = torch.ops.trtllm.cute_dsl_nvfp4_dense_gemm_fc2_blackwell(
+                fc1_output,
+                self.w2_weight.view(torch.uint8).reshape(self.hidden_size, -1),
+                fc1_output_sf.reshape(-1),
+                self.w2_weight_scale,
+                fc2_alpha,
+                expert_count=self.expert_size_per_partition,
+                weight_per_expert=self.intermediate_size_per_partition,
+                output_dtype=torch.bfloat16,
+                scaling_vector_size=self.scaling_vector_size,
+            )
+
+        return final_hidden_states
+
+    def run_moe(
+        self,
+        x: torch.Tensor,
+        token_selected_experts: torch.Tensor,
+        token_final_scales: Optional[torch.Tensor],
+        x_sf: Optional[torch.Tensor] = None,
+        enable_alltoall: bool = False,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Run MoE computation with DenseGEMM backend (NVFP4 only).
+
+        Args:
+            x: Input hidden states (pre-quantized to NVFP4)
+            token_selected_experts: Expert IDs [num_tokens, top_k]. If EPLB is enabled,
+                                    this represents expert slots [num_tokens, top_k] instead.
+            token_final_scales: Final scaling factors for each token
+            x_sf: Input scale factors for NVFP4
+            enable_alltoall: Whether alltoall communication is enabled.
+            **kwargs: Additional arguments for forward compatibility.
+
+        Returns:
+            final_hidden_states tensor.
+        """
+        assert self.has_nvfp4, (
+            f"{self.__class__.__name__} only supports nvfp4 quantization, "
+            f"got {self.quant_config.quant_mode}."
+        )
+        return self.run_moe_nvfp4(
+            x=x,
+            token_selected_experts=token_selected_experts,
+            token_final_scales=token_final_scales,
+            x_sf=x_sf,
+            enable_alltoall=enable_alltoall,
+        )
+
+    def forward_chunk(
+        self,
+        x: Union[torch.Tensor, Fp4QuantizedTensor],
+        router_logits: torch.Tensor,
+        output_dtype: Optional[torch.dtype] = None,
+        all_rank_num_tokens: Optional[List[int]] = None,
+        use_dp_padding: Optional[bool] = None,
+        repeating_info: tuple = (True, True),
+    ) -> torch.Tensor:
+        # Currently, the default path is that ConfigurableMoE calls DenseGEMMFusedMoE.run_moe.
+        # This forward_chunk method is a reference implementation of the legacy path.
+        # Apply routing
+        token_selected_experts, token_final_scales = self.routing_method.apply(router_logits)
+        assert token_selected_experts.shape[1] == self.routing_method.experts_per_token
+        assert token_selected_experts.shape == token_final_scales.shape
+        assert token_selected_experts.shape[0] == router_logits.shape[0]
+        assert token_final_scales.dtype == torch.float32
+        assert token_selected_experts.dtype == torch.int32
+
+        x, x_sf = self.quantize_input(x)
+
+        if self.use_dp and self.parallel_size > 1:
+            x, x_sf, token_selected_experts, token_final_scales = allgather(
+                [x, x_sf, token_selected_experts, token_final_scales],
+                self.mapping,
+                dim=0,
+                sizes=None if use_dp_padding else all_rank_num_tokens,
+            )
+
+        x = self.run_moe(
+            x=x,
+            token_selected_experts=token_selected_experts,
+            token_final_scales=token_final_scales,
+            x_sf=x_sf,
+            enable_alltoall=False,
+        )
+        return x
+
+    def forward_impl(
+        self,
+        x: Union[torch.Tensor, Fp4QuantizedTensor],
+        router_logits: torch.Tensor,
+        *,
+        do_finalize: bool = True,
+        output_dtype: Optional[torch.dtype] = None,
+        all_rank_num_tokens: Optional[List[int]] = None,
+        use_dp_padding: Optional[bool] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        assert do_finalize, "DenseGEMMFusedMoE does not support do_finalize=False"
+
+        is_first_call = self.repeat_idx == 0
+        is_last_call = self.repeat_idx == self.repeat_count - 1
+
+        outputs = self.forward_chunk(
+            x,
+            router_logits,
+            output_dtype,
+            all_rank_num_tokens=all_rank_num_tokens,
+            use_dp_padding=use_dp_padding,
+            repeating_info=(is_first_call, is_last_call),
+        )
+        outputs = self.reducescatter_or_allreduce(
+            outputs,
+            all_rank_num_tokens=all_rank_num_tokens,
+            use_dp_padding=use_dp_padding,
+        )
+
+        if self.use_dp and self.parallel_size > 1:
+            rank = self.parallel_rank
+            outputs = outputs[: all_rank_num_tokens[rank]]
+        self.repeat_idx = 0 if self.repeat_idx == self.repeat_count - 1 else self.repeat_idx + 1
+        return outputs
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
index f5345b61b508..1edbf54c2914 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -13,7 +13,8 @@
 from ...distributed import allgather, reducescatter
 from ...expert_statistic import ExpertStatistic
 from ...model_config import ModelConfig
-from ...utils import AuxStreamType, EventType, Fp4QuantizedTensor
+from ...utils import (ActivationType, AuxStreamType, EventType,
+                      Fp4QuantizedTensor)
 from .deep_ep_utils import buffer_pool, deep_ep_installed
 from .interface import AlltoallMethodType, MoE
 from .ops import MoEOp, MoEOpSelector
@@ -61,6 +62,7 @@ def __init__(
         VANILLA,
         apply_router_weight_on_input: bool = False,
         layer_idx: Optional[int] = None,
+        activation_type: ActivationType = ActivationType.Swiglu,
     ):
 
         super().__init__(
@@ -74,6 +76,7 @@ def __init__(
             aux_stream_dict=aux_stream_dict,
             weight_loading_mode=weight_loading_mode,
             layer_idx=layer_idx,
+            activation_type=activation_type,
         )
 
         assert self.use_dp, "Attention DP should be used with WideEP."
diff --git a/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py b/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py
index e839d0d9ed4c..6fe9fc09aa73 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py
@@ -1079,6 +1079,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
     'GptOssForCausalLM',
     'MixtralForCausalLM',
     'Llama4ForConditionalGeneration',
+    'NemotronHForCausalLM',
     'Qwen2MoeForCausalLM',
     'Qwen3MoeForCausalLM',
 ]
diff --git a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
index c84fc8f5d637..e27290b472fa 100644
--- a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
+++ b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
@@ -206,7 +206,8 @@ def cancel_request(self, req: LlmRequest):
         return self.impl.cancel_request(req)
 
     def prepare_context_requests(self, requests: List[LlmRequest]):
-        raise NotImplementedError
+        # not implemented, an empty placeholder to allow being invoked unconditionally
+        ...
 
     def get_disaggregated_params(self):
         # Cpp kv cache transceiver will set the disaggregated params to context response
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 130769730689..693fa3c75554 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -453,6 +453,11 @@ def __init__(
         torch.cuda.current_stream().wait_stream(self.execution_stream)
         self.is_warmup = False
 
+        # Snapshot some cumulative KV cache counters so that stats reported to
+        # users exclude blocks reused and missed during warmup dummy requests.
+        if hasattr(self.kv_cache_manager, 'snapshot_warmup_baseline'):
+            self.kv_cache_manager.snapshot_warmup_baseline()
+
         self.is_shutdown = False
         self.max_batch_size = max_batch_size
         self.adp_ctx_waiting_iters_count = 0
@@ -581,6 +586,19 @@ def _maybe_init_kv_connector_manager(self):
             self.kv_connector_manager.wait_for_initialization()
 
     def _end_transfer_and_maybe_terminate(self, request: LlmRequest):
+        if self.kv_cache_transceiver and request in self.active_requests:
+            # Fast-transfer: KV transfer completed in the same iteration
+            # before _handle_responses could run. Create the response now
+            # while state is still TRANS_IN_PROGRESS (required by C++
+            # createResult). Then proceed with end_transfer + termination.
+            response = request.create_response(False, self.dist.rank)
+            if response:
+                response.result.cached_tokens = request.cached_tokens
+                self._enqueue_responses([(request.py_request_id, response)])
+            if self.async_transfer_manager.end_transfer(request):
+                self.active_requests.remove(request)
+                self._terminate_request(request)
+            return
         if self.async_transfer_manager.end_transfer(request):
             self._terminate_request(request)
 
@@ -1282,6 +1300,7 @@ def _executor_loop_pp(self):
                 self._handle_control_request()
 
                 if self.kv_cache_transceiver:
+                    self._check_disagg_ctx_schedulable_status(new_requests)
                     self._check_disagg_gen_transfer_status()
 
                 if self.enable_iter_perf_stats:
@@ -1306,11 +1325,22 @@ def _executor_loop_pp(self):
                     self._prepare_disagg_gen_init(
                         fitting_disagg_gen_init_requests)
 
+                    all_gen_first = self.active_requests and all(
+                        req.py_disaggregated_params
+                        and req.py_disaggregated_params.schedule_style ==
+                        DisaggScheduleStyle.GENERATION_FIRST
+                        for req in self.active_requests)
                     if num_fitting_reqs == 0 and not fitting_disagg_gen_init_requests:
-                        logger.warning(
-                            "num_fitting_reqs=0 and fitting_disagg_gen_init_requests is empty, may not have enough kvCache"
-                        )
-                        self._check_disagg_ctx_cache_transfer_status(1)
+                        if not all_gen_first:
+                            logger.warning(
+                                "num_fitting_reqs=0 and fitting_disagg_gen_init_requests is empty, may not have enough kvCache"
+                            )
+                            self._check_disagg_ctx_cache_transfer_status(1)
+                        elif self.async_transfer_manager.has_any_inflight_requests(
+                        ):
+                            # Non-blocking cleanup of completed/timed-out
+                            # transfers to free KV blocks (see _executor_loop).
+                            self._check_disagg_ctx_cache_transfer_status(0)
 
                 self.num_scheduled_requests = scheduled_batch.batch_size
 
@@ -1606,13 +1636,19 @@ def _handle_executed_batch(self, executed_batch: Optional[BatchStatePP]):
                 self._handle_canceled_requests()
 
                 finished_requests = self._handle_responses()
+                # Complete ctx send sessions AFTER responses are created so
+                # _handle_responses sees the request before it is terminated.
+                if self.kv_cache_transceiver:
+                    self._check_disagg_ctx_cache_transfer_status(0)
+                sample_state_scheduled_requests = executed_batch.scheduled_requests
                 attn_metadata = getattr(self.model_engine, 'attn_metadata',
                                         None)
                 kv_cache_dtype_byte_size = getattr(self.model_engine,
                                                    'kv_cache_dtype_byte_size',
                                                    None)
                 self.resource_manager.update_resources(
-                    scheduled_requests, attn_metadata, kv_cache_dtype_byte_size)
+                    sample_state_scheduled_requests, attn_metadata,
+                    kv_cache_dtype_byte_size)
 
                 self._remove_inflight_ids(scheduled_requests)
 
@@ -1797,11 +1833,23 @@ def _prepare_and_schedule_batch(self):
             # For requests that are fitting disagg gen init, also prepare resources for KV cache manager
             self._prepare_disagg_gen_init(fitting_disagg_gen_init_requests)
 
+            all_gen_first = self.active_requests and all(
+                req.py_disaggregated_params and req.py_disaggregated_params.
+                schedule_style == DisaggScheduleStyle.GENERATION_FIRST
+                for req in self.active_requests)
             if num_fitting_reqs == 0 and not fitting_disagg_gen_init_requests:
-                logger.warning(
-                    "num_fitting_reqs=0 and fitting_disagg_gen_init_requests is empty, may not have enough kvCache"
-                )
-                self._check_disagg_ctx_cache_transfer_status(1)
+                if not all_gen_first:
+                    logger.warning(
+                        "num_fitting_reqs=0 and fitting_disagg_gen_init_requests is empty, may not have enough kvCache"
+                    )
+                    self._check_disagg_ctx_cache_transfer_status(1)
+                elif self.async_transfer_manager.has_any_inflight_requests():
+                    # Non-blocking cleanup of completed/timed-out transfers
+                    # to free KV blocks. We avoid the blocking check because
+                    # gen-first requests may be waiting for peer info (which
+                    # would block indefinitely), but completed transfers must
+                    # still be reaped so that KV cache can be reclaimed.
+                    self._check_disagg_ctx_cache_transfer_status(0)
 
             # In gen-only benchmark mode, all requests must fit in KV cache
             # simultaneously. If some requests are stuck in INIT state and the
@@ -1886,6 +1934,7 @@ def _executor_loop(self):
                 finished_requests = []
 
                 can_queue, _ = self._can_queue(scheduled_batch)
+
                 if can_queue:
                     if self.kv_cache_transceiver:
                         # For generation requests which have completed KV cache transfer
@@ -1987,6 +2036,10 @@ def _executor_loop(self):
 
                     self._handle_canceled_requests()
                     finished_requests = self._handle_responses()
+                    # Complete ctx send sessions AFTER responses are created so
+                    # _handle_responses sees the request before it is terminated.
+                    if self.kv_cache_transceiver:
+                        self._check_disagg_ctx_cache_transfer_status(0)
                     # Compute GPU times after _handle_responses creates metric entries
                     # (safe in non-overlap mode: no next iteration to overwrite events)
                     self.perf_manager.compute_batch_gpu_times(
@@ -2145,6 +2198,7 @@ def _executor_loop_overlap(self):
 
                 can_queue, can_queue_this_rank = self._can_queue(
                     scheduled_batch)
+
                 if can_queue:
                     if self.kv_cache_transceiver:
                         # For generation requests which have completed KV cache transfer
@@ -2788,10 +2842,14 @@ def _check_disagg_gen_transfer_status(self):
             req.is_disagg_generation_transmission_in_progress
             for req in self.active_requests
         ])
-        need_check_one = all([
+        non_gen_first_reqs = [
+            req for req in self.active_requests
+            if req.py_disaggregated_params and req.py_disaggregated_params.
+            schedule_style != DisaggScheduleStyle.GENERATION_FIRST
+        ]
+        need_check_one = bool(non_gen_first_reqs) and all(
             req.is_disagg_generation_transmission_in_progress
-            for req in self.active_requests
-        ])
+            for req in non_gen_first_reqs)
 
         if need_check:
             at_least_num = 1 if need_check_one else 0
@@ -2836,14 +2894,16 @@ def _check_disagg_ctx_schedulable_status(self,
         """
         if not self.kv_cache_transceiver:
             return
-        ctx_only_requests = [
+        gen_first_ctx_requests = [
             req for req in new_requests
             if req.is_context_only_request and req.py_disaggregated_params.
             schedule_style == DisaggScheduleStyle.GENERATION_FIRST
         ]
-        if ctx_only_requests:
-            self.kv_cache_transceiver.prepare_context_requests(
-                ctx_only_requests)
+        # Always call prepare_context_requests when there are new requests
+        # or previously-waiting requests, so the tp_allgather consensus
+        # can promote requests whose peer info has arrived on all ranks.
+        self.kv_cache_transceiver.prepare_context_requests(
+            gen_first_ctx_requests)
 
     @nvtx_range("_pad_attention_dp_dummy_request")
     def _pad_attention_dp_dummy_request(self):
@@ -2998,10 +3058,14 @@ def _recv_disagg_gen_cache(self, new_gen_reqs):
                 if req.state == LlmRequestState.DISAGG_GENERATION_TRANS_IN_PROGRESS:
                     req.py_kv_transfer_start_time = time.time()
 
-        block_transfer = all([
+        non_gen_first_active = [
+            req for req in self.active_requests
+            if req.py_disaggregated_params and req.py_disaggregated_params.
+            schedule_style != DisaggScheduleStyle.GENERATION_FIRST
+        ]
+        block_transfer = bool(non_gen_first_active) and all(
             req.is_disagg_generation_transmission_in_progress
-            for req in self.active_requests
-        ])
+            for req in non_gen_first_active)
         self._check_disagg_gen_cache_transfer_status(1 if block_transfer else 0)
 
         return
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
index f2f74381660c..d83c2681090d 100644
--- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -534,6 +534,9 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
                     max_kv_event_entries=self.event_buffer_max_size)
 
         self.impl = KVCacheManagerCpp(**kwargs)
+        # Warmup baseline for cumulative counters (set by snapshot_warmup_baseline)
+        self._warmup_reused_blocks = 0
+        self._warmup_missed_blocks = 0
 
         self.impl.allocate_pools(False)
         self.kv_cache_pool_pointers = self.impl.get_block_pool_pointers()
@@ -1169,7 +1172,26 @@ def get_latest_events(self, timeout_ms: Optional[float] = 0):
         return self.impl.get_latest_events(timeout_ms)
 
     def get_kv_cache_stats(self):
-        return self.impl.get_kv_cache_stats()
+        stats = self.impl.get_kv_cache_stats()
+        # Subtract warmup baseline so cumulative counters only reflect
+        # real inference traffic, not dummy requests from warmup.
+        stats.reused_blocks -= self._warmup_reused_blocks
+        stats.missed_blocks -= self._warmup_missed_blocks
+        # Recompute cache hit rate from adjusted values.
+        total = stats.reused_blocks + stats.missed_blocks
+        stats.cache_hit_rate = (stats.reused_blocks /
+                                total) if total > 0 else 0.0
+        return stats
+
+    def snapshot_warmup_baseline(self):
+        """Snapshot cumulative reused and missed block counters so they can be subtracted later.
+
+        Must be called after warmup completes so that get_kv_cache_stats()
+        returns values that exclude warmup dummy requests.
+        """
+        raw = self.impl.get_kv_cache_stats()
+        self._warmup_reused_blocks = raw.reused_blocks
+        self._warmup_missed_blocks = raw.missed_blocks
 
     def rewind_kv_cache(self, request: LlmRequest, rewind_len: int):
         self.impl.rewind_kv_cache(request.py_request_id, rewind_len)
diff --git a/tensorrt_llm/_torch/pyexecutor/scheduler/scheduler.py b/tensorrt_llm/_torch/pyexecutor/scheduler/scheduler.py
index 82555e17e28f..4f2d56c657be 100644
--- a/tensorrt_llm/_torch/pyexecutor/scheduler/scheduler.py
+++ b/tensorrt_llm/_torch/pyexecutor/scheduler/scheduler.py
@@ -305,6 +305,7 @@ def can_schedule(self, requests: RequestList) -> bool:
 class ChunkingPolicy(Enum):
     EQUAL_PROGRESS = 1
     FIRST_COME_FIRST_SERVED = 2
+    FORCE_CHUNK = 3
 
 
 @dataclasses.dataclass
@@ -501,6 +502,9 @@ def schedule(
         ):
             all_context_requests_fit = False
 
+        if ctx_chunk_config and ctx_chunk_config.chunking_policy == ChunkingPolicy.FORCE_CHUNK:
+            all_context_requests_fit = False
+
         # 3. Apply Chunking Strategy if needed
         if not all_context_requests_fit and contexts_to_be_chunked:
             assert ctx_chunk_config is not None, (
@@ -598,6 +602,8 @@ def _set_ctx_requests_chunk_size(
             self._chunk_equal_progress(requests, capacity, unit_size)
         elif policy == ChunkingPolicy.FIRST_COME_FIRST_SERVED:
             self._chunk_fcfs(requests, capacity, unit_size)
+        elif policy == ChunkingPolicy.FORCE_CHUNK:
+            self._chunk_forced(requests, capacity, unit_size)
         else:
             raise ValueError(f"Invalid chunking policy: {policy}")
 
@@ -716,6 +722,28 @@ def _chunk_fcfs(
             if capacity is not None:
                 current_compute_capacity -= actual_model_cost
 
+    def _chunk_forced(self, requests: RequestList, capacity: Optional[int], unit_size: int):
+        """Mirrors the kFORCE_CHUNK specialization of setCtxRequestsChunkSize (microBatchScheduler.cpp).
+
+        Every request is assigned exactly min(context_remaining_length, unit_size) tokens.
+        Requests that would exceed the capacity budget are zeroed out.
+
+        This policy is designed for linear attention / Mamba2 state caching, which doesn't support
+        estimating reusable tokens, so we don't subtract them from the budget.
+        """
+        if self.max_context_length is not None and self.max_context_length < unit_size:
+            raise ValueError(
+                f"The forced chunk size ({unit_size}) exceeds the "
+                f"max context length ({self.max_context_length})"
+            )
+        total_tokens = 0
+        for req in requests:
+            req.context_chunk_size = min(req.context_remaining_length, unit_size)
+            if capacity is not None and total_tokens + req.context_chunk_size > capacity:
+                req.context_chunk_size = 0
+            total_tokens += req.context_chunk_size
+        assert capacity is None or total_tokens <= capacity
+
     def _fit_draft_tokens(self, requests: RequestList, capacity: Optional[int], unit_size: int):
         # capacity is a compute-token budget. Sum actual model tokens per request:
         # min(chunk_size, P - reusable), where P = context_remaining_length
@@ -1441,6 +1469,8 @@ def __init__(
 
             if "EQUAL_PROGRESS" in str(input_policy):
                 policy_enum = ChunkingPolicy.EQUAL_PROGRESS
+            elif "FORCE_CHUNK" in str(input_policy):
+                policy_enum = ChunkingPolicy.FORCE_CHUNK
             else:
                 # Default to FCFS for FIRST_COME_FIRST_SERVED or others
                 policy_enum = ChunkingPolicy.FIRST_COME_FIRST_SERVED
diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
index 6c211c05be75..e9df1bf2f254 100644
--- a/tensorrt_llm/_torch/utils.py
+++ b/tensorrt_llm/_torch/utils.py
@@ -23,6 +23,7 @@
     'MoeChunkingOverlap',
     'MoeBalancer',
     'MoeOutputMemset',
+    'MoeFc2Alpha',
 ]
 AuxStreamType = Enum(
     'AuxStreamType',
@@ -296,6 +297,16 @@ def get_last_power_of_2_num_tokens_buckets(max_num_tokens) -> List[int]:
     return tuple(num_token_buckets[::-1])
 
 
+def deep_gemm_gen_tuning_buckets(x: int):
+    buckets = tuple(range(8, 128, 8))
+    # Clamp x to be between 4096 and 8192.
+    if x >= 128:
+        x = min(x, 8192)
+        x = max(x, 4096)
+        buckets += tuple(range(128, x, 128))
+    return buckets
+
+
 def fp4_scale_infer_shape(input_shapes: List[List[int]]):
     """Calculate the dimensions of the fp4 scale tensor.
     """
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
index 6a6474d6e06b..66ade5c23f55 100644
--- a/tensorrt_llm/commands/serve.py
+++ b/tensorrt_llm/commands/serve.py
@@ -1104,6 +1104,12 @@ def serve_encoder(model: str, host: str, port: int, log_level: str,
               type=click.Choice(severity_map.keys()),
               default='info',
               help="The logging level.")
+@click.option("-s",
+              "--schedule_style",
+              type=click.Choice(["context_first", "generation_first"],
+                                case_sensitive=False),
+              default=None,
+              help="The schedule style for the disaggregated server.")
 @click.option(
     "--metrics-log-interval",
     type=int,
@@ -1118,6 +1124,7 @@ def disaggregated(
     request_timeout: int,
     log_level: str,
     metrics_log_interval: int,
+    schedule_style: str,
 ):
     """Running server in disaggregated mode"""
 
@@ -1132,7 +1139,8 @@ def disaggregated(
         logger.warning("--config_file is deprecated, use --config instead.")
 
     disagg_cfg = parse_disagg_config_file(config_file)
-
+    if schedule_style:
+        disagg_cfg.schedule_style = schedule_style
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         try:
             s.bind((disagg_cfg.hostname, disagg_cfg.port))
diff --git a/tensorrt_llm/disaggregated_params.py b/tensorrt_llm/disaggregated_params.py
index 2bdd8e097815..6f74001a4ef3 100644
--- a/tensorrt_llm/disaggregated_params.py
+++ b/tensorrt_llm/disaggregated_params.py
@@ -67,8 +67,10 @@ def get_context_phase_params(self) -> tllme.ContextPhaseParams:
         request_id = (
             self.disagg_request_id if self.disagg_request_id is not None else self.ctx_request_id
         )
+        # `first_gen_tokens` is now required by bindings and cannot be None.
+        first_gen_tokens = self.first_gen_tokens if self.first_gen_tokens is not None else []
         return tllme.ContextPhaseParams(
-            self.first_gen_tokens,
+            first_gen_tokens,
             request_id,
             self.opaque_state,
             self.draft_tokens,
diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
index 051171b3dd41..1d5368b57a7e 100644
--- a/tensorrt_llm/executor/proxy.py
+++ b/tensorrt_llm/executor/proxy.py
@@ -24,7 +24,7 @@
 from .request import CancellingRequest, GenerationRequest
 from .result import GenerationResult, IterationResult
 from .rpc import RPCClient
-from .rpc.rpc_common import get_unique_ipc_addr
+from .rpc.rpc_common import RPCError, get_unique_ipc_addr
 from .utils import (ErrorResponse, WorkerCommIpcAddrs, create_mpi_comm_session,
                     get_spawn_proxy_process_env, is_llm_response,
                     print_alive_threads)
@@ -387,6 +387,19 @@ def get_stats(self, timeout: float) -> List[dict]:
         stats = self.rpc_client.fetch_stats_wait_async(timeout=timeout).remote()
         return [json.loads(s) if isinstance(s, str) else s for s in stats]
 
+    def get_disaggregated_params(self) -> dict:
+        """Get disaggregated params from worker runtime via RPC."""
+        if self.rpc_client is None:
+            logger.warning(
+                "RPC client not initialized, cannot get disaggregated params")
+            return {}
+        try:
+            params = self.rpc_client.get_disaggregated_params().remote()
+            return params if isinstance(params, dict) else {}
+        except RPCError as e:
+            logger.warning(f"Error fetching disaggregated params via RPC: {e}")
+            return {}
+
     def aget_stats(self, timeout: float) -> IterationResult:
         """Get iteration statistics from the runtime via RPC (async).
 
diff --git a/tensorrt_llm/llmapi/disagg_utils.py b/tensorrt_llm/llmapi/disagg_utils.py
index 86edbefea7a0..be2158ac32dd 100644
--- a/tensorrt_llm/llmapi/disagg_utils.py
+++ b/tensorrt_llm/llmapi/disagg_utils.py
@@ -131,6 +131,10 @@ def extract_disagg_cfg(hostname: str = 'localhost',
                        conditional_disagg_config: Optional[dict] = None,
                        otlp_config: Optional[dict] = None,
                        disagg_cluster: Optional[dict] = None,
+                       node_id: Optional[int] = None,
+                       schedule_style: Literal[
+                           'context_first',
+                           'generation_first'] = 'context_first',
                        **kwargs: Any) -> DisaggServerConfig:
     context_servers = context_servers or {}
     generation_servers = generation_servers or {}
@@ -174,7 +178,10 @@ def extract_disagg_cfg(hostname: str = 'localhost',
                                 conditional_disagg_config, otlp_config,
                                 max_retries, perf_metrics_max_requests,
                                 disagg_cluster_config)
-
+    if node_id is not None:
+        config.node_id = node_id
+    if schedule_style:
+        config.schedule_style = schedule_style
     return config
 
 
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index d0347a798fab..caf3adeec83a 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -151,7 +151,7 @@ def __init__(self,
         self._executor_cls = kwargs.pop("executor_cls", GenerationExecutor)
         self._orchestrator_type = kwargs.get("orchestrator_type", None)
         self._llm_id = None
-        self._disaggregated_params = {}
+        self._disaggregated_params: Optional[dict] = None
 
         log_level = logger.level
         logger.set_level("info")  # force display the backend
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 9b6b525d00e1..4180f91b4a85 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -482,8 +482,8 @@ class MoeConfig(StrictBaseModel):
     Configuration for MoE.
     """
     backend: Literal[
-        "AUTO", "CUTLASS", "CUTEDSL", "WIDEEP", "TRTLLM", "DEEPGEMM", "VANILLA",
-        "TRITON"] = Field(
+        "AUTO", "CUTLASS", "CUTEDSL", "WIDEEP", "TRTLLM", "DEEPGEMM",
+        "DENSEGEMM", "VANILLA", "TRITON"] = Field(
             default='AUTO',
             description="MoE backend to use. "
             "AUTO selects default backend based on model. It currently doesn\'t always give the best choice for all scenarios. The capabilities of auto selection will be improved in future releases."
@@ -1860,6 +1860,7 @@ class ContextChunkingPolicy(StrEnum, metaclass=PybindMirrorEnumMeta):
     ''' Context chunking policy. '''
     FIRST_COME_FIRST_SERVED = "FIRST_COME_FIRST_SERVED"
     EQUAL_PROGRESS = "EQUAL_PROGRESS"
+    FORCE_CHUNK = "FORCE_CHUNK"
 
     def _to_pybind(self):
         return getattr(_ContextChunkingPolicy, self.value)
diff --git a/tensorrt_llm/metrics/collector.py b/tensorrt_llm/metrics/collector.py
index 01dd98be3121..3f4a1dc8a685 100644
--- a/tensorrt_llm/metrics/collector.py
+++ b/tensorrt_llm/metrics/collector.py
@@ -39,6 +39,8 @@ class MetricsCollector:
         trtllm_time_per_output_token_seconds
         trtllm_request_queue_time_seconds
         trtllm_kv_cache_hit_rate
+        trtllm_kv_cache_reused_blocks_total
+        trtllm_kv_cache_missed_blocks_total
         trtllm_kv_cache_utilization
     """
     labelname_finish_reason = "finished_reason"
@@ -104,10 +106,22 @@ def __init__(self, labels: Dict[str, str]) -> None:
                                        "kv_cache_hit_rate",
                                        documentation="KV cache hit rate",
                                        labelnames=self.labels.keys())
+        self.kv_cache_reused_blocks = Counter(
+            name=self.metric_prefix + "kv_cache_reused_blocks",
+            documentation=
+            "Cumulative number of KV cache blocks reused (cache hits)",
+            labelnames=self.labels.keys())
+        self.kv_cache_missed_blocks = Counter(
+            name=self.metric_prefix + "kv_cache_missed_blocks",
+            documentation=
+            "Cumulative number of KV cache blocks missed (cache misses)",
+            labelnames=self.labels.keys())
         self.kv_cache_utilization = Gauge(name=self.metric_prefix +
                                           "kv_cache_utilization",
                                           documentation="KV cache utilization",
                                           labelnames=self.labels.keys())
+        self._prev_reused_blocks = 0
+        self._prev_missed_blocks = 0
 
     def _label_merge(self, labels: Dict[str, str]) -> Dict[str, str]:
         if labels is None or len(labels) == 0:
@@ -181,6 +195,8 @@ def log_iteration_stats(self, iteration_stats: dict) -> None:
 
         This method updates Prometheus metrics including:
         - kv_cache_hit_rate
+        - kv_cache_reused_blocks
+        - kv_cache_missed_blocks
         - kv_cache_utilization
 
         Args:
@@ -189,6 +205,8 @@ def log_iteration_stats(self, iteration_stats: dict) -> None:
                 - "kvCacheStats" (dict): KV cache statistics containing:
                     - "cacheHitRate" (float): Cache hit rate (0.0 to 1.0). If present (including zero),
                       the kv_cache_hit_rate gauge is updated.
+                    - "reusedBlocks" (int): Number of KV cache blocks reused (cache hits).
+                    - "missedBlocks" (int): Number of KV cache blocks missed (cache misses).
                     - "usedNumBlocks" (int): Number of KV cache blocks currently in use.
                     - "maxNumBlocks" (int): Maximum number of KV cache blocks available. Should always be
                       non-zero.
@@ -205,6 +223,18 @@ def log_iteration_stats(self, iteration_stats: dict) -> None:
             cache_hit_rate = kv_stats.get("cacheHitRate")
             if cache_hit_rate is not None:
                 self._log_gauge(self.kv_cache_hit_rate, cache_hit_rate)
+            reused_blocks = kv_stats.get("reusedBlocks")
+            if reused_blocks is not None:
+                delta = reused_blocks - self._prev_reused_blocks
+                if delta > 0:
+                    self._log_counter(self.kv_cache_reused_blocks, None, delta)
+                self._prev_reused_blocks = reused_blocks
+            missed_blocks = kv_stats.get("missedBlocks")
+            if missed_blocks is not None:
+                delta = missed_blocks - self._prev_missed_blocks
+                if delta > 0:
+                    self._log_counter(self.kv_cache_missed_blocks, None, delta)
+                self._prev_missed_blocks = missed_blocks
             if "usedNumBlocks" in kv_stats and "maxNumBlocks" in kv_stats:
                 max_num_blocks = kv_stats["maxNumBlocks"]
                 if max_num_blocks:
diff --git a/tensorrt_llm/serve/openai_client.py b/tensorrt_llm/serve/openai_client.py
index dbf94f0ac1ef..c726c1c22720 100644
--- a/tensorrt_llm/serve/openai_client.py
+++ b/tensorrt_llm/serve/openai_client.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.
+# Copyright (c) 2026, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -288,7 +288,12 @@ async def shutdown(self) -> None:
         await self._session.close()
 
     async def check_ready(self) -> Tuple[List[str], List[str]]:
-        return await OpenAIHttpClient.check_ready_for_servers(self._session, self._router.servers)
+        ready_servers, unready_servers = await OpenAIHttpClient.check_ready_for_servers(
+            self._session, self._router.servers
+        )
+        if ready_servers:
+            await self._router.prepare_servers(ready_servers)
+        return ready_servers, unready_servers
 
     @staticmethod
     async def check_ready_for_servers(
diff --git a/tensorrt_llm/serve/openai_disagg_service.py b/tensorrt_llm/serve/openai_disagg_service.py
index 332c8585f209..387d7fa03ec3 100644
--- a/tensorrt_llm/serve/openai_disagg_service.py
+++ b/tensorrt_llm/serve/openai_disagg_service.py
@@ -83,9 +83,15 @@ def __init__(
             case "generation_first":
                 self._send_disagg_request = self._send_disagg_request_gen_first
                 self._schedule_style = DisaggScheduleStyle.GENERATION_FIRST
+                logger.info(
+                    f"Using generation first disagg schedule style, schedule_style: {self._config.schedule_style}"
+                )
             case _:
                 self._send_disagg_request = self._send_disagg_request_ctx_first
                 self._schedule_style = DisaggScheduleStyle.CONTEXT_FIRST
+                logger.info(
+                    f"Using context first disagg schedule style, schedule_style: {self._config.schedule_style}"
+                )
 
     async def openai_completion(
         self, request: UCompletionRequest, hooks: Optional[ResponseHooks] = None
@@ -163,11 +169,12 @@ def _get_ctx_request(
         ctx_request = request.model_copy(
             update={
                 "disaggregated_params": DisaggregatedParams(
-                    request_type="context_only", disagg_request_id=disagg_request_id
+                    request_type="context_only",
+                    disagg_request_id=disagg_request_id,
+                    schedule_style=self._schedule_style,
                 ),
                 "stream": False,
                 "stream_options": None,
-                "schedule_style": self._schedule_style,
             }
         )
         return ctx_request
@@ -182,6 +189,7 @@ def _get_gen_request(
         if ctx_response:
             request.disaggregated_params = ctx_response.choices[0].disaggregated_params
             request.disaggregated_params.request_type = "generation_only"
+            request.disaggregated_params.schedule_style = self._schedule_style
             # Replace the string prompt with prompt_tokens_ids
             if isinstance(request, CompletionRequest):
                 request.prompt = ctx_response.prompt_token_ids
@@ -198,6 +206,11 @@ def _get_gen_request(
         if ctx_server_info and "server_info" in ctx_server_info:
             disaggregated_params = ctx_server_info["server_info"].get("disaggregated_params", {})
             if disaggregated_params:
+                # ctx_info_endpoint from get_disaggregated_params() is a list;
+                # the Pydantic model expects a single str.
+                ep = disaggregated_params.get("ctx_info_endpoint")
+                if isinstance(ep, list) and ep:
+                    disaggregated_params = {**disaggregated_params, "ctx_info_endpoint": ep[0]}
                 request.disaggregated_params = request.disaggregated_params.model_copy(
                     update=disaggregated_params
                 )
@@ -363,27 +376,87 @@ async def _send_disagg_request_gen_first(
         need_ctx = not (await self._check_gen_only_disagg(request))
         ctx_server, gen_server = None, None
         ctx_server_info = None
-        tasks = []
         ctx_req, gen_req = None, None
         disagg_request_id = get_global_disagg_request_id(self._config.node_id)
         if need_ctx:
             ctx_server, ctx_server_info = await self._ctx_router.get_next_server(request)
             ctx_req = self._get_ctx_request(request, disagg_request_id)
-            tasks.append(
-                asyncio.create_task(
-                    self._ctx_client.send_request(ctx_req, server=ctx_server, hooks=hooks)
-                )
-            )
         gen_req = self._get_gen_request(
             request,
             ctx_response=None,
             disagg_request_id=disagg_request_id,
             ctx_server_info=ctx_server_info,
         )
-        tasks.append(
-            asyncio.create_task(
-                self._gen_client.send_request(gen_req, server=gen_server, hooks=hooks)
+
+        if request.stream and need_ctx:
+            # For streaming gen_first requests, the gen client returns a lazy
+            # async generator whose HTTP POST only fires when iterated. The ctx
+            # server blocks waiting for the gen server's rx session (gen_first
+            # protocol). Using asyncio.gather would deadlock: ctx waits for gen
+            # server, but gen POST is deferred until the generator is consumed,
+            # and the generator isn't consumed until gather returns.
+            #
+            # Fix: eagerly start consuming the gen generator in a background
+            # task so the HTTP POST fires, then pipe chunks through a queue.
+            gen_response = await self._gen_client.send_request(
+                gen_req, server=gen_server, hooks=hooks
             )
-        )
-        responses = await asyncio.gather(*tasks)
-        return responses[-1]
+
+            queue: asyncio.Queue = asyncio.Queue()
+
+            async def _consume_gen():
+                try:
+                    async for chunk in gen_response:
+                        await queue.put(chunk)
+                except Exception as e:
+                    await queue.put(e)
+                await queue.put(None)  # sentinel
+
+            consume_task: asyncio.Task = asyncio.create_task(_consume_gen())
+
+            # Now send ctx request — gen server has received its request
+            try:
+                await self._ctx_client.send_request(ctx_req, server=ctx_server, hooks=hooks)
+            except Exception:
+                consume_task.cancel()
+                try:
+                    await consume_task
+                except (asyncio.CancelledError, Exception):
+                    pass
+                raise
+
+            async def _yield_from_queue():
+                try:
+                    while True:
+                        item = await queue.get()
+                        if item is None:
+                            break
+                        if isinstance(item, Exception):
+                            raise item
+                        yield item
+                finally:
+                    if not consume_task.done():
+                        consume_task.cancel()
+                    try:
+                        await consume_task
+                    except asyncio.CancelledError:
+                        pass
+
+            return _yield_from_queue()
+        else:
+            # Non-streaming or no ctx needed: both HTTP POSTs fire eagerly
+            # through generator consumption, so asyncio.gather works fine.
+            tasks = []
+            if need_ctx:
+                tasks.append(
+                    asyncio.create_task(
+                        self._ctx_client.send_request(ctx_req, server=ctx_server, hooks=hooks)
+                    )
+                )
+            tasks.append(
+                asyncio.create_task(
+                    self._gen_client.send_request(gen_req, server=gen_server, hooks=hooks)
+                )
+            )
+            responses = await asyncio.gather(*tasks)
+            return responses[-1]
diff --git a/tensorrt_llm/serve/openai_protocol.py b/tensorrt_llm/serve/openai_protocol.py
index 9ea39e7f0ff2..9d6b1fcad608 100644
--- a/tensorrt_llm/serve/openai_protocol.py
+++ b/tensorrt_llm/serve/openai_protocol.py
@@ -618,6 +618,7 @@ class FunctionDefinition(OpenAIBaseModel):
     name: str
     description: Optional[str] = None
     parameters: Optional[Dict[str, Any]] = None
+    strict: Optional[bool] = None
 
 
 class ChatCompletionToolsParam(OpenAIBaseModel):
diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py
index ea44e98fbdbb..c0be17380172 100644
--- a/tensorrt_llm/serve/openai_server.py
+++ b/tensorrt_llm/serve/openai_server.py
@@ -44,6 +44,7 @@
 from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.logger import logger
 from tensorrt_llm.metrics.collector import MetricsCollector
+from tensorrt_llm.sampling_params import GuidedDecodingParams
 from tensorrt_llm.serve.chat_utils import (load_chat_template,
                                            parse_chat_messages_coroutines)
 from tensorrt_llm.serve.cluster_storage import create_cluster_storage_client
@@ -62,6 +63,7 @@
                                                 ImageObject,
                                                 MemoryUpdateRequest, ModelCard,
                                                 ModelList, PromptTokensDetails,
+                                                ResponseFormat,
                                                 ResponsesRequest,
                                                 ResponsesResponse,
                                                 UpdateWeightsRequest, UsageInfo,
@@ -96,6 +98,78 @@
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
 
 
+def _build_tool_strict_guided_decoding_params(tools, tool_parser_name):
+    """Build GuidedDecodingParams with structural tags for tools with strict=True.
+
+    When a tool has ``strict=True`` in its function definition, the server
+    should use constrained decoding to guarantee that the generated tool call
+    arguments exactly match the function's ``parameters`` JSON Schema.
+
+    This function builds structural tag items from each tool parser's
+    ``structure_info()`` and the tool's ``parameters`` schema, then returns
+    a ``GuidedDecodingParams`` with the structural tag format.
+
+    Returns None if no tool has strict=True or the parser doesn't support
+    structural tags.
+    """
+    if not tools or not tool_parser_name:
+        return None
+
+    # Check if any tool has strict=True
+    has_strict = any(tool.function.strict for tool in tools
+                     if tool.function.strict)
+    if not has_strict:
+        return None
+
+    tool_parser_cls = ToolParserFactory.parsers.get(tool_parser_name.lower())
+    if tool_parser_cls is None:
+        logger.warning(
+            "Tool parser '%s' not found, cannot enforce strict mode for tools.",
+            tool_parser_name)
+        return None
+
+    parser = tool_parser_cls()
+    if not parser.supports_structural_tag():
+        logger.warning(
+            "Tool parser '%s' does not support structural tags, "
+            "cannot enforce strict mode for tools.", tool_parser_name)
+        return None
+
+    get_info = parser.structure_info()
+
+    tags = []
+    triggers = set()
+    for tool in tools:
+        info = get_info(tool.function.name)
+        triggers.add(info.trigger)
+
+        if tool.function.strict and tool.function.parameters:
+            # Strict tool: constrain arguments to match the JSON Schema
+            content = {
+                "type": "json_schema",
+                "json_schema": tool.function.parameters,
+            }
+        else:
+            # Non-strict tool or no parameters: allow any text
+            content = {"type": "any_text"}
+
+        tags.append({
+            "begin": info.begin,
+            "content": content,
+            "end": info.end,
+        })
+
+    stag_format = {
+        "type": "triggered_tags",
+        "triggers": sorted(triggers),
+        "tags": tags,
+    }
+
+    resp_format = ResponseFormat(type="structural_tag", format=stag_format)
+    return GuidedDecodingParams(structural_tag=resp_format.model_dump_json(
+        by_alias=True, exclude_none=True))
+
+
 class OpenAIServer:
 
     def __init__(
@@ -856,6 +930,14 @@ async def chat_stream_generator(
                 if tool_parser_cls and getattr(
                         tool_parser_cls, 'needs_raw_special_tokens', False):
                     sampling_params.skip_special_tokens = False
+                # When strict=True on any tool, apply constrained decoding
+                # via structural tags (only if response_format doesn't already
+                # set guided decoding).
+                if sampling_params.guided_decoding is None:
+                    strict_guided = _build_tool_strict_guided_decoding_params(
+                        request.tools, self.tool_parser)
+                    if strict_guided is not None:
+                        sampling_params.guided_decoding = strict_guided
             postproc_args = ChatPostprocArgs.from_request(request)
             disaggregated_params = to_llm_disaggregated_params(
                 request.disaggregated_params)
diff --git a/tensorrt_llm/serve/router.py b/tensorrt_llm/serve/router.py
index 04336403ea6d..c64eae2e3f6f 100644
--- a/tensorrt_llm/serve/router.py
+++ b/tensorrt_llm/serve/router.py
@@ -163,6 +163,7 @@ def __init__(
         self._session = None
         self._health_check_timeout = metadata_server_cfg.health_check_timeout if metadata_server_cfg else None
         self._server_preparation_func = server_preparation_func
+        self._prepared_ready_servers: set[str] = set()
 
     @abstractmethod
     def _on_servers_updated(self, old_servers, new_servers):
@@ -184,32 +185,44 @@ async def _fetch_server_info(self, server: str, timeout: float) -> dict:
                                    timeout=timeout) as response:
                 return await response.json()
         except Exception as e:
-            logger.error(f"Error fetching server info for server {server}: {e}")
+            logger.warning(
+                f"Error fetching server info for server {server}: {e}")
+            raise RuntimeError(
+                f"Failed to fetch server info for server {server}") from e
         finally:
             await session.close()
-            return {}
 
     async def _prepare_server(self, server: str):
-        if self._server_preparation_func:
-            await self._server_preparation_func(server)
-
-        self._server_info[server] = await self._fetch_server_info(
-            server, self._health_check_timeout)
-        logger.info(f"server is ready with info: {self._server_info[server]}")
+        if server in self._prepared_ready_servers:
+            return
+        try:
+            if self._server_preparation_func:
+                await self._server_preparation_func(server)
+            server_info = await self._fetch_server_info(
+                server, self._health_check_timeout)
+            self._server_info[server] = server_info
+            logger.info(
+                f"server is ready with info: {self._server_info[server]}")
+            self._prepared_ready_servers.add(server)
+        except RuntimeError as e:
+            # swallow the error, if the server becomes ready or is added later, it will be prepared again
+            logger.warning(f"Error preparing server {server}: {e}")
 
     async def prepare_servers(self, servers: Optional[List[str]] = None):
         for server in servers or self._servers:
+            if server not in self._servers:
+                continue
             await self._prepare_server(server)
 
     async def add_server(self, server: str):
         if server in self._servers:
             logger.warning(f"Server {server} already exists")
             return
-        await self._prepare_server(server)
         async with self._lock:
             old_servers = self._servers.copy()
             self._servers = [*old_servers, server]
             self._on_servers_updated(old_servers, self._servers)
+        await self._prepare_server(server)
         logger.debug(
             f"Added server {server}, {self._server_role.name} current server list: {self._servers}"
         )
@@ -224,6 +237,7 @@ async def remove_server(self, server: str):
                 old_server for old_server in old_servers if old_server != server
             ]
             self._on_servers_updated(old_servers, self._servers)
+        self._prepared_ready_servers.discard(server)
         self._server_info.pop(server, None)
         logger.debug(
             f"Removed server {server}, current server list: {self._servers}")
@@ -308,6 +322,8 @@ async def _monitor_servers(self, poll_interval: float = 10.0):
                         # Log removed servers
                         for server in old_servers:
                             if server not in final_servers:
+                                self._prepared_ready_servers.discard(server)
+                                self._server_info.pop(server, None)
                                 logger.info(f"Server {server} is removed")
 
                         # Log added servers
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index c1c30ea256f1..2d597120b787 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -331,7 +331,6 @@ def multi_popen(server_configs, server_name="", enable_redirect_log=False):
             for i, (env, args) in enumerate(server_configs):
                 if enable_redirect_log:
                     f = open(f"output_{server_name}_{i}.log", "w+")
-                    env["TLLM_LOG_LEVEL"] = "INFO"
                     proc = popen(args, env=env, stdout=f, stderr=f)
                     log_files.append(f)
                 else:
@@ -359,10 +358,13 @@ def multi_popen(server_configs, server_name="", enable_redirect_log=False):
     with (
             MyThreadPoolExecutor(max_workers=max_workers) as thread_pool,
             temp_dir,
-            multi_popen(ctx_servers, "ctx") as ctx_processes,
-            multi_popen(gen_servers, "gen") as gen_processes,
-            multi_popen([(os.environ, server_cmd)], "disagg") as
-            server_processes,
+            multi_popen(ctx_servers, "ctx",
+                        enable_redirect_log=False) as ctx_processes,
+            multi_popen(gen_servers, "gen", enable_redirect_log=False) as
+            gen_processes,
+            multi_popen([(base_env, server_cmd)],
+                        "disagg",
+                        enable_redirect_log=False) as server_processes,
     ):
         start_time = time.time()
         server_is_ready = False
@@ -1656,6 +1658,70 @@ def test_auto_dtype_with_helix(self, comms_medium, cuda_graph_config,
                                       self.MODEL_PATH) as llm:
             run_accuracy_test(llm, self.MODEL_NAME, ["GSM8K"])
 
+    @pytest.mark.parametrize(
+        "gen_tp_pp", [(1, 1), (1, 2), (2, 1), (2, 2)],
+        ids=["gen_tp1pp1", "gen_tp1pp2", "gen_tp2pp1", "gen_tp2pp2"])
+    @pytest.mark.parametrize(
+        "ctx_tp_pp",
+        [(1, 1), (1, 2), (2, 1), (2, 2), (1, 4)],
+        ids=[
+            "ctx_tp1pp1", "ctx_tp1pp2", "ctx_tp2pp1", "ctx_tp2pp2", "ctx_tp1pp4"
+        ],
+    )
+    def test_gen_first(self, ctx_tp_pp, gen_tp_pp):
+        ctx_tp, ctx_pp = ctx_tp_pp
+        gen_tp, gen_pp = gen_tp_pp
+        total_gpus = ctx_tp * ctx_pp + gen_tp * gen_pp
+        if total_gpus > get_device_count():
+            pytest.skip(f"Not enough devices for {total_gpus} GPUs")
+        transceiver_runtime = "PYTHON"
+        transceiver_backend = "NIXL"
+        kv_cache_config = {
+            "enable_block_reuse": True,
+            "enable_partial_reuse": False,
+        }
+        ctx_server_config = {
+            "tensor_parallel_size": ctx_tp,
+            "pipeline_parallel_size": ctx_pp,
+            "disable_overlap_scheduler": True,
+            "cuda_graph_config": None,
+            "cache_transceiver_config": {
+                "backend": transceiver_backend,
+                "transceiver_runtime": transceiver_runtime,
+            },
+            "kv_cache_config": kv_cache_config,
+        }
+        gen_server_config = {
+            "tensor_parallel_size": gen_tp,
+            "pipeline_parallel_size": gen_pp,
+            "disable_overlap_scheduler": True,
+            "cuda_graph_config": None,
+            "cache_transceiver_config": {
+                "backend": transceiver_backend,
+                "transceiver_runtime": transceiver_runtime,
+            },
+            "kv_cache_config": kv_cache_config,
+        }
+        disaggregated_server_config = {
+            "hostname": "localhost",
+            "port": 8000,
+            "backend": "pytorch",
+            "context_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8001"]
+            },
+            "generation_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8002"]
+            },
+            "schedule_style": "generation_first",
+        }
+        with launch_disaggregated_llm(disaggregated_server_config,
+                                      ctx_server_config, gen_server_config,
+                                      self.MODEL_PATH) as llm:
+            # MMLU is good enough for such a sanity test
+            run_accuracy_test(llm, self.MODEL_NAME, ["MMLU"])
+
 
 @skip_pre_blackwell
 @pytest.mark.timeout(DEFAULT_TEST_TIMEOUT)
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 2f18a4a4051b..fb85d4e81b53 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import asyncio
+import json
 import os
 import sys
 from unittest import mock
@@ -6320,8 +6321,8 @@ class TestNemotronV3Super(LlmapiAccuracyTestHarness):
     EXTRA_EVALUATOR_KWARGS = dict(chat_template_kwargs=dict(
         enable_thinking=False))
 
-    @skip_pre_hopper
-    @pytest.mark.skip_less_device_memory(64000)
+    @skip_pre_blackwell
+    @pytest.mark.skip_less_device_memory(80000)
     @pytest.mark.skip_less_mpi_world_size(4)
     @pytest.mark.parametrize(
         "tp_size, ep_size, attention_dp, overlap_scheduler, cuda_graph",
@@ -6363,6 +6364,80 @@ def test_auto_dtype_4gpus(self, tp_size, ep_size, attention_dp,
             task.evaluate(llm,
                           extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
 
+    def _run_nvfp4_4gpus_eplb(self, moe_backend, eplb_config):
+        if moe_backend == "TRTLLM":
+            pytest.skip(
+                "TRTLLM + EPLB is not supported yet, see https://nvbugs/5997893."
+            )
+
+        kv_cache_config = KvCacheConfig(
+            enable_block_reuse=False,
+            mamba_ssm_cache_dtype="float16",
+            free_gpu_memory_fraction=0.5,
+        )
+        model_path = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4"
+        max_batch_size = 32
+        cuda_graph_config = CudaGraphConfig(max_batch_size=max_batch_size,
+                                            enable_padding=True)
+        moe_config = MoeConfig(backend=moe_backend, load_balancer=eplb_config)
+        pytorch_config = dict(cuda_graph_config=cuda_graph_config,
+                              moe_config=moe_config)
+        with LLM(
+                model_path,
+                kv_cache_config=kv_cache_config,
+                max_batch_size=max_batch_size,
+                tensor_parallel_size=4,
+                moe_expert_parallel_size=4,
+                enable_attention_dp=True,
+                **pytorch_config,
+        ) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm,
+                          extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm,
+                          extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
+
+    @skip_pre_blackwell
+    @pytest.mark.skip_less_mpi_world_size(4)
+    @pytest.mark.skip_less_device_memory(80000)
+    @parametrize_with_ids("moe_backend", ["TRTLLM", "CUTLASS"])
+    def test_nvfp4_4gpus_static_eplb(self, moe_backend):
+        model_path = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4"
+        with open(f"{model_path}/config.json") as f:
+            model_cfg = json.load(f)
+        num_experts = model_cfg["n_routed_experts"]
+        # num_slots should be larger than or equal to num_experts and should be divisible by parallel_size.
+        # Assign extra 16 expert slots per rank.
+        extra_num_slots = 16 * 4
+        num_slots = num_experts + extra_num_slots
+        hybrid_pattern = model_cfg["hybrid_override_pattern"]
+        moe_layer_indices = [
+            i for i, ch in enumerate(hybrid_pattern) if ch == 'E'
+        ]
+        initial_global_assignments = {}
+        for i in moe_layer_indices:
+            initial_global_assignments[i] = [(i + j) % num_experts
+                                             for j in range(num_slots)]
+        eplb_config = MoeLoadBalancerConfig(
+            num_slots=num_slots,
+            initial_global_assignments=initial_global_assignments,
+            layer_updates_per_iter=0)
+        self._run_nvfp4_4gpus_eplb(moe_backend, eplb_config)
+
+    @pytest.mark.skip_less_device(4)
+    @pytest.mark.skip_device_not_contain(["GB200"])
+    @parametrize_with_ids("moe_backend", ["TRTLLM", "CUTLASS"])
+    def test_nvfp4_4gpus_online_eplb(self, moe_backend):
+        num_experts = 512  # 512 experts per token for Nemotron V3 Super.
+        # num_slots should be larger than or equal to num_experts and should be divisible by parallel_size.
+        # Assign extra 16 expert slots per rank.
+        extra_num_slots = 16 * 4
+        num_slots = num_experts + extra_num_slots
+        eplb_config = MoeLoadBalancerConfig(num_slots=num_slots,
+                                            layer_updates_per_iter=2)
+        self._run_nvfp4_4gpus_eplb(moe_backend, eplb_config)
+
     @skip_pre_hopper
     @pytest.mark.skip_less_mpi_world_size(4)
     @pytest.mark.skip_less_device_memory(40000)
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap_gen_first.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap_gen_first.yaml
new file mode 100644
index 000000000000..72a316ad86bc
--- /dev/null
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap_gen_first.yaml
@@ -0,0 +1,39 @@
+model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+hostname: localhost
+port: 8000
+backend: "pytorch"
+cuda_graph_config: null
+free_gpu_memory_fraction: 0.2
+context_servers:
+  num_instances: 1
+  max_batch_size: 8
+  max_num_tokens: 3000
+  max_seq_len: 4096
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  kv_cache_config:
+    enable_block_reuse: False
+    free_gpu_memory_fraction: 0.2
+    enable_partial_reuse: False
+
+  cache_transceiver_config:
+    backend: DEFAULT
+    transceiver_runtime: PYTHON
+  urls:
+      - "localhost:8001"
+generation_servers:
+  num_instances: 1
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  max_batch_size: 256
+  max_num_tokens: 4096
+  max_seq_len: 4096
+  kv_cache_config:
+    enable_block_reuse: False
+    free_gpu_memory_fraction: 0.2
+    enable_partial_reuse: False
+  cache_transceiver_config:
+    backend: DEFAULT
+    transceiver_runtime: PYTHON
+  urls:
+      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap_gen_first_pp4.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap_gen_first_pp4.yaml
new file mode 100644
index 000000000000..e924722aef1d
--- /dev/null
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap_gen_first_pp4.yaml
@@ -0,0 +1,39 @@
+model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+hostname: localhost
+port: 8000
+backend: "pytorch"
+cuda_graph_config: null
+free_gpu_memory_fraction: 0.2
+context_servers:
+  num_instances: 1
+  max_batch_size: 8
+  max_num_tokens: 3000
+  max_seq_len: 4096
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 4
+  kv_cache_config:
+    enable_block_reuse: False
+    free_gpu_memory_fraction: 0.2
+    enable_partial_reuse: False
+
+  cache_transceiver_config:
+    backend: DEFAULT
+    transceiver_runtime: PYTHON
+  urls:
+      - "localhost:8001"
+generation_servers:
+  num_instances: 1
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  max_batch_size: 256
+  max_num_tokens: 4096
+  max_seq_len: 4096
+  kv_cache_config:
+    enable_block_reuse: False
+    free_gpu_memory_fraction: 0.2
+    enable_partial_reuse: False
+  cache_transceiver_config:
+    backend: DEFAULT
+    transceiver_runtime: PYTHON
+  urls:
+      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
index c509ec1bd49a..1c8072b02412 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -165,6 +165,10 @@ def get_test_config(test_desc, example_dir, test_root):
         f"{test_configs_root}/disagg_config_mixed.yaml",
         "overlap":
         f"{test_configs_root}/disagg_config_overlap.yaml",
+        "overlap_gen_first":
+        f"{test_configs_root}/disagg_config_overlap_gen_first.yaml",
+        "overlap_gen_first_pp4":
+        f"{test_configs_root}/disagg_config_overlap_gen_first_pp4.yaml",
         "overlap_transceiver_runtime_python":
         f"{test_configs_root}/disagg_config_overlap_transceiver_runtime_python.yaml",
         "tool_calls":
@@ -299,7 +303,7 @@ def get_client_test_set(test_desc):
                              verify_streaming_completion=True,
                              verify_chat=False,
                              verify_streaming_chat=False)
-    if test_desc in ("overlap", "trtllm_sampler"):
+    if test_desc.startswith("overlap") or test_desc == "trtllm_sampler":
         return ClientTestSet(completion=True,
                              completion_streaming=True,
                              chat=True,
@@ -454,6 +458,7 @@ def setup_disagg_cluster(
     env: dict[str, str] | None = None,
     cwd: str | None = None,
     server_start_timeout: int = 300,
+    schedule_style: str | None = None,
 ) -> tuple[dict[str, Any], list[ProcessWrapper], list[ProcessWrapper],
            ProcessWrapper, int, str]:
     """Load config, launch workers + disagg server, wait for ready.
@@ -463,6 +468,7 @@ def setup_disagg_cluster(
         model_name: Model path override (defaults to config's 'model' field)
         env: Environment variables to pass to subprocess (workers and disagg server)
         server_start_timeout: Timeout in seconds for server to become ready
+        schedule_style: Disagg schedule style ('context_first' or 'generation_first')
 
     Returns:
         tuple: (config, ctx_workers, gen_workers, disagg_server, server_port, work_dir)
@@ -554,6 +560,8 @@ def setup_disagg_cluster(
             "perf_metrics_max_requests":
             config.get("perf_metrics_max_requests", 0),
         }
+        if schedule_style:
+            server_config["schedule_style"] = schedule_style
         disagg_server = run_disagg_server(server_config,
                                           work_dir,
                                           server_port,
@@ -578,7 +586,8 @@ def run_disaggregated_test(example_dir,
                            prompt_file="prompts.json",
                            extra_endpoints_test=None,
                            model_path=None,
-                           cwd=None):
+                           cwd=None,
+                           disagg_schedule_style=None):
     """Run disaggregated test using service discovery instead of MPI."""
 
     if mpi_disabled():
@@ -589,7 +598,8 @@ def run_disaggregated_test(example_dir,
     config_file = get_test_config(test_desc, example_dir,
                                   os.path.dirname(__file__))
     config, ctx_workers, gen_workers, disagg_server, server_port, work_dir = \
-        setup_disagg_cluster(config_file, model_name=model_path, env=env, cwd=cwd)
+        setup_disagg_cluster(config_file, model_name=model_path, env=env, cwd=cwd,
+                             schedule_style=disagg_schedule_style)
 
     server_host = config.get("hostname", "localhost")
 
@@ -853,6 +863,29 @@ def test_disaggregated_overlap(disaggregated_test_root, llm_venv,
                            cwd=llm_venv.get_working_directory())
 
 
+@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
+                         indirect=True)
+@pytest.mark.parametrize("ctx_pp", [1, 4], ids=["ctx_pp1", "ctx_pp4"])
+def test_disaggregated_overlap_gen_first(disaggregated_test_root,
+                                         disaggregated_example_root, llm_venv,
+                                         llama_model_root, ctx_pp):
+    src_dst_dict = {
+        llama_model_root:
+        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    }
+    for src, dst in src_dst_dict.items():
+        if not os.path.islink(dst):
+            os.makedirs(os.path.dirname(dst), exist_ok=True)
+            os.symlink(src, dst, target_is_directory=True)
+
+    run_disaggregated_test(
+        disaggregated_example_root,
+        "overlap_gen_first" if ctx_pp == 1 else "overlap_gen_first_pp4",
+        env=llm_venv._new_env,
+        cwd=llm_venv.get_working_directory(),
+        disagg_schedule_style="generation_first")
+
+
 @pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
                          indirect=True)
 def test_disaggregated_overlap_transceiver_runtime_python(
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
index 370800c2a93f..63ae65f6cc05 100644
--- a/tests/integration/test_lists/qa/llm_function_core.txt
+++ b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -550,6 +550,8 @@ disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin]
 disaggregated/test_auto_scaling.py::test_disagg_server_restart[http-round_robin]
 disaggregated/test_disaggregated.py::test_disaggregated_overlap_transceiver_runtime_python[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_transceiver_runtime_python[DeepSeek-V3-Lite-fp8]
+disaggregated/test_disaggregated.py::test_disaggregated_overlap_gen_first[ctx_pp1-TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated.py::test_disaggregated_overlap_gen_first[ctx_pp4-TinyLlama-1.1B-Chat-v1.0]
 
 # llm-api promote pytorch to default
 llmapi/test_llm_api_qa.py::TestLlmDefaultBackend::test_llm_args_logging
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
index 489b0be8b723..ff8bd51e5ad5 100644
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -111,11 +111,13 @@ l0_b200:
   - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "TRTLLM"
   - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "CUTEDSL"
   - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "DEEPGEMM"
+  - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "DENSEGEMM"
   # ------------- MoE: test_single_gpu (by backend) ---------------
   - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "CUTLASS"
   - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "TRTLLM"
   - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "CUTEDSL"
   - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "DEEPGEMM"
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "DENSEGEMM"
   # ------------- MoE: FlashInfer & TRTLLM symbol collision tests ---------------
   - unittest/_torch/flashinfer/test_trtllm_flashinfer_symbol_collision.py
   # --- MoE end
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index f01d94d8d095..244f2307d5db 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -21,6 +21,8 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_4gpu_mtp_ar TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_5_397B_A17B::test_nvfp4[adp4]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_5_397B_A17B::test_nvfp4[tep4]
+  - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_4gpus_static_eplb[moe_backend=CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_4gpus_static_eplb[moe_backend=TRTLLM]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
index b63359dbeeb6..f15e10f8794a 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -121,6 +121,8 @@ l0_dgx_h100:
   - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_gentp4[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated.py::test_disaggregated_genbs1[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated.py::test_disaggregated_overlap_transceiver_runtime_python[TinyLlama-1.1B-Chat-v1.0]
+  - disaggregated/test_disaggregated.py::test_disaggregated_overlap_gen_first[ctx_pp1-TinyLlama-1.1B-Chat-v1.0]
+  - disaggregated/test_disaggregated.py::test_disaggregated_overlap_gen_first[ctx_pp4-TinyLlama-1.1B-Chat-v1.0]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1]
@@ -131,6 +133,7 @@ l0_dgx_h100:
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_instance[GSM8K]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_instance[MMLU]
+  - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_gen_first[ctx_tp1pp1-gen_tp1pp1]
   - disaggregated/test_auto_scaling.py::test_service_discovery[etcd-round_robin]
   - disaggregated/test_auto_scaling.py::test_worker_restart[etcd-load_balancing]
   - disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin]
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml
index 0cf908feb485..e0539b70ea96 100644
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml
@@ -56,6 +56,8 @@ l0_gb200_multi_gpus:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=TRTLLM]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_4gpus_online_eplb[moe_backend=TRTLLM]
+  - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_4gpus_online_eplb[moe_backend=CUTLASS]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 501b6c54fadb..46f1d178e447 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -340,9 +340,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput
 accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput_trtllm] SKIP (https://nvbugs/5981293)
 accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_2_model_mtp[2model] SKIP (https://nvbugs/5981293)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_python_scheduler[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-enable_chunked_prefill=False] SKIP (https://nvbugs/5981122)
-accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-False] SKIP (https://nvbugs/5959992)
-accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-True-True] SKIP (https://nvbugs/5959992)
-accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-True] SKIP (https://nvbugs/5959992)
 test_e2e.py::test_draft_token_tree_quickstart_advanced_eagle3_depth_1_tree[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] SKIP (https://nvbugs/5989907)
 test_e2e.py::test_draft_token_tree_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] SKIP (https://nvbugs/5989907)
 unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend[act=Relu2-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize] SKIP (https://nvbugs/5989912)
@@ -353,17 +350,12 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[
 disaggregated/test_disaggregated.py::test_disaggregated_overlap_transceiver_runtime_python[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5997543)
 accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B_Instruct_Eagle3::test_eagle3_one_model SKIP (https://nvbugs/5997534)
 unittest/disaggregated/test_py_cache_transceiver_mp.py::test_v2_transceiver_mp[ctx_first-v2_mp_tp1_pp1_to_tp1_pp2] SKIP (https://nvbugs/5996642)
-perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5996645)
-perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5996645)
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/5996656)
 accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[fp8-4-trtllm] SKIP (https://nvbugs/5997046)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_python_scheduler[ep4-mtp_nextn=0] SKIP (https://nvbugs/5997051)
-perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-UCX] SKIP (https://nvbugs/5996645)
 perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_tep8_mtp3_8k1k] SKIP (https://nvbugs/5997092)
 accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8 SKIP (https://nvbugs/6004530)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4] SKIP (https://nvbugs/6007201)
-perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-UCX] SKIP (https://nvbugs/5996645)
-perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5996645)
 unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=DEEPEP-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize] SKIP (https://nvbugs/6007285)
 disaggregated/test_disaggregated.py::test_disaggregated_gpt_oss_120b_harmony[gpt_oss/gpt-oss-120b] SKIP (https://nvbugs/6011317)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2] SKIP (https://nvbugs/6011320)
diff --git a/tests/scripts/cute_dsl_kernels/moe_as_dense_gemm/run_moe_as_dense_gemm_fc1.py b/tests/scripts/cute_dsl_kernels/moe_as_dense_gemm/run_moe_as_dense_gemm_fc1.py
new file mode 100644
index 000000000000..be578889672c
--- /dev/null
+++ b/tests/scripts/cute_dsl_kernels/moe_as_dense_gemm/run_moe_as_dense_gemm_fc1.py
@@ -0,0 +1,1007 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Example usage of the MoE as Dense GEMM FC1 kernel.
+
+Functional testing:
+python run_moe_as_dense_gemm_fc1.py \
+        --ab_dtype Float4E2M1FN --c_dtype Float16 \
+        --sf_dtype Float8E4M3FN --sf_vec_size 16 \
+        --mma_tiler_mn 128,128 --cluster_shape_mn 1,1 \
+        --mnkl 128,65536,256,1 --expert_count 256
+
+Perf testing:
+python run_moe_as_dense_gemm_fc1.py \
+        --ab_dtype Float4E2M1FN --c_dtype Float16 \
+        --sf_dtype Float8E4M3FN --sf_vec_size 16 \
+        --mma_tiler_mn 128,128 --cluster_shape_mn 1,1 \
+        --mnkl 128,65536,256,1 --expert_count 256 \
+        --skip_ref_check --use_cold_l2 --warmup_iterations 10 --iterations 50
+"""
+
+import argparse
+import sys
+from pathlib import Path
+from typing import Tuple, Type
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+import torch
+from cutlass.cute.runtime import from_dlpack
+
+# Import kernel module
+try:
+    from tensorrt_llm._torch.cute_dsl_kernels.blackwell.moe_as_dense_gemm import (
+        fc1 as kernel_module,
+    )
+except (ModuleNotFoundError, ImportError):
+    sys.path.insert(0, str(Path(__file__).parents[4] / "tensorrt_llm/_torch/cute_dsl_kernels"))
+    from blackwell.moe_as_dense_gemm import fc1 as kernel_module
+
+Sm100BlockScaledPersistentDenseGemmKernel = kernel_module.Sm100BlockScaledPersistentDenseGemmKernel
+cvt_sf_MKL_to_M32x4xrm_K4xrk_L = kernel_module.cvt_sf_MKL_to_M32x4xrm_K4xrk_L
+cvt_sf_M32x4xrm_K4xrk_L_to_MKL = kernel_module.cvt_sf_M32x4xrm_K4xrk_L_to_MKL
+
+# Add parent directory to path to import testing module
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from testing import benchmark  # noqa: E402
+
+# Fixed random seed for reproducible tensor initialization.
+DEFAULT_RANDOM_SEED = 1111
+
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+def simulate_f8_quantization(tensor_f32: torch.Tensor, f8_dtype) -> torch.Tensor:
+    """Simulate f8 quantization: fp32 -> f8 -> fp32.
+
+    This models the precision loss when storing scale factors in f8 format.
+
+    :param tensor_f32: Input fp32 tensor (on CPU), shape (m, n, num_groups)
+    :param f8_dtype: Target f8 dtype (e.g., cutlass.Float8E4M3FN)
+    :return: Tensor after f32 -> f8 -> f32 round-trip (on CPU)
+    """
+    shape = tensor_f32.shape
+    f8_torch = torch.empty(*shape, dtype=torch.uint8, device="cuda")
+    f8_tensor = from_dlpack(f8_torch, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+    f8_tensor.element_type = f8_dtype
+    f32_device = tensor_f32.cuda()
+    f32_tensor = from_dlpack(f32_device, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+    cute.testing.convert(f32_tensor, f8_tensor)
+    cute.testing.convert(f8_tensor, f32_tensor)
+    return f32_device.cpu()
+
+
+def simulate_nvfp4_quantization(tensor_f32: torch.Tensor) -> torch.Tensor:
+    """Simulate nvfp4 quantization: fp32 -> nvfp4 -> fp32.
+
+    This models the precision loss when storing output in nvfp4 format.
+
+    :param tensor_f32: Input fp32 tensor (on CPU), shape (m, n, ng)
+    :return: Tensor after f32 -> nvfp4 -> f32 round-trip (on CPU)
+    """
+    m_dim, n_dim, ng = tensor_f32.shape
+    ref_f32_torch = cutlass_torch.matrix(ng, m_dim, n_dim, False, cutlass.Float32)
+    f4_tensor, _ = cutlass_torch.cute_tensor_like(
+        ref_f32_torch, cutlass.Float4E2M1FN, is_dynamic_layout=True, assumed_align=16
+    )
+    f32_device = tensor_f32.cuda()
+    f32_tensor = from_dlpack(f32_device, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+    cute.testing.convert(f32_tensor, f4_tensor)
+    cute.testing.convert(f4_tensor, f32_tensor)
+    return f32_device.cpu()
+
+
+def compute_scale_factor(
+    tensor_f32: torch.Tensor,
+    sf_vec_size_local: int,
+    norm_const_local: float,
+    rcp_limits: float,
+) -> torch.Tensor:
+    """Compute scale factor for nvfp4 quantization.
+
+    Scale factor = abs_max_per_vector * norm_const * rcp_limits
+
+    :param tensor_f32: Input fp32 tensor, shape (m, n, ng)
+    :param sf_vec_size_local: Vector size for scale factor (e.g., 16)
+    :param norm_const_local: Normalization constant
+    :param rcp_limits: Reciprocal of dtype max value (e.g., 1/6.0 for nvfp4)
+    :return: Scale factor tensor, shape (m, sfn, ng) where sfn = ceil(n / sf_vec_size)
+    """
+    m_dim, n_dim, ng = tensor_f32.shape
+    sfn = ceil_div(n_dim, sf_vec_size_local)
+    padded_n = sfn * sf_vec_size_local
+    if padded_n > n_dim:
+        tensor_padded = torch.zeros(m_dim, padded_n, ng, dtype=tensor_f32.dtype)
+        tensor_padded[:, :n_dim, :] = tensor_f32
+    else:
+        tensor_padded = tensor_f32
+    tensor_reshaped = tensor_padded.view(m_dim, sfn, sf_vec_size_local, ng)
+    abs_max, _ = torch.abs(tensor_reshaped).max(dim=2)
+    scale_factor = abs_max * norm_const_local * rcp_limits
+    return scale_factor
+
+
+def apply_quantization_scale(
+    tensor_f32: torch.Tensor,
+    scale_factor: torch.Tensor,
+    sf_vec_size_local: int,
+    norm_const_local: float,
+) -> torch.Tensor:
+    """Apply quantization scale to tensor.
+
+    Output = tensor * (norm_const / scale_factor).
+    This simulates the kernel's quantization scaling.
+
+    :param tensor_f32: Input fp32 tensor, shape (m, n, ng)
+    :param scale_factor: Scale factor tensor, shape (m, sfn, ng)
+    :param sf_vec_size_local: Vector size for scale factor
+    :param norm_const_local: Normalization constant
+    :return: Scaled tensor, shape (m, n, ng)
+    """
+    m_dim, n_dim, ng = tensor_f32.shape
+    sfn = scale_factor.shape[1]
+    fp32_max = torch.tensor(3.40282346638528859812e38, dtype=torch.float32)
+    scale_rcp = norm_const_local * scale_factor.reciprocal()
+    scale_rcp = torch.where(torch.isinf(scale_rcp), fp32_max, scale_rcp)
+    scale_rcp_expanded = scale_rcp.unsqueeze(2).expand(m_dim, sfn, sf_vec_size_local, ng)
+    scale_rcp_expanded = scale_rcp_expanded.reshape(m_dim, sfn * sf_vec_size_local, ng)
+    scale_rcp_expanded = scale_rcp_expanded[:, :n_dim, :]
+    return tensor_f32 * scale_rcp_expanded
+
+
+def unswizzle_kernel_sfc(
+    sfc_cute_tensor,
+    m_dim: int,
+    n_dim: int,
+    sf_vec_size_local: int,
+    l_dim: int,
+) -> torch.Tensor:
+    """Unswizzle kernel's scale factor tensor from MMA layout to MKL layout.
+
+    :param sfc_cute_tensor: Kernel's scale factor cute tensor (swizzled MMA layout)
+    :param m_dim: M dimension
+    :param n_dim: Output N dimension (n_out)
+    :param sf_vec_size_local: Vector size for scale factor
+    :param l_dim: L dimension (batch)
+    :return: Unswizzled scale factor tensor, shape (m, sfn, l)
+    """
+    sfn = ceil_div(n_dim, sf_vec_size_local)
+
+    swizzled_sfc_cpu, _ = create_sf_layout_tensor(l_dim, m_dim, n_dim, sf_vec_size_local)
+    swizzled_sfc_tensor, swizzled_sfc_torch = cutlass_torch.cute_tensor_like(
+        swizzled_sfc_cpu, cutlass.Float32, is_dynamic_layout=True, assumed_align=16
+    )
+
+    cute.testing.convert(sfc_cute_tensor, swizzled_sfc_tensor)
+    swizzled_sfc_cpu = swizzled_sfc_torch.cpu()
+
+    unswizzled_sfc = torch.empty(m_dim, sfn, l_dim, dtype=torch.float32)
+    cvt_sf_M32x4xrm_K4xrk_L_to_MKL(
+        from_dlpack(swizzled_sfc_cpu),
+        from_dlpack(unswizzled_sfc),
+    )
+
+    return unswizzled_sfc
+
+
+def create_sf_layout_tensor(l, mn, k, sf_vec_size):  # noqa: E741
+    """Create scale factor tensor in MMA layout for SFC verification.
+
+    :param l: Batch dimension (L)
+    :param mn: M or N dimension
+    :param k: K dimension (for SFC, this is n_out // sf_vec_size)
+    :param sf_vec_size: Vector size for scale factor
+    :return: Tuple of (swizzled_tensor_cpu, ref_shape)
+    """
+    sf_k = ceil_div(k, sf_vec_size)
+    ref_shape = (l, mn, sf_k)
+
+    atom_m = (32, 4)
+    atom_k = 4
+    mma_shape = (
+        l,
+        ceil_div(mn, atom_m[0] * atom_m[1]),
+        ceil_div(sf_k, atom_k),
+        atom_m[0],
+        atom_m[1],
+        atom_k,
+    )
+
+    mma_permute_order = (3, 4, 1, 5, 2, 0)
+
+    # Create f32 cute torch tensor (cpu) with MMA layout
+    cute_f32_torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
+        mma_shape,
+        torch.float32,
+        permute_order=mma_permute_order,
+        init_type=cutlass_torch.TensorInitType.SCALAR,
+        init_config=cutlass_torch.ScalarInitConfig(value=0.0),
+    )
+
+    return cute_f32_torch_tensor_cpu, ref_shape
+
+
+def run(
+    mnkl: Tuple[int, int, int, int],
+    expert_count: int,
+    ab_dtype: Type[cutlass.Numeric],
+    sf_dtype: Type[cutlass.Numeric],
+    sf_vec_size: int,
+    c_dtype: Type[cutlass.Numeric],
+    a_major: str,
+    b_major: str,
+    c_major: str,
+    mma_tiler_mn: Tuple[int, int],
+    cluster_shape_mn: Tuple[int, int],
+    use_prefetch: bool = False,
+    prefetch_dist: int = 3,
+    vectorized_f32: bool = True,
+    tolerance: float = 1e-01,
+    warmup_iterations: int = 0,
+    iterations: int = 1,
+    skip_ref_check: bool = False,
+    use_cold_l2: bool = False,
+    use_cupti: bool = True,
+    no_alpha_post: bool = False,
+    **kwargs,
+):
+    """Execute a persistent batched dense blockscaled GEMM operation on Blackwell architecture.
+
+    This function prepares input tensors, configures and launches the persistent GEMM kernel,
+    optionally performs reference validation, and benchmarks the execution performance.
+
+    :param mnkl: Problem size (M, N, K, L)
+    :type mnkl: Tuple[int, int, int, int]
+    :param ab_dtype: Data type for input tensors A and B
+    :type ab_dtype: Type[cutlass.Numeric]
+    :param sf_dtype: Data type for scale factor tensor
+    :type sf_dtype: Type[cutlass.Numeric]
+    :param sf_vec_size: Vector size for scale factor tensor
+    :type sf_vec_size: int
+    :param c_dtype: Data type for output tensor C
+    :type c_dtype: Type[cutlass.Numeric]
+    :param a_major/b_major/c_major: Memory layout of tensor A/B/C
+    :type a_major/b_major/c_major: str
+    :param mma_tiler_mn: MMA tiling size.
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster shape.
+    :type cluster_shape_mn: Tuple[int, int]
+    :param tolerance: Tolerance value for reference validation comparison, defaults to 1e-01
+    :type tolerance: float, optional
+    :param warmup_iterations: Number of warmup iterations before benchmarking, defaults to 0
+    :type warmup_iterations: int, optional
+    :param iterations: Number of benchmark iterations to run, defaults to 1
+    :type iterations: int, optional
+    :param skip_ref_check: Whether to skip reference result validation, defaults to False
+    :type skip_ref_check: bool, optional
+    :param use_cold_l2: Whether to use circular buffer strategy to ensure cold L2 cache, defaults to False
+    :type use_cold_l2: bool, optional
+    :raises RuntimeError: If CUDA GPU is not available
+    :raises ValueError: If the configuration is invalid or unsupported by the kernel
+    :return: Execution time of the GEMM kernel
+    :rtype: float
+    """
+    # Unpack parameters
+    m, n, k, l = mnkl  # noqa: E741
+
+    # Compute weight_per_expert from n and expert_count
+    weight_per_expert = n // expert_count
+
+    print("Running Sm100 Persistent Dense BlockScaled GEMM with SwiGLU fusion test with:")
+    print(f"mnkl: {mnkl}")
+    print(f"AB dtype: {ab_dtype}, SF dtype: {sf_dtype}, SF Vec size: {sf_vec_size}")
+    print(f"C dtype: {c_dtype}")
+    print(f"Matrix majors - A: {a_major}, B: {b_major}, C: {c_major}")
+    print(f"Mma Tiler (M, N): {mma_tiler_mn}, Cluster Shape (M, N): {cluster_shape_mn}")
+    print(f"Expert count: {expert_count}")
+    print(f"Weight per expert (n // expert_count): {weight_per_expert}")
+    print(f"Use prefetch: {'True' if use_prefetch else 'False'}")
+    print(f"Prefetch dist: {prefetch_dist}")
+    print(f"Vectorized f32: {'True' if vectorized_f32 else 'False'}")
+    print(f"Tolerance: {tolerance}")
+    print(f"Warmup iterations: {warmup_iterations}")
+    print(f"Iterations: {iterations}")
+    print(f"Skip reference checking: {skip_ref_check}")
+    print(f"Use cold L2: {'True' if use_cold_l2 else 'False'}")
+    print(f"Use CUPTI: {'True' if use_cupti else 'False'}")
+    print(f"No alpha post: {'True' if no_alpha_post else 'False'}")
+
+    # Skip unsupported testcase
+    if not Sm100BlockScaledPersistentDenseGemmKernel.can_implement(
+        ab_dtype,
+        sf_dtype,
+        sf_vec_size,
+        c_dtype,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        m,
+        n,
+        k,
+        l,
+        a_major,
+        b_major,
+        c_major,
+        expert_count,
+        weight_per_expert,
+    ):
+        raise TypeError(
+            f"Unsupported testcase {ab_dtype}, {sf_dtype}, {sf_vec_size}, {c_dtype}, "
+            f"{mma_tiler_mn}, {cluster_shape_mn}, {m}, {n}, {k}, {l}, "
+            f"{a_major}, {b_major}, {c_major}, {expert_count}, {weight_per_expert}"
+        )
+
+    if not torch.cuda.is_available():
+        raise RuntimeError("GPU is required to run this example!")
+
+    torch.manual_seed(DEFAULT_RANDOM_SEED)
+
+    # Create tensor A/B/C
+    # Note: C has n//2 columns due to SwiGLU fusion (pairs of up/gate columns are combined)
+    a_ref = cutlass_torch.matrix(l, m, k, a_major == "m", cutlass.Float32)
+    b_ref = cutlass_torch.matrix(l, n, k, b_major == "n", cutlass.Float32)
+    c_ref = cutlass_torch.matrix(l, m, n // 2, c_major == "m", cutlass.Float32)
+
+    a_tensor, a_torch = cutlass_torch.cute_tensor_like(
+        a_ref, ab_dtype, is_dynamic_layout=True, assumed_align=16
+    )
+    b_tensor, b_torch = cutlass_torch.cute_tensor_like(
+        b_ref, ab_dtype, is_dynamic_layout=True, assumed_align=16
+    )
+    c_tensor, c_torch = cutlass_torch.cute_tensor_like(
+        c_ref, c_dtype, is_dynamic_layout=True, assumed_align=16
+    )
+
+    # Mark tensor with element divisibility for 16B alignment
+    a_tensor.mark_compact_shape_dynamic(
+        mode=1 if a_major == "k" else 0,
+        stride_order=(2, 0, 1) if a_major == "k" else (2, 1, 0),
+        divisibility=32 if ab_dtype == cutlass.Float4E2M1FN else 16,
+    )
+    b_tensor.mark_compact_shape_dynamic(
+        mode=1 if b_major == "k" else 0,
+        stride_order=(2, 0, 1) if b_major == "k" else (2, 1, 0),
+        divisibility=32 if ab_dtype == cutlass.Float4E2M1FN else 16,
+    )
+    c_tensor.mark_compact_shape_dynamic(
+        mode=1 if c_major == "n" else 0,
+        stride_order=(2, 0, 1) if c_major == "n" else (2, 1, 0),
+        divisibility=32 if ab_dtype == cutlass.Float4E2M1FN else 16,
+    )
+
+    # Create scale factor tensor SFA/SFB
+    def create_scale_factor_tensor(l, mn, k, sf_vec_size, dtype):  # noqa: E741
+        sf_k = ceil_div(k, sf_vec_size)
+        ref_shape = (l, mn, sf_k)
+
+        atom_m = (32, 4)
+        atom_k = 4
+        mma_shape = (
+            l,
+            ceil_div(mn, atom_m[0] * atom_m[1]),
+            ceil_div(sf_k, atom_k),
+            atom_m[0],
+            atom_m[1],
+            atom_k,
+        )
+
+        ref_permute_order = (1, 2, 0)
+        mma_permute_order = (3, 4, 1, 5, 2, 0)
+
+        # Create f32 ref torch tensor (cpu)
+        ref_f32_torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
+            ref_shape,
+            torch.float32,
+            permute_order=ref_permute_order,
+            init_type=cutlass_torch.TensorInitType.RANDOM,
+            init_config=cutlass_torch.RandomInitConfig(
+                min_val=1,
+                max_val=2,  # Reduced from 3 to 2 to reduce overflow risk
+            ),
+        )
+
+        # Create f32 cute torch tensor (cpu)
+        cute_f32_torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
+            mma_shape,
+            torch.float32,
+            permute_order=mma_permute_order,
+            init_type=cutlass_torch.TensorInitType.RANDOM,
+            init_config=cutlass_torch.RandomInitConfig(
+                min_val=0,
+                max_val=1,
+            ),
+        )
+
+        # convert ref f32 tensor to cute f32 tensor
+        cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+            from_dlpack(ref_f32_torch_tensor_cpu),
+            from_dlpack(cute_f32_torch_tensor_cpu),
+        )
+        cute_f32_torch_tensor = cute_f32_torch_tensor_cpu.cuda()
+
+        # reshape makes memory contiguous
+        ref_f32_torch_tensor_cpu = (
+            ref_f32_torch_tensor_cpu.permute(2, 0, 1)
+            .unsqueeze(-1)
+            .expand(l, mn, sf_k, sf_vec_size)
+            .reshape(l, mn, sf_k * sf_vec_size)
+            .permute(*ref_permute_order)
+        )
+        # prune to mkl for reference check.
+        ref_f32_torch_tensor_cpu = ref_f32_torch_tensor_cpu[:, :k, :]
+
+        # Create dtype cute torch tensor (cpu)
+        cute_tensor, cute_torch_tensor = cutlass_torch.cute_tensor_like(
+            cute_f32_torch_tensor_cpu,
+            dtype,
+            is_dynamic_layout=True,
+            assumed_align=16,
+        )
+
+        # Convert f32 cute tensor to dtype cute tensor
+        cute_tensor = cutlass_torch.convert_cute_tensor(
+            cute_f32_torch_tensor,
+            cute_tensor,
+            dtype,
+            is_dynamic_layout=True,
+        )
+        return ref_f32_torch_tensor_cpu, cute_tensor, cute_torch_tensor
+
+    sfa_ref, sfa_tensor, sfa_torch = create_scale_factor_tensor(l, m, k, sf_vec_size, sf_dtype)
+    sfb_ref, sfb_tensor, sfb_torch = create_scale_factor_tensor(l, n, k, sf_vec_size, sf_dtype)
+
+    # Create scale factor tensor alpha_scale_tensor
+    # Alpha scale is indexed by N dimension (not M) since we don't swap A/B
+    def create_alpha_scale_tensor(l, expert_count, weight_per_expert):  # noqa: E741
+        ref_shape = (l, expert_count)
+        ref_permute_order = (1, 0)
+
+        # Create f32 ref torch tensor (cpu)
+        ref_f32_torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
+            ref_shape,
+            torch.float32,
+            permute_order=ref_permute_order,
+            init_type=cutlass_torch.TensorInitType.RANDOM,
+            init_config=cutlass_torch.RandomInitConfig(
+                min_val=-2,
+                max_val=2,
+            ),
+            # init_type=cutlass_torch.TensorInitType.SCALAR,
+            # init_config=cutlass_torch.ScalarInitConfig(
+            #     value=1.0,
+            # ),
+        )
+
+        # Create cute alpha_scale_tensor
+        cute_alpha_scale_tensor, cute_alpha_scale_torch_tensor = cutlass_torch.cute_tensor_like(
+            ref_f32_torch_tensor_cpu,
+            cutlass.Float32,
+            is_dynamic_layout=True,
+            assumed_align=4,
+        )
+
+        # Expand to (n, l) for einsum "mnl,nl->mnl" (alpha indexed by N dimension)
+        # n = expert_count * weight_per_expert
+        ref_f32_torch_tensor_cpu = (
+            ref_f32_torch_tensor_cpu.permute(1, 0)
+            .unsqueeze(-1)
+            .expand(l, expert_count, weight_per_expert)
+            .reshape(l, expert_count * weight_per_expert)
+            .permute(*ref_permute_order)
+        )
+
+        return (
+            ref_f32_torch_tensor_cpu,
+            cute_alpha_scale_tensor,
+            cute_alpha_scale_torch_tensor,
+        )
+
+    alpha_scale_ref, alpha_scale_tensor, alpha_scale_torch = create_alpha_scale_tensor(
+        l, expert_count, weight_per_expert
+    )
+
+    # Create post-SwiGLU alpha scale tensor
+    # Shape: (m, expert_count, l) for per-token per-expert scaling
+    def create_alpha_scale_post_swiglu_tensor(l, m, expert_count, weight_per_expert):  # noqa: E741
+        """Create alpha scale tensor to apply after SwiGLU.
+
+        Post-SwiGLU alpha has shape (m, expert_count, l) for per-token per-expert scaling.
+        This provides fine-grained control over each token's contribution to each expert.
+
+        Args:
+            l: batch size
+            m: sequence length (number of tokens)
+            expert_count: number of experts
+            weight_per_expert: weights per expert in input (before SwiGLU)
+
+        Returns:
+            tuple: (ref_tensor_expanded, cute_tensor, torch_tensor)
+                - ref_tensor_expanded: (m, n_out, l) for reference check
+                - cute_tensor: (m, expert_count, l) for kernel input
+                - torch_tensor: backing torch tensor
+        """
+        weight_per_expert_out = weight_per_expert // 2
+        n_out = expert_count * weight_per_expert_out
+
+        # Create tensor with shape (m, expert_count, l) in contiguous memory order
+        # This matches the convention of other tensors (L as last dimension)
+        ref_shape = (m, expert_count, l)
+
+        # Create f32 tensor directly without permutation
+        # Use positive range [0.8, 1.2] to minimize numerical variation while still testing functionality
+        ref_f32_torch_tensor_cpu = torch.rand(ref_shape, dtype=torch.float32) * 0.4 + 0.8
+
+        # Create cute alpha_scale_tensor with shape (m, expert_count, l)
+        cute_alpha_scale_tensor, cute_alpha_scale_torch_tensor = cutlass_torch.cute_tensor_like(
+            ref_f32_torch_tensor_cpu,
+            cutlass.Float32,
+            is_dynamic_layout=True,
+            assumed_align=4,
+        )
+
+        # Expand to (m, n_out, l) for reference computation
+        # Each expert's alpha (per token) is repeated for weight_per_expert_out elements
+        # Input: (m, expert_count, l)
+        # Need to expand to (m, n_out, l) where n_out = expert_count * weight_per_expert_out
+        ref_expanded = (
+            ref_f32_torch_tensor_cpu.unsqueeze(2)  # (m, expert_count, 1, l)
+            .expand(m, expert_count, weight_per_expert_out, l)  # (m, expert_count, wpe_out, l)
+            .reshape(m, n_out, l)  # (m, n_out, l)
+        )
+
+        return (
+            ref_expanded,
+            cute_alpha_scale_tensor,
+            cute_alpha_scale_torch_tensor,
+        )
+
+    # Create alpha_scale_post tensor (optional, disabled when no_alpha_post=True)
+    alpha_scale_post_ref = None
+    alpha_scale_post_tensor = None
+    alpha_scale_post_torch = None
+    if not no_alpha_post:
+        alpha_scale_post_ref, alpha_scale_post_tensor, alpha_scale_post_torch = (
+            create_alpha_scale_post_swiglu_tensor(l, m, expert_count, weight_per_expert)
+        )
+
+        # Debug: print alpha statistics
+        print(f"Alpha post shape: {alpha_scale_post_torch.shape}")
+        print(
+            f"Alpha post min: {alpha_scale_post_torch.min().item():.4f}, max: {alpha_scale_post_torch.max().item():.4f}"
+        )
+        print(f"Alpha post mean: {alpha_scale_post_torch.mean().item():.4f}")
+        print(f"Alpha post with abs < 0.1: {(alpha_scale_post_torch.abs() < 0.1).sum().item()}")
+    else:
+        print("Alpha post: disabled (no_alpha_post=True)")
+
+    # Create SFC tensor and norm_const tensor for FP4 quantized output
+    # SFC (Scale Factor C) is used to store per-block scale factors for FP4 quantization
+    generate_sfc = c_dtype is cutlass.Float4E2M1FN
+    sfc_tensor = None
+    sfc_torch = None
+    norm_const_tensor = None
+    norm_const = 1.0
+
+    print(f"FP4 quantization with SFC: {'True' if generate_sfc else 'False'}")
+
+    if generate_sfc:
+        # SFC shape: (m, n_out // sf_vec_size, l) where n_out = n // 2 due to SwiGLU
+        # SFC tensor needs to be created with swizzled MMA layout (same as SFA/SFB)
+        n_out = n // 2
+
+        # Create SFC tensor with swizzled MMA layout using create_sf_layout_tensor
+        # This creates tensor with ref_shape (l, m, sfc_n) in MMA swizzled layout
+        sfc_swizzled_cpu, _ = create_sf_layout_tensor(l, m, n_out, sf_vec_size)
+        sfc_tensor, sfc_torch = cutlass_torch.cute_tensor_like(
+            sfc_swizzled_cpu, sf_dtype, is_dynamic_layout=True, assumed_align=16
+        )
+
+        # Create norm_const tensor (scalar tensor containing normalization constant)
+        norm_const_ref = torch.tensor([norm_const], dtype=torch.float32)
+        norm_const_tensor, _ = cutlass_torch.cute_tensor_like(
+            norm_const_ref, cutlass.Float32, is_dynamic_layout=True, assumed_align=4
+        )
+
+    # Configure gemm kernel with SwiGLU fusion
+    gemm = Sm100BlockScaledPersistentDenseGemmKernel(
+        sf_vec_size,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        weight_per_expert,
+        use_prefetch,
+        prefetch_dist,
+        vectorized_f32,
+    )
+
+    # Compute max active clusters on current device
+    hardware_info = cutlass.utils.HardwareInfo()
+    max_active_clusters = hardware_info.get_max_active_clusters(
+        cluster_shape_mn[0] * cluster_shape_mn[1]
+    )
+
+    # Initialize Stream
+    current_stream = cutlass_torch.default_stream()
+
+    # Compile gemm kernel
+    compiled_gemm = cute.compile(
+        gemm,
+        a_tensor,
+        b_tensor,
+        sfa_tensor,
+        sfb_tensor,
+        alpha_scale_tensor,
+        alpha_scale_post_tensor,  # Post-SwiGLU alpha: (l, m, expert_count)
+        c_tensor,
+        sfc_tensor,  # SFC tensor for FP4 quantized output (None if not quantizing)
+        norm_const_tensor,  # Normalization constant tensor (None if not quantizing)
+        max_active_clusters,
+        current_stream,
+    )
+
+    # Compute reference result
+    if not skip_ref_check:
+        # Execute kernel once for reference checking
+        compiled_gemm(
+            a_tensor,
+            b_tensor,
+            sfa_tensor,
+            sfb_tensor,
+            alpha_scale_tensor,
+            alpha_scale_post_tensor,  # Post-SwiGLU alpha: (l, m, expert_count)
+            c_tensor,
+            sfc_tensor,  # SFC tensor for FP4 quantized output
+            norm_const_tensor,  # Normalization constant tensor
+            current_stream,
+        )
+        print("Verifying results...")
+        res_a = torch.einsum("mkl,mkl->mkl", a_ref, sfa_ref)
+        res_b = torch.einsum("nkl,nkl->nkl", b_ref, sfb_ref)
+        ref = torch.einsum("mkl,nkl->mnl", res_a, res_b)
+        # Alpha scale indexed by N dimension (not M) since we don't swap A/B
+        ref = torch.einsum("mnl,nl->mnl", ref, alpha_scale_ref)
+
+        # Apply SwiGLU fusion: output = up * silu(gate)
+        # up and gate are interleaved in the N dimension (granularity=64)
+        # Extract up (even subtiles) and gate (odd subtiles)
+        # ref shape is (m, n, l), we need to reshape to extract up and gate
+        # Assuming interleaving at granularity 64
+        granularity = 64
+        ref_reshaped = ref.view(m, n // granularity, granularity, l)
+        ref_up = ref_reshaped[:, 0::2, :, :].contiguous().view(m, n // 2, l)
+        ref_gate = ref_reshaped[:, 1::2, :, :].contiguous().view(m, n // 2, l)
+
+        # silu(x) = x * sigmoid(x)
+        ref = ref_up * (ref_gate * torch.sigmoid(ref_gate))
+        # Now ref shape: (m, n_out, l) where n_out = n // 2
+
+        # Apply post-SwiGLU alpha scale (optional)
+        # Alpha is per-token per-expert: (m, n_out, l) * (m, n_out, l)
+        if alpha_scale_post_ref is not None:
+            ref = ref * alpha_scale_post_ref
+
+        # Convert c back to f32 for comparison.
+        c_ref_device = c_ref.cuda()
+        cute.testing.convert(
+            c_tensor,
+            from_dlpack(c_ref_device, assumed_align=16).mark_layout_dynamic(
+                leading_dim=(1 if c_major == "n" else 0)
+            ),
+        )
+        c_ref = c_ref_device.cpu()
+
+        # Note: n_out = n // 2 due to SwiGLU fusion
+        n_out = n // 2
+        if c_dtype in (cutlass.Float32, cutlass.Float16, cutlass.BFloat16):
+            torch.testing.assert_close(c_ref, ref, atol=tolerance, rtol=1e-02)
+        elif c_dtype in (cutlass.Float8E5M2, cutlass.Float8E4M3FN):
+            # Convert ref : f32 -> f8 -> f32
+            ref_f8_ = torch.empty(*(l, m, n_out), dtype=torch.uint8, device="cuda").permute(1, 2, 0)
+            ref_f8 = from_dlpack(ref_f8_, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+            ref_f8.element_type = c_dtype
+            ref_device = ref.permute(2, 0, 1).contiguous().permute(1, 2, 0).cuda()
+            ref_tensor = from_dlpack(ref_device, assumed_align=16).mark_layout_dynamic(
+                leading_dim=1
+            )
+            cute.testing.convert(ref_tensor, ref_f8)
+            cute.testing.convert(ref_f8, ref_tensor)
+            ref = ref_device.cpu()
+            torch.testing.assert_close(c_ref, ref, atol=tolerance, rtol=1e-02)
+        elif c_dtype is cutlass.Float4E2M1FN:
+            # FP4 quantization with SFC (Scale Factor C) verification
+            # Reference: run_blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py
+
+            # ============================================================
+            # Step 1: Compute reference scale factor (SFC) from SwiGLU output
+            # ============================================================
+            rcp_limits = gemm.get_dtype_rcp_limits(c_dtype)
+
+            # Compute reference SFC: abs_max * norm_const * rcp_limits
+            ref_sfc_before_f8 = compute_scale_factor(ref, sf_vec_size, norm_const, rcp_limits)
+            # Simulate f8 quantization for SFC (kernel stores SFC in sf_dtype format)
+            ref_sfc_f32 = simulate_f8_quantization(ref_sfc_before_f8, sf_dtype)
+
+            # ============================================================
+            # Step 2: Verify kernel SFC matches reference SFC (using pass rate)
+            # ============================================================
+            if sfc_tensor is not None:
+                kernel_sfc = unswizzle_kernel_sfc(sfc_tensor, m, n_out, sf_vec_size, l)
+
+                sfc_diff = torch.abs(ref_sfc_f32 - kernel_sfc)
+                sfc_within_tolerance = (sfc_diff <= tolerance) | (
+                    sfc_diff <= torch.abs(ref_sfc_f32) * 1e-02
+                )
+                sfc_pass_rate = sfc_within_tolerance.float().mean().item()
+                print(f"SFC Tensor pass rate: {sfc_pass_rate * 100:.2f}%")
+
+            # ============================================================
+            # Step 3: Apply quantization scale and simulate nvfp4 precision loss
+            # ============================================================
+            # Apply scale: ref_scaled = ref * (norm_const / sfc)
+            ref_scaled = apply_quantization_scale(ref, ref_sfc_f32, sf_vec_size, norm_const)
+            # Simulate nvfp4 quantization: f32 -> nvfp4 -> f32
+            ref_quantized = simulate_nvfp4_quantization(ref_scaled)
+
+            # ============================================================
+            # Step 4: Compare kernel output with reference (using pass rate)
+            # ============================================================
+            print("Verifying C Tensor...")
+            diff = torch.abs(c_ref - ref_quantized)
+            within_tolerance = (diff <= tolerance) | (diff <= torch.abs(ref_quantized) * 1e-02)
+            pass_rate = within_tolerance.float().mean().item()
+            print(f"C Tensor pass rate: {pass_rate * 100:.2f}% (threshold: 95%)")
+            assert pass_rate >= 0.95, (
+                f"Only {pass_rate * 100:.2f}% elements within tolerance, expected >= 95%"
+            )
+
+    def generate_tensors():
+        a_tensor, _ = cutlass_torch.cute_tensor_like(
+            a_ref, ab_dtype, is_dynamic_layout=True, assumed_align=16
+        )
+        b_tensor, _ = cutlass_torch.cute_tensor_like(
+            b_ref, ab_dtype, is_dynamic_layout=True, assumed_align=16
+        )
+        c_tensor, _ = cutlass_torch.cute_tensor_like(
+            c_ref, c_dtype, is_dynamic_layout=True, assumed_align=16
+        )
+
+        # Mark tensor to be byte aligned
+        a_tensor.mark_compact_shape_dynamic(
+            mode=1 if a_major == "k" else 0,
+            stride_order=(2, 0, 1) if a_major == "k" else (2, 1, 0),
+            divisibility=2 if ab_dtype == cutlass.Float4E2M1FN else 1,
+        )
+        b_tensor.mark_compact_shape_dynamic(
+            mode=1 if b_major == "k" else 0,
+            stride_order=(2, 0, 1) if b_major == "k" else (2, 1, 0),
+            divisibility=2 if ab_dtype == cutlass.Float4E2M1FN else 1,
+        )
+        c_tensor.mark_compact_shape_dynamic(
+            mode=1 if c_major == "n" else 0,
+            stride_order=(2, 0, 1) if c_major == "n" else (2, 1, 0),
+            divisibility=2 if c_dtype == cutlass.Float4E2M1FN else 1,
+        )
+
+        _, sfa_tensor, _ = create_scale_factor_tensor(l, m, k, sf_vec_size, sf_dtype)
+        _, sfb_tensor, _ = create_scale_factor_tensor(l, n, k, sf_vec_size, sf_dtype)
+        _, alpha_scale_tensor, _ = create_alpha_scale_tensor(l, expert_count, weight_per_expert)
+        gen_alpha_scale_post_tensor = None
+        if not no_alpha_post:
+            _, gen_alpha_scale_post_tensor, _ = create_alpha_scale_post_swiglu_tensor(
+                l, m, expert_count, weight_per_expert
+            )
+
+        # Create SFC tensor and norm_const tensor for FP4 quantized output
+        gen_sfc_tensor = None
+        gen_norm_const_tensor = None
+        if generate_sfc:
+            n_out = n // 2
+            # Create SFC tensor with swizzled MMA layout
+            sfc_swizzled_cpu_gen, _ = create_sf_layout_tensor(l, m, n_out, sf_vec_size)
+            gen_sfc_tensor, _ = cutlass_torch.cute_tensor_like(
+                sfc_swizzled_cpu_gen, sf_dtype, is_dynamic_layout=True, assumed_align=16
+            )
+            norm_const_ref_gen = torch.tensor([norm_const], dtype=torch.float32)
+            gen_norm_const_tensor, _ = cutlass_torch.cute_tensor_like(
+                norm_const_ref_gen, cutlass.Float32, is_dynamic_layout=True, assumed_align=4
+            )
+
+        return cute.testing.JitArguments(
+            a_tensor,
+            b_tensor,
+            sfa_tensor,
+            sfb_tensor,
+            alpha_scale_tensor,
+            gen_alpha_scale_post_tensor,  # Post-SwiGLU alpha: (l, m, expert_count) or None
+            c_tensor,
+            gen_sfc_tensor,  # SFC tensor for FP4 quantized output
+            gen_norm_const_tensor,  # Normalization constant tensor
+            current_stream,
+        )
+
+    workspace_count = 1
+    if use_cold_l2:
+        one_workspace_bytes = (
+            a_torch.numel() * a_torch.element_size()
+            + b_torch.numel() * b_torch.element_size()
+            + sfa_torch.numel() * sfa_torch.element_size()
+            + sfb_torch.numel() * sfb_torch.element_size()
+            + alpha_scale_torch.numel() * alpha_scale_torch.element_size()
+            + (
+                alpha_scale_post_torch.numel() * alpha_scale_post_torch.element_size()
+                if alpha_scale_post_torch is not None
+                else 0
+            )
+            + c_torch.numel() * c_torch.element_size()
+        )
+        if sfc_torch is not None:
+            one_workspace_bytes += sfc_torch.numel() * sfc_torch.element_size()
+        workspace_count = cute.testing.get_workspace_count(
+            one_workspace_bytes, warmup_iterations, iterations
+        )
+
+    exec_time = benchmark(
+        compiled_gemm,
+        workspace_generator=generate_tensors,
+        workspace_count=workspace_count,
+        stream=current_stream,
+        warmup_iterations=warmup_iterations,
+        iterations=iterations,
+        use_cupti=use_cupti,
+    )
+
+    return exec_time  # Return execution time in microseconds
+
+
+if __name__ == "__main__":
+
+    def parse_comma_separated_ints(s: str) -> Tuple[int, ...]:
+        try:
+            return tuple(int(x.strip()) for x in s.split(","))
+        except ValueError:
+            raise argparse.ArgumentTypeError("Invalid format. Expected comma-separated integers.")
+
+    parser = argparse.ArgumentParser(
+        description="Example of Sm100 Dense Persistent BlockScaled GEMM with SwiGLU fusion."
+    )
+
+    parser.add_argument(
+        "--mnkl",
+        type=parse_comma_separated_ints,
+        default=(512, 256, 256, 1),
+        help="mnkl dimensions (comma-separated)",
+    )
+    parser.add_argument(
+        "--expert_count",  # should be 256 or 257 normally
+        type=int,
+        default=257,
+        help="expert count",
+    )
+    parser.add_argument(
+        "--use_prefetch",
+        action="store_true",
+        default=False,
+        help="Enable prefetch operations (default: False)",
+    )
+    parser.add_argument(
+        "--prefetch_dist",
+        type=int,
+        default=7,
+        help="Prefetch distance for TMA operations (default: 3)",
+    )
+    parser.add_argument(
+        "--vectorized_f32",
+        action="store_true",
+        default=True,
+        help="Enable vectorized f32x2 operations for better performance (default: True)",
+    )
+    parser.add_argument(
+        "--mma_tiler_mn",
+        type=parse_comma_separated_ints,
+        default=(128, 128),
+        help="Mma tile shape (comma-separated)",
+    )
+    parser.add_argument(
+        "--cluster_shape_mn",
+        type=parse_comma_separated_ints,
+        default=(1, 1),
+        help="Cluster shape (comma-separated)",
+    )
+    parser.add_argument("--ab_dtype", type=cutlass.dtype, default=cutlass.Float4E2M1FN)
+    parser.add_argument("--sf_dtype", type=cutlass.dtype, default=cutlass.Float8E4M3FN)
+    parser.add_argument("--sf_vec_size", type=int, default=16)
+    parser.add_argument("--c_dtype", type=cutlass.dtype, default=cutlass.Float16)
+    parser.add_argument("--a_major", choices=["k", "m"], type=str, default="k")
+    parser.add_argument("--b_major", choices=["k", "n"], type=str, default="k")
+    parser.add_argument("--c_major", choices=["n", "m"], type=str, default="n")
+    parser.add_argument("--tolerance", type=float, default=1e-01, help="Tolerance for validation")
+    parser.add_argument("--warmup_iterations", type=int, default=10, help="Warmup iterations")
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=50,
+        help="Number of iterations to run the kernel",
+    )
+    parser.add_argument("--skip_ref_check", action="store_true", help="Skip reference checking")
+    parser.add_argument(
+        "--use_cold_l2",
+        action="store_true",
+        default=False,
+        help="Use circular buffer tensor sets to ensure L2 cold cache",
+    )
+    parser.add_argument(
+        "--use_cupti",
+        action="store_true",
+        default=True,
+        help="Use CUPTI for profiling (default: True)",
+    )
+    parser.add_argument(
+        "--no_alpha_post",
+        action="store_true",
+        default=False,
+        help="Disable post-SwiGLU alpha scaling (alpha_post=None)",
+    )
+
+    args = parser.parse_args()
+
+    if len(args.mnkl) != 4:
+        parser.error("--mnkl must contain exactly 4 values")
+
+    if len(args.mma_tiler_mn) != 2:
+        parser.error("--mma_tiler_mn must contain exactly 2 values")
+
+    if len(args.cluster_shape_mn) != 2:
+        parser.error("--cluster_shape_mn must contain exactly 2 values")
+
+    exec_time = run(
+        args.mnkl,
+        args.expert_count,
+        args.ab_dtype,
+        args.sf_dtype,
+        args.sf_vec_size,
+        args.c_dtype,
+        args.a_major,
+        args.b_major,
+        args.c_major,
+        args.mma_tiler_mn,
+        args.cluster_shape_mn,
+        args.use_prefetch,
+        args.prefetch_dist,
+        args.vectorized_f32,
+        args.tolerance,
+        args.warmup_iterations,
+        args.iterations,
+        args.skip_ref_check,
+        args.use_cold_l2,
+        args.use_cupti,
+        args.no_alpha_post,
+    )
+    print(f"Execution time: {exec_time:.2f} us")
+    print("PASS")
diff --git a/tests/scripts/cute_dsl_kernels/moe_as_dense_gemm/run_moe_as_dense_gemm_fc2.py b/tests/scripts/cute_dsl_kernels/moe_as_dense_gemm/run_moe_as_dense_gemm_fc2.py
new file mode 100644
index 000000000000..81d52d24b22f
--- /dev/null
+++ b/tests/scripts/cute_dsl_kernels/moe_as_dense_gemm/run_moe_as_dense_gemm_fc2.py
@@ -0,0 +1,611 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Example usage of the MoE as Dense GEMM FC2 kernel.
+
+Functional testing:
+python run_moe_as_dense_gemm_fc2.py \
+        --ab_dtype Float4E2M1FN --c_dtype Float16 \
+        --sf_dtype Float8E8M0FNU --sf_vec_size 16 \
+        --mma_tiler_mn 128,128 --cluster_shape_mn 1,1 \
+        --mnkl 512,256,256,1 --expert_count 256
+
+Perf testing:
+python run_moe_as_dense_gemm_fc2.py \
+        --ab_dtype Float4E2M1FN --c_dtype Float16 \
+        --sf_dtype Float8E8M0FNU --sf_vec_size 16 \
+        --mma_tiler_mn 128,128 --cluster_shape_mn 1,1 \
+        --mnkl 512,256,256,1 --expert_count 256 \
+        --skip_ref_check --use_cold_l2 --warmup_iterations 10 --iterations 50
+
+Note: weight_per_expert is automatically computed as k // expert_count.
+      Ensure k is divisible by expert_count.
+"""
+
+import argparse
+import sys
+from pathlib import Path
+from typing import Tuple, Type
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+import torch
+from cutlass.cute.runtime import from_dlpack
+
+# Import kernel module
+try:
+    from tensorrt_llm._torch.cute_dsl_kernels.blackwell.moe_as_dense_gemm import (
+        fc2 as kernel_module,
+    )
+except (ModuleNotFoundError, ImportError):
+    sys.path.insert(0, str(Path(__file__).parents[4] / "tensorrt_llm/_torch/cute_dsl_kernels"))
+    from blackwell.moe_as_dense_gemm import fc2 as kernel_module
+
+Sm100BlockScaledPersistentDenseGemmKernel = kernel_module.Sm100BlockScaledPersistentDenseGemmKernel
+cvt_sf_MKL_to_M32x4xrm_K4xrk_L = kernel_module.cvt_sf_MKL_to_M32x4xrm_K4xrk_L
+
+# Add parent directory to path to import testing module
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from testing import benchmark  # noqa: E402
+
+# Fixed random seed for reproducible tensor initialization.
+DEFAULT_RANDOM_SEED = 1111
+
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+def run(
+    mnkl: Tuple[int, int, int, int],
+    expert_count: int,
+    ab_dtype: Type[cutlass.Numeric],
+    sf_dtype: Type[cutlass.Numeric],
+    sf_vec_size: int,
+    c_dtype: Type[cutlass.Numeric],
+    a_major: str,
+    b_major: str,
+    c_major: str,
+    mma_tiler_mn: Tuple[int, int],
+    cluster_shape_mn: Tuple[int, int],
+    use_prefetch: bool = False,
+    prefetch_dist: int = 3,
+    tolerance: float = 1e-01,
+    warmup_iterations: int = 0,
+    iterations: int = 1,
+    skip_ref_check: bool = False,
+    use_cold_l2: bool = False,
+    use_cupti: bool = True,
+    **kwargs,
+):
+    """Execute a persistent batched dense blockscaled GEMM operation on Blackwell architecture.
+
+    This function prepares input tensors, configures and launches the persistent GEMM kernel,
+    optionally performs reference validation, and benchmarks the execution performance.
+
+    :param mnkl: Problem size (M, N, K, L)
+    :type mnkl: Tuple[int, int, int, int]
+    :param ab_dtype: Data type for input tensors A and B
+    :type ab_dtype: Type[cutlass.Numeric]
+    :param sf_dtype: Data type for scale factor tensor
+    :type sf_dtype: Type[cutlass.Numeric]
+    :param sf_vec_size: Vector size for scale factor tensor
+    :type sf_vec_size: int
+    :param c_dtype: Data type for output tensor C
+    :type c_dtype: Type[cutlass.Numeric]
+    :param a_major/b_major/c_major: Memory layout of tensor A/B/C
+    :type a_major/b_major/c_major: str
+    :param mma_tiler_mn: MMA tiling size.
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster shape.
+    :type cluster_shape_mn: Tuple[int, int]
+    :param tolerance: Tolerance value for reference validation comparison, defaults to 1e-01
+    :type tolerance: float, optional
+    :param warmup_iterations: Number of warmup iterations before benchmarking, defaults to 0
+    :type warmup_iterations: int, optional
+    :param iterations: Number of benchmark iterations to run, defaults to 1
+    :type iterations: int, optional
+    :param skip_ref_check: Whether to skip reference result validation, defaults to False
+    :type skip_ref_check: bool, optional
+    :param use_cold_l2: Whether to use circular buffer strategy to ensure cold L2 cache, defaults to False
+    :type use_cold_l2: bool, optional
+    :raises RuntimeError: If CUDA GPU is not available
+    :raises ValueError: If the configuration is invalid or unsupported by the kernel
+    :return: Execution time of the GEMM kernel
+    :rtype: float
+    """
+    # Unpack parameters
+    m, n, k, l = mnkl  # noqa: E741
+
+    # Compute weight_per_expert from k and expert_count
+    if k % expert_count != 0:
+        raise ValueError(f"k ({k}) must be divisible by expert_count ({expert_count})")
+    weight_per_expert = k // expert_count
+
+    print("Running Sm100 Persistent Dense BlockScaled GEMM test with:")
+    print(f"mnkl: {mnkl}")
+    print(f"AB dtype: {ab_dtype}, SF dtype: {sf_dtype}, SF Vec size: {sf_vec_size}")
+    print(f"C dtype: {c_dtype}")
+    print(f"Matrix majors - A: {a_major}, B: {b_major}, C: {c_major}")
+    print(f"Mma Tiler (M, N): {mma_tiler_mn}, Cluster Shape (M, N): {cluster_shape_mn}")
+    print(f"Expert count: {expert_count}")
+    print(f"Weight per expert (k // expert_count): {weight_per_expert}")
+    print(f"Use prefetch: {'True' if use_prefetch else 'False'}")
+    print(f"Prefetch dist: {prefetch_dist}")
+    print(f"Tolerance: {tolerance}")
+    print(f"Warmup iterations: {warmup_iterations}")
+    print(f"Iterations: {iterations}")
+    print(f"Skip reference checking: {skip_ref_check}")
+    print(f"Use cold L2: {'True' if use_cold_l2 else 'False'}")
+    print(f"Use CUPTI: {'True' if use_cupti else 'False'}")
+
+    # Skip unsupported testcase
+    if not Sm100BlockScaledPersistentDenseGemmKernel.can_implement(
+        ab_dtype,
+        sf_dtype,
+        sf_vec_size,
+        c_dtype,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        m,
+        n,
+        k,
+        l,
+        a_major,
+        b_major,
+        c_major,
+        expert_count,
+        weight_per_expert,
+    ):
+        raise TypeError(
+            f"Unsupported testcase {ab_dtype}, {sf_dtype}, {sf_vec_size}, {c_dtype}, "
+            f"{mma_tiler_mn}, {cluster_shape_mn}, {m}, {n}, {k}, {l}, "
+            f"{a_major}, {b_major}, {c_major}, {expert_count}, {weight_per_expert}"
+        )
+
+    if not torch.cuda.is_available():
+        raise RuntimeError("GPU is required to run this example!")
+
+    torch.manual_seed(DEFAULT_RANDOM_SEED)
+
+    # Create tensor A/B/C
+    a_ref = cutlass_torch.matrix(l, m, k, a_major == "m", cutlass.Float32)
+    b_ref = cutlass_torch.matrix(l, n, k, b_major == "n", cutlass.Float32)
+    c_ref = cutlass_torch.matrix(l, m, n, c_major == "m", cutlass.Float32)
+
+    a_tensor, a_torch = cutlass_torch.cute_tensor_like(
+        a_ref, ab_dtype, is_dynamic_layout=True, assumed_align=16
+    )
+    b_tensor, b_torch = cutlass_torch.cute_tensor_like(
+        b_ref, ab_dtype, is_dynamic_layout=True, assumed_align=16
+    )
+    c_tensor, c_torch = cutlass_torch.cute_tensor_like(
+        c_ref, c_dtype, is_dynamic_layout=True, assumed_align=16
+    )
+
+    # Mark tensor with element divisibility for 16B alignment
+    a_tensor.mark_compact_shape_dynamic(
+        mode=1 if a_major == "k" else 0,
+        stride_order=(2, 0, 1) if a_major == "k" else (2, 1, 0),
+        divisibility=32 if ab_dtype == cutlass.Float4E2M1FN else 16,
+    )
+    b_tensor.mark_compact_shape_dynamic(
+        mode=1 if b_major == "k" else 0,
+        stride_order=(2, 0, 1) if b_major == "k" else (2, 1, 0),
+        divisibility=32 if ab_dtype == cutlass.Float4E2M1FN else 16,
+    )
+    c_tensor.mark_compact_shape_dynamic(
+        mode=1 if c_major == "n" else 0,
+        stride_order=(2, 0, 1) if c_major == "n" else (2, 1, 0),
+        divisibility=32 if ab_dtype == cutlass.Float4E2M1FN else 16,
+    )
+
+    # Create scale factor tensor SFA/SFB
+    def create_scale_factor_tensor(l, mn, k, sf_vec_size, dtype):  # noqa: E741
+        sf_k = ceil_div(k, sf_vec_size)
+        ref_shape = (l, mn, sf_k)
+
+        atom_m = (32, 4)
+        atom_k = 4
+        mma_shape = (
+            l,
+            ceil_div(mn, atom_m[0] * atom_m[1]),
+            ceil_div(sf_k, atom_k),
+            atom_m[0],
+            atom_m[1],
+            atom_k,
+        )
+
+        ref_permute_order = (1, 2, 0)
+        mma_permute_order = (3, 4, 1, 5, 2, 0)
+
+        # Create f32 ref torch tensor (cpu)
+        ref_f32_torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
+            ref_shape,
+            torch.float32,
+            permute_order=ref_permute_order,
+            init_type=cutlass_torch.TensorInitType.RANDOM,
+            init_config=cutlass_torch.RandomInitConfig(
+                min_val=1,
+                max_val=3,
+            ),
+        )
+
+        # Create f32 cute torch tensor (cpu)
+        cute_f32_torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
+            mma_shape,
+            torch.float32,
+            permute_order=mma_permute_order,
+            init_type=cutlass_torch.TensorInitType.RANDOM,
+            init_config=cutlass_torch.RandomInitConfig(
+                min_val=0,
+                max_val=1,
+            ),
+        )
+
+        # convert ref f32 tensor to cute f32 tensor
+        cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+            from_dlpack(ref_f32_torch_tensor_cpu),
+            from_dlpack(cute_f32_torch_tensor_cpu),
+        )
+        cute_f32_torch_tensor = cute_f32_torch_tensor_cpu.cuda()
+
+        # reshape makes memory contiguous
+        ref_f32_torch_tensor_cpu = (
+            ref_f32_torch_tensor_cpu.permute(2, 0, 1)
+            .unsqueeze(-1)
+            .expand(l, mn, sf_k, sf_vec_size)
+            .reshape(l, mn, sf_k * sf_vec_size)
+            .permute(*ref_permute_order)
+        )
+        # prune to mkl for reference check.
+        ref_f32_torch_tensor_cpu = ref_f32_torch_tensor_cpu[:, :k, :]
+
+        # Create dtype cute torch tensor (cpu)
+        cute_tensor, cute_torch_tensor = cutlass_torch.cute_tensor_like(
+            cute_f32_torch_tensor_cpu,
+            dtype,
+            is_dynamic_layout=True,
+            assumed_align=16,
+        )
+
+        # Convert f32 cute tensor to dtype cute tensor
+        cute_tensor = cutlass_torch.convert_cute_tensor(
+            cute_f32_torch_tensor,
+            cute_tensor,
+            dtype,
+            is_dynamic_layout=True,
+        )
+        return ref_f32_torch_tensor_cpu, cute_tensor, cute_torch_tensor
+
+    sfa_ref, sfa_tensor, sfa_torch = create_scale_factor_tensor(l, m, k, sf_vec_size, sf_dtype)
+    sfb_ref, sfb_tensor, sfb_torch = create_scale_factor_tensor(l, n, k, sf_vec_size, sf_dtype)
+
+    # Additional Alpha scale tensor
+    def create_alpha_scale_tensor(l, m, n, expert_count, dtype):  # noqa: E741
+        ### if not swapAB
+        # True means alpha_scale is expert_count major.
+        alpha_scale_ref = cutlass_torch.matrix(
+            l,
+            m,
+            expert_count,
+            False,  # expert_count is major
+            cutlass.Float32,
+            # Debug alpha scale data
+            # init_type=cutlass_torch.TensorInitType.SCALAR,
+            # init_config=cutlass_torch.ScalarInitConfig(value=1),
+        )
+        # currently we assume alpha_scale is n major, which means token major and 4B aligned, should be 4B aligned.
+        alpha_scale_tensor, alpha_scale_torch = cutlass_torch.cute_tensor_like(
+            alpha_scale_ref, cutlass.Float32, is_dynamic_layout=True, assumed_align=4
+        )
+        # print(f"alpha_scale_ref: {alpha_scale_ref.shape}")
+        # reshape makes memory contiguous for reference check.
+        alpha_scale_ref = (
+            alpha_scale_ref.permute(2, 0, 1)
+            .unsqueeze(-1)
+            .expand(l, m, expert_count, weight_per_expert)
+            .reshape(l, m, expert_count * weight_per_expert)
+            .permute((1, 2, 0))
+        )
+
+        return alpha_scale_ref, alpha_scale_tensor, alpha_scale_torch
+
+    alpha_scale_ref, alpha_scale_tensor, alpha_scale_torch = create_alpha_scale_tensor(
+        l, m, n, expert_count, cutlass.Float32
+    )
+
+    # Configure gemm kernel
+    gemm = Sm100BlockScaledPersistentDenseGemmKernel(
+        sf_vec_size,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        expert_count,
+        weight_per_expert,
+        use_prefetch,
+        prefetch_dist,
+    )
+
+    # Compute max active clusters on current device
+    hardware_info = cutlass.utils.HardwareInfo()
+    max_active_clusters = hardware_info.get_max_active_clusters(
+        cluster_shape_mn[0] * cluster_shape_mn[1]
+    )
+
+    # Initialize Stream
+    current_stream = cutlass_torch.default_stream()
+
+    # Compile gemm kernel
+    compiled_gemm = cute.compile(
+        gemm,
+        a_tensor,
+        b_tensor,
+        sfa_tensor,
+        sfb_tensor,
+        alpha_scale_tensor,
+        c_tensor,
+        max_active_clusters,
+        current_stream,
+    )
+
+    # Compute reference result
+    if not skip_ref_check:
+        # Execute kernel once for reference checking
+        compiled_gemm(
+            a_tensor,
+            b_tensor,
+            sfa_tensor,
+            sfb_tensor,
+            alpha_scale_tensor,
+            c_tensor,
+            current_stream,
+        )
+        print("Verifying results...")
+        res_a = torch.einsum("mkl,mkl->mkl", a_ref, sfa_ref)
+        res_b = torch.einsum("nkl,nkl->nkl", b_ref, sfb_ref)
+        # Final reference result is the product of res_a, res_b and alpha_scale_ref.
+        ref = torch.einsum("mkl,nkl,mkl->mnl", res_a, res_b, alpha_scale_ref)
+
+        # Convert c back to f32 for comparison.
+        c_ref_device = c_ref.cuda()
+        cute.testing.convert(
+            c_tensor,
+            from_dlpack(c_ref_device, assumed_align=16).mark_layout_dynamic(
+                leading_dim=(1 if c_major == "n" else 0)
+            ),
+        )
+        c_ref = c_ref_device.cpu()
+
+        if c_dtype in (cutlass.Float32, cutlass.Float16, cutlass.BFloat16):
+            torch.testing.assert_close(c_ref, ref, atol=tolerance, rtol=1e-02)
+        elif c_dtype in (cutlass.Float8E5M2, cutlass.Float8E4M3FN):
+            # Convert ref : f32 -> f8 -> f32
+            ref_f8_ = torch.empty(*(l, m, n), dtype=torch.uint8, device="cuda").permute(1, 2, 0)
+            ref_f8 = from_dlpack(ref_f8_, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+            ref_f8.element_type = c_dtype
+            ref_device = ref.permute(2, 0, 1).contiguous().permute(1, 2, 0).cuda()
+            ref_tensor = from_dlpack(ref_device, assumed_align=16).mark_layout_dynamic(
+                leading_dim=1
+            )
+            cute.testing.convert(ref_tensor, ref_f8)
+            cute.testing.convert(ref_f8, ref_tensor)
+            ref = ref_device.cpu()
+            torch.testing.assert_close(c_ref, ref, atol=tolerance, rtol=1e-02)
+        elif c_dtype is cutlass.Float4E2M1FN:
+            # Convert ref : f32 -> f4 -> f32
+            ref_f4_ = torch.empty(*(l, m, n), dtype=torch.uint8, device="cuda").permute(1, 2, 0)
+            ref_f4 = from_dlpack(ref_f4_, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+            ref_f4.element_type = c_dtype
+            ref_device = ref.permute(2, 0, 1).contiguous().permute(1, 2, 0).cuda()
+            ref_tensor = from_dlpack(ref_device, assumed_align=16).mark_layout_dynamic(
+                leading_dim=1
+            )
+            cute.testing.convert(ref_tensor, ref_f4)
+            cute.testing.convert(ref_f4, ref_tensor)
+            ref = ref_device.cpu()
+            torch.testing.assert_close(c_ref, ref, atol=tolerance, rtol=1e-02)
+
+    def generate_tensors():
+        a_tensor, _ = cutlass_torch.cute_tensor_like(
+            a_ref, ab_dtype, is_dynamic_layout=True, assumed_align=16
+        )
+        b_tensor, _ = cutlass_torch.cute_tensor_like(
+            b_ref, ab_dtype, is_dynamic_layout=True, assumed_align=16
+        )
+        c_tensor, _ = cutlass_torch.cute_tensor_like(
+            c_ref, c_dtype, is_dynamic_layout=True, assumed_align=16
+        )
+
+        # Mark tensor to be byte aligned
+        a_tensor.mark_compact_shape_dynamic(
+            mode=1 if a_major == "k" else 0,
+            stride_order=(2, 0, 1) if a_major == "k" else (2, 1, 0),
+            divisibility=2 if ab_dtype == cutlass.Float4E2M1FN else 1,
+        )
+        b_tensor.mark_compact_shape_dynamic(
+            mode=1 if b_major == "k" else 0,
+            stride_order=(2, 0, 1) if b_major == "k" else (2, 1, 0),
+            divisibility=2 if ab_dtype == cutlass.Float4E2M1FN else 1,
+        )
+        c_tensor.mark_compact_shape_dynamic(
+            mode=1 if c_major == "n" else 0,
+            stride_order=(2, 0, 1) if c_major == "n" else (2, 1, 0),
+            divisibility=2 if c_dtype == cutlass.Float4E2M1FN else 1,
+        )
+
+        _, sfa_tensor, _ = create_scale_factor_tensor(l, m, k, sf_vec_size, sf_dtype)
+        _, sfb_tensor, _ = create_scale_factor_tensor(l, n, k, sf_vec_size, sf_dtype)
+        _, alpha_scale_tensor, _ = create_alpha_scale_tensor(l, m, n, expert_count, cutlass.Float32)
+        return cute.testing.JitArguments(
+            a_tensor,
+            b_tensor,
+            sfa_tensor,
+            sfb_tensor,
+            alpha_scale_tensor,
+            c_tensor,
+            current_stream,
+        )
+
+    workspace_count = 1
+    if use_cold_l2:
+        one_workspace_bytes = (
+            a_torch.numel() * a_torch.element_size()
+            + b_torch.numel() * b_torch.element_size()
+            + sfa_torch.numel() * sfa_torch.element_size()
+            + sfb_torch.numel() * sfb_torch.element_size()
+            + alpha_scale_torch.numel() * alpha_scale_torch.element_size()
+            + c_torch.numel() * c_torch.element_size()
+        )
+        workspace_count = cute.testing.get_workspace_count(
+            one_workspace_bytes, warmup_iterations, iterations
+        )
+
+    exec_time = benchmark(
+        compiled_gemm,
+        workspace_generator=generate_tensors,
+        workspace_count=workspace_count,
+        stream=current_stream,
+        warmup_iterations=warmup_iterations,
+        iterations=iterations,
+        use_cupti=use_cupti,
+    )
+
+    return exec_time  # Return execution time in microseconds
+
+
+if __name__ == "__main__":
+
+    def parse_comma_separated_ints(s: str) -> Tuple[int, ...]:
+        try:
+            return tuple(int(x.strip()) for x in s.split(","))
+        except ValueError:
+            raise argparse.ArgumentTypeError("Invalid format. Expected comma-separated integers.")
+
+    parser = argparse.ArgumentParser(
+        description="Example of Sm100 Dense Persistent BlockScaled GEMM."
+    )
+
+    parser.add_argument(
+        "--mnkl",
+        type=parse_comma_separated_ints,
+        default=(512, 256, 256, 1),
+        help="mnkl dimensions (comma-separated)",
+    )
+    parser.add_argument(
+        "--expert_count",
+        type=int,
+        default=256,
+        help="Expert count (k must be divisible by expert_count)",
+    )
+    parser.add_argument(
+        "--use_prefetch",
+        action="store_true",
+        default=False,
+        help="Enable prefetch operations (default: False)",
+    )
+    parser.add_argument(
+        "--prefetch_dist",
+        type=int,
+        default=7,
+        help="Prefetch distance for TMA operations (default: 3)",
+    )
+    parser.add_argument(
+        "--mma_tiler_mn",
+        type=parse_comma_separated_ints,
+        default=(128, 128),
+        help="Mma tile shape (comma-separated)",
+    )
+    parser.add_argument(
+        "--cluster_shape_mn",
+        type=parse_comma_separated_ints,
+        default=(1, 1),
+        help="Cluster shape (comma-separated)",
+    )
+    parser.add_argument("--ab_dtype", type=cutlass.dtype, default=cutlass.Float4E2M1FN)
+    parser.add_argument("--sf_dtype", type=cutlass.dtype, default=cutlass.Float8E8M0FNU)
+    parser.add_argument("--sf_vec_size", type=int, default=16)
+    parser.add_argument("--c_dtype", type=cutlass.dtype, default=cutlass.Float16)
+    parser.add_argument("--a_major", choices=["k", "m"], type=str, default="k")
+    parser.add_argument("--b_major", choices=["k", "n"], type=str, default="k")
+    parser.add_argument("--c_major", choices=["n", "m"], type=str, default="n")
+    parser.add_argument("--tolerance", type=float, default=1e-01, help="Tolerance for validation")
+    parser.add_argument("--warmup_iterations", type=int, default=10, help="Warmup iterations")
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=50,
+        help="Number of iterations to run the kernel",
+    )
+    parser.add_argument("--skip_ref_check", action="store_true", help="Skip reference checking")
+    parser.add_argument(
+        "--use_cold_l2",
+        action="store_true",
+        default=False,
+        help="Use circular buffer tensor sets to ensure L2 cold cache",
+    )
+    parser.add_argument(
+        "--use_cupti",
+        action="store_true",
+        default=True,
+        help="Use CUPTI for profiling (default: True)",
+    )
+    args = parser.parse_args()
+
+    if len(args.mnkl) != 4:
+        parser.error("--mnkl must contain exactly 4 values")
+
+    if len(args.mma_tiler_mn) != 2:
+        parser.error("--mma_tiler_mn must contain exactly 2 values")
+
+    if len(args.cluster_shape_mn) != 2:
+        parser.error("--cluster_shape_mn must contain exactly 2 values")
+
+    exec_time = run(
+        args.mnkl,
+        args.expert_count,
+        args.ab_dtype,
+        args.sf_dtype,
+        args.sf_vec_size,
+        args.c_dtype,
+        args.a_major,
+        args.b_major,
+        args.c_major,
+        args.mma_tiler_mn,
+        args.cluster_shape_mn,
+        args.use_prefetch,
+        args.prefetch_dist,
+        args.tolerance,
+        args.warmup_iterations,
+        args.iterations,
+        args.skip_ref_check,
+        args.use_cold_l2,
+        args.use_cupti,
+    )
+    print(f"Execution time: {exec_time:.2f} us")
+    print("PASS")
diff --git a/tests/unittest/_torch/executor/test_py_scheduler.py b/tests/unittest/_torch/executor/test_py_scheduler.py
index 01b5c90edd30..d1f78827e1fd 100644
--- a/tests/unittest/_torch/executor/test_py_scheduler.py
+++ b/tests/unittest/_torch/executor/test_py_scheduler.py
@@ -1381,6 +1381,276 @@ def test_draft_tokens_no_chunking_discard_none(self):
         )
 
 
+class TestForceChunkPolicy:
+    """
+    Tests for FORCE_CHUNK chunking policy in PyMicroBatchScheduler.
+    FORCE_CHUNK always chunks every context request to at most chunk_unit_size
+    tokens per scheduling step, regardless of whether the full context would fit
+    in the budget.
+
+    Aligned with C++ ForceChunkTest in microBatchSchedulerTest.cpp.
+    """
+
+    # --- Helper methods (mirrors C++ ForceChunkTest fixture) ---
+
+    @staticmethod
+    def _chunk_iteration(requests, chunk_unit_size, capacity=None):
+        """Run a single chunking iteration: call _set_ctx_requests_chunk_size
+        with FORCE_CHUNK, then move_to_next_context_chunk for active requests.
+        C++ ref: ForceChunkTest::chunkIteration"""
+        config = ContextChunkingConfig(ChunkingPolicy.FORCE_CHUNK, chunk_unit_size=chunk_unit_size)
+        scheduler = PyMicroBatchScheduler(
+            max_batch_size=64, max_num_tokens=1000, ctx_chunk_config=config
+        )
+        active = [r for r in requests if r.context_remaining_length > 0]
+        scheduler._set_ctx_requests_chunk_size(active, capacity)
+        for r in active:
+            r.move_to_next_context_chunk()
+
+    @staticmethod
+    def _expect_positions(requests, expected, label=""):
+        """Verify context positions of all requests match expected values.
+        C++ ref: ForceChunkTest::expectPositions"""
+        assert len(requests) == len(expected), label
+        for i, req in enumerate(requests):
+            assert req.context_current_position == expected[i], (
+                f"{label} request {i} (id={req.request_id}): "
+                f"expected {expected[i]}, got {req.context_current_position}"
+            )
+
+    # --- Direct _set_ctx_requests_chunk_size tests ---
+    # C++ ref: ForceChunkTest::Basic through CapacityAcrossIterations
+
+    def test_basic(self):
+        """
+        A single request with prompt_len > chunk_unit_size is chunked to unit_size.
+        C++ ref: ForceChunkTest.Basic
+        """
+        config = ContextChunkingConfig(ChunkingPolicy.FORCE_CHUNK, chunk_unit_size=10)
+        scheduler = PyMicroBatchScheduler(
+            max_batch_size=64, max_num_tokens=1000, ctx_chunk_config=config
+        )
+        reqs = [make_context_request(0, prompt_len=30)]
+        scheduler._set_ctx_requests_chunk_size(reqs, None)
+        assert reqs[0].context_chunk_size == 10
+
+    def test_prompt_smaller_than_unit(self):
+        """
+        When prompt_len < chunk_unit_size, chunk_size = prompt_len (min).
+        C++ ref: ForceChunkTest.PromptSmallerThanUnit
+        """
+        config = ContextChunkingConfig(ChunkingPolicy.FORCE_CHUNK, chunk_unit_size=20)
+        scheduler = PyMicroBatchScheduler(
+            max_batch_size=64, max_num_tokens=1000, ctx_chunk_config=config
+        )
+        reqs = [make_context_request(0, prompt_len=8)]
+        scheduler._set_ctx_requests_chunk_size(reqs, None)
+        assert reqs[0].context_chunk_size == 8
+
+    def test_exact_unit_size(self):
+        """
+        When prompt_len == chunk_unit_size, chunk_size = prompt_len.
+        C++ ref: ForceChunkTest.ExactUnitSize
+        """
+        config = ContextChunkingConfig(ChunkingPolicy.FORCE_CHUNK, chunk_unit_size=10)
+        scheduler = PyMicroBatchScheduler(
+            max_batch_size=64, max_num_tokens=1000, ctx_chunk_config=config
+        )
+        reqs = [make_context_request(0, prompt_len=10)]
+        scheduler._set_ctx_requests_chunk_size(reqs, None)
+        assert reqs[0].context_chunk_size == 10
+
+    def test_multiple_requests(self):
+        """
+        Each request independently gets min(remaining, unit_size).
+        C++ ref: ForceChunkTest.MultipleRequests
+        """
+        config = ContextChunkingConfig(ChunkingPolicy.FORCE_CHUNK, chunk_unit_size=10)
+        scheduler = PyMicroBatchScheduler(
+            max_batch_size=64, max_num_tokens=1000, ctx_chunk_config=config
+        )
+        reqs = [
+            make_context_request(0, prompt_len=25),
+            make_context_request(1, prompt_len=15),
+            make_context_request(2, prompt_len=5),
+        ]
+        scheduler._set_ctx_requests_chunk_size(reqs, None)
+        assert reqs[0].context_chunk_size == 10
+        assert reqs[1].context_chunk_size == 10
+        assert reqs[2].context_chunk_size == 5  # min(5, 10)
+
+    def test_capacity_limits(self):
+        """
+        When capacity is limited, later requests get chunk_size=0.
+        C++ ref: ForceChunkTest.CapacityLimits
+        """
+        config = ContextChunkingConfig(ChunkingPolicy.FORCE_CHUNK, chunk_unit_size=10)
+        scheduler = PyMicroBatchScheduler(
+            max_batch_size=64, max_num_tokens=1000, ctx_chunk_config=config
+        )
+        reqs = [
+            make_context_request(0, prompt_len=30),
+            make_context_request(1, prompt_len=30),
+        ]
+        scheduler._set_ctx_requests_chunk_size(reqs, capacity=15)
+        # req0 gets 10, req1 would push total to 20 > 15 → 0
+        assert reqs[0].context_chunk_size == 10
+        assert reqs[1].context_chunk_size == 0
+
+    def test_capacity_exact_fit(self):
+        """
+        When capacity exactly accommodates all chunks.
+        C++ ref: ForceChunkTest.CapacityExactFit
+        """
+        config = ContextChunkingConfig(ChunkingPolicy.FORCE_CHUNK, chunk_unit_size=10)
+        scheduler = PyMicroBatchScheduler(
+            max_batch_size=64, max_num_tokens=1000, ctx_chunk_config=config
+        )
+        reqs = [
+            make_context_request(0, prompt_len=30),
+            make_context_request(1, prompt_len=30),
+        ]
+        scheduler._set_ctx_requests_chunk_size(reqs, capacity=20)
+        assert reqs[0].context_chunk_size == 10
+        assert reqs[1].context_chunk_size == 10
+
+    def test_multi_iteration(self):
+        """
+        A request with prompt_len=25 and chunk_unit_size=10 processes in 3
+        iterations: chunk 1: 10, chunk 2: 10, chunk 3: 5.
+        C++ ref: ForceChunkTest.MultiIteration
+        """
+        reqs = [make_context_request(0, prompt_len=25)]
+
+        # Iteration 1
+        self._chunk_iteration(reqs, 10)
+        self._expect_positions(reqs, [10], "iter 1")
+
+        # Iteration 2
+        self._chunk_iteration(reqs, 10)
+        self._expect_positions(reqs, [20], "iter 2")
+
+        # Iteration 3
+        self._chunk_iteration(reqs, 10)
+        self._expect_positions(reqs, [25], "iter 3")
+
+    def test_multi_request_multi_iteration(self):
+        """
+        Two requests with different lengths processed over multiple iterations.
+        prompt_len={25, 12}, chunk_unit_size=10.
+        C++ ref: ForceChunkTest.MultiRequestMultiIteration
+        """
+        reqs = [
+            make_context_request(0, prompt_len=25),
+            make_context_request(1, prompt_len=12),
+        ]
+
+        # Iteration 1: both get 10
+        self._chunk_iteration(reqs, 10)
+        self._expect_positions(reqs, [10, 10], "iter 1")
+
+        # Iteration 2: req0 gets 10, req1 gets 2 (remaining)
+        self._chunk_iteration(reqs, 10)
+        self._expect_positions(reqs, [20, 12], "iter 2")
+
+        # Iteration 3: only req0 active (remaining=5), req1 done
+        self._chunk_iteration(reqs, 10)
+        self._expect_positions(reqs, [25, 12], "iter 3")
+
+    def test_capacity_across_iterations(self):
+        """
+        With limited capacity, some requests may be delayed to later iterations.
+        prompt_len={25, 25}, chunk_unit_size=10, capacity=15.
+        C++ ref: ForceChunkTest.CapacityAcrossIterations
+        """
+        reqs = [
+            make_context_request(0, prompt_len=25),
+            make_context_request(1, prompt_len=25),
+        ]
+
+        # Iteration 1: req0=10, req1=0 (10+10=20 > 15)
+        self._chunk_iteration(reqs, 10, capacity=15)
+        self._expect_positions(reqs, [10, 0], "iter 1")
+
+        # Iteration 2: req0=10, req1=0 (still constrained)
+        self._chunk_iteration(reqs, 10, capacity=15)
+        self._expect_positions(reqs, [20, 0], "iter 2")
+
+        # Iteration 3: req0=5, req1=10 (5+10=15 == capacity)
+        self._chunk_iteration(reqs, 10, capacity=15)
+        self._expect_positions(reqs, [25, 10], "iter 3")
+
+        # Iteration 4: only req1 active (remaining=15), gets 10
+        self._chunk_iteration(reqs, 10, capacity=15)
+        self._expect_positions(reqs, [25, 20], "iter 4")
+
+        # Iteration 5: req1 remaining=5
+        self._chunk_iteration(reqs, 10, capacity=15)
+        self._expect_positions(reqs, [25, 25], "iter 5")
+
+    # --- Full scheduler.schedule() tests ---
+    # C++ ref: ForceChunkTest::FullSchedulerPath through FullSchedulerWithGeneration
+
+    def test_full_scheduler_path(self):
+        """
+        FORCE_CHUNK always re-chunks even when all contexts fit within the
+        token budget. Test via the full schedule() path.
+        C++ ref: ForceChunkTest.FullSchedulerPath
+        """
+        config = ContextChunkingConfig(ChunkingPolicy.FORCE_CHUNK, chunk_unit_size=10)
+        scheduler = PyMicroBatchScheduler(
+            max_batch_size=4, max_num_tokens=100, ctx_chunk_config=config
+        )
+        req = make_context_request(0, prompt_len=30)
+        ctx, gen = scheduler.schedule([req], set())
+        # Despite budget=100 >> prompt=30, FORCE_CHUNK limits chunk to unit_size=10.
+        assert len(ctx) == 1
+        assert ctx[0].context_chunk_size == 10
+        assert len(gen) == 0
+
+    def test_full_scheduler_multiple_requests(self):
+        """
+        Full scheduler path with multiple requests.
+        C++ ref: ForceChunkTest.FullSchedulerMultipleRequests
+        """
+        config = ContextChunkingConfig(ChunkingPolicy.FORCE_CHUNK, chunk_unit_size=10)
+        scheduler = PyMicroBatchScheduler(
+            max_batch_size=4, max_num_tokens=100, ctx_chunk_config=config
+        )
+        requests = [
+            make_context_request(0, prompt_len=25),
+            make_context_request(1, prompt_len=15),
+            make_context_request(2, prompt_len=5),
+        ]
+        ctx, gen = scheduler.schedule(requests, set())
+        assert len(ctx) == 3
+        # Find by request_id since sorting may reorder.
+        chunks = {r.request_id: r.context_chunk_size for r in ctx}
+        assert chunks[0] == 10
+        assert chunks[1] == 10
+        assert chunks[2] == 5
+
+    def test_full_scheduler_with_generation(self):
+        """
+        Context chunking with concurrent generation requests.
+        Generation tokens reduce the available budget for context chunks.
+        C++ ref: ForceChunkTest.FullSchedulerWithGeneration
+        """
+        config = ContextChunkingConfig(ChunkingPolicy.FORCE_CHUNK, chunk_unit_size=10)
+        scheduler = PyMicroBatchScheduler(
+            max_batch_size=4, max_num_tokens=15, ctx_chunk_config=config
+        )
+        requests = [
+            make_generation_request(0),  # costs 1 token
+            make_context_request(1, prompt_len=30),
+        ]
+        ctx, gen = scheduler.schedule(requests, set())
+        assert len(gen) == 1
+        assert len(ctx) == 1
+        # Budget remaining = 15 - 1 (gen) = 14; chunk = min(30, 10) = 10
+        assert ctx[0].context_chunk_size == 10
+
+
 class TestDraftTokensGreaterThanChunkSize:
     """
     Tests that when draft tokens > chunk unit, they get properly trimmed.
diff --git a/tests/unittest/_torch/modules/moe/moe_test_utils.py b/tests/unittest/_torch/modules/moe/moe_test_utils.py
index 3f3e56af2a33..a26d0466ee87 100644
--- a/tests/unittest/_torch/modules/moe/moe_test_utils.py
+++ b/tests/unittest/_torch/modules/moe/moe_test_utils.py
@@ -45,6 +45,7 @@
     TRTLLMGenFusedMoE,
 )
 from tensorrt_llm._torch.modules.fused_moe.fused_moe_deepgemm import DeepGemmFusedMoE
+from tensorrt_llm._torch.modules.fused_moe.fused_moe_densegemm import DenseGEMMFusedMoE
 from tensorrt_llm._torch.modules.fused_moe.interface import MoE
 from tensorrt_llm._torch.utils import ActivationType, is_gated_activation
 from tensorrt_llm.models.modeling_utils import QuantAlgo
@@ -62,6 +63,7 @@ class MoeBackendType(str, Enum):
     TRTLLM = "TRTLLM"
     CUTEDSL = "CUTEDSL"
     DEEPGEMM = "DEEPGEMM"
+    DENSEGEMM = "DENSEGEMM"
 
 
 def get_backend_class(backend_type: MoeBackendType) -> Type[MoE]:
@@ -71,6 +73,7 @@ def get_backend_class(backend_type: MoeBackendType) -> Type[MoE]:
         MoeBackendType.TRTLLM: TRTLLMGenFusedMoE,
         MoeBackendType.CUTEDSL: CuteDslFusedMoE,
         MoeBackendType.DEEPGEMM: DeepGemmFusedMoE,
+        MoeBackendType.DENSEGEMM: DenseGEMMFusedMoE,
     }
     return backend_class_map[backend_type]
 
@@ -533,6 +536,82 @@ def should_skip_deepgemm(
     return None
 
 
+def should_skip_densegemm(
+    backend_type: MoeBackendType,
+    quant_algo: Optional[QuantAlgo] = None,
+    model_config: "MoeModelConfig" = None,
+    comm_method: Optional[str] = None,
+    moe_tp_size: int = 1,
+    parallel_mode: Optional[str] = None,
+) -> Optional[str]:
+    """
+    Check DenseGEMM backend specific constraints.
+
+    DenseGEMM reshapes all expert weights into a single dense matrix and performs
+    a single large GEMM. It only supports NVFP4 quantization on Blackwell (SM 100/103).
+
+    Constraints:
+    - Only NVFP4 quantization
+    - hidden_size and intermediate_size must be 128-aligned (NVFP4 requirement)
+    - intermediate_size must be 256-aligned (MMA tile K boundary)
+    - DenseGEMM supports TP (DTP/TTP) but not EP (DEP/TEP) since all experts
+      must reside on a single GPU for the dense matrix formulation.
+
+    Returns:
+        Skip reason string if test should be skipped, None otherwise
+    """
+    if backend_type != MoeBackendType.DENSEGEMM:
+        return None
+
+    # DenseGEMM only supports NVFP4
+    if quant_algo != QuantAlgo.NVFP4:
+        return f"DenseGEMMFusedMoE only supports NVFP4 quantization (got quant_algo={quant_algo})"
+
+    # DenseGEMM does not support EP modes (DEP/TEP). All experts must reside
+    # on the same GPU for the dense matrix formulation. TP modes (DTP/TTP) are
+    # supported since they shard intermediate_size, not experts.
+    if parallel_mode in ("DEP", "TEP"):
+        return (
+            f"DenseGEMMFusedMoE does not support expert parallelism "
+            f"(got parallel_mode={parallel_mode}). All experts must reside on a single GPU."
+        )
+
+    # DenseGEMM does not support DeepEP communication.
+    if comm_method in ("DEEPEP", "DEEPEPLOWLATENCY"):
+        return (
+            f"DenseGEMMFusedMoE does not support EP communication (got comm_method={comm_method})."
+        )
+
+    if model_config is not None:
+        hidden_size = model_config.hidden_size
+        intermediate_size = model_config.intermediate_size
+
+        # In TP mode, intermediate_size is sharded across moe_tp_size GPUs
+        sharded_intermediate = intermediate_size // moe_tp_size
+
+        # 128-alignment required for NVFP4 dense GEMM kernels
+        if hidden_size % 128 != 0 or sharded_intermediate % 128 != 0:
+            return (
+                f"DenseGEMMFusedMoE NVFP4 requires 128-aligned sizes "
+                f"(got h={hidden_size}, i={sharded_intermediate} "
+                f"[{intermediate_size}/tp{moe_tp_size}])"
+            )
+
+        # FC2 DenseGEMM kernel tiles K with MMA tile size 256.
+        # intermediate_size (= weight_per_expert for FC2) must be 256-aligned
+        # so expert boundaries align with MMA tile boundaries.
+        _MMA_TILE_K = 256
+        if sharded_intermediate % _MMA_TILE_K != 0:
+            return (
+                f"DenseGEMMFusedMoE requires intermediate_size to be a multiple "
+                f"of {_MMA_TILE_K} (got {sharded_intermediate} "
+                f"[{intermediate_size}/tp{moe_tp_size}]). "
+                f"FC2 kernel cannot split alpha_scale at non-aligned expert boundaries."
+            )
+
+    return None
+
+
 def should_skip_multi_gpu(
     parallel_mode: str,
     model_config: "MoeModelConfig",
@@ -703,15 +782,21 @@ def get_quick_skip_reason(
             lambda: should_skip_deepgemm(
                 backend_type, quant_algo=quant_algo, model_config=model_config
             ),
+            lambda: should_skip_densegemm(
+                backend_type, quant_algo=quant_algo, model_config=model_config
+            ),
         ]
         for check in skip_checks:
             skip_reason = check()
             if skip_reason:
                 return skip_reason
 
-        # DEEPGEMM: float16 reference module constraint
-        if backend_type == MoeBackendType.DEEPGEMM and dtype == torch.float16:
-            return "DeepGemmFusedMoE reference module requires bfloat16 input"
+        # DEEPGEMM/DENSEGEMM: float16 reference module constraint
+        if (
+            backend_type in (MoeBackendType.DEEPGEMM, MoeBackendType.DENSEGEMM)
+            and dtype == torch.float16
+        ):
+            return f"{backend_type.value} reference module requires bfloat16 input"
 
         # 128-alignment requirement for quantization
         if quant_algo is not None:
diff --git a/tests/unittest/_torch/modules/moe/test_moe_backend.py b/tests/unittest/_torch/modules/moe/test_moe_backend.py
index 2e1d97326d4a..3bb9853b2d06 100644
--- a/tests/unittest/_torch/modules/moe/test_moe_backend.py
+++ b/tests/unittest/_torch/modules/moe/test_moe_backend.py
@@ -220,6 +220,7 @@ def run_backend_moe(
     MoeBackendType.TRTLLM,
     MoeBackendType.CUTEDSL,
     MoeBackendType.DEEPGEMM,
+    MoeBackendType.DENSEGEMM,
 ]
 
 # Data types to test
@@ -456,6 +457,7 @@ def test_moe_backend(
     swiglu_alpha: Optional[float],
     swiglu_beta: Optional[float],
     swiglu_limit: Optional[float],
+    monkeypatch: pytest.MonkeyPatch,
 ):
     """
     Test MoE backend with autotune to capture all tactics.
@@ -466,6 +468,10 @@ def test_moe_backend(
     3. Different sequence lengths use appropriate tactics
     4. swiglu_gptoss_style (SwiGlu with custom parameters) works correctly
     """
+    # DENSEGEMM: disable fused fc2_alpha path for backend-level testing.
+    if backend_type == MoeBackendType.DENSEGEMM:
+        monkeypatch.setenv("TRTLLM_MOE_FUSED_FC2_ALPHA", "0")
+
     is_gated = is_gated_activation(activation_type)
     swiglu_gptoss_style = False
     if is_gated:
diff --git a/tests/unittest/_torch/modules/moe/test_moe_module.py b/tests/unittest/_torch/modules/moe/test_moe_module.py
index 3c77e4957759..527cbf1a3437 100644
--- a/tests/unittest/_torch/modules/moe/test_moe_module.py
+++ b/tests/unittest/_torch/modules/moe/test_moe_module.py
@@ -51,6 +51,7 @@
     should_skip_cutedsl,
     should_skip_cutlass,
     should_skip_deepgemm,
+    should_skip_densegemm,
     should_skip_multi_gpu,
     should_skip_to_accelerate_ci,
     should_skip_trtllm,
@@ -740,6 +741,7 @@ def init_worker(custom_paths, comm_method_type):
     MoeBackendType.TRTLLM,
     MoeBackendType.CUTEDSL,
     MoeBackendType.DEEPGEMM,
+    MoeBackendType.DENSEGEMM,
 ]
 
 # Data types to test
@@ -976,6 +978,14 @@ def generate_multi_gpu_test_params(
                         model_config=model_config,
                         moe_tp_size=moe_tp_size,
                     ),
+                    should_skip_densegemm(
+                        backend_type,
+                        quant_algo=quant_algo,
+                        model_config=model_config,
+                        comm_method=comm_method,
+                        moe_tp_size=moe_tp_size,
+                        parallel_mode=parallel_mode,
+                    ),
                     should_skip_multi_gpu(
                         parallel_mode, model_config, world_size=4, comm_method=comm_method
                     ),
diff --git a/tests/unittest/_torch/thop/parallel/test_moe_densegemm.py b/tests/unittest/_torch/thop/parallel/test_moe_densegemm.py
new file mode 100644
index 000000000000..c7415e0b4111
--- /dev/null
+++ b/tests/unittest/_torch/thop/parallel/test_moe_densegemm.py
@@ -0,0 +1,401 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Unit tests for Dense GEMM with SwiGLU fusion (FC1 kernel).
+
+This module tests the NVFP4 dense GEMM with SwiGLU fusion kernel used in MoE FC1 layers.
+The kernel performs: C = alpha_post * SwiGLU(alpha * (A @ B.T))
+"""
+
+import pytest
+import torch
+
+from tensorrt_llm._torch.modules.fused_moe.quantization import interleave_linear_and_gate
+from tensorrt_llm._torch.utils import swizzle_sf, unswizzle_sf
+from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm.math_utils import pad_up
+
+
+def swiglu_ref(x: torch.Tensor) -> torch.Tensor:
+    """Reference SwiGLU implementation: x * silu(gate) where [x, gate] = chunk(input, 2)."""
+    x, gate = x.chunk(2, dim=-1)
+    return x * torch.nn.functional.silu(gate)
+
+
+def nvfp4_dense_gemm_ref(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_sf: torch.Tensor,
+    b_sf: torch.Tensor,
+    alpha: torch.Tensor,
+    output_dtype: torch.dtype,
+    scaling_vector_size: int = 16,
+) -> torch.Tensor:
+    """Reference implementation for dense GEMM using nvfp4_gemm op.
+
+    Args:
+        a: Input activation (M, K//2) in fp4 packed format
+        b: Weight tensor (num_expert, weight_per_expert, K//2) in fp4 packed format
+        a_sf: Scale factor for a (swizzled)
+        b_sf: Scale factor for b (num_expert, weight_per_expert, scale_k)
+        alpha: Per-expert alpha scale (num_expert,)
+        output_dtype: Output data type
+        scaling_vector_size: Vector size for block scaling
+
+    Returns:
+        Output tensor (M, num_expert * weight_per_expert) in output_dtype
+    """
+    assert a.dtype == torch.float4_e2m1fn_x2
+    assert b.dtype == torch.float4_e2m1fn_x2
+
+    m = a.size(0)
+    num_expert = b.size(0)
+    weight_per_expert = b.size(1)
+    n = num_expert * weight_per_expert
+
+    # Compute reference output by calling nvfp4_gemm for each expert
+    ref = torch.empty(m, n, dtype=output_dtype, device="cuda")
+
+    for expert_idx in range(num_expert):
+        start_col = expert_idx * weight_per_expert
+        end_col = (expert_idx + 1) * weight_per_expert
+
+        # Get expert's weight and scale factor
+        b_expert = b[expert_idx]  # (weight_per_expert, k//2)
+        b_sf_expert = b_sf[expert_idx]  # (weight_per_expert, scale_k)
+
+        # Call nvfp4_gemm for this expert
+        ref[:, start_col:end_col] = torch.ops.trtllm.nvfp4_gemm(
+            a.view(torch.uint8),
+            b_expert.view(torch.uint8),
+            a_sf,
+            b_sf_expert,
+            alpha[expert_idx],
+            output_dtype,
+        )
+
+    return ref
+
+
+@pytest.mark.skipif(
+    get_sm_version() not in (100, 103),
+    reason="This test is only supported on SM 100 and SM 103 GPUs",
+)
+@pytest.mark.parametrize("num_expert", [1, 4, 8, 32])
+@pytest.mark.parametrize("weight_per_expert", [256, 512, 1024, 2816])
+@pytest.mark.parametrize("num_tokens", [1, 8, 127, 256])
+@pytest.mark.parametrize("hidden_size", [256, 512, 2048])
+@pytest.mark.parametrize("enable_alpha_post", [True, False])
+def test_nvfp4_dense_gemm_swiglu_blackwell(
+    num_tokens: int,
+    hidden_size: int,
+    num_expert: int,
+    weight_per_expert: int,
+    enable_alpha_post: bool,
+):
+    """Test Dense GEMM with SwiGLU fusion for FC1 layer.
+
+    This test validates the dense GEMM kernel which:
+    1. Performs GEMM: C = A @ B.T with per-expert alpha scaling
+    2. Applies SwiGLU fusion: output = up * silu(gate) where [up, gate] = chunk(C, 2)
+    3. Optionally applies post-SwiGLU alpha scaling (when enable_alpha_post=True)
+    4. Quantizes output to NVFP4 format with scale factor generation
+    """
+    sf_vec_size = 16
+    m = num_tokens
+    k = hidden_size
+    n = num_expert * weight_per_expert  # Full N before SwiGLU
+
+    # Create input tensors in bfloat16, then quantize to fp4
+    a_bf16 = torch.randint(-5, 5, (m, k), dtype=torch.int32, device="cuda").to(torch.bfloat16)
+    b_bf16 = torch.randint(
+        -5, 5, (num_expert, weight_per_expert, k), dtype=torch.int32, device="cuda"
+    ).to(torch.bfloat16)
+
+    # Compute global scale factors
+    a_global_sf = a_bf16.abs().max().float() / (448 * 6)
+    b_global_sf = b_bf16.abs().amax(dim=(1, 2)).float() / (448 * 6)
+
+    # Quantize to fp4
+    a, a_sf = torch.ops.trtllm.fp4_quantize(a_bf16, 1 / a_global_sf, sf_vec_size, False)
+    a = a.view(torch.float4_e2m1fn_x2)
+
+    b, b_sf = torch.ops.trtllm.fp4_quantize(b_bf16, 1 / b_global_sf, sf_vec_size, False)
+    b = b.view(torch.float4_e2m1fn_x2)
+    b_sf = b_sf.view(num_expert, weight_per_expert, k // sf_vec_size)
+
+    # Per-expert alpha = a_global_sf * b_global_sf
+    alpha = a_global_sf * b_global_sf
+
+    # Interleave weights for SwiGLU (linear and gate interleaved at group_size=64)
+    b_interleaved = interleave_linear_and_gate(b.view(torch.uint8), group_size=64, dim=1).view(
+        torch.float4_e2m1fn_x2
+    )
+
+    # Interleave scale factors
+    b_sf_unswizzled = unswizzle_sf(b_sf, weight_per_expert, k).view(
+        num_expert, weight_per_expert, k // sf_vec_size
+    )
+    b_sf_unswizzled_interleaved = interleave_linear_and_gate(b_sf_unswizzled, group_size=64, dim=1)
+    b_sf_interleaved = swizzle_sf(b_sf_unswizzled_interleaved, weight_per_expert, k).view(
+        num_expert, weight_per_expert, k // sf_vec_size
+    )
+
+    # Compute reference using nvfp4_gemm (simulates fp4 precision)
+    c_ref = nvfp4_dense_gemm_ref(
+        a,
+        b,
+        a_sf,
+        b_sf,
+        alpha,
+        output_dtype=torch.bfloat16,
+        scaling_vector_size=sf_vec_size,
+    )
+
+    # Apply SwiGLU separately for each expert: reshape -> swiglu -> flatten
+    c_ref = swiglu_ref(c_ref.view(m, num_expert, weight_per_expert))  # (m, num_expert, wpe//2)
+
+    # Conditionally apply alpha_post scaling
+    alpha_post = None
+    if enable_alpha_post:
+        # Create per-token per-expert alpha_post scale (m, num_expert)
+        # Use values close to 1.0 to minimize numerical variation while testing functionality
+        alpha_post = torch.rand(m, num_expert, dtype=torch.float32, device="cuda") * 0.4 + 0.8
+        # Apply alpha_post: (m, num_expert, wpe//2) * (m, num_expert, 1) -> (m, num_expert, wpe//2)
+        c_ref = c_ref * alpha_post.unsqueeze(-1)
+
+    c_ref = c_ref.to(torch.bfloat16)
+    c_ref = c_ref.view(m, -1)  # Flatten to (m, n_out)
+
+    # Output N after SwiGLU
+    n_out = n // 2
+
+    # Create norm_const tensor for fp4 output quantization
+    global_sf = c_ref.abs().max().float() / (448 * 6)
+    norm_const = torch.tensor([1 / global_sf], dtype=torch.float32, device="cuda")
+
+    # Quantize reference output to fp4
+    c_ref_quantized, c_sf_ref = torch.ops.trtllm.fp4_quantize(
+        c_ref, 1 / global_sf, sf_vec_size, False
+    )
+
+    # Call the kernel with fp4 output
+    c, c_sf = torch.ops.trtllm.cute_dsl_nvfp4_dense_gemm_swiglu_blackwell(
+        a,
+        b_interleaved,
+        a_sf,
+        b_sf_interleaved,
+        alpha,
+        alpha_post,  # None when enable_alpha_post=False
+        norm_const,
+        expert_count=num_expert,
+        weight_per_expert=weight_per_expert,
+        output_dtype=torch.float4_e2m1fn_x2,
+        scaling_vector_size=sf_vec_size,
+    )
+
+    # Verify output shape (fp4 packed: n_out // 2 bytes)
+    assert c.shape == (m, n_out // 2), f"Expected shape {(m, n_out // 2)}, got {c.shape}"
+    assert c.dtype == torch.float4_e2m1fn_x2, f"Expected dtype float4_e2m1fn_x2, got {c.dtype}"
+
+    # Verify output values by comparing fp4 nibbles.
+    # The kernel and fp4_quantize reference may round differently, so allow ±1 per nibble.
+    # Each uint8 byte packs two fp4 values: high nibble [7:4] and low nibble [3:0].
+    c_u8 = c.view(torch.uint8).flatten().int()
+    ref_u8 = c_ref_quantized.view(torch.uint8).flatten().int()
+    lo_diff = ((c_u8 & 0xF) - (ref_u8 & 0xF)).abs()
+    hi_diff = ((c_u8 >> 4) - (ref_u8 >> 4)).abs()
+    num_fp4_elements = c_u8.numel() * 2
+    close_count = (lo_diff <= 1).sum().item() + (hi_diff <= 1).sum().item()
+    nibble_close_ratio = close_count / num_fp4_elements
+    assert nibble_close_ratio > 0.95, (
+        f"Only {nibble_close_ratio * 100:.2f}% fp4 elements within ±1, expected >= 95%"
+    )
+
+    # Verify scale factor shape
+    scale_n_out = n_out // sf_vec_size
+    expected_c_sf_shape = (32, 4, pad_up(m, 128) // 128, 4, scale_n_out // 4, 1)
+    assert c_sf.shape == expected_c_sf_shape, (
+        f"Expected c_sf shape {expected_c_sf_shape}, got {c_sf.shape}"
+    )
+    assert c_sf.dtype == torch.uint8, f"Expected c_sf dtype uint8, got {c_sf.dtype}"
+
+    # Verify scale factor values
+    # Unswizzle both c_sf and c_sf_ref for comparison (both are padded to 128)
+    c_sf_unswizzled = unswizzle_sf(c_sf.view(-1), pad_up(m, 128), n_out)
+    c_sf_ref_unswizzled = unswizzle_sf(c_sf_ref.view(-1), pad_up(m, 128), n_out)
+
+    # Compare only the valid region (first m rows)
+    c_sf_valid = c_sf_unswizzled[:m, :].view(torch.uint8)
+    c_sf_ref_valid = c_sf_ref_unswizzled[:m, :].view(torch.uint8)
+
+    # Allow off-by-1 rounding differences between kernel and fp4_quantize reference.
+    # Both compute scale factors independently; ±1 in uint8 is expected FP rounding.
+    sf_diff = (c_sf_valid.int() - c_sf_ref_valid.int()).abs()
+    sf_close_ratio = (sf_diff <= 1).sum().item() / c_sf_valid.numel()
+    assert sf_close_ratio > 0.95, (
+        f"Scale factor: only {sf_close_ratio * 100:.2f}% within ±1, expected >= 95%"
+    )
+
+
+def nvfp4_dense_gemm_fc2_ref(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_sf: torch.Tensor,
+    b_sf: torch.Tensor,
+    alpha_scale: torch.Tensor,
+    output_dtype: torch.dtype,
+    expert_count: int,
+    weight_per_expert: int,
+    scaling_vector_size: int = 16,
+) -> torch.Tensor:
+    """Reference implementation for FC2 dense GEMM.
+
+    For each expert, compute GEMM with corresponding A slice and B slice,
+    multiply by alpha_scale, then accumulate results.
+
+    C = sum_i(alpha_scale[:, i] * (A[:, i*wpe:(i+1)*wpe] @ B[:, i*wpe:(i+1)*wpe].T))
+
+    Args:
+        a: Input activation (M, K//2) in fp4 packed format, K = expert_count * weight_per_expert
+        b: Weight tensor (N, K//2) in fp4 packed format
+        a_sf: Scale factor for a (swizzled)
+        b_sf: Scale factor for b (swizzled)
+        alpha_scale: Per-token-per-expert scale (M, expert_count)
+        output_dtype: Output data type
+        expert_count: Number of experts
+        weight_per_expert: Number of weights per expert
+        scaling_vector_size: Vector size for block scaling
+
+    Returns:
+        Output tensor (M, N) in output_dtype
+    """
+    assert a.dtype == torch.float4_e2m1fn_x2
+    assert b.dtype == torch.float4_e2m1fn_x2
+
+    m = a.size(0)
+    k = a.size(1) * 2
+    n = b.size(0)
+
+    # Unswizzle scale factors for slicing
+    a_sf_unswizzled = unswizzle_sf(a_sf.view(-1), pad_up(m, 128), k)[:m, :]  # (m, scale_k)
+    b_sf_unswizzled = unswizzle_sf(b_sf.view(-1), n, k)  # (n, scale_k)
+
+    # Initialize output accumulator
+    c_ref = torch.zeros(m, n, dtype=output_dtype, device=a.device)
+
+    # For each expert, compute partial GEMM and accumulate
+    wpe = weight_per_expert
+    scale_wpe = wpe // scaling_vector_size
+    alpha_one = torch.tensor([1.0], dtype=torch.float32, device=a.device)
+
+    for expert_idx in range(expert_count):
+        # Slice A for this expert (in packed fp4 format, so divide by 2)
+        a_expert = a.view(torch.uint8)[
+            :, expert_idx * wpe // 2 : (expert_idx + 1) * wpe // 2
+        ].contiguous()
+        a_sf_expert = a_sf_unswizzled[:, expert_idx * scale_wpe : (expert_idx + 1) * scale_wpe]
+        a_sf_expert_swizzled = swizzle_sf(a_sf_expert.contiguous(), m, wpe)
+
+        # Slice B for this expert
+        b_expert = b.view(torch.uint8)[
+            :, expert_idx * wpe // 2 : (expert_idx + 1) * wpe // 2
+        ].contiguous()
+        b_sf_expert = b_sf_unswizzled[:, expert_idx * scale_wpe : (expert_idx + 1) * scale_wpe]
+        b_sf_expert_swizzled = swizzle_sf(b_sf_expert.contiguous(), n, wpe)
+
+        # Compute GEMM for this expert
+        c_expert = torch.ops.trtllm.nvfp4_gemm(
+            a_expert,
+            b_expert,
+            a_sf_expert_swizzled,
+            b_sf_expert_swizzled,
+            alpha_one,
+            output_dtype,
+        )
+
+        # Apply alpha_scale and accumulate
+        c_ref += c_expert * alpha_scale[:, expert_idx : expert_idx + 1]
+
+    return c_ref
+
+
+@pytest.mark.skipif(
+    get_sm_version() not in (100, 103),
+    reason="This test is only supported on SM 100 and SM 103 GPUs",
+)
+@pytest.mark.parametrize("num_expert", [1, 4, 8, 32, 256])
+@pytest.mark.parametrize("weight_per_expert", [256, 512, 1024])
+@pytest.mark.parametrize("num_tokens", [1, 8, 127, 256])
+@pytest.mark.parametrize("output_hidden_size", [256, 512, 2048])
+def test_nvfp4_dense_gemm_fc2_blackwell(
+    num_tokens: int, output_hidden_size: int, num_expert: int, weight_per_expert: int
+):
+    """Test Dense GEMM FC2 for MoE second projection.
+
+    This test validates the FC2 dense GEMM kernel which:
+    1. Performs GEMM: C = A @ B.T with per-token-per-expert alpha scaling
+    2. Each token has independent alpha values for each expert
+    """
+    sf_vec_size = 16
+    m = num_tokens
+    k = num_expert * weight_per_expert  # Input dimension from FC1 output
+    n = output_hidden_size  # Output hidden dimension
+
+    # Create input tensors in bfloat16, then quantize to fp4
+    a_bf16 = torch.randint(-5, 5, (m, k), dtype=torch.int32, device="cuda").to(torch.bfloat16)
+    b_bf16 = torch.randint(-5, 5, (n, k), dtype=torch.int32, device="cuda").to(torch.bfloat16)
+
+    # Compute global scale factors
+    a_global_sf = a_bf16.abs().max().float() / (448 * 6)
+    b_global_sf = b_bf16.abs().max().float() / (448 * 6)
+
+    # Quantize to fp4
+    a, a_sf = torch.ops.trtllm.fp4_quantize(a_bf16, 1 / a_global_sf, sf_vec_size, False)
+    a = a.view(torch.float4_e2m1fn_x2)
+
+    b, b_sf = torch.ops.trtllm.fp4_quantize(b_bf16, 1 / b_global_sf, sf_vec_size, False)
+    b = b.view(torch.float4_e2m1fn_x2)
+
+    # Create per-token-per-expert alpha scale (m, expert_count)
+    # Each token has different alpha values for each expert
+    alpha_scale = torch.rand(m, num_expert, dtype=torch.float32, device="cuda") * 2.0
+
+    # Compute reference
+    c_ref = nvfp4_dense_gemm_fc2_ref(
+        a,
+        b,
+        a_sf,
+        b_sf,
+        alpha_scale,
+        output_dtype=torch.bfloat16,
+        expert_count=num_expert,
+        weight_per_expert=weight_per_expert,
+        scaling_vector_size=sf_vec_size,
+    )
+
+    # Call the kernel
+    c = torch.ops.trtllm.cute_dsl_nvfp4_dense_gemm_fc2_blackwell(
+        a,
+        b,
+        a_sf,
+        b_sf,
+        alpha_scale,
+        expert_count=num_expert,
+        weight_per_expert=weight_per_expert,
+        output_dtype=torch.bfloat16,
+        scaling_vector_size=sf_vec_size,
+    )
+
+    # Verify output shape
+    assert c.shape == (m, n), f"Expected shape {(m, n)}, got {c.shape}"
+    assert c.dtype == torch.bfloat16, f"Expected dtype bfloat16, got {c.dtype}"
+
+    # Verify output values
+    # Use relative tolerance for floating point comparison
+    rtol = 0.1  # 10% relative tolerance
+    atol = 0.05  # Small absolute tolerance for values near zero
+
+    is_close = torch.isclose(c.float(), c_ref.float(), rtol=rtol, atol=atol)
+    match_ratio = is_close.sum().item() / c.numel()
+
+    assert match_ratio > 0.90, f"Only {match_ratio * 100:.2f}% elements match, expected >= 90%"
diff --git a/tests/unittest/disaggregated/test_py_cache_transceiver_mp.py b/tests/unittest/disaggregated/test_py_cache_transceiver_mp.py
index 64992bfba688..1eca6b12e9a9 100644
--- a/tests/unittest/disaggregated/test_py_cache_transceiver_mp.py
+++ b/tests/unittest/disaggregated/test_py_cache_transceiver_mp.py
@@ -19,7 +19,6 @@
 import tensorrt_llm.bindings
 import tensorrt_llm.bindings.executor as trtllm
 from tensorrt_llm import DisaggregatedParams, Mapping, SamplingParams
-from tensorrt_llm._torch.pyexecutor.hang_detector import HangDetector
 from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest, LlmRequestType
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm.bindings import DataType, LlmRequestState
@@ -241,13 +240,6 @@ def signal_handler(signum, frame):
     dist.init_process_group(backend="gloo", rank=rank, world_size=world_size)
     tensorrt_llm.logger.set_level("info")
 
-    def on_hang_detected():
-        print(f"[Rank {rank}] Hang detected! Forcing exit.", flush=True)
-        os._exit(1)
-
-    hang_detector = HangDetector(timeout=60, on_detected=on_hang_detected)
-    hang_detector.start()
-
     ctx_instance_num = ctx_tp * ctx_pp
     gen_instance_num = gen_tp * gen_pp
 
@@ -478,7 +470,6 @@ def on_hang_detected():
 
     # Synchronize all processes
     dist.barrier()
-    hang_detector.checkpoint()
 
     # ===== Batch process multiple requests (like C++ cacheTransceiverTest) =====
     # Reference: C++ test uses lenList = {30, 10, 60, 80}
@@ -702,33 +693,29 @@ def gather_and_verify_request(
         f"handling {len(my_requests)}, {'CTX' if is_ctx else 'GEN'} mode, tp_rank={tp_rank}",
         flush=True,
     )
-    hang_detector.checkpoint()
 
     # ===== Phase 2: Transfer  =====
     if ctx_gen_workflow == "gen_first1":
-        _run_gen_first1_transfer(rank, is_ctx, transceiver, my_requests, hang_detector.checkpoint)
+        _run_gen_first1_transfer(rank, is_ctx, transceiver, my_requests)
     elif ctx_gen_workflow == "gen_first2":
-        _run_gen_first2_transfer(rank, is_ctx, transceiver, my_requests, hang_detector.checkpoint)
+        _run_gen_first2_transfer(rank, is_ctx, transceiver, my_requests)
     else:
         _run_ctx_first_transfer(
             rank, is_ctx, transceiver, my_requests, ctx_enable_dp, gen_enable_dp
         )
-    hang_detector.checkpoint()
 
     # ===== Phase 3: Wait for remaining transfers to complete =====
     # Synchronize before checking completion
-    hang_detector.checkpoint()
+
     dist.barrier()
-    hang_detector.checkpoint()
 
     if is_ctx and my_requests:
         transceiver.check_context_transfer_status(None)
         print(f"[Rank {rank}] CTX: All transfers completed ({mode_str})", flush=True)
-        hang_detector.checkpoint()
+
     elif not is_ctx and my_requests:
         transceiver.check_gen_transfer_status(None)
         print(f"[Rank {rank}] GEN: All transfers completed ({mode_str})", flush=True)
-        hang_detector.checkpoint()
 
         if is_gen_first:
             # verify the aux data is unpacked correctly
@@ -748,9 +735,8 @@ def gather_and_verify_request(
                 )
 
     # Synchronize before verification
-    hang_detector.checkpoint()
+
     dist.barrier()
-    hang_detector.checkpoint()
 
     # ===== Phase 4: Batch verify all requests =====
     # All ranks must participate in gather (collective op), so iterate all_requests.
@@ -807,9 +793,7 @@ def gather_and_verify_request(
     dist.broadcast(pass_tensor, src=0)
     assert pass_tensor.item() == 1, "Some requests failed verification!"
 
-    hang_detector.checkpoint()
     dist.barrier()
-    hang_detector.checkpoint()
 
     # ===== Phase 5: Cleanup requests =====
     # All ranks added all requests, so all need to remove them
@@ -820,8 +804,6 @@ def gather_and_verify_request(
     if rank == 0:
         print(f"[Rank {rank}] Cleanup completed ({mode_str})")
 
-    hang_detector.stop()
-
     # Cleanup
     dist.destroy_process_group()
 
@@ -902,13 +884,8 @@ def _wait_ctx_request_ready(transceiver, my_requests):
     return all_ready
 
 
-def _run_gen_first1_transfer(rank, is_ctx, transceiver, my_requests, checkpoint_fn=None):
+def _run_gen_first1_transfer(rank, is_ctx, transceiver, my_requests):
     """Generation-first transfer: ctx prepares first, then gen receives and ctx sends."""
-
-    def _checkpoint():
-        if checkpoint_fn:
-            checkpoint_fn()
-
     # Step 1: Context side calls prepare_context_requests, no kvcache request is sent, thus no request
     # can reach CONTEXT_INIT state.
     if is_ctx:
@@ -921,9 +898,7 @@ def _checkpoint():
             assert req.state == LlmRequestState.DISAGG_CONTEXT_WAIT_SCHEDULER
         print(f"[Rank {rank}] CTX: All requests are waiting for being scheduled", flush=True)
 
-    _checkpoint()
     dist.barrier()
-    _checkpoint()
 
     # Step 2: Generation side submits receive requests
     if not is_ctx:
@@ -937,15 +912,12 @@ def _checkpoint():
             f"[Rank {rank}] GEN: Submitted {len(my_requests)} gen-first receive requests",
             flush=True,
         )
-    _checkpoint()
     dist.barrier()
-    _checkpoint()
 
     if is_ctx:
         # Poll until all requests reach CONTEXT_INIT (peer info arrived)
         transceiver.prepare_context_requests(ctx_my_requests)
         _wait_ctx_request_ready(transceiver, ctx_my_requests)
-        _checkpoint()
 
         for req_idx, request in my_requests:
             print(
@@ -956,13 +928,8 @@ def _checkpoint():
         print(f"[Rank {rank}] CTX: Submitted {len(my_requests)} send requests", flush=True)
 
 
-def _run_gen_first2_transfer(rank, is_ctx, transceiver, my_requests, checkpoint_fn=None):
+def _run_gen_first2_transfer(rank, is_ctx, transceiver, my_requests):
     """Generation-first transfer: gen receives first, then ctx prepares and sends."""
-
-    def _checkpoint():
-        if checkpoint_fn:
-            checkpoint_fn()
-
     # Step 1: Generation side submits receive requests, now context side doesn't know the requests
     # but gets kvcache requests first
     if not is_ctx:
@@ -976,9 +943,7 @@ def _checkpoint():
             f"[Rank {rank}] GEN: Submitted {len(my_requests)} gen-first receive requests",
             flush=True,
         )
-    _checkpoint()
     dist.barrier()
-    _checkpoint()
     time.sleep(3)  # wait for the receive requests to be submitted
     # Step 2: Context side calls prepare_context_requests, now context side knows the requests
     # all requests can reach CONTEXT_INIT state directly.
@@ -988,11 +953,8 @@ def _checkpoint():
         transceiver.prepare_context_requests(ctx_my_requests)
         print(f"[Rank {rank}] CTX: Called prepare_context_requests", flush=True)
         _wait_ctx_request_ready(transceiver, ctx_my_requests)
-        _checkpoint()
 
-    _checkpoint()
     dist.barrier()
-    _checkpoint()
     # Step 3: Context side sends the data
     if is_ctx:
         for req_idx, request in my_requests:
diff --git a/tests/unittest/llmapi/apps/_test_openai_prometheus.py b/tests/unittest/llmapi/apps/_test_openai_prometheus.py
index 1d8c30dc1241..4cd65051fc39 100644
--- a/tests/unittest/llmapi/apps/_test_openai_prometheus.py
+++ b/tests/unittest/llmapi/apps/_test_openai_prometheus.py
@@ -14,8 +14,10 @@
 # limitations under the License.
 import logging
 import os
+import re
 import tempfile
 import time
+from typing import Dict
 from urllib.request import urlopen
 
 import pytest
@@ -31,11 +33,13 @@
 
 @pytest.fixture(scope="module", ids=["TinyLlama-1.1B-Chat"])
 def model_name():
+    """Return the HuggingFace model path used for all tests in this module."""
     return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
 
 
 @pytest.fixture(scope="module")
 def temp_extra_llm_api_options_file(request):
+    """Create a temporary YAML file with extra LLM API options for metrics collection."""
     temp_dir = tempfile.gettempdir()
     temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
     try:
@@ -56,6 +60,7 @@ def temp_extra_llm_api_options_file(request):
 @pytest.fixture(scope="module")
 def server(model_name: str,
            temp_extra_llm_api_options_file: str) -> RemoteOpenAIServer:
+    """Start a RemoteOpenAIServer with the PyTorch backend and metrics enabled."""
     model_path = get_model_path(model_name)
     args = ["--backend", "pytorch", "--tp_size", "1"]
     args.extend(["--extra_llm_api_options", temp_extra_llm_api_options_file])
@@ -65,8 +70,67 @@ def server(model_name: str,
         logger.info("Tests completed, shutting down server")
 
 
+def _parse_prometheus_sample(data: str, metric_name: str) -> float | None:
+    """Parse Prometheus exposition text and return the sample value for a metric.
+
+    Matches a single sample line in the Prometheus text format, e.g.:
+        trtllm_kv_cache_hit_rate{model_name="llama"} 0.5
+        trtllm_kv_cache_hit_rate 0.5
+    Comment lines (# HELP, # TYPE) are naturally skipped because they
+    do not match the pattern.
+
+    Args:
+        data: Raw Prometheus exposition text from the /prometheus/metrics endpoint.
+        metric_name: Fully qualified metric name to search for (e.g.
+            "trtllm_kv_cache_hit_rate").
+
+    Returns:
+        The float value of the first matching sample, or None if not found.
+    """
+
+    # '^'  # anchor to start of line
+    # r'(?:\{[^}]*\})?' for  optional {label="value",...} block (non-capturing)
+    # r'\s+' for whitespace separating metric name from value
+    # r'(\S+)' for capture the numeric sample value
+    # re.MULTILINE so '^' matches each line start, not just the start of `data`
+    pattern = re.compile(
+        r'^' + re.escape(metric_name) + r'(?:\{[^}]*\})?' + r'\s+' + r'(\S+)',
+        re.MULTILINE)
+    match = pattern.search(data)
+    return float(match.group(1)) if match else None
+
+
+def _parse_all_kv_metrics(data: str, prefix: str) -> Dict[str, float | None]:
+    """Parse and return sample values for all KV cache Prometheus metrics.
+
+    Args:
+        data: Raw Prometheus exposition text.
+        prefix: Metric name prefix (e.g. "trtllm_").
+
+    Returns:
+        Dict mapping each fully qualified metric name to its parsed sample
+        value, or None if that metric was not found in the data.
+    """
+    names = [
+        prefix + "kv_cache_hit_rate",
+        prefix + "kv_cache_reused_blocks_total",
+        prefix + "kv_cache_missed_blocks_total",
+        prefix + "kv_cache_utilization",
+    ]
+    return {name: _parse_prometheus_sample(data, name) for name in names}
+
+
 def test_metrics_endpoint(server: RemoteOpenAIServer):
-    metric_prefix = "trtllm_"
+    """Verify that Prometheus metrics are correctly exposed after serving requests.
+
+    Sends two identical completion requests, then polls the /prometheus/metrics
+    endpoint until iteration-level KV cache metrics appear. Asserts that:
+    - Request-level metrics (success count, latencies) are present.
+    - KV cache metrics have sample values (not just HELP/TYPE lines).
+    - Post-warmup values are correct: the second identical request reuses
+      1 block and misses 1 block, yielding a 0.5 hit rate.
+    """
+    METRIC_PREFIX = "trtllm_"
 
     client = server.get_client()
     # Need to send at least 2 requests to get iteration stats computed
@@ -93,11 +157,14 @@ def test_metrics_endpoint(server: RemoteOpenAIServer):
 
         data = response.read().decode("utf-8")
 
-        # Check if iteration stats metrics are present
-        if (metric_prefix + "kv_cache_hit_rate" in data
-                and metric_prefix + "kv_cache_utilization" in data):
-            iteration_stats_metrics_found = True
-            break
+        # Check if iteration stats metrics have sample values
+        kv_metrics = _parse_all_kv_metrics(data, METRIC_PREFIX)
+        if all(v is not None for v in kv_metrics.values()):
+            hit_rate = kv_metrics[METRIC_PREFIX + "kv_cache_hit_rate"]
+            if hit_rate > 0.0:
+                # Wait until we have some kv cache reuse to check on iteration stats
+                iteration_stats_metrics_found = True
+                break
 
         logger.info(
             f"Iteration stats not yet available, waiting {poll_interval}s...")
@@ -109,13 +176,30 @@ def test_metrics_endpoint(server: RemoteOpenAIServer):
     data = response.read().decode("utf-8")
 
     # Assert request-level metrics (these should be available immediately)
-    assert metric_prefix + "request_success_total" in data
-    assert metric_prefix + "e2e_request_latency_seconds" in data
-    assert metric_prefix + "time_to_first_token_seconds" in data
-    assert metric_prefix + "request_queue_time_seconds" in data
+    assert METRIC_PREFIX + "request_success_total" in data
+    assert METRIC_PREFIX + "e2e_request_latency_seconds" in data
+    assert METRIC_PREFIX + "time_to_first_token_seconds" in data
+    assert METRIC_PREFIX + "request_queue_time_seconds" in data
 
-    # Assert iteration stats metrics (collected by background task)
+    # Assert iteration stats metrics are eventually available
     assert iteration_stats_metrics_found, \
-        f"Iteration stats metrics not found after waiting {max_wait_time}s"
-    assert metric_prefix + "kv_cache_hit_rate" in data
-    assert metric_prefix + "kv_cache_utilization" in data
+        f"Iteration stats metrics not found or cache hit rate is always 0 after waiting {max_wait_time}s"
+    kv_metrics = _parse_all_kv_metrics(data, METRIC_PREFIX)
+    for name, value in kv_metrics.items():
+        assert value is not None, f"No sample value found for {name}"
+
+    # Verify post-warmup values match expected behavior:
+    # Two identical requests → 1 reused block, 1 missed block, 0.5 hit rate
+    hit_rate = kv_metrics[METRIC_PREFIX + "kv_cache_hit_rate"]
+    reused = kv_metrics[METRIC_PREFIX + "kv_cache_reused_blocks_total"]
+    missed = kv_metrics[METRIC_PREFIX + "kv_cache_missed_blocks_total"]
+    utilization = kv_metrics[METRIC_PREFIX + "kv_cache_utilization"]
+
+    assert hit_rate == pytest.approx(0.5), \
+        f"Expected kv_cache_hit_rate == 0.5, got {hit_rate}"
+    assert reused == 1.0, \
+        f"Expected kv_cache_reused_blocks_total == 1.0, got {reused}"
+    assert missed == 1.0, \
+        f"Expected kv_cache_missed_blocks_total == 1.0, got {missed}"
+    assert utilization >= 0, \
+        f"Expected kv_cache_utilization >= 0, got {utilization}"
diff --git a/tests/unittest/llmapi/apps/test_tool_parsers.py b/tests/unittest/llmapi/apps/test_tool_parsers.py
index bed578477db1..ff73c26a70f7 100644
--- a/tests/unittest/llmapi/apps/test_tool_parsers.py
+++ b/tests/unittest/llmapi/apps/test_tool_parsers.py
@@ -2100,5 +2100,291 @@ def test_create_minimax_m2_parser(self):
         assert isinstance(parser, MiniMaxM2ToolParser)
 
 
+# ============================================================================
+# FunctionDefinition strict field and ChatCompletionRequest store field Tests
+# ============================================================================
+
+
+class TestFunctionDefinitionStrictField:
+    """Test that FunctionDefinition accepts the strict field (TRTLLM-11616)."""
+
+    def test_strict_true_accepted(self):
+        """FunctionDefinition should accept strict=True without validation error."""
+        func_def = FunctionDefinition(
+            name="get_weather",
+            description="Get weather",
+            parameters={
+                "type": "object",
+                "properties": {}
+            },
+            strict=True,
+        )
+        assert func_def.strict is True
+
+    def test_strict_false_accepted(self):
+        """FunctionDefinition should accept strict=False without validation error."""
+        func_def = FunctionDefinition(
+            name="get_weather",
+            description="Get weather",
+            parameters={
+                "type": "object",
+                "properties": {}
+            },
+            strict=False,
+        )
+        assert func_def.strict is False
+
+    def test_strict_none_by_default(self):
+        """FunctionDefinition should default strict to None."""
+        func_def = FunctionDefinition(
+            name="get_weather",
+            description="Get weather",
+        )
+        assert func_def.strict is None
+
+    def test_tool_param_with_strict(self):
+        """ChatCompletionToolsParam should accept function with strict field."""
+        tool = ChatCompletionToolsParam(
+            type="function",
+            function=FunctionDefinition(
+                name="search_web",
+                description="Search",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "query": {
+                            "type": "string"
+                        }
+                    },
+                },
+                strict=True,
+            ),
+        )
+        assert tool.function.strict is True
+        assert tool.function.name == "search_web"
+
+
+# ============================================================================
+# Strict tool structural tag constraint building Tests
+# ============================================================================
+
+
+class TestBuildToolStrictGuidedDecoding:
+    """Test _build_tool_strict_guided_decoding_params from openai_server."""
+
+    def test_no_strict_tools_returns_none(self):
+        """Should return None when no tool has strict=True."""
+        from tensorrt_llm.serve.openai_server import \
+            _build_tool_strict_guided_decoding_params
+
+        tools = [
+            ChatCompletionToolsParam(
+                type="function",
+                function=FunctionDefinition(
+                    name="get_weather",
+                    parameters={
+                        "type": "object",
+                        "properties": {}
+                    },
+                ),
+            ),
+        ]
+        result = _build_tool_strict_guided_decoding_params(tools, "qwen3")
+        assert result is None
+
+    def test_strict_false_returns_none(self):
+        """Should return None when all tools have strict=False."""
+        from tensorrt_llm.serve.openai_server import \
+            _build_tool_strict_guided_decoding_params
+
+        tools = [
+            ChatCompletionToolsParam(
+                type="function",
+                function=FunctionDefinition(
+                    name="get_weather",
+                    parameters={
+                        "type": "object",
+                        "properties": {}
+                    },
+                    strict=False,
+                ),
+            ),
+        ]
+        result = _build_tool_strict_guided_decoding_params(tools, "qwen3")
+        assert result is None
+
+    def test_no_tools_returns_none(self):
+        """Should return None when tools list is empty or None."""
+        from tensorrt_llm.serve.openai_server import \
+            _build_tool_strict_guided_decoding_params
+
+        assert _build_tool_strict_guided_decoding_params(None, "qwen3") is None
+        assert _build_tool_strict_guided_decoding_params([], "qwen3") is None
+
+    def test_no_parser_returns_none(self):
+        """Should return None when no tool parser is provided."""
+        from tensorrt_llm.serve.openai_server import \
+            _build_tool_strict_guided_decoding_params
+
+        tools = [
+            ChatCompletionToolsParam(
+                type="function",
+                function=FunctionDefinition(
+                    name="get_weather",
+                    parameters={
+                        "type": "object",
+                        "properties": {}
+                    },
+                    strict=True,
+                ),
+            ),
+        ]
+        assert _build_tool_strict_guided_decoding_params(tools, None) is None
+        assert _build_tool_strict_guided_decoding_params(tools, "") is None
+
+    def test_strict_tool_with_qwen3_parser(self):
+        """Should build GuidedDecodingParams with structural_tag for Qwen3."""
+        from tensorrt_llm.serve.openai_server import \
+            _build_tool_strict_guided_decoding_params
+
+        tools = [
+            ChatCompletionToolsParam(
+                type="function",
+                function=FunctionDefinition(
+                    name="get_weather",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "location": {
+                                "type": "string"
+                            },
+                        },
+                        "required": ["location"],
+                    },
+                    strict=True,
+                ),
+            ),
+        ]
+        result = _build_tool_strict_guided_decoding_params(tools, "qwen3")
+        assert result is not None
+        assert result.structural_tag is not None
+
+        stag = json.loads(result.structural_tag)
+        assert stag["type"] == "structural_tag"
+        fmt = stag["format"]
+        assert fmt["type"] == "triggered_tags"
+        assert "<tool_call>" in fmt["triggers"]
+        assert len(fmt["tags"]) == 1
+
+        tag = fmt["tags"][0]
+        assert "get_weather" in tag["begin"]
+        assert tag["content"]["type"] == "json_schema"
+        assert tag["content"]["json_schema"]["properties"]["location"][
+            "type"] == "string"
+
+    def test_mixed_strict_and_non_strict(self):
+        """Should constrain strict tools and allow any text for non-strict."""
+        from tensorrt_llm.serve.openai_server import \
+            _build_tool_strict_guided_decoding_params
+
+        tools = [
+            ChatCompletionToolsParam(
+                type="function",
+                function=FunctionDefinition(
+                    name="get_weather",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "location": {
+                                "type": "string"
+                            }
+                        },
+                    },
+                    strict=True,
+                ),
+            ),
+            ChatCompletionToolsParam(
+                type="function",
+                function=FunctionDefinition(
+                    name="search_web",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "query": {
+                                "type": "string"
+                            }
+                        },
+                    },
+                    strict=False,
+                ),
+            ),
+        ]
+        result = _build_tool_strict_guided_decoding_params(tools, "qwen3")
+        assert result is not None
+
+        stag = json.loads(result.structural_tag)
+        fmt = stag["format"]
+        assert len(fmt["tags"]) == 2
+
+        # First tag (strict) should have json_schema content
+        assert fmt["tags"][0]["content"]["type"] == "json_schema"
+        # Second tag (non-strict) should have any_text content
+        assert fmt["tags"][1]["content"]["type"] == "any_text"
+
+    def test_unsupported_parser_returns_none(self):
+        """Should return None for parsers that don't support structural tags."""
+        from tensorrt_llm.serve.openai_server import \
+            _build_tool_strict_guided_decoding_params
+
+        tools = [
+            ChatCompletionToolsParam(
+                type="function",
+                function=FunctionDefinition(
+                    name="get_weather",
+                    parameters={
+                        "type": "object",
+                        "properties": {}
+                    },
+                    strict=True,
+                ),
+            ),
+        ]
+        # glm4 does not support structural tags
+        result = _build_tool_strict_guided_decoding_params(tools, "glm4")
+        assert result is None
+
+    def test_strict_tool_with_deepseek_parser(self):
+        """Should build GuidedDecodingParams with structural_tag for DeepSeek."""
+        from tensorrt_llm.serve.openai_server import \
+            _build_tool_strict_guided_decoding_params
+
+        tools = [
+            ChatCompletionToolsParam(
+                type="function",
+                function=FunctionDefinition(
+                    name="calculate",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "expr": {
+                                "type": "string"
+                            }
+                        },
+                    },
+                    strict=True,
+                ),
+            ),
+        ]
+        result = _build_tool_strict_guided_decoding_params(tools, "deepseek_v3")
+        assert result is not None
+        assert result.structural_tag is not None
+
+        stag = json.loads(result.structural_tag)
+        fmt = stag["format"]
+        assert fmt["type"] == "triggered_tags"
+        assert len(fmt["tags"]) == 1
+        assert "calculate" in fmt["tags"][0]["begin"]
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])