xinhe-nv · pull · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026
diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
@@ -30,6 +30,7 @@ foreach(DEP_IDX RANGE ${DEP_COUNT_MINUS_ONE})
   string(JSON DEP_SOURCE_SUBDIR          ERROR_VARIABLE _err GET "${DEP_OBJECT}" "source_subdir")
   string(JSON DEP_GIT_SUBMODULES_RECURSE ERROR_VARIABLE _err GET "${DEP_OBJECT}" "git_submodules_recurse")
   string(JSON DEP_USE_URL                ERROR_VARIABLE _err GET "${DEP_OBJECT}" "use_url")
+  string(JSON DEP_PATCH_FILE            ERROR_VARIABLE _err GET "${DEP_OBJECT}" "patch_file")
   # cmake-format: on
 
   # Build FetchContent_Declare arguments
@@ -53,6 +54,19 @@ foreach(DEP_IDX RANGE ${DEP_COUNT_MINUS_ONE})
     list(APPEND FETCH_ARGS SOURCE_SUBDIR "${DEP_SOURCE_SUBDIR}")
   endif()
 
+  if(DEP_PATCH_FILE AND NOT DEP_PATCH_FILE STREQUAL "")
+    list(
+      APPEND
+      FETCH_ARGS
+      PATCH_COMMAND
+      patch
+      -p1
+      --forward
+      --batch
+      -i
+      "${CMAKE_CURRENT_SOURCE_DIR}/${DEP_PATCH_FILE}")
+  endif()
+
   FetchContent_Declare(${FETCH_ARGS})
 
   # Special handling: Export deep_ep commit to global property

diff --git a/3rdparty/fetch_content.json b/3rdparty/fetch_content.json
@@ -93,9 +93,10 @@
     {
       "name": "xgrammar",
       "git_repository": "https://github.com/mlc-ai/xgrammar",
-      "git_tag": "v0.1.25",
+      "git_tag": "v0.1.32",
       "git_shallow": true,
-      "source_subdir": "dont-add-this-project-with-add-subdirectory"
+      "source_subdir": "dont-add-this-project-with-add-subdirectory",
+      "patch_file": "patches/xgrammar_constexpr.patch"
     }
   ]
 }
diff --git a/3rdparty/patches/xgrammar_constexpr.patch b/3rdparty/patches/xgrammar_constexpr.patch
@@ -0,0 +1,19 @@
+--- a/cpp/grammar_functor.cc
++++ b/cpp/grammar_functor.cc
+@@ -1750,11 +1750,11 @@
+   void Apply(Grammar* grammar);
+   static std::optional<uint64_t> HashSequence(const Grammar& grammar, int32_t sequence_id);
+
+-  static const int16_t kNotEndStateFlag = -0x100;
+-  static const int16_t kEndStateFlag = -0x200;
+-  static const int16_t kSelfRecursionFlag = -0x300;
+-  static const int16_t kSimpleCycleFlag = -0x400;
+-  static const int16_t kUnKnownFlag = -0x500;
++  static constexpr int16_t kNotEndStateFlag = -0x100;
++  static constexpr int16_t kEndStateFlag = -0x200;
++  static constexpr int16_t kSelfRecursionFlag = -0x300;
++  static constexpr int16_t kSimpleCycleFlag = -0x400;
++  static constexpr int16_t kUnKnownFlag = -0x500;
+
+  private:
+   Grammar* grammar_;
diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md
@@ -63471,7 +63471,7 @@ SOFTWARE.
   - `Homepage`: https://github.com/akshaynagpal/w2n
 
 
-## xgrammar (0.1.25)
+## xgrammar (0.1.32)
 
 ### Licenses
 License: `Apache 2.0`

diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py
@@ -3333,11 +3333,14 @@ def use_cubin_header(sm,
                      head_size,
                      dtype,
                      output_dtype=None,
-                     enable_skip_softmax=False):
+                     enable_skip_softmax=False,
+                     attention_mask_type=None):
     if enable_skip_softmax:
         return False
     if 'e4m3' in dtype and output_dtype in ['bf16', 'fp16']:
         return False
+    if attention_mask_type == AttentionMaskType.BIDIRECTIONAL_SLIDING_WINDOW:
+        return False
     return (sm == 90 and head_size == 128) or (sm == 89 and 'e4m3' in dtype)
 
 
@@ -3349,9 +3352,11 @@ def get_cubin_header(kernel_traits, specs_names):
     cubin_lens_dict = {}
     launchers_dict = {}
     for kspec, fname, lname, kname in specs_names:
+        mask_type = AttentionMaskType.BIDIRECTIONAL_SLIDING_WINDOW \
+            if '_bidirectional_sliding_window' in kname else None
         if generate_cu_trtllm and not use_cubin_header(
                 kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype,
-                kspec.enable_skip_softmax):
+                kspec.enable_skip_softmax, mask_type):
             continue
         name = fname.replace('.', '_')
         data = 'extern unsigned char cubin_{name}_cubin[];'.format(name=name)
@@ -3487,7 +3492,8 @@ def get_cubin_header(kernel_traits, specs_names):
         return_softmax_stats_flag = pythonBoolean2cpp[sm != '90' or (
             sm == '90' and '_softmax' in kname)]
 
-        enable_skip_softmax_flag = pythonBoolean2cpp['_skipSoftmax' in kname]
+        enable_skip_softmax = '_skipSoftmax' in kname
+        enable_skip_softmax_flag = pythonBoolean2cpp[enable_skip_softmax]
 
         # meta_unroll_step
         meta_unroll_step = unroll_step if ('_nl' in kname
@@ -3516,7 +3522,8 @@ def get_cubin_header(kernel_traits, specs_names):
                 def get_lname_from_kname(kname: str) -> str:
                     if use_cubin_header(int(sm), int(head_size), prec.lower(),
                                         output_prec.lower(),
-                                        enable_skip_softmax_flag):
+                                        enable_skip_softmax,
+                                        attention_mask_type):
                         return 'nullptr'
                     lname = kname.replace('_kernel', '')
                     mask_types = [
@@ -3537,9 +3544,9 @@ def get_lname_from_kname(kname: str) -> str:
 {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
 {attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \
 {is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {enable_skip_softmax_flag}, {lname}}}\
-'''.format(**locals()) if use_cubin_header(int(sm), int(head_size),
-                                           prec.lower(), output_prec.lower(),
-                                           enable_skip_softmax_flag) else '''\
+'''.format(**locals()) if use_cubin_header(
+                    int(sm), int(head_size), prec.lower(), output_prec.lower(),
+                    enable_skip_softmax, attention_mask_type) else '''\
 {{ DATA_TYPE_{prec}, DATA_TYPE_{output_prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, {head_size_v}, \
 {sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, nullptr, \
 0, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \

diff --git a/examples/auto_deploy/model_registry/models.yaml b/examples/auto_deploy/model_registry/models.yaml
@@ -181,8 +181,6 @@ models:
   yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'simple_shard_only.yaml']
 - name: deepseek-ai/DeepSeek-R1
   yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'num_hidden_layers_5.yaml']
-- name: deepseek-ai/DeepSeek-V3
-  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'num_hidden_layers_5.yaml']
 - name: deepseek-ai/DeepSeek-Coder-V2-Instruct
   yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml']
 - name: Qwen/Qwen3-VL-8B-Instruct

diff --git a/requirements.txt b/requirements.txt
@@ -56,7 +56,7 @@ patchelf
 einops
 flashinfer-python==0.6.4
 opencv-python-headless
-xgrammar==0.1.25
+xgrammar==0.1.32
 llguidance==0.7.29
 jsonschema
 backoff

diff --git a/scripts/attribution/data/dependency_metadata.yml b/scripts/attribution/data/dependency_metadata.yml
@@ -90,7 +90,7 @@ ucx/1.20:
 ucxx/16eaa57c8d98c8ef54d666a2d2b11e76cfa565f5:
   license: 759cb066f14805ef4068f633d9071e1d
   source: https://github.com/rapidsai/ucxx/tree/16eaa57c8d98c8ef54d666a2d2b11e76cfa565f5
-xgrammar/v0.1.25:
+xgrammar/v0.1.32:
   copyright: 989a9441d689f61fba9f797cc253e51b
   license: 8e1c96809a7467593130ecc62ae12be9
 zeromq/4.3.4-3.el8:

diff --git a/scripts/attribution/data/files_to_dependency.yml b/scripts/attribution/data/files_to_dependency.yml
@@ -7507,7 +7507,7 @@ ucxx/16eaa57c8d98c8ef54d666a2d2b11e76cfa565f5:
 - cb11c17f716ae644b2d74652d3d3232b
 - e10a9fefe2ef09b9560cc204bd54f728
 - f75f9c7cefa54d6626032daa93ef2549
-xgrammar/v0.1.25:
+xgrammar/v0.1.32:
 - 0e2b512f384e122c3b8243ae00256e06
 - 1a6e20d89e227a29d674e12e18b5e9e7
 - 1acd98aa4050fd0b8cda58d5d4f6ef78