upd

yzh119 · yzh119 · commit bfe69911d110 · 2026-02-03T06:50:59.000-05:00
diff --git a/csrc/gdn_prefill_sm90_kernel_inst.jinja b/csrc/gdn_prefill_sm90_kernel_inst.jinja
@@ -29,18 +29,10 @@
 namespace flat {
 
 // Explicit template instantiation for launch_delta_rule_prefill_kernel_gbai
-template void launch_delta_rule_prefill_kernel_gbai<
-    /*IsGVA=*/{{ is_gva }},
-    /*NeedsBeta=*/{{ needs_beta }},
-    /*NeedsAlpha=*/{{ needs_alpha }},
-    /*InitStateFromInput=*/{{ init_state }},
-    cutlass::arch::Sm90,
-    {{ dtype }}, {{ dtype }}, float>(
-    cudaStream_t stream, {{ dtype }}* output, float* output_state,
-    {{ dtype }} const* q, {{ dtype }} const* k, {{ dtype }} const* v,
-    float const* input_state, float const* alpha, float const* beta,
-    int64_t const* cu_seqlens, int32_t num_seqs, int32_t num_q_heads,
-    int32_t num_k_heads, int32_t num_v_heads, int32_t num_o_heads,
-    int32_t head_size, int64_t total_seqlen, float scale, int32_t sm_count);
+// Parameter types must exactly match the extern template declaration in prefill_kernel_delta_rule_sm90_extern.inc
+template void launch_delta_rule_prefill_kernel_gbai<{{ is_gva }}, {{ needs_beta }}, {{ needs_alpha }}, {{ init_state }}, cutlass::arch::Sm90, {{ dtype }}, {{ dtype }}, float>(
+    cudaStream_t, {{ dtype }}*, float*, {{ dtype }} const*, {{ dtype }} const*, {{ dtype }} const*,
+    float const*, float const*, float const*, int64_t const*, int32_t, int32_t,
+    int32_t, int32_t, int32_t, int32_t, int64_t, float, int32_t);
 
 }  // namespace flat
diff --git a/flashinfer/jit/core.py b/flashinfer/jit/core.py
@@ -416,16 +416,29 @@ def gen_jit_spec(
     verbose_env = os.environ.get("FLASHINFER_JIT_VERBOSE", "0")
     debug = (debug_env if debug_env is not None else verbose_env) == "1"
 
-    cflags = ["-std=c++17", "-Wno-switch-bool"]
+    # Only add default C++ standard if not specified in extra flags
+    cflags_has_std = extra_cflags is not None and any(
+        f.startswith("-std=") for f in extra_cflags
+    )
+    cuda_cflags_has_std = extra_cuda_cflags is not None and any(
+        f.startswith("-std=") for f in extra_cuda_cflags
+    )
+
+    cflags = ["-Wno-switch-bool"]
+    if not cflags_has_std:
+        cflags.insert(0, "-std=c++17")
+
     cuda_cflags = [
-        "-std=c++17",
         f"--threads={os.environ.get('FLASHINFER_NVCC_THREADS', '1')}",
         "-use_fast_math",
         "-DFLASHINFER_ENABLE_F16",
         "-DFLASHINFER_ENABLE_BF16",
         "-DFLASHINFER_ENABLE_FP8_E4M3",
         "-DFLASHINFER_ENABLE_FP8_E5M2",
     ]
+    if not cuda_cflags_has_std:
+        cuda_cflags.insert(0, "-std=c++17")
+
     if debug:
         cflags += ["-O0", "-g"]
         cuda_cflags += [
@@ -446,14 +459,8 @@ def gen_jit_spec(
         cuda_cflags += ["-lineinfo"]
 
     if extra_cflags is not None:
-        # If extra_cflags contains a -std flag, remove the default one to avoid conflicts
-        if any(f.startswith("-std=") for f in extra_cflags):
-            cflags = [f for f in cflags if not f.startswith("-std=")]
         cflags += extra_cflags
     if extra_cuda_cflags is not None:
-        # If extra_cuda_cflags contains a -std flag, remove the default one to avoid conflicts
-        if any(f.startswith("-std=") for f in extra_cuda_cflags):
-            cuda_cflags = [f for f in cuda_cflags if not f.startswith("-std=")]
         cuda_cflags += extra_cuda_cflags
 
     spec = JitSpec(