fix: add sink argument for contrib.custom_ops (#416)

rebel-jaehunryu · web-flow · commit 6f788a835b0c · 2026-02-27T15:23:21.000+09:00
diff --git a/vllm_rbln/v1/attention/backends/flash_attention.py b/vllm_rbln/v1/attention/backends/flash_attention.py
@@ -1474,7 +1474,7 @@ def forward(
                         )
 
                 if q_len == 1:
-                    attn_output = causal_attention_naive_decode(  # noqa: E501
+                    decode_args = [
                         query,
                         key,
                         value,
@@ -1483,9 +1483,14 @@ def forward(
                         self.scale,
                         attn_metadata.block_tables.to(torch.int16),
                         self.scale,  # dummy (required by rbln_triton_ops signature)
+                    ]
+                    if not envs.VLLM_RBLN_USE_CUSTOM_KERNEL:
+                        decode_args.append(self.sinks)
+                    attn_output = causal_attention_naive_decode(  # noqa: E501
+                        *decode_args,
                     )
                 else:
-                    attn_output = causal_attention_naive_prefill(  # noqa: E501
+                    prefill_args = [
                         query,
                         key,
                         value,
@@ -1494,6 +1499,11 @@ def forward(
                         self.scale,
                         attn_metadata.block_tables.to(torch.int16),
                         self.scale,  # dummy (required by rbln_triton_ops signature)
+                    ]
+                    if not envs.VLLM_RBLN_USE_CUSTOM_KERNEL:
+                        prefill_args.append(self.sinks)
+                    attn_output = causal_attention_naive_prefill(  # noqa: E501
+                        *prefill_args,
                     )
             else:
                 if envs.VLLM_RBLN_COMPILE_MODEL:
@@ -1524,7 +1534,7 @@ def forward(
                 # * otherwise         - seq_lens[B, P] == dyn_size_for_partitions,
                 #   dynamic size for each partition
                 if q_len == 1:
-                    attn_output = flash_causal_attention_naive_decode(  # noqa: E501
+                    decode_args = [
                         query,
                         key,
                         value,
@@ -1533,9 +1543,14 @@ def forward(
                         attn_metadata.seq_lens.to(torch.int16),
                         attn_metadata.block_tables.to(torch.int16),
                         self.scale,  # dummy
+                    ]
+                    if not envs.VLLM_RBLN_USE_CUSTOM_KERNEL:
+                        decode_args.append(self.sinks)
+                    attn_output = flash_causal_attention_naive_decode(  # noqa: E501
+                        *decode_args,
                     )
                 else:
-                    attn_output = flash_causal_attention_naive_prefill(  # noqa: E501
+                    prefill_args = [
                         query,
                         key,
                         value,
@@ -1544,6 +1559,11 @@ def forward(
                         attn_metadata.seq_lens.to(torch.int16),
                         attn_metadata.block_tables.to(torch.int16),
                         self.scale,  # dummy
+                    ]
+                    if not envs.VLLM_RBLN_USE_CUSTOM_KERNEL:
+                        prefill_args.append(self.sinks)
+                    attn_output = flash_causal_attention_naive_prefill(  # noqa: E501
+                        *prefill_args,
                     )
         else:
             if self.is_normal:
@@ -1568,7 +1588,7 @@ def forward(
                         )
 
                 if q_len == 1:
-                    attn_output = attention_naive_decode(  # noqa: E501
+                    decode_args = [
                         query,
                         key,
                         value,
@@ -1578,9 +1598,14 @@ def forward(
                         self.scale,
                         attn_metadata.block_tables.to(torch.int16),
                         self.scale,  # dummy (required by rbln_triton_ops signature)
+                    ]
+                    if not envs.VLLM_RBLN_USE_CUSTOM_KERNEL:
+                        decode_args.append(self.sinks)
+                    attn_output = attention_naive_decode(  # noqa: E501
+                        *decode_args,
                     )
                 else:
-                    attn_output = attention_naive_prefill(  # noqa: E501
+                    prefill_args = [
                         query,
                         key,
                         value,
@@ -1590,6 +1615,11 @@ def forward(
                         self.scale,
                         attn_metadata.block_tables.to(torch.int16),
                         self.scale,  # dummy (required by rbln_triton_ops signature)
+                    ]
+                    if not envs.VLLM_RBLN_USE_CUSTOM_KERNEL:
+                        prefill_args.append(self.sinks)
+                    attn_output = attention_naive_prefill(  # noqa: E501
+                        *prefill_args,
                     )
             else:
                 if envs.VLLM_RBLN_COMPILE_MODEL:
@@ -1612,7 +1642,7 @@ def forward(
                     flash_attention_naive_decode = flash_attention_naive_decode_impl
 
                 if q_len == 1:
-                    attn_output = flash_attention_naive_decode(  # noqa: E501
+                    decode_args = [
                         query,
                         key,
                         value,
@@ -1622,9 +1652,14 @@ def forward(
                         attn_metadata.seq_lens.to(torch.int16),
                         attn_metadata.block_tables.to(torch.int16),
                         self.scale,  # dummy
+                    ]
+                    if not envs.VLLM_RBLN_USE_CUSTOM_KERNEL:
+                        decode_args.append(self.sinks)
+                    attn_output = flash_attention_naive_decode(  # noqa: E501
+                        *decode_args,
                     )
                 else:
-                    attn_output = flash_attention_naive_prefill(  # noqa: E501
+                    prefill_args = [
                         query,
                         key,
                         value,
@@ -1634,6 +1669,11 @@ def forward(
                         attn_metadata.seq_lens.to(torch.int16),
                         attn_metadata.block_tables.to(torch.int16),
                         self.scale,  # dummy
+                    ]
+                    if not envs.VLLM_RBLN_USE_CUSTOM_KERNEL:
+                        prefill_args.append(self.sinks)
+                    attn_output = flash_attention_naive_prefill(  # noqa: E501
+                        *prefill_args,
                     )
 
         # 2. attention output reshape for attention backend return