File tree Expand file tree Collapse file tree 1 file changed +2
-2
lines changed
onnxruntime/core/providers/cuda/llm Expand file tree Collapse file tree 1 file changed +2
-2
lines changed Original file line number Diff line number Diff line change @@ -205,7 +205,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
205205 gqa_parameters.num_heads = parameters.q_num_heads ;
206206 gqa_parameters.head_size = parameters.head_size ;
207207 gqa_parameters.v_head_size = parameters.v_head_size ;
208- gqa_parameters.kv_hidden_size = parameters.kv_num_heads * parameters.head_size ;
208+ gqa_parameters.kv_hidden_size = parameters.kv_num_heads * parameters.v_head_size ;
209209 gqa_parameters.kv_num_heads = parameters.kv_num_heads ;
210210 gqa_parameters.rotary_dim = 0 ; // New Attention op doesn't use rotary embeddings directly
211211 gqa_parameters.is_unidirectional = parameters.is_causal ;
@@ -218,7 +218,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
218218 gqa_parameters.rotary_interleaved = false ;
219219 gqa_parameters.use_smooth_softmax = false ;
220220 gqa_parameters.scale = parameters.scale ;
221- gqa_parameters.softcap = 0 . 0f ;
221+ gqa_parameters.softcap = parameters. softcap ;
222222 gqa_parameters.mask_type = onnxruntime::contrib::AttentionMaskType::MASK_NONE;
223223 gqa_parameters.qkv_format = contribop_parameters.qkv_format ;
224224 gqa_parameters.past_kv_format = onnxruntime::contrib::AttentionQkvFormat::Q_K_V_BNSH;
You can’t perform that action at this time.
0 commit comments