@@ -278,11 +278,7 @@ def __post_init__(self):
278278 assert not self .swiglu
279279 self .gated_linear_unit = True
280280 self .activation_func = quick_gelu
281- _origin_rotary_interleaved = self .rotary_interleaved
282- if self .multi_latent_attention and self .rotary_interleaved :
283- self .rotary_interleaved = False
284281 super ().__post_init__ ()
285- self .rotary_interleaved = _origin_rotary_interleaved
286282 self ._check_npu ()
287283 self .variable_seq_lengths = True
288284
@@ -481,8 +477,6 @@ def convert_hf_config(config) -> Dict[str, Any]:
481477 res .pop ('num_query_groups' , None )
482478 if llm_model_type == 'glm_moe_dsa' :
483479 res ['experimental_attention_variant' ] = 'dsa'
484- # https://github.com/modelscope/ms-swift/pull/8085
485- # res['rotary_interleaved'] = False
486480 elif llm_model_type == 'qwen3_next' or hf_model_type in {'qwen3_5' , 'qwen3_5_moe' }:
487481 use_mcore_gdn = get_env_args ('SWIFT_USE_MCORE_GDN' , bool , False )
488482 if use_mcore_gdn and llm_model_type == 'qwen3_next' :
@@ -525,10 +519,6 @@ def convert_hf_config(config) -> Dict[str, Any]:
525519 mrope_interleaved = rope_scaling .get ('mrope_interleaved' , False ) or rope_scaling .get ('interleaved' , False )
526520 res ['mrope_interleaved' ] = mrope_interleaved
527521
528- if res .get ('multi_latent_attention' ) and res .get ('position_embedding_type' ) in {
529- 'rope' , None
530- } and 'rotary_interleaved' not in res :
531- res ['rotary_interleaved' ] = True
532522 if first_k_dense_replace is not None :
533523 res ['moe_layer_freq' ] = f'[0]*{ first_k_dense_replace } +[1]*{ res ["num_layers" ] - first_k_dense_replace } '
534524 if res .get ('moe_router_score_function' , 'softmax' ) == 'sigmoid' and 'moe_router_enable_expert_bias' not in res :
0 commit comments