You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
please help me make a configuration file to run llama4 on 2x cmp50 10GB. the current config is like this and the memory is only enough to load 40 layers:
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
-
please help me make a configuration file to run llama4 on 2x cmp50 10GB. the current config is like this and the memory is only enough to load 40 layers:
match:
name: "^model\.layers\.(0|1[0-6]|[1-9])\."
class: ktransformers.models.modeling_llama4.Llama4TextRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.KLlama4RotaryEmbedding
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
match:
name: "^model\.layers\.(1[7-9]|[2-9][0-9]|\d{3,})\."
class: ktransformers.models.modeling_llama4.Llama4TextRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.KLlama4RotaryEmbedding
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
match:
name: "^lm_head$"
class: torch.nn.Linear
replace:
class: ktransformers.operators.linear.KTransformersLinear
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
generate_op: "KLinearTorch"
prefill_op: "KLinearTorch"
match:
name: "^model\.layers\.(0|1[0-6]|[1-9])\.(?!.feed_forward\.shared_expert_gate).$"
class: torch.nn.Linear
replace:
class: ktransformers.operators.linear.KTransformersLinear
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
generate_op: "KLinearTorch"
prefill_op: "KLinearTorch"
match:
name: "^model\.layers\.(1[7-9]|[2-9][0-9]|\d{3,})\.(?!.feed_forward\.shared_expert_gate).$"
class: torch.nn.Linear
replace:
class: ktransformers.operators.linear.KTransformersLinear
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
generate_op: "KLinearTorch"
prefill_op: "KLinearTorch"
match:
name: "^model\.layers\.(0|1[0-6]|[1-9])\.feed_forward$"
class: ktransformers.models.modeling_llama4.Llama4TextMoe
replace:
class: ktransformers.operators.experts.KLlama4Moe
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
match:
name: "^model\.layers\.(1[7-9]|[2-9][0-9]|\d{3,})\.feed_forward$"
class: ktransformers.models.modeling_llama4.Llama4TextMoe
replace:
class: ktransformers.operators.experts.KLlama4Moe
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
match:
name: "^model\.layers\.(0|[1-9]|1[0-6])\.feed_forward\.experts$"
replace:
class: ktransformers.operators.experts.KLlama4Experts
kwargs:
prefill_device: "cuda:0"
prefill_op: None
generate_device: "cpu"
generate_op: "KExpertsCPU"
out_device: "cuda:0"
recursive: False
match:
name: "^model\.layers\.(1[7-9]|[2-9]\d|\d{3,})\.feed_forward\.experts$"
replace:
class: ktransformers.operators.experts.KLlama4Experts
kwargs:
prefill_device: "cuda:1"
prefill_op: None
generate_device: "cpu"
generate_op: "KExpertsCPU"
out_device: "cuda:1"
recursive: False
match:
name: "^model\.layers\.(0|1[0-6]|[1-9])\.self_attn$"
replace:
class: ktransformers.operators.balance_serve_attention.KLlama4Attention
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
match:
name: "^model\.layers\.(1[7-9]|[2-9][0-9]|\d{3,})\.self_attn$"
replace:
class: ktransformers.operators.balance_serve_attention.KLlama4Attention
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
match:
name: "^model$"
replace:
class: "ktransformers.operators.models.KLlama4Model"
kwargs:
per_layer_prefill_intput_threshold: 0
transfer_map:
17: "cuda:1"
match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
match:
name: "^model\.layers\.(0|1[0-6]|[1-9])\."
class: ktransformers.models.modeling_llama4.Llama4TextRMSNorm
replace:
class: ktransformers.operators.layernorm.KLlama4RMSNorm
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
match:
name: "^model\.layers\.(1[7-9]|[2-9][0-9]|\d{3,})\."
class: ktransformers.models.modeling_llama4.Llama4TextRMSNorm
replace:
class: ktransformers.operators.layernorm.KLlama4RMSNorm
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
match:
name: "^model\.layers\.(0|1[0-6]|[1-9])\."
class: ktransformers.models.modeling_llama4.Llama4TextMLP
replace:
class: ktransformers.operators.mlp.KLlama4MLP
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
match:
name: "^model\.layers\.(1[7-9]|[2-9][0-9]|\d{3,})\."
class: ktransformers.models.modeling_llama4.Llama4TextMLP
replace:
class: ktransformers.operators.mlp.KLlama4MLP
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
Beta Was this translation helpful? Give feedback.
All reactions