Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions autotest/config-npu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -203,3 +203,55 @@ case:
loss/reduced_llm_loss: 0.01
lr: 0.01
timeout: 10800

npu-qwen3-rl-vllm-vs-gpu:
-
type: rl
parameters:
config: autotest/config/rl_qwen3_30B_grpo_gpu_vs_npu.py
infer_backend: vllm
acceleator: NPU
output_path: /mnt/hwfile/llmrazor/qa-llm-cic/qa-llm-cicd/test_output
resource:
image: ccr-hw/910c:rl_s1_vllm
envs:
- MODEL_PATH=/mnt/hwfile/llmrazor/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
- DATA_PATH=/mnt/hwfile/llmrazor/qa-llm-cicd/xtuner_resource/datasets/gsm8k/train-mini.jsonl
- EVAL_DATA_PATH=/mnt/hwfile/llmrazor/qa-llm-cicd/xtuner_resource/datasets/gsm8k/test.jsonl
- XTUNER_DETERMINISTIC=true
- XTUNER_USE_LMDEPLOY=0
- XTUNER_USE_VLLM=1
- XTUNER_USE_SGLANG=0
- XTUNER_USE_FA3=0
- XTUNER_ACTIVATION_OFFLOAD=1
- VLLM_VERSION=0.11.0
- VLLM_USE_V1=1
assert_info:
base_metric: npu-qwen3-rl-vllm-vs-gpu/00f7e16/tracker.jsonl
check_metrics:
-
metric: eval/accuracy
threshold: 0.05
method: absolute
operator: <
-
metric: response/rewards/mean
threshold: 0.1
method: absolute
operator: <
-
metric: mismatch/mismatch_k3_kl
threshold: 0.0001
method: absolute
operator: <=
-
metric: response/response_len/mean
threshold: 0.12
method: relative
operator: <
-
metric: time/step
threshold: 10
method: absolute
operator: <
timeout: 5500
93 changes: 93 additions & 0 deletions autotest/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -632,3 +632,96 @@ case:
method: absolute
operator: <
timeout: 4200

qwen3-rl-sglang:
-
type: rl
parameters:
config: autotest/config/rl_qwen3_30B_gsm8k_grpo.py
infer_backend: sglang
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
resource:
envs:
- MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3-30B-A3B
- DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/gsm8k/train-mini.jsonl
- EVAL_DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/gsm8k/test.jsonl
- XTUNER_DETERMINISTIC=true
- XTUNER_USE_LMDEPLOY=0
- XTUNER_USE_VLLM=0
- XTUNER_USE_SGLANG=1
- TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
- XTUNER_USE_FA3=0
assert_info:
base_metric: qwen3-rl-sglang/cf46114/tracker.jsonl
check_metrics:
-
metric: eval/accuracy
threshold: 0.05
method: absolute
operator: <
-
metric: response/rewards/mean
threshold: 0.1
method: absolute
operator: <
-
metric: mismatch/mismatch_k3_kl
threshold: 0.0001
method: absolute
operator: <=
-
metric: response/response_len/mean
threshold: 0.12
method: relative
operator: <
-
metric: time/step
threshold: 10
method: absolute
operator: <
timeout: 4500

qwen3-5-rl-vl-lmdeploy:
-
type: rl
parameters:
config: autotest/config/rl_qwen3_5_vl_dapo.py
infer_backend: lmdeploy
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
resource:
memory_per_task: 1200
envs:
- MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
- DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/rl_vl_meta.jsonl
- PYTHONPATH=/mnt/shared-storage-user/llmrazor-share/qa-caif-cicd/lmdeploy:$PYTHONPATH
- XTUNER_USE_LMDEPLOY=1
- XTUNER_DETERMINISTIC=true
assert_info:
base_metric: qwen3-5-rl-vl-lmdeploy/00f7e16/tracker.jsonl
check_metrics:
-
metric: eval/accuracy
threshold: 0.1
method: absolute
operator: <
-
metric: response/rewards/mean
threshold: 0.1
method: absolute
operator: <
-
metric: mismatch/mismatch_k3_kl
threshold: 0.0002
method: absolute
operator: <=
-
metric: response/response_len/mean
threshold: 0.25
method: relative
operator: <
-
metric: time/step
threshold: 10
method: absolute
operator: <
timeout: 7200
165 changes: 165 additions & 0 deletions autotest/config/rl_qwen3_30B_grpo_gpu_vs_npu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import os
from copy import deepcopy
from pathlib import Path
from transformers import AutoTokenizer
from xtuner.v1.config import (
AdamWConfig,
FSDPConfig,
LRConfig,
)
from xtuner.v1.data_proto.rl_data import SampleParams
from xtuner.v1.datasets import RLTokenizeFnConfig
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config
from xtuner.v1.ray.base import AcceleratorResourcesConfig
from xtuner.v1.ray.config.worker import RolloutConfig
from xtuner.v1.ray.dataflow import DataFlowConfig, ReplayBufferConfig
from xtuner.v1.ray.evaluator import EvaluatorConfig
from xtuner.v1.ray.judger.controller import JudgerConfig
from xtuner.v1.ray.judger.gsm8k import GSM8KJudgerConfig
from xtuner.v1.rl.base import WorkerConfig
from xtuner.v1.rl.grpo import GRPOLossConfig
from xtuner.v1.train.rl_trainer import RLTrainerConfig
from xtuner.v1.model import get_model_config_from_hf

work_dir = os.environ["WORK_DIR"]
model_path = os.environ["MODEL_PATH"]
data_path = os.environ["DATA_PATH"]
eval_data_path = os.environ["EVAL_DATA_PATH"]
enable_return_routed_experts = os.environ.get("ENABLE_RETURN_ROUTED_EXPERTS", '1')
enable_evaluate = True if eval_data_path != "" else False

# basic settings
experimental_name = "grpo_gsm8k"
total_epochs = 2
global_batch_size = 64
prompt_repeat_k = 5
rollout_tp_size = 4
rollout_dp_size = 4
rollout_ep_size = 1
max_prompt_length = 512
max_response_length = 1024
pack_max_length = 32768
train_optimizer_steps = 1
hf_interval = 100
enable_initial_evaluate = True
evaluate_step = 10


# 1. resources
resources = AcceleratorResourcesConfig(
accelerator="NPU",
num_workers=16,
num_cpus_per_worker=6,
cpu_memory_per_worker=16 * 1024**3, # 16 GB
)

# 2. rollout
rollout_config = RolloutConfig(
env=experimental_name,
device=resources.accelerator,
model_path=model_path,
dtype="bfloat16",
tensor_parallel_size=rollout_tp_size,
data_parallel_size=rollout_dp_size,
expert_parallel_size=rollout_ep_size,
gpu_memory_utilization=0.85,
context_length = max_response_length + max_prompt_length,
rollout_max_batch_size_per_instance=2048,
enable_return_routed_experts=True if enable_return_routed_experts == "1" else False,
)

# sampling params
training_sample_params = SampleParams(
max_tokens=max_response_length,
)
evaluation_sample_params = deepcopy(training_sample_params)
evaluation_sample_params.top_p = 1.0
evaluation_sample_params.temperature = 0.0
evaluation_sample_params.top_k = 1

# dataset: 不需要修改
train_dataset = DatasetConfig(name=experimental_name, anno_path=data_path)
eval_dataset = DatasetConfig(name=experimental_name, anno_path=eval_data_path) if enable_evaluate else None
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer_config = RLTokenizeFnConfig(max_length=max_prompt_length)

train_dataset_cfg = [{"dataset": train_dataset, "tokenize_fn": tokenizer_config}]
eval_dataset_cfg = [{"dataset": eval_dataset, "tokenize_fn": tokenizer_config}] if enable_evaluate else []

dataloader_config = DataloaderConfig(pack_max_length=pack_max_length, collator="fake_collator", pack_level="none")

# 3. judger
dapomath_judger_config = GSM8KJudgerConfig(judger_name="openai/gsm8k")
judger_cfg = JudgerConfig(reward_judger_configs=[dapomath_judger_config])

# 4. dataflow and evaluator
dataflow_config = DataFlowConfig(
env=experimental_name,
prompt_repeat_k=prompt_repeat_k,
global_batch_size=global_batch_size,
sample_params=training_sample_params,
)

evaluator_cfg = EvaluatorConfig(
enable_evaluate=enable_evaluate,
enable_initial_evaluate=enable_initial_evaluate,
dataset_cfg=eval_dataset_cfg,
tokenizer=tokenizer,
evaluate_step=evaluate_step,
compute_metric_func=None,
sample_params=evaluation_sample_params,
) if enable_evaluate else None

# replay buffer config: : 不需要修改
replay_buffer_cfg = ReplayBufferConfig(
dataset_cfg=train_dataset_cfg, dataloader_cfg=dataloader_config, tokenizer=tokenizer
)

# 5. Train worker
# NOTE: modify model_cfg
model_cfg = get_model_config_from_hf(Path(model_path))
optim_cfg = AdamWConfig(lr=1e-6, foreach=False)
loss_cfg = GRPOLossConfig(
policy_loss_cfg=dict(
cliprange_high=0.2,
cliprange_low=0.2,
loss_type="vanilla",
),
ignore_idx=-100,
use_kl_loss=True,
kl_loss_coef=0.001,
kl_loss_type="low_var_kl",
mode="chunk",
chunk_size=512,
)
lr_cfg = LRConfig(lr_type="constant", warmup_ratio=0, lr_min=1e-6)
fsdp_cfg = FSDPConfig(torch_compile=False, cpu_offload=False, ep_size=1)
train_worker_cfg: WorkerConfig = WorkerConfig(
model_cfg=model_cfg,
load_from=model_path,
optim_cfg=optim_cfg,
loss_cfg=loss_cfg,
lr_cfg=lr_cfg,
fsdp_cfg=fsdp_cfg,
sp_size=1,
optimizer_steps=train_optimizer_steps,
pack_max_length=pack_max_length,
)

# 6. RL Trainer
trainer = RLTrainerConfig(
load_from=model_path,
resources=resources,
rollout_config=rollout_config,
dataflow_config=dataflow_config,
judger_config=judger_cfg,
replay_buffer_config=replay_buffer_cfg,
evaluator_config=evaluator_cfg,
train_worker_config=train_worker_cfg,
tokenizer_path=model_path,
work_dir=work_dir,
total_epochs=total_epochs,
hf_interval=hf_interval,
exp_tracker="jsonl",
)
Loading
Loading