File tree Expand file tree Collapse file tree 2 files changed +5
-1
lines changed
verl/workers/rollout/trtllm_rollout Expand file tree Collapse file tree 2 files changed +5
-1
lines changed Original file line number Diff line number Diff line change @@ -14,6 +14,7 @@ export RAY_DEDUP_LOGS=0
1414TP=${1:- 4}
1515PROJECT_NAME=${PROJECT_NAME:- " verl_grpo_example_gsm8k_math" }
1616EXP_NAME=trtllm-qwen2-7b-tp${TP} -8gpus${EXP_NAME_SUFFIX: +" -" }${EXP_NAME_SUFFIX}
17+ NODES=${NODES:- 1}
1718
1819if [ $TP -eq 4 ]; then
1920 MAX_BATCH_SIZE=1024
@@ -81,7 +82,7 @@ python3 -m verl.trainer.main_ppo \
8182 trainer.project_name=" ${PROJECT_NAME} " \
8283 trainer.experiment_name=${EXP_NAME} \
8384 trainer.n_gpus_per_node=8 \
84- trainer.nnodes=1 \
85+ trainer.nnodes=${NODES} \
8586 trainer.save_freq=-1 \
8687 trainer.test_freq=5 \
8788 trainer.resume_mode=disable \
Original file line number Diff line number Diff line change @@ -98,6 +98,8 @@ def get_server_address(self):
9898
9999 async def launch_server (self ):
100100 from tensorrt_llm import AsyncLLM
101+ import tensorrt_llm .logger as trtllm_logger
102+ trtllm_logger .set_level ('info' )
101103 from tensorrt_llm .llmapi import CudaGraphConfig , KvCacheConfig
102104 from tensorrt_llm .serve import OpenAIServer
103105
@@ -134,6 +136,7 @@ async def launch_server(self):
134136 "sampler_type" : "TRTLLMSampler" ,
135137 ** engine_kwargs ,
136138 }
139+ print (f"llm_kwargs: { llm_kwargs } " )
137140
138141 self .llm = await AsyncLLM (** llm_kwargs )
139142
You can’t perform that action at this time.
0 commit comments