-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Description
System Info
使用的是A100,跑的实验是verl的ppo,参数是verl快速启动文档里面复制的
报错如下:
Error executing job with overrides: ['data.train_files=/data/data/gsm8k/train.parquet', 'data.val_files=/data/data/gsm8k/test.parquet', 'data.train_batch_size=256', 'data.max_prompt_length=512', 'data.max_response_length=512', 'actor_rollout_ref.model.path=/data/qwen/qwen2.5-0.5B', 'actor_rollout_ref.actor.optim.lr=1e-6', 'actor_rollout_ref.actor.ppo_mini_batch_size=64', 'actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4', 'actor_rollout_ref.rollout.name=vllm', 'actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8', 'actor_rollout_ref.rollout.tensor_model_parallel_size=1', 'actor_rollout_ref.rollout.gpu_memory_utilization=0.4', 'actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4', 'critic.optim.lr=1e-5', 'critic.model.path=/data/qwen/qwen2.5-0.5B', 'critic.ppo_micro_batch_size_per_gpu=4', 'algorithm.kl_ctrl.kl_coef=0.001', 'trainer.logger=[console,tensorboard]', 'trainer.val_before_train=False', 'trainer.n_gpus_per_node=1', 'trainer.nnodes=1', 'trainer.save_freq=-1', 'trainer.test_freq=10', 'trainer.total_epochs=15']
ray::TaskRunner.run() (pid=40293, ip=10.230.51.88, actor_id=56d37ca4409f57f7278484da01000000, repr=<main_ppo.TaskRunner object at 0x785b2a4d5e20>)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "", line 30, in init
File "/data/lgj/verl/verl/workers/config/critic.py", line 204, in post_init
super().post_init()
File "/data/lgj/verl/verl/workers/config/critic.py", line 96, in post_init
self.model_config = HFModelConfig(
^^^^^^^^^^^^^^
File "", line 36, in init
File "/data/lgj/verl/verl/workers/config/model.py", line 157, in post_init
self.tokenizer = hf_tokenizer(self.local_tokenizer_path, trust_remote_code=self.trust_remote_code)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/lgj/verl/verl/utils/tokenizer.py", line 98, in hf_tokenizer
tokenizer = AutoTokenizer.from_pretrained(name_or_path, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miniconda/envs/verl2/lib/python3.12/site-packages/transformers/models/auto/tokenization_auto.py", line 1156, in from_pretrained
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miniconda/envs/verl2/lib/python3.12/site-packages/transformers/tokenization_utils_base.py", line 2112, in from_pretrained
return cls._from_pretrained(
^^^^^^^^^^^^^^^^^^^^^
File "/data/miniconda/envs/verl2/lib/python3.12/site-packages/transformers/tokenization_utils_base.py", line 2419, in _from_pretrained
if _is_local and _config.model_type not in [
^^^^^^^^^^^^^^^^^^
AttributeError: 'dict' object has no attribute 'model_type'
The above exception was the direct cause of the following exception:
ray::TaskRunner.run() (pid=40293, ip=10.230.51.88, actor_id=56d37ca4409f57f7278484da01000000, repr=<main_ppo.TaskRunner object at 0x785b2a4d5e20>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/lgj/verl/verl/trainer/main_ppo.py", line 299, in run
validate_config(
File "/data/lgj/verl/verl/utils/config.py", line 174, in validate_config
critic_config = omega_conf_to_dataclass(config.critic)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/lgj/verl/verl/utils/config.py", line 50, in omega_conf_to_dataclass
return instantiate(config, convert="partial")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miniconda/envs/verl2/lib/python3.12/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 226, in instantiate
return instantiate_node(
^^^^^^^^^^^^^^^^^
File "/data/miniconda/envs/verl2/lib/python3.12/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 347, in instantiate_node
return _call_target(target, partial, args, kwargs, full_key)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miniconda/envs/verl2/lib/python3.12/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 97, in _call_target
raise InstantiationException(msg) from e
hydra.errors.InstantiationException: Error in call to target 'verl.workers.config.critic.FSDPCriticConfig':
AttributeError("'dict' object has no attribute 'model_type'")
full_key: critic
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
def post_init(self):
"""Validate critic configuration parameters."""
assert self.strategy != MISSING
if self.model_config is None:
warnings.warn("using model in Critic Config is deprecated, please use model_config instead", stacklevel=2)
self.model_config = HFModelConfig(
path=self.model.path,
tokenizer_path=self.model.tokenizer_path,
override_config=self.model.override_config,
external_lib=self.model.external_lib,
trust_remote_code=self.model.trust_remote_code,
)
critic.py96行这里会强制将model_config改为HFModelConfig
Expected behavior
在dp_critic.yaml中又是用的_target_: verl.workers.config.FSDPCriticConfig