wellkilo
diff --git a/‎config/config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎config/config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎config/config_single.yaml‎
Lines changed: 1 addition & 0 deletions b/‎config/config_single.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎config/halfcheetah.yaml‎
Lines changed: 8 additions & 7 deletions b/‎config/halfcheetah.yaml‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎config/halfcheetah_single.yaml‎
Lines changed: 10 additions & 9 deletions b/‎config/halfcheetah_single.yaml‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎config/walker2d.yaml‎
Lines changed: 9 additions & 8 deletions b/‎config/walker2d.yaml‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎config/walker2d_single.yaml‎
Lines changed: 11 additions & 10 deletions b/‎config/walker2d_single.yaml‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎drl/config.py‎
Lines changed: 1 addition & 0 deletions b/‎drl/config.py‎
Lines changed: 1 addition & 0 deletions
@@ -21,4 +21,5 @@ use_mps: true
 max_grad_norm: 1.0
 target_kl: 0.02
 lr_schedule: "linear"
+clip_ratio_value: 0.2
 ray_address:
@@ -21,4 +21,5 @@ use_mps: true
 max_grad_norm: 1.0
 target_kl: 0.02
 lr_schedule: "linear"
+clip_ratio_value: 0.2
 ray_address:
@@ -1,24 +1,25 @@
 env_name: HalfCheetah-v5
 num_actors: 8
 replay_buffer_capacity: 200000
-batch_size: 64
+batch_size: 256
 gamma: 0.99
 gae_lambda: 0.95
-lr: 0.0003
-clip_ratio: 0.2
+lr: 0.0001
+clip_ratio: 0.15
 vf_coef: 0.5
-ent_coef: 0.02
+ent_coef: 0.005
 rollout_length: 2048
 actor_update_interval: 1
-learner_updates_per_iter: 4
+learner_updates_per_iter: 10
 max_iters: 3000
 log_interval: 10
 metrics_path: output/halfcheetah/metrics.csv
-hidden_sizes: [64, 64]
+hidden_sizes: [256, 256]
 seed: 42
 use_cuda: true
 use_mps: true
-max_grad_norm: 1.0
+max_grad_norm: 0.5
 target_kl: 0.02
 lr_schedule: "linear"
+clip_ratio_value: 0.2
 ray_address:
@@ -1,24 +1,25 @@
 env_name: HalfCheetah-v5
 num_actors: 1
 replay_buffer_capacity: 200000
-batch_size: 64
+batch_size: 256
 gamma: 0.99
 gae_lambda: 0.95
-lr: 0.0003
-clip_ratio: 0.2
-vf_coef: 1.0
-ent_coef: 0.02
-rollout_length: 4096
+lr: 0.0001
+clip_ratio: 0.15
+vf_coef: 0.5
+ent_coef: 0.005
+rollout_length: 8192
 actor_update_interval: 1
-learner_updates_per_iter: 4
+learner_updates_per_iter: 15
 max_iters: 3000
 log_interval: 10
 metrics_path: output/halfcheetah/metrics_single.csv
-hidden_sizes: [64, 64]
+hidden_sizes: [256, 256]
 seed: 42
 use_cuda: true
 use_mps: true
-max_grad_norm: 1.0
+max_grad_norm: 0.5
 target_kl: 0.02
 lr_schedule: "linear"
+clip_ratio_value: 0.2
 ray_address:
@@ -1,24 +1,25 @@
 env_name: Walker2d-v5
 num_actors: 8
 replay_buffer_capacity: 200000
-batch_size: 64
+batch_size: 256
 gamma: 0.99
 gae_lambda: 0.95
-lr: 0.0003
-clip_ratio: 0.2
+lr: 0.0001
+clip_ratio: 0.1
 vf_coef: 0.5
-ent_coef: 0.02
+ent_coef: 0.01
 rollout_length: 2048
 actor_update_interval: 1
-learner_updates_per_iter: 4
+learner_updates_per_iter: 10
 max_iters: 3000
 log_interval: 10
 metrics_path: output/walker2d/metrics.csv
-hidden_sizes: [64, 64]
+hidden_sizes: [256, 256]
 seed: 42
 use_cuda: true
 use_mps: true
-max_grad_norm: 1.0
-target_kl: 0.02
+max_grad_norm: 0.5
+target_kl: 0.015
 lr_schedule: "linear"
+clip_ratio_value: 0.2
 ray_address:
@@ -1,24 +1,25 @@
 env_name: Walker2d-v5
 num_actors: 1
 replay_buffer_capacity: 200000
-batch_size: 64
+batch_size: 256
 gamma: 0.99
 gae_lambda: 0.95
-lr: 0.0003
-clip_ratio: 0.2
-vf_coef: 1.0
-ent_coef: 0.02
-rollout_length: 4096
+lr: 0.0001
+clip_ratio: 0.1
+vf_coef: 0.5
+ent_coef: 0.01
+rollout_length: 8192
 actor_update_interval: 1
-learner_updates_per_iter: 4
+learner_updates_per_iter: 15
 max_iters: 3000
 log_interval: 10
 metrics_path: output/walker2d/metrics_single.csv
-hidden_sizes: [64, 64]
+hidden_sizes: [256, 256]
 seed: 42
 use_cuda: true
 use_mps: true
-max_grad_norm: 1.0
-target_kl: 0.02
+max_grad_norm: 0.5
+target_kl: 0.015
 lr_schedule: "linear"
+clip_ratio_value: 0.2
 ray_address:
@@ -52,3 +52,4 @@ class Config:
     max_grad_norm: float = 0.5       # 梯度裁剪阈值（防止梯度爆炸）
     target_kl: float = 0.015         # KL 散度早停阈值（防止过度更新）
     lr_schedule: str = "linear"       # 学习率调度：'constant' 或 'linear'
+    clip_ratio_value: float = 0.2    # 价值函数裁剪比率（防止价值预测剧变导致 GAE 失真）