-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathloop.yaml
More file actions
84 lines (76 loc) · 1.8 KB
/
loop.yaml
File metadata and controls
84 lines (76 loc) · 1.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
env_local_path: .env.local
base_output_dir: datasets/tulu_3/actives/dpo
base_trainer_dir: trainer_output/loop
base_logs_dir: logs/loop
base_wandb_dir: wandb/loop
base_wandb_project: loop
inputs_path: datasets/ultrafeedback/qwen_3_235b
oracle_name: ultrafeedback
acquisition_function_type: deltaquantile
reward_model_type: enn
debug: false
seed: &a 4
max_length: &b 4096
outer_loop_batch_size: 64
save_every_n_outer_batches: 100
replay_buffer_factor: 100
acquisition_function:
beta: 1.0
random:
seed: *a
ultrafeedback:
seed: *a
dts:
max_iterations: 30
drts:
max_iterations: 30
ids:
argmax_tol: 0.0001
decision_buffer: 0.0
use_candidate_set: false
rucb:
argmax_tol: 0.0001
decision_buffer: 0.0
use_candidate_set: false
maxminlcb:
argmax_tol: 0.0001
decision_buffer: 0.0
use_candidate_set: false
seed: *a
deltaquantile:
quantile: 0.05
epsilon: 0.0
enn:
previous_checkpoint_path: null
effective_batch_size: 64
inference_batch_size: 8
max_steps: &d 100
model:
base_model_name_or_path: Skywork/Skywork-Reward-V2-Qwen3-4B
num_heads: 20
head_num_layers: 2
head_hidden_dim: 128
head_initialization_xavier_gain: 1.0
freeze_base_model: true
feature_extraction_layer: last_hidden_state
regularization:
initial_value: &c 1.0
decay_type: exponential
exponential_decay_base: 0.9
exponential_decay_scaler: 4308
trainer:
lr_scheduler_type: cosine
learning_rate: 0.00005
warmup_ratio: 0.0
num_train_epochs: 1
max_length: *b
center_rewards_coefficient: 0.01
regularization_towards_initial_weights: *c
precompute_features: true
bf16: true
disable_tqdm: true
report_to: none
save_strategy: "no"
save_steps: *d
logging_strategy: "steps"
logging_steps: 1