-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_reward_model_separate_dataset.py
124 lines (105 loc) · 3.27 KB
/
test_reward_model_separate_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# To run this script: `echo train_reward_model.py | entr -s "uv run train_reward_model.py"`
import torch
import json
from datasets import load_from_disk, DatasetDict, load_dataset
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
)
from trl import RewardTrainer, RewardConfig
from peft.tuners.lora import LoraConfig
from peft.mapping import get_peft_model
import wandb
import os
from huggingface_hub import HfApi
from dotenv import load_dotenv
load_dotenv("/workspace/.env")
# Add logging to check HF_TOKEN
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
print("HF_TOKEN is set in the environment.")
api = HfApi()
try:
user_info = api.whoami(token=hf_token)
print(f"Logged in as: {user_info['name']}")
except Exception as e:
print(f"Error verifying HuggingFace login: {e}")
else:
print("HF_TOKEN is not set in the environment.")
print("All variables: ", os.environ)
# Configuration
model_name = "unsloth/Llama-3.2-3B"
output_dir = "./reward_model_output"
num_epochs = 2
batch_size = 1 # For some reason making this larger doesn't help training time, why?
learning_rate = 5e-5
max_length = 8192
gradient_accumulation_steps = 4
# Initialize wandb
wandb.init(project="ensure_same_perf")
print("Loading dataset...")
dataset: DatasetDict = load_dataset("OpenPipe/test-reward-dataset-tmp-delete-me")
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(
model_name, padding=True, truncation=True, max_length=max_length
)
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=1,
device_map="auto",
# attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16,
trust_remote_code=True,
)
print(f"Tokenizer padding token: {tokenizer.pad_token}")
print(f"Model padding token: {model.config.pad_token_id}")
model.config.pad_token_id = tokenizer.pad_token_id
tokenizer.padding_side = "right"
print("Configuring LoRA...")
peft_config = LoraConfig(
task_type="SEQ_CLS",
r=8,
lora_alpha=16,
lora_dropout=0,
)
model = get_peft_model(model, peft_config)
print("Model config: ", model.config)
# Configure training arguments
training_args = RewardConfig(
output_dir=output_dir,
num_train_epochs=num_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
learning_rate=learning_rate,
weight_decay=0,
evaluation_strategy="steps",
eval_steps=0.1,
logging_steps=5,
save_strategy="steps",
save_steps=500,
# load_best_model_at_end=True,
max_length=max_length,
report_to="wandb",
no_cuda=False,
bf16=True,
use_liger_kernel=False,
warmup_steps=10,
optim="adamw_bnb_8bit",
gradient_accumulation_steps=gradient_accumulation_steps,
)
print("Initializing RewardTrainer...")
trainer = RewardTrainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["test"], # Use "test" split for validation
tokenizer=tokenizer,
)
print(f"Trainer args for model:")
print(json.dumps(trainer.args.to_dict(), indent=2))
print("Starting model training...")
trainer.train()
print("Saving final model...")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print("Reward model training complete")