hpcaitech · TongLi3701 · Apr 21, 2025 · Feb 23, 2025 · Feb 23, 2025 · Feb 25, 2025
@@ -163,3 +163,5 @@ coverage.xml
 # log, test files - ColossalChat
 applications/ColossalChat/logs
 applications/ColossalChat/tests/logs
+applications/ColossalChat/wandb
+applications/ColossalChat/model
@@ -356,10 +356,24 @@ def apply_chat_template_and_mask(
     truncation: bool = True,
     ignore_idx: int = -100,
 ) -> Dict[str, torch.Tensor]:
+
+    system_prompt = "You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Now the user asks you to solve a math problem that involves reasoning. After thinking, when you finally reach a conclusion, clearly output the final answer without explanation within the <answer> </answer> tags, i.e., <answer> 123 </answer>.\n\n"
+
+    system_element = {
+        "role": "system",
+        "content": system_prompt,
+    }
+
+    # Format for RL.
+    gt_answer = None
+    if "messages" in chat and "gt_answer" in chat:
+        gt_answer = chat["gt_answer"]
+        chat = [chat["messages"]]
+
     tokens = []
     assistant_mask = []
     for i, msg in enumerate(chat):
-        msg_tokens = tokenizer.apply_chat_template([msg], tokenize=True)
+        msg_tokens = tokenizer.apply_chat_template([system_element, msg], tokenize=True, add_generation_prompt=True)
         # remove unexpected bos token
         if i > 0 and msg_tokens[0] == tokenizer.bos_token_id:
             msg_tokens = msg_tokens[1:]
@@ -372,14 +386,10 @@ def apply_chat_template_and_mask(
     if max_length is not None:
         if padding and len(tokens) < max_length:
             to_pad = max_length - len(tokens)
-            if tokenizer.padding_side == "right":
-                tokens.extend([tokenizer.pad_token_id] * to_pad)
-                assistant_mask.extend([False] * to_pad)
-                attention_mask.extend([0] * to_pad)
-            else:
-                tokens = [tokenizer.pad_token_id] * to_pad + tokens
-                assistant_mask = [False] * to_pad + assistant_mask
-                attention_mask = [0] * to_pad + attention_mask
+            # Left padding for generation.
+            tokens = [tokenizer.pad_token_id] * to_pad + tokens
+            assistant_mask = [False] * to_pad + assistant_mask
+            attention_mask = [0] * to_pad + attention_mask
         if truncation and len(tokens) > max_length:
             tokens = tokens[:max_length]
             assistant_mask = assistant_mask[:max_length]
@@ -389,6 +399,13 @@ def apply_chat_template_and_mask(
     labels = input_ids.clone()
     labels[~torch.tensor(assistant_mask, dtype=torch.bool)] = ignore_idx
 
+    if gt_answer is not None:
+        gt_answer = tokenizer.encode(
+            gt_answer, padding="max_length", truncation=True, max_length=128, return_tensors="pt"
+        )
+        gt_answer = gt_answer.squeeze(1)
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "gt_answer": gt_answer}
+
     return {
         "input_ids": input_ids,
         "attention_mask": attention_mask,

@@ -1,3 +1,4 @@
+import os
 from contextlib import nullcontext
 from typing import Any, Dict, Optional
 
@@ -33,6 +34,8 @@ def __init__(
         model_config: Dict[str, Any],
         plugin_config: Dict[str, Any],
         microbatch_size: int = 1,
+        save_interval: int = 100,
+        save_dir: str = "./model",
     ):
         self.num_producers = num_producers
         self.num_episodes = num_episodes
@@ -44,14 +47,16 @@ def __init__(
         self.num_recv_per_update = num_recv_per_update
         self.batch_size = batch_size
         self.microbatch_size = microbatch_size
+        self.save_interval = save_interval
+        self.save_dir = save_dir
         assert batch_size % microbatch_size == 0, "batch_size should be divisible by microbatch_size"
         self.num_microbatches = batch_size // microbatch_size
 
         self.model_config = model_config
         self.plugin_config = plugin_config
-        assert self.plugin_config.get("pp_size", 1) == 1, "pp_size > 1 is not supported now"
 
         self.device = get_current_device()
+        self.lr_scheduler = None
 
     def setup(self) -> None:
         for i in range(self.num_producers):
@@ -60,18 +65,15 @@ def setup(self) -> None:
             cc.init_collective_group(self.num_producers + 1, self.num_producers, group_name="sync_model")
         launch(self.rank, self.world_size, self.master_addr, self.master_port, local_rank=0)
 
-        plugin_config = dict(
-            tp_size=1,
-            pp_size=1,
-            precision="bf16",
-            zero_stage=1,
-        )
+        plugin_config = dict(tp_size=1, pp_size=1, precision="bf16", zero_stage=2)
         if self.plugin_config.get("pp_size", 1) > 1 and "num_microbatches" not in self.plugin_config:
             plugin_config["microbatch_size"] = self.microbatch_size
         plugin_config.update(self.plugin_config)
         self.plugin = HybridParallelPlugin(**plugin_config)
         self.booster = Booster(plugin=self.plugin)
         self.dp_rank = dist.get_rank(self.plugin.dp_group)
+        self.tp_rank = dist.get_rank(self.plugin.tp_group)
+
         self.dp_size = dist.get_world_size(self.plugin.dp_group)
 
         self.buffer = []
@@ -94,7 +96,6 @@ def loop(self) -> None:
                     i = 0
                     for _ in range(self.num_recv_per_update):
                         # receive data from producers
-
                         for r in range(self.num_producers):
                             print(f"[T{dist.get_rank()}] Recv data episode {episode} step {step} from {r}")
                             self.buffer.extend(
@@ -116,13 +117,26 @@ def loop(self) -> None:
                                 pbar.set_postfix({"loss": loss})
                             i += 1
                     assert len(self.buffer) == 0
+                    if self.lr_scheduler is not None:
+                        self.lr_scheduler.step()
+                    if (step + 1) % self.save_interval == 0:
+                        if self.rank == 0:
+                            print(f"Start saving policy model at step {step + 1}.")
+                        save_path = os.path.join(self.save_dir, f"modeling-step-{step + 1}")
+                        self.booster.save_model(self.policy_model, save_path, shard=True)
+                        if self.rank == 0:
+                            print(f"Saved model checkpoint at step {step + 1} in folder {save_path}")
+
                     if episode != self.num_episodes - 1 or step != self.num_update_per_episode - 1:
                         print(f"[T{dist.get_rank()}] Sync model episode {episode} step {step}")
+                        torch.cuda.empty_cache()
                         state_dict = self.state_dict()
                         if self.rank == 0:
                             ray_broadcast_tensor_dict(
                                 state_dict, src=self.num_producers, device=self.device, group_name="sync_model"
                             )
+                        del state_dict
+                        torch.cuda.empty_cache()
 
 
 @ray.remote