add deepspeed support, fine-tuned results

WayneDW · WayneDW · commit 97b85efa6931 · 2026-04-04T11:45:20.000-04:00
diff --git a/diffu_grpo/diffu_grpo_train.py b/diffu_grpo/diffu_grpo_train.py
@@ -8,9 +8,11 @@
 from accelerate import PartialState
 from data_utils import (
     get_apps_questions,
+    get_countdown_questions,
     get_dapo17_data,
     get_gsm8k_questions,
     get_math500_questions,
+    get_sudoku_questions,
 )
 from diffu_grpo_config import DiffuGRPOConfig
 
@@ -23,7 +25,9 @@
     boxed_and_answer_tags_format_reward,
     codefence_reward_func,
     correctness_reward_func_math,
+    countdown_reward_func,
     soft_format_reward_func,
+    sudoku_reward_func,
 )
 from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
 from transformers.trainer_callback import TrainerCallback
@@ -32,7 +36,6 @@
 
 from model.llada.configuration_llada import LLaDAConfig
 from model.llada.lladou import LLaDOUModelLM
-from model.path_utils import lladou_config_dir
 from utils import set_random_seed
 
 logging.set_verbosity_info()
@@ -101,6 +104,11 @@ def main(grpo_config, model_config):
             "reward_funcs": [apps_reward_func, codefence_reward_func],
             "reward_weights": [3.0, 1.0],
         },
+        "countdown": {
+            "loader": lambda: get_countdown_questions("train", prompt_mode=prompt_mode),
+            "reward_funcs": [countdown_reward_func],
+            "reward_weights": [1.0],
+        },
         "dapo17": {
             "loader": lambda: get_dapo17_data("train", prompt_mode=prompt_mode),
             "reward_funcs": [
@@ -109,10 +117,15 @@ def main(grpo_config, model_config):
             ],
             "reward_weights": [1.0, 1.0],
         },
+        "sudoku": {
+            "loader": get_sudoku_questions,
+            "reward_funcs": [sudoku_reward_func],
+            "reward_weights": [1.0],
+        },
     }
 
     if thinking_mode:
-        thinking_datasets = ("gsm8k", "math500", "dapo17", "apps")
+        thinking_datasets = ("gsm8k", "math500", "dapo17", "countdown", "apps")
         for key in thinking_datasets:
             if key in dataset_registry:
                 dataset_registry[key]["reward_funcs"] = [
@@ -148,7 +161,13 @@ def main(grpo_config, model_config):
     # Shuffle dataset with fixed seed for reproducibility
     dataset = dataset.shuffle(seed=grpo_config.seed)
 
-    train_set = dataset
+    # Split dataset if needed
+    if grpo_config.dataset in ["countdown", "sudoku"]:
+        train_set = dataset.select(
+            range(0, len(dataset) - 500)
+        )  # Leave last 500 for evaluation
+    else:
+        train_set = dataset
 
     # 4 bit quantization configuration
     if model_config.load_in_4bit:
@@ -170,6 +189,8 @@ def main(grpo_config, model_config):
     state.wait_for_everyone()
     local_dir = snapshot_download(grpo_config.model_path)
 
+    repo_root = Path(__file__).resolve().parent.parent
+
     if grpo_config.use_official_model:
         model = AutoModel.from_pretrained(
             local_dir,
@@ -191,7 +212,9 @@ def main(grpo_config, model_config):
         #         quantization_config=bnb_config,
         #     )
         # elif "lladou" in grpo_config.model_path.lower():
-        lladou_config = LLaDAConfig.from_pretrained(lladou_config_dir())
+        lladou_config = LLaDAConfig.from_pretrained(
+            repo_root / "model/llada/lladou_config"
+        )
         assert lladou_config.flash_attention
         model = LLaDOUModelLM.from_pretrained(
             local_dir,
@@ -239,6 +262,13 @@ def main(grpo_config, model_config):
         processing_class=tokenizer,
     )
 
+    # Propagate teacher device preference (if any) so accel_reward can honor it
+    # when loading the verifier/teacher model.
+    if grpo_config.teacher_device is not None:
+        os.environ["DIFFUGRPO_TEACHER_DEVICE"] = grpo_config.teacher_device
+    else:
+        os.environ.pop("DIFFUGRPO_TEACHER_DEVICE", None)
+
     local_log_path = grpo_config.local_log_path or os.path.join(
         grpo_config.output_dir, "local_training_logs.jsonl"
     )
diff --git a/diffu_grpo/diffu_grpo_trainer.py b/diffu_grpo/diffu_grpo_trainer.py
@@ -52,7 +52,7 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.beta = beta
 
-        self.model_wrapped = self.model_wrapped.to(torch.bfloat16)
+        # self.model_wrapped = self.model_wrapped.to(torch.bfloat16)
 
     def _generate_and_score_completions(
         self, inputs: list[dict[str, Union[torch.Tensor, Any]]]
@@ -153,7 +153,7 @@ def _generate_and_score_completions(
             prompt_ids,
             prompt_mask,
         )
-
+        # breakpoint()
         # Rollout
         with (
             profiling_context(self, "transformers.generate"),
@@ -189,10 +189,9 @@ def _generate_and_score_completions(
                 use_scheduler=self.args.use_scheduler,
             )
             logger.info("Rollout completed")
-            if (
-                self.args.torch_empty_cache_steps is not None
-                and self.state.global_step % self.args.torch_empty_cache_steps == 0
-            ):
+
+            # let deepspeed manage cuda cache 
+            if self.accelerator.distributed_type != DistributedType.DEEPSPEED:
                 torch.cuda.empty_cache()
 
         # Compute prompt length and extract completion ids
@@ -243,22 +242,12 @@ def _generate_and_score_completions(
                 example["student_logprob"] = logprob
 
         with torch.no_grad():
-            # If the generation and optimization steps are misaligned—i.e., if generation does not occur at the end of
-            # a full optimizer step (when gradient_accumulation_steps is not a multiple of generate_every)—then the
-            # **samples** may come from an earlier version of the model. In that case, we need to track old_per_token_logps
-            # for importance sampling. If the steps are aligned, importance sampling isn't necessary and we set
-            # old_per_token_logps to None.
-            # This will only run when self._step % generate_every == 0 or self._buffered_inputs is None
-            # generate_every = (
-            #     self.args.steps_per_generation * self.num_iterations
-            # )  # generation frequency
+            # In the diffusion setting we already have per-token log-probs for the rollout trajectory (`sequence_logp`)
+            # computed in `generate`. We always reuse them as `old_per_token_logps` so that we can explicitly
+            # measure and correct any on/off-policy mismatch during replay.
             old_per_token_logps = sequence_logp.clone().detach()
-            # if self.args.gradient_accumulation_steps % generate_every != 0:
-            #     old_per_token_logps = sequence_logp.clone().detach()
-            # else:
-            #     old_per_token_logps = None
 
-            # Compute the per-token log probabilities for the reference model
+            # Compute the per-token log probabilities for the reference model when KL regularization is enabled.
             if self.beta != 0.0:
                 ref_per_token_logps = old_per_token_logps.clone().detach()
             else:
@@ -452,16 +441,10 @@ def _compute_loss(self, model, inputs, num_items_in_batch):
             inputs["completion_mask"],
         )
         input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
-        # Replay must mirror the rollout mask: during generation only the prompt tokens
-        # were marked as valid, so keep zeros on the completion portion.
-        prompt_only_mask = torch.ones_like(prompt_ids, dtype=prompt_mask.dtype)
-        attention_mask = torch.cat(
-            [
-                prompt_only_mask,
-                torch.zeros_like(completion_ids, dtype=prompt_mask.dtype),
-            ],
-            dim=1,
-        )
+        # Replay must mirror the rollout mask used during generation:
+        # prompt padding is preserved, completion tokens are treated as valid (non-padding) tokens.
+        attention_mask = torch.ones_like(input_ids, dtype=prompt_mask.dtype)
+        attention_mask[:, :prompt_len] = prompt_mask
         sampling_traj = inputs["sampling_traj"]
         x0_hist = inputs["x0_hist"]
         all_advantages = inputs["advantages"]
@@ -483,17 +466,13 @@ def _compute_loss(self, model, inputs, num_items_in_batch):
 
             all_traj_len = self.accelerator.gather(
                 torch.tensor(traj_len, device=input_ids_batch.device)
-            )
+            ) 
             max_traj_len = all_traj_len.max().item()
 
             mask_id = self.args.mask_id
             cur_input = input_ids_batch.clone()
             cur_input[:, prompt_len:] = mask_id
-            if (
-                self.args.torch_empty_cache_steps is not None
-                and self.state.global_step % self.args.torch_empty_cache_steps == 0
-            ):
-                torch.cuda.empty_cache()
+            # torch.cuda.empty_cache()
             for step in tqdm(range(max_traj_len), desc="Computing per-token logps"):
                 # logger.info(f"Step {step} of {traj_len}")
                 # running the model in batches per step
@@ -546,14 +525,19 @@ def _compute_loss(self, model, inputs, num_items_in_batch):
                         cur_logp = torch.zeros_like(
                             unmasking_prob[batch], dtype=torch.float32
                         ).unsqueeze(0)
+                        EPS = 1e-6
+                        clamped_prob = torch.clamp(unmasking_prob[batch], min=EPS, max=1.0 - EPS)
                         if len(cur_traj[batch][step]) > 0:
+                            # Use log1p for log(1-p) when p is small
                             cur_logp[:, keep_mask_index_mask] = torch.log1p(
-                                -unmasking_prob[batch, keep_mask_index_mask]
+                                -clamped_prob[keep_mask_index_mask]
                             )
+                            # Use log for log(p), now safe due to clamping
                             cur_logp[:, unmasking_index_mask] = (
-                                torch.log(unmasking_prob[batch, unmasking_index_mask])
+                                torch.log(clamped_prob[unmasking_index_mask])
                                 + x0_logp[batch, unmasking_index_mask]
                             )
+
                         if (
                             torch.isnan(cur_logp).sum() > 0
                             or not torch.isfinite(cur_logp).all()
@@ -620,17 +604,15 @@ def _compute_loss(self, model, inputs, num_items_in_batch):
                     # Two-sided clipping
                     if self.args.delta is not None:
                         coef_1 = torch.clamp(coef_1, max=self.args.delta)
-                    advantages = torch.where(
-                        advantages < self.args.advantage_min_clip,
-                        torch.zeros_like(
-                            advantages
-                        ),  # ignores advantages below a threshold
-                        advantages,
+                    advantages = torch.clamp(
+                        advantages, min=self.args.advantage_min_clip
                     )
 
                     per_token_loss1 = coef_1 * advantages.unsqueeze(1)
                     per_token_loss2 = coef_2 * advantages.unsqueeze(1)
                     per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+                    # if entropy_mask is not None:
+                    #     per_token_loss = per_token_loss * entropy_mask
 
                     if self.beta != 0.0:
                         per_token_loss = per_token_loss + self.beta * per_token_kl
@@ -640,21 +622,28 @@ def _compute_loss(self, model, inputs, num_items_in_batch):
                         / per_token_loss.size(0)
                         / self.max_completion_length
                     )
-                    loss = loss / self.current_gradient_accumulation_steps
+
                     if loss.grad_fn is None:
                         # this means that no token is unmasked, this can happen because generated completion rollout is splitted into smaller batches
                         # raise ValueError("No gradient found")
                         loss = logits.exp().sum() * 0.0 + unmasking_prob.sum() * 0.0
                 loss_list.append(loss.item())
                 # print(f"Loss: {loss}")
-                # Backward pass
                 if bad_flag or loss.isnan():
                     accel_break(bad_process_index)
-                # logger.info(f"[Rank {self.accelerator.process_index}]Loss: {loss}")
-                self.backward(loss, num_items_in_batch)
+
+                # Backward pass: accumulate gradients over diffusion steps but only let DeepSpeed
+                # take an optimizer step on the final (chunk, step) pair.
+                force_deepspeed_step = False
+                if self.accelerator.distributed_type == DistributedType.DEEPSPEED:
+                    is_last_chunk = start + batch_size == input_ids.size(0)
+                    is_last_step = step == max_traj_len - 1
+                    force_deepspeed_step = is_last_chunk and is_last_step
+                self.backward(loss, num_items_in_batch, force_deepspeed_step=force_deepspeed_step)
                 return_loss += loss.detach()
 
                 del cur_input
+                # torch.cuda.empty_cache() # to reduce memory usage but will make things super slow
                 cur_input = next_input
 
                 # Log the metrics
@@ -758,11 +747,31 @@ def compute_loss(
         else:
             return self._compute_loss(model, inputs, num_items_in_batch)
 
-    def backward(self, loss: torch.Tensor, num_items_in_batch):
+    def backward(self, loss: torch.Tensor, num_items_in_batch, force_deepspeed_step=False):
+        if (force_deepspeed_step and self.accelerator.distributed_type != DistributedType.DEEPSPEED):
+            raise ValueError("force_deepspeed_step should only be true during DeepSpeed runs")
+        
         kwargs = {}
+        
+
+        # since we don't want deepspeed to step the optimizer
+        # every single time we unmask a chunk, we force the gradient sync
+        # flag to be false here until we're ready to step (after the full rollout)
+        if self.accelerator.distributed_type == DistributedType.DEEPSPEED:
+            orig_sync = getattr(self.accelerator, "sync_gradients", True)
+            self.accelerator.sync_gradients = force_deepspeed_step
+            self.accelerator.backward(loss, **kwargs)
+            self.accelerator.sync_gradients = orig_sync
+            return  # exit early so that we don't clear any live gradients from the cache
+
+        if (
+            self.args.torch_empty_cache_steps is not None
+            and self.state.global_step % self.args.torch_empty_cache_steps == 0
+        ):
+            torch.cuda.empty_cache()
 
         if self.args.n_gpu > 1:
-            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training (non-deepspeed)
 
         # Finally we need to normalize the loss for reporting if GA loss bug is not fixed during compute loss
         if (
@@ -771,9 +780,4 @@ def backward(self, loss: torch.Tensor, num_items_in_batch):
             # If the model does not accept loss kwargs, we need to normalize the loss by the number of gradient accumulation steps
             loss = loss / self.current_gradient_accumulation_steps
 
-        # Turning off loss scaling w.r.t. gradient accumulation when DeepSpeed is enabled
-        # https://github.com/huggingface/transformers/pull/35808
-        if self.accelerator.distributed_type == DistributedType.DEEPSPEED:
-            kwargs["scale_wrt_gas"] = False
-
         self.accelerator.backward(loss, **kwargs)
diff --git a/diffu_grpo/run_grpo_gsm_wei.sbatch b/diffu_grpo/run_grpo_gsm_wei.sbatch
@@ -0,0 +1,64 @@
+#!/bin/bash
+#SBATCH --partition=h100
+#SBATCH --gres=gpu:8
+#SBATCH --time=96:00:00
+#SBATCH --job-name=gsm8k
+#SBATCH --output=logs/gsm8k_%j.out
+#SBATCH --error=logs/gsm8k_%j.err
+
+# Set environment variables
+export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
+export TOKENIZERS_PARALLELISM=false
+export PYTHONUNBUFFERED=1
+
+# Create logs directory if it doesn't exist
+mkdir -p logs
+
+# The running script is in a different setup as in the paper and requires different hyperparameters.
+TEMPERATURE=1.0
+BLOCK_LENGTH=32
+MAX_COMPLETION_LENGTH=256
+NUM_GENERATIONS=96
+PER_DEVICE_TRAIN_BATCH_SIZE=12
+LEARNING_RATE=1e-6
+ADVANTAGE_MIN_CLIP=0.0
+BETA=0.0
+EFFICIENCY_REWARD_WEIGHT=0.2
+
+RUN_NAME="gsm8k"
+MODEL_DIR="YOUR_PATH"
+
+OUTPUT_DIR="./checkpoints/${RUN_NAME}"
+
+accelerate launch \
+    --config_file accelerate.yaml \
+    --num_processes 8 \
+    --main_process_port 12446 \
+    diffu_grpo_train.py \
+    --config sbatch_scripts/train.yaml \
+    --model_path "${MODEL_DIR}" \
+    --dataset "gsm8k" \
+    --run_name "${RUN_NAME}" \
+    --output_dir "${OUTPUT_DIR}" \
+    --temperature ${TEMPERATURE} \
+    --num_iterations 1 \
+    --max_steps 10000 \
+    --gen_step_efficiency_reward_weight ${EFFICIENCY_REWARD_WEIGHT} \
+    --beta ${BETA} \
+    --normalize true \
+    --scale 30.0 \
+    --use_scheduler false \
+    --max_prompt_length 256 \
+    --max_completion_length ${MAX_COMPLETION_LENGTH} \
+    --block_length ${BLOCK_LENGTH} \
+    --learning_rate ${LEARNING_RATE} \
+    --warmup_steps ${WARMUP_STEPS} \
+    --warmup_learning_rate ${WARMUP_LEARNING_RATE} \
+    --advantage_min_clip ${ADVANTAGE_MIN_CLIP} \
+    --freeze_unmasking_head false \
+    --num_generations ${NUM_GENERATIONS} \
+    --per_device_train_batch_size ${PER_DEVICE_TRAIN_BATCH_SIZE} \
+    --gradient_accumulation_steps 1 \
+    --rollout_mode training \
+    --scale_reward none \
+    --prompt_mode "non-thinking"