ml-explore · Goekdeniz-Guelmez · Mar 14, 2025 · Mar 14, 2025 · Mar 14, 2025 · Mar 14, 2025
diff --git a/ACKNOWLEDGMENTS.md b/ACKNOWLEDGMENTS.md
@@ -9,4 +9,4 @@ MLX LM was developed with contributions from the following individuals:
 
 - Shunta Saito: Added support for PLaMo models.
 - Prince Canuma: Helped add support for `Starcoder2` models.
-- Gökdeniz Gülmez: Added support for the following architectures: OpenBMB's `MiniCPM` and `MiniCPM3`, Kyutai's `Helium`, State-Space's`Mamba v1`, Z.ai & THUKEG's `GLM4`, and Allenai's `OLMoE`; Added support for the following training algorithms: `full-fine-tuning`; Added support for the following other features: `Multiple Optimizers to choose for training`.
+- Gökdeniz Gülmez: Added support for the following architectures: OpenBMB's `MiniCPM` and `MiniCPM3`, Kyutai's `Helium`, State-Space's`Mamba v1`, Z.ai & THUKEG's `GLM4`, and Allenai's `OLMoE`; Added support for the following training algorithms: `full-fine-tuning`,  and `Direct Preference Optimization (DPO)`; Added support for the following other features: `Multiple Optimizers to choose for training`.
diff --git a/mlx_lm/LORA.md b/mlx_lm/LORA.md
@@ -18,6 +18,7 @@ LoRA (QLoRA).[^qlora] LoRA fine-tuning works with the following model families:
 
 - [Run](#Run)
   - [Fine-tune](#Fine-tune)
+  - [DPO-Training](#DPO-Training)
   - [Evaluate](#Evaluate)
   - [Generate](#Generate)
 - [Fuse](#Fuse)
@@ -84,6 +85,38 @@ ignore the prompt and compute loss for just the completion by passing
 datasets. For `chat` datasets the final message in the message list is
 considered the completion. See the [dataset section](#Data) for more details. 
 
+### DPO Training
+
+Direct Preference Optimization (DPO) training allows you to fine-tune models using human preference data. To use DPO training, set the training mode to 'dpo':
+
+```shell
+mlx_lm.lora \
+    --model <path_to_model> \
+    --train \
+    --training-mode dpo \
+    --data <path_to_data> \
+    --beta 0.1
+```
+
+The DPO training accepts the following additional parameters:
+
+- `--beta`: Controls the strength of the DPO loss (default: 0.1)
+- `--dpo-loss-type`: Choose between "sigmoid" (default), "hinge", "ipo", or "dpop" loss functions
+- `--delta`: Margin parameter for hinge loss (default: 50.0)
+- `--reference-model-path`: Path to a reference model for DPO training
+
+For DPO training, the data should be in JSONL format with the following structure:
+
+```jsonl
+{"prompt": "User prompt", "chosen": "Preferred response", "rejected": "Less preferred response"}
+```
+
+if the Prompt template accept a system message, you can extend the Dataset with a additional "system" field.
+
+```jsonl
+{"system": "You are a helpfull assistant", "prompt": "User prompt", "chosen": "Preferred response", "rejected": "Less preferred response"}
+```
+
 ### Evaluate
 
 To compute test set perplexity use:

diff --git a/mlx_lm/examples/lora_config.yaml b/mlx_lm/examples/lora_config.yaml
@@ -7,6 +7,18 @@ train: true
 # The fine-tuning method: "lora", "dora", or "full".
 fine_tune_type: lora
 
+# The training-mode: "normal", or "dpo"
+training_mode: normal
+
+# If you set training_mode to "dpo"
+# beta: 0.1
+# The dpo-loss-type: "sigmoid", "hinge", "ipo", or "dpop"
+# dpo_loss_type: "sigmoid"
+# is_reference_free: False
+# delta: 50.0
+# If reference_model_path is not given it will just use the same model
+# reference_model_path: "mlx_model"
+
 # The Optimizer with its possible inputs
 optimizer: adamw
 # optimizer_config:
@@ -86,4 +98,8 @@ lora_parameters:
 #  valid_split: "train[-100:]"
 #  prompt_feature: "text"
 #  completion_feature: "summary"
-
+# for DPO training
+#  prompt_feature: "text"
+#  system_feature: "system"
+#  chosen_feature: "chosen"
+#  rejected_feature: "rejected"
diff --git a/mlx_lm/lora.py b/mlx_lm/lora.py
@@ -15,6 +15,7 @@
 
 from .tokenizer_utils import TokenizerWrapper
 from .tuner.datasets import load_dataset
+from .tuner.dpo_trainer import DPOTrainingArgs, evaluate_dpo, train_dpo
 from .tuner.trainer import TrainingArgs, TrainingCallback, evaluate, train
 from .tuner.utils import (
     build_schedule,
@@ -44,6 +45,7 @@
     "model": "mlx_model",
     "train": False,
     "fine_tune_type": "lora",
+    "training_mode": "normal",
     "optimizer": "adam",
     "optimizer_config": {
         "adam": {},
@@ -69,6 +71,11 @@
     "lr_schedule": None,
     "lora_parameters": {"rank": 8, "dropout": 0.0, "scale": 10.0},
     "mask_prompt": False,
+    # DPO args
+    "beta": 0.1,
+    "dpo_loss_type": "sigmoid",
+    "delta": 50.0,
+    "reference_model_path": None,
 }
 
 
@@ -101,6 +108,12 @@ def build_parser():
         choices=["lora", "dora", "full"],
         help="Type of fine-tuning to perform: lora, dora, or full.",
     )
+    parser.add_argument(
+        "--training-mode",
+        type=str,
+        choices=["normal", "dpo"],
+        help="Training mode: normal or DPO",
+    )
     parser.add_argument(
         "--optimizer",
         type=str,
@@ -181,6 +194,30 @@ def build_parser():
         default=None,
     )
     parser.add_argument("--seed", type=int, help="The PRNG seed")
+
+    # DPO args
+    parser.add_argument(
+        "--beta",
+        type=float,
+        help="Temperature parameter for DPO training.",
+        default=0.1,
+    )
+    parser.add_argument(
+        "--dpo-loss-type",
+        type=str,
+        help="DPO loss type: 'sigmoid', 'hinge', 'ipo', or 'dpop'.",
+        choices=["sigmoid", "hinge", "ipo", "dpop"],
+        default="sigmoid",
+    )
+    parser.add_argument(
+        "--delta", type=float, help="Delta parameter for DPOP loss type.", default=50.0
+    )
+    parser.add_argument(
+        "--reference-model-path",
+        type=str,
+        help="Path to reference model weights. If None, uses the same model.",
+        default=None,
+    )
     return parser
 
 
@@ -227,18 +264,7 @@ def train_model(
     adapter_file = adapter_path / "adapters.safetensors"
     save_config(vars(args), adapter_path / "adapter_config.json")
 
-    # init training args
-    training_args = TrainingArgs(
-        batch_size=args.batch_size,
-        iters=args.iters,
-        val_batches=args.val_batches,
-        steps_per_report=args.steps_per_report,
-        steps_per_eval=args.steps_per_eval,
-        steps_per_save=args.save_every,
-        adapter_file=adapter_file,
-        max_seq_length=args.max_seq_length,
-        grad_checkpoint=args.grad_checkpoint,
-    )
+    model.train()
 
     # Initialize the selected optimizer
     lr = build_schedule(args.lr_schedule) if args.lr_schedule else args.learning_rate
@@ -255,31 +281,104 @@ def train_model(
 
     opt = opt_class(learning_rate=lr, **optimizer_config)
 
-    # Train model
-    train(
-        model=model,
-        tokenizer=tokenizer,
-        args=training_args,
-        optimizer=opt,
-        train_dataset=train_set,
-        val_dataset=valid_set,
-        training_callback=training_callback,
-    )
+    if args.training_mode == "dpo":
+        training_args = DPOTrainingArgs(
+            batch_size=args.batch_size,
+            iters=args.iters,
+            val_batches=args.val_batches,
+            steps_per_report=args.steps_per_report,
+            steps_per_eval=args.steps_per_eval,
+            steps_per_save=args.save_every,
+            adapter_file=adapter_file,
+            max_seq_length=args.max_seq_length,
+            grad_checkpoint=args.grad_checkpoint,
+            beta=args.beta,
+            loss_type=args.dpo_loss_type,
+            delta=args.delta,
+            reference_model_path=args.reference_model_path,
+        )
+
+        if args.reference_model_path:
+            reference_model, _ = load(args.reference_model_path)
+        else:
+            reference_model, _ = load(args.model)
+
+        train_dpo(
+            model=model,
+            ref_model=reference_model.freeze(),
+            tokenizer=tokenizer,
+            optimizer=opt,
+            train_dataset=train_set,
+            val_dataset=valid_set,
+            args=training_args,
+            training_callback=training_callback,
+        )
+    else:
+        training_args = TrainingArgs(
+            batch_size=args.batch_size,
+            iters=args.iters,
+            val_batches=args.val_batches,
+            steps_per_report=args.steps_per_report,
+            steps_per_eval=args.steps_per_eval,
+            steps_per_save=args.save_every,
+            adapter_file=adapter_file,
+            max_seq_length=args.max_seq_length,
+            grad_checkpoint=args.grad_checkpoint,
+        )
+
+        # Train model
+        train(
+            model=model,
+            tokenizer=tokenizer,
+            args=training_args,
+            optimizer=opt,
+            train_dataset=train_set,
+            val_dataset=valid_set,
+            training_callback=training_callback,
+        )
 
 
 def evaluate_model(args, model: nn.Module, tokenizer: TokenizerWrapper, test_set):
-    test_loss = evaluate(
-        model=model,
-        dataset=test_set,
-        tokenizer=tokenizer,
-        batch_size=args.batch_size,
-        num_batches=args.test_batches,
-        max_seq_length=args.max_seq_length,
-    )
+    model.eval()
+
+    if args.training_mode == "dpo":
+        if args.reference_model_path:
+            reference_model, _ = load(args.reference_model_path)
+        else:
+            reference_model = model
+
+        test_loss, _, _, test_metrics = evaluate_dpo(
+            model=model,
+            ref_model=reference_model.freeze(),
+            dataset=test_set,
+            batch_size=args.batch_size,
+            num_batches=args.test_batches,
+            max_seq_length=args.max_seq_length,
+            beta=args.beta,
+            delta=args.delta,
+            loss_type=args.dpo_loss_type,
+        )
+
+        test_ppl = math.exp(test_loss)
+
+        print(f"Test loss {test_loss:.3f}, Test ppl {test_ppl:.3f}")
+        print("DPO Test Metrics:")
+        for metric_name, metric_value in test_metrics.items():
+            print(f"  {metric_name}: {float(metric_value):.3f}")
+
+    else:
+        test_loss = evaluate(
+            model=model,
+            dataset=test_set,
+            tokenizer=tokenizer,
+            batch_size=args.batch_size,
+            num_batches=args.test_batches,
+            max_seq_length=args.max_seq_length,
+        )
 
-    test_ppl = math.exp(test_loss)
+        test_ppl = math.exp(test_loss)
 
-    print(f"Test loss {test_loss:.3f}, Test ppl {test_ppl:.3f}.")
+        print(f"Test loss {test_loss:.3f}, Test ppl {test_ppl:.3f}.")
 
 
 def run(args, training_callback: TrainingCallback = None):