quic · smedhe · Mar 31, 2026 · Apr 6, 2026 · Apr 6, 2026
diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
@@ -14,6 +14,10 @@
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 
+import torch
+import torch.distributed as dist
+from accelerate.utils import ParallelismConfig
+
 from QEfficient.finetune.experimental.core.callbacks import replace_progress_callback
 from QEfficient.finetune.experimental.core.component_registry import ComponentFactory
 from QEfficient.finetune.experimental.core.config_manager import (
@@ -26,6 +30,7 @@
 from QEfficient.finetune.experimental.core.trainer import sft_trainer  # noqa: F401
 from QEfficient.finetune.experimental.core.utils.device_map_utils import get_device_map
 from QEfficient.finetune.experimental.core.utils.peft_utils import convert_peft_config_to_lora_config
+from QEfficient.finetune.experimental.core.utils.tp_peft_utils import apply_peft_to_model
 from QEfficient.finetune.experimental.core.utils.training_config_utils import prepare_training_config
 
 logger = Logger(__name__)
@@ -59,7 +64,9 @@ def __init__(self, config_manager: ConfigManager):
 
         # Prepare training configuration
         self.training_config = prepare_training_config(config_manager=self.config_manager)
-
+        self.tp_enabled = self.training_config["tp_degree"] > 1
+        if self.tp_enabled:
+            self._initialize_dist_tp()
         # Create datasets
         logger.log_rank_zero("Creating datasets...")
         self.train_dataset, self.eval_dataset = self._create_datasets()
@@ -111,6 +118,21 @@ def _setup_environment(self) -> None:
         os.environ["TRACKIO_DIR"] = str(self.output_dir / "trackio_logs")
         os.environ["TENSORBOARD_LOGGING_DIR"] = str(self.output_dir)
 
+    def _initialize_dist_tp(self):
+        WORLD_SIZE = int(os.getenv("WORLD_SIZE", "1"))
+        LOCAL_RANK = int(os.getenv("LOCAL_RANK", "0"))
+
+        if self.training_config["device"] == "cuda":
+            backend = "nccl"
+        else:
+            backend = "cpu:gloo,qaic:qccl"
+
+        dist.init_process_group(
+            backend=backend,  # "nccl" for GPUs, "gloo" for CPUs
+            world_size=WORLD_SIZE,  # total number of processes
+            rank=LOCAL_RANK,  # unique ID for this process
+        )
+
     def _create_datasets(self) -> Tuple[Any, Any]:
         """
         Create training and evaluation datasets.
@@ -161,8 +183,11 @@ def _create_model(self) -> Any:
         model_name = model_config.pop("model_name")
 
         # Get training config for PP settings
-        training_config = self.config.training
-        pp_degree = training_config.get("pp_degree", 1)
+        # training_config = self.config.training
+        training_config = self.training_config
+
+        pp_degree = self.training_config.get("pp_degree", 1)
+
         device = training_config.get("device", "qaic")
 
         # Generate device_map for pipeline parallelism if pp_degree > 1
@@ -176,11 +201,38 @@ def _create_model(self) -> Any:
             model_config["device_map"] = device_map
             logger.log_rank_zero(f"Pipeline Parallelism enabled: Using device_map for {pp_degree} stages")
 
+        tp_degree = training_config.pop("tp_degree", 1)
+
+        if tp_degree > 1:
+            pc = training_config.get("parallelism_config")
+            if not isinstance(pc, ParallelismConfig):
+                raise TypeError(f"Expected ParallelismConfig, got {type(pc).__name__}")
+            device_mesh = pc.build_device_mesh(device)
+            tp_mesh = device_mesh["tp"]
+            model_config["tp_plan"] = "auto"
+            model_config["tp_size"] = tp_degree
+            model_config["device_mesh"] = tp_mesh
+
         # Filter out PEFT-related fields, these shouldn't be passed to model creation
         excluded_keys = {"use_peft", "peft_config"}
         model_config_kwargs = {k: v for k, v in model_config.items() if k not in excluded_keys}
 
         model_instance = ComponentFactory.create_model(model_type, model_name, **model_config_kwargs)
+
+        if tp_degree > 1:
+            # Need to explicitly untie the embedding weights here to consider
+            # this as separate params in further TP processing
+            model_instance.model.lm_head.weight = torch.nn.Parameter(model_instance.model.lm_head.weight.clone())
+            peft_config = None
+            if model_config.get("use_peft", False):
+                peft_config_dataclass = model_config.get("peft_config")
+                if peft_config_dataclass is not None:
+                    peft_config = convert_peft_config_to_lora_config(peft_config_dataclass)
+                # Apply PEFT to the model and include PEFT layers in TP plan
+                model_instance.model = apply_peft_to_model(
+                    model_instance.model, tp_mesh=tp_mesh, peft_config=peft_config
+                )
+
         return model_instance
 
     def _create_optimizer(self) -> Tuple[Any, Dict[str, Any]]:
@@ -245,15 +297,18 @@ def _create_trainer(
         # Get PEFT config if enabled
         model_config_dict = self.config_manager.get_model_config()
         peft_config = None
-        if model_config_dict.get("use_peft", False):
+        if model_config_dict.get("use_peft", False) and not (
+            self.config_manager.config.training.get("tp_degree", 1) > 1
+        ):
             peft_config_dataclass = model_config_dict.get("peft_config")
             if peft_config_dataclass is not None:
                 peft_config = convert_peft_config_to_lora_config(peft_config_dataclass)
 
         # Build dependencies for trainer configuration
         dependencies = {}
-        if peft_config is not None:
+        if peft_config is not None and not (self.config_manager.config.training.get("tp_degree", 1) > 1):
             dependencies["peft_config"] = peft_config
+
         trainer_cls, args_cls, additional_kwargs = ComponentFactory.create_trainer_config(trainer_type, **dependencies)
 
         # Clean up training config: remove fields that shouldn't be passed to TrainingArguments
@@ -264,6 +319,13 @@ def _create_trainer(
         # Remove PP-specific fields as they're handled via device_map in model loading
         training_config.pop("pp_degree", None)
 
+        training_config.pop("tp_degree", None)
+        training_config.pop("ddp_degree", None)
+
+        # Before constructing SFTConfig/TrainingArguments
+        if training_config.get("report_to") is None:
+            training_config["report_to"] = "tensorboard"
+
         # Create trainer arguments instance
         args = args_cls(**training_config)
         dataset_config_dict = self.config_manager.get_dataset_config()

diff --git a/QEfficient/finetune/experimental/configs/sft_tp_ddp_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_tp_ddp_gsm8k_config.yaml
@@ -0,0 +1,52 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 8 # LoRA rank
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
+  prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
+  completion_template: "{answer}"    # Model will be trained on this part. 
+  config_name: "main" # Config name for the dataset 
+  data_seed: 42 # Random seed for dataset shuffling
+
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 1  # Batch size per device during training
+  num_train_epochs: 1
+  torch_compile: False # Whether to use torch.compile
+  tp_degree: 2
+  ddp_degree: 2
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 1e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
diff --git a/QEfficient/finetune/experimental/configs/sft_tp_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_tp_gsm8k_config.yaml
@@ -0,0 +1,52 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 8 # LoRA rank
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
+  prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
+  completion_template: "{answer}"    # Model will be trained on this part. 
+  config_name: "main" # Config name for the dataset 
+  data_seed: 42 # Random seed for dataset shuffling
+
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 1  # Batch size per device during training
+  num_train_epochs: 1
+  torch_compile: False # Whether to use torch.compile
+  tp_degree: 2
+  ddp_degree: 1
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 1e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
diff --git a/QEfficient/finetune/experimental/core/component_registry.py b/QEfficient/finetune/experimental/core/component_registry.py
@@ -225,7 +225,7 @@ def create_trainer_config(name: str, **dependencies) -> tuple:
         for kwarg, default in config["required_kwargs"].items():
             if kwarg in dependencies:
                 additional_kwargs[kwarg] = dependencies[kwarg]
-            elif default != "REQUIRED":
+            elif default != "REQUIRED" and not isinstance(default, type):
                 additional_kwargs[kwarg] = default
 
         # Check for missing required arguments

diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py
@@ -242,10 +242,10 @@ class ModelConfig:
         default="AutoModelForCausalLM",
         metadata={"help": "The AutoClass name to load the model (e.g., 'AutoModelForCausalLM')."},
     )
-    load_in_4bit: bool = field(
-        default=False,
-        metadata={"help": "Whether to load the model in 4-bit quantization."},
-    )
+    # load_in_4bit: bool = field(
+    #     default=False,
+    #     metadata={"help": "Whether to load the model in 4-bit quantization."},
+    # )
     use_peft: bool = field(
         default=True,
         metadata={"help": "Whether to use PEFT (Parameter-Efficient Fine-Tuning)."},
@@ -330,10 +330,10 @@ class TrainingConfig:
         default="./training_results",
         metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
     )
-    overwrite_output_dir: bool = field(
-        default=False,
-        metadata={"help": "Whether to overwrite the output directory."},
-    )
+    # overwrite_output_dir: bool = field(
+    #     default=False,
+    #     metadata={"help": "Whether to overwrite the output directory."},
+    # )
     seed: int = field(
         default=42,
         metadata={"help": "Random seed for reproducibility."},
@@ -476,6 +476,14 @@ class TrainingConfig:
         default=1,
         metadata={"help": "Pipeline parallelism degree (number of pipeline stages). Set > 1 to enable PP."},
     )
+    tp_degree: int = field(
+        default=1,
+        metadata={"help": "Tensor parallelism degree (number of pipeline stages). Set > 1 to enable TP."},
+    )
+    ddp_degree: int = field(
+        default=1,
+        metadata={"help": "Data parallelism degree (number of pipeline stages). Set > 1 to enable DDP."},
+    )
 
 
 @dataclass