Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 67 additions & 5 deletions QEfficient/cloud/finetune_experimental.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
from pathlib import Path
from typing import Any, Dict, List, Tuple

import torch
import torch.distributed as dist
from accelerate.utils import ParallelismConfig

from QEfficient.finetune.experimental.core.callbacks import replace_progress_callback
from QEfficient.finetune.experimental.core.component_registry import ComponentFactory
from QEfficient.finetune.experimental.core.config_manager import (
Expand All @@ -26,6 +30,7 @@
from QEfficient.finetune.experimental.core.trainer import sft_trainer # noqa: F401
from QEfficient.finetune.experimental.core.utils.device_map_utils import get_device_map
from QEfficient.finetune.experimental.core.utils.peft_utils import convert_peft_config_to_lora_config
from QEfficient.finetune.experimental.core.utils.tp_peft_utils import apply_peft_to_model
from QEfficient.finetune.experimental.core.utils.training_config_utils import prepare_training_config

logger = Logger(__name__)
Expand Down Expand Up @@ -59,7 +64,9 @@ def __init__(self, config_manager: ConfigManager):

# Prepare training configuration
self.training_config = prepare_training_config(config_manager=self.config_manager)

self.tp_enabled = self.training_config["tp_degree"] > 1
if self.tp_enabled:
self._initialize_dist_tp()
# Create datasets
logger.log_rank_zero("Creating datasets...")
self.train_dataset, self.eval_dataset = self._create_datasets()
Expand Down Expand Up @@ -111,6 +118,21 @@ def _setup_environment(self) -> None:
os.environ["TRACKIO_DIR"] = str(self.output_dir / "trackio_logs")
os.environ["TENSORBOARD_LOGGING_DIR"] = str(self.output_dir)

def _initialize_dist_tp(self):
WORLD_SIZE = int(os.getenv("WORLD_SIZE", "1"))
LOCAL_RANK = int(os.getenv("LOCAL_RANK", "0"))

if self.training_config["device"] == "cuda":
backend = "nccl"
else:
backend = "cpu:gloo,qaic:qccl"

dist.init_process_group(
backend=backend, # "nccl" for GPUs, "gloo" for CPUs
world_size=WORLD_SIZE, # total number of processes
rank=LOCAL_RANK, # unique ID for this process
)

def _create_datasets(self) -> Tuple[Any, Any]:
"""
Create training and evaluation datasets.
Expand Down Expand Up @@ -161,8 +183,11 @@ def _create_model(self) -> Any:
model_name = model_config.pop("model_name")

# Get training config for PP settings
training_config = self.config.training
pp_degree = training_config.get("pp_degree", 1)
# training_config = self.config.training
training_config = self.training_config

pp_degree = self.training_config.get("pp_degree", 1)

device = training_config.get("device", "qaic")

# Generate device_map for pipeline parallelism if pp_degree > 1
Expand All @@ -176,11 +201,38 @@ def _create_model(self) -> Any:
model_config["device_map"] = device_map
logger.log_rank_zero(f"Pipeline Parallelism enabled: Using device_map for {pp_degree} stages")

tp_degree = training_config.pop("tp_degree", 1)

if tp_degree > 1:
pc = training_config.get("parallelism_config")
if not isinstance(pc, ParallelismConfig):
raise TypeError(f"Expected ParallelismConfig, got {type(pc).__name__}")
device_mesh = pc.build_device_mesh(device)
tp_mesh = device_mesh["tp"]
model_config["tp_plan"] = "auto"
model_config["tp_size"] = tp_degree
model_config["device_mesh"] = tp_mesh

# Filter out PEFT-related fields, these shouldn't be passed to model creation
excluded_keys = {"use_peft", "peft_config"}
model_config_kwargs = {k: v for k, v in model_config.items() if k not in excluded_keys}

model_instance = ComponentFactory.create_model(model_type, model_name, **model_config_kwargs)

if tp_degree > 1:
# Need to explicitly untie the embedding weights here to consider
# this as separate params in further TP processing
model_instance.model.lm_head.weight = torch.nn.Parameter(model_instance.model.lm_head.weight.clone())
peft_config = None
if model_config.get("use_peft", False):
peft_config_dataclass = model_config.get("peft_config")
if peft_config_dataclass is not None:
peft_config = convert_peft_config_to_lora_config(peft_config_dataclass)
# Apply PEFT to the model and include PEFT layers in TP plan
model_instance.model = apply_peft_to_model(
model_instance.model, tp_mesh=tp_mesh, peft_config=peft_config
)

return model_instance

def _create_optimizer(self) -> Tuple[Any, Dict[str, Any]]:
Expand Down Expand Up @@ -245,15 +297,18 @@ def _create_trainer(
# Get PEFT config if enabled
model_config_dict = self.config_manager.get_model_config()
peft_config = None
if model_config_dict.get("use_peft", False):
if model_config_dict.get("use_peft", False) and not (
self.config_manager.config.training.get("tp_degree", 1) > 1
):
peft_config_dataclass = model_config_dict.get("peft_config")
if peft_config_dataclass is not None:
peft_config = convert_peft_config_to_lora_config(peft_config_dataclass)

# Build dependencies for trainer configuration
dependencies = {}
if peft_config is not None:
if peft_config is not None and not (self.config_manager.config.training.get("tp_degree", 1) > 1):
dependencies["peft_config"] = peft_config

trainer_cls, args_cls, additional_kwargs = ComponentFactory.create_trainer_config(trainer_type, **dependencies)

# Clean up training config: remove fields that shouldn't be passed to TrainingArguments
Expand All @@ -264,6 +319,13 @@ def _create_trainer(
# Remove PP-specific fields as they're handled via device_map in model loading
training_config.pop("pp_degree", None)

training_config.pop("tp_degree", None)
training_config.pop("ddp_degree", None)

# Before constructing SFTConfig/TrainingArguments
if training_config.get("report_to") is None:
training_config["report_to"] = "tensorboard"

# Create trainer arguments instance
args = args_cls(**training_config)
dataset_config_dict = self.config_manager.get_dataset_config()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
# Model configuration
model:
model_type: "hf" # Hugging Face model
auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name
use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
peft_config:
lora_r: 8 # LoRA rank
lora_alpha: 16
lora_dropout: 0
target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
peft_type: "LORA" # Options: LORA, IA3, etc.

# Dataset configuration
dataset:
dataset_type: "sft_dataset"
dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
completion_template: "{answer}" # Model will be trained on this part.
config_name: "main" # Config name for the dataset
data_seed: 42 # Random seed for dataset shuffling


# Training configuration
training:
type: "sft"
gradient_accumulation_steps: 1 # Number of steps to accumulate gradients
per_device_train_batch_size: 1 # Batch size per device during training
num_train_epochs: 1
torch_compile: False # Whether to use torch.compile
tp_degree: 2
ddp_degree: 2

# Optimizer configuration
optimizers:
optimizer_name: "adamw"
lr: 1e-4

scheduler:
scheduler_name: "cosine"

callbacks:
early_stopping:
early_stopping_patience: 3 # Number of epochs to wait before stopping training
early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
52 changes: 52 additions & 0 deletions QEfficient/finetune/experimental/configs/sft_tp_gsm8k_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
# Model configuration
model:
model_type: "hf" # Hugging Face model
auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name
use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
peft_config:
lora_r: 8 # LoRA rank
lora_alpha: 16
lora_dropout: 0
target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
peft_type: "LORA" # Options: LORA, IA3, etc.

# Dataset configuration
dataset:
dataset_type: "sft_dataset"
dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
completion_template: "{answer}" # Model will be trained on this part.
config_name: "main" # Config name for the dataset
data_seed: 42 # Random seed for dataset shuffling


# Training configuration
training:
type: "sft"
gradient_accumulation_steps: 1 # Number of steps to accumulate gradients
per_device_train_batch_size: 1 # Batch size per device during training
num_train_epochs: 1
torch_compile: False # Whether to use torch.compile
tp_degree: 2
ddp_degree: 1

# Optimizer configuration
optimizers:
optimizer_name: "adamw"
lr: 1e-4

scheduler:
scheduler_name: "cosine"

callbacks:
early_stopping:
early_stopping_patience: 3 # Number of epochs to wait before stopping training
early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def create_trainer_config(name: str, **dependencies) -> tuple:
for kwarg, default in config["required_kwargs"].items():
if kwarg in dependencies:
additional_kwargs[kwarg] = dependencies[kwarg]
elif default != "REQUIRED":
elif default != "REQUIRED" and not isinstance(default, type):
additional_kwargs[kwarg] = default

# Check for missing required arguments
Expand Down
24 changes: 16 additions & 8 deletions QEfficient/finetune/experimental/core/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,10 +242,10 @@ class ModelConfig:
default="AutoModelForCausalLM",
metadata={"help": "The AutoClass name to load the model (e.g., 'AutoModelForCausalLM')."},
)
load_in_4bit: bool = field(
default=False,
metadata={"help": "Whether to load the model in 4-bit quantization."},
)
# load_in_4bit: bool = field(
# default=False,
# metadata={"help": "Whether to load the model in 4-bit quantization."},
# )
use_peft: bool = field(
default=True,
metadata={"help": "Whether to use PEFT (Parameter-Efficient Fine-Tuning)."},
Expand Down Expand Up @@ -330,10 +330,10 @@ class TrainingConfig:
default="./training_results",
metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
)
overwrite_output_dir: bool = field(
default=False,
metadata={"help": "Whether to overwrite the output directory."},
)
# overwrite_output_dir: bool = field(
# default=False,
# metadata={"help": "Whether to overwrite the output directory."},
# )
seed: int = field(
default=42,
metadata={"help": "Random seed for reproducibility."},
Expand Down Expand Up @@ -476,6 +476,14 @@ class TrainingConfig:
default=1,
metadata={"help": "Pipeline parallelism degree (number of pipeline stages). Set > 1 to enable PP."},
)
tp_degree: int = field(
default=1,
metadata={"help": "Tensor parallelism degree (number of pipeline stages). Set > 1 to enable TP."},
)
ddp_degree: int = field(
default=1,
metadata={"help": "Data parallelism degree (number of pipeline stages). Set > 1 to enable DDP."},
)


@dataclass
Expand Down
Loading
Loading