-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Open
Description
Hi,
I faced with the problem that speed of processing data is degrading with more steps.
On the beginning of training I have speed 5 iteration per seconds, in the middle of epoch I get 3 seconds per iteration.
11%|████████▍ | 26373/231300 [10:16:50<147:18:07, 3.01s/it]
For me waiting 147 hours is tooooo much.
Some details about my setup, its instance on AWS g6e.xlarge:
GPU: NVIDIA L40S (48GB VRAM) // 43GB utilize during training
✓ Train dataset : 5,921,185 rows of pairs [sentence1, sentence2]
✓ Validation dataset: 623,518 rows
Also I see the on the nvidia-smi not fully utilized the GPU capacity:
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L40S On | 00000000:30:00.0 Off | 0 |
| N/A 47C P0 96W / 350W | 43117MiB / 46068MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 24938 C python3 43108MiB |
+-----------------------------------------------------------------------------------------+
Here is my training code:
"""
Train a bi-encoder model using intfloat/multilingual-e5-base with MultipleNegativesRankingLoss.
Usage:
python train_bi_encoder.py --output_dir ./models/e5-multilingual-encoder
"""
import argparse
import logging
from pathlib import Path
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import SentenceTransformerTrainingArguments, BatchSamplers
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import torch
# Setup logging
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO
)
logger = logging.getLogger(__name__)
def load_parquet_dataset(data_dir: str):
"""Load training and validation datasets from parquet files."""
logger.info(f"Loading datasets from {data_dir}")
train_dataset = load_dataset(
"parquet",
data_files=str(Path(data_dir) / "train-00000-of-00001.parquet"),
split="train"
)
eval_dataset = load_dataset(
"parquet",
data_files=str(Path(data_dir) / "val-00000-of-00001.parquet"),
split="train"
)
logger.info(f"Loaded {len(train_dataset)} training samples")
logger.info(f"Loaded {len(eval_dataset)} validation samples")
logger.info(f"Dataset columns: {train_dataset.column_names}")
return train_dataset, eval_dataset
def create_evaluator(eval_dataset, batch_size: int = 128):
"""Create an evaluator for validation during training."""
# Sample a subset for faster evaluation (10k samples)
eval_subset = eval_dataset.shuffle(seed=42).select(range(min(10000, len(eval_dataset))))
sentences1 = eval_subset["sentence1"]
sentences2 = eval_subset["sentence2"]
# Create similarity scores (1.0 for positive pairs in our case)
scores = [1.0] * len(sentences1)
evaluator = EmbeddingSimilarityEvaluator(
sentences1=sentences1,
sentences2=sentences2,
scores=scores,
batch_size=batch_size,
name="validation",
show_progress_bar=True
)
return evaluator
def main():
parser = argparse.ArgumentParser(description="Train bi-encoder")
parser.add_argument(
"--model_name",
type=str,
default="intfloat/multilingual-e5-base",
help="Base model to fine-tune"
)
parser.add_argument(
"--data_dir",
type=str,
default="data",
help="Directory containing train and val parquet files"
)
parser.add_argument(
"--output_dir",
type=str,
default="e5-multilingual-category-encoder",
help="Directory to save the trained model"
)
parser.add_argument(
"--num_epochs",
type=int,
default=5,
help="Number of training epochs"
)
parser.add_argument(
"--batch_size",
type=int,
default=128,
help="Training batch size per device"
)
parser.add_argument(
"--learning_rate",
type=float,
default=2e-5,
help="Learning rate"
)
parser.add_argument(
"--warmup_ratio",
type=float,
default=0.1,
help="Warmup ratio of total training steps"
)
parser.add_argument(
"--eval_steps",
type=int,
default=5000,
help="Evaluate every N steps"
)
parser.add_argument(
"--save_steps",
type=int,
default=5000,
help="Save checkpoint every N steps"
)
parser.add_argument(
"--logging_steps",
type=int,
default=500,
help="Log every N steps"
)
parser.add_argument(
"--max_seq_length",
type=int,
default=512,
help="Maximum sequence length"
)
parser.add_argument(
"--test_run",
action="store_true",
help="Run a quick test with limited data"
)
args = parser.parse_args()
# Log GPU information
if torch.cuda.is_available():
logger.info(f"CUDA available: {torch.cuda.is_available()}")
logger.info(f"CUDA device: {torch.cuda.get_device_name(0)}")
logger.info(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
logger.warning("CUDA not available, training will be slow!")
# Load datasets
train_dataset, eval_dataset = load_parquet_dataset(args.data_dir)
# For test run, use only a small subset
if args.test_run:
logger.info("TEST RUN MODE: Using limited dataset")
train_dataset = train_dataset.shuffle(seed=42).select(range(1000))
eval_dataset = eval_dataset.shuffle(seed=42).select(range(500))
args.num_epochs = 1
args.eval_steps = 50
args.save_steps = 50
args.logging_steps = 10
# Load model
logger.info(f"Loading model: {args.model_name}")
model = SentenceTransformer(
args.model_name,
device="cuda" if torch.cuda.is_available() else "cpu"
)
# Set max sequence length
model.max_seq_length = args.max_seq_length
logger.info(f"Max sequence length: {model.max_seq_length}")
# Define loss function
# MultipleNegativesRankingLoss uses in-batch negatives
# With batch_size=128, each sample has 127 negatives
loss = MultipleNegativesRankingLoss(model=model)
logger.info(f"Using MultipleNegativesRankingLoss with batch size {args.batch_size}")
logger.info(f"Effective negatives per sample: {args.batch_size - 1}")
# Create evaluator
evaluator = create_evaluator(eval_dataset, batch_size=args.batch_size)
# Configure training arguments
training_args = SentenceTransformerTrainingArguments(
output_dir=args.output_dir,
num_train_epochs=args.num_epochs,
per_device_train_batch_size=args.batch_size,
per_device_eval_batch_size=args.batch_size,
learning_rate=args.learning_rate,
warmup_ratio=args.warmup_ratio,
fp16=False, # Don't use fp16
bf16=True, # Use bfloat16 for better numerical stability
batch_sampler=BatchSamplers.NO_DUPLICATES, # Critical for MultipleNegativesRankingLoss
eval_strategy="epoch",
save_strategy="epoch",
save_total_limit=3, # Keep only 3 best checkpoints
logging_steps=args.logging_steps,
logging_first_step=True,
report_to="trackio",
run_name=f"e5-multilingual-{args.num_epochs}epochs",
seed=42,
data_seed=42,
load_best_model_at_end=True,
metric_for_best_model="validation_spearman_cosine",
greater_is_better=True,
)
# Log training configuration
logger.info("=" * 80)
logger.info("TRAINING CONFIGURATION")
logger.info("=" * 80)
logger.info(f"Model: {args.model_name}")
logger.info(f"Training samples: {len(train_dataset):,}")
logger.info(f"Validation samples: {len(eval_dataset):,}")
logger.info(f"Epochs: {args.num_epochs}")
logger.info(f"Batch size: {args.batch_size}")
logger.info(f"Learning rate: {args.learning_rate}")
logger.info(f"Warmup ratio: {args.warmup_ratio}")
logger.info(f"Max sequence length: {args.max_seq_length}")
logger.info(f"Using bfloat16: {training_args.bf16}")
logger.info(f"Output directory: {args.output_dir}")
logger.info("=" * 80)
# Calculate steps
steps_per_epoch = len(train_dataset) // args.batch_size
total_steps = steps_per_epoch * args.num_epochs
logger.info(f"Steps per epoch: {steps_per_epoch:,}")
logger.info(f"Total training steps: {total_steps:,}")
logger.info("=" * 80)
# Create trainer
trainer = SentenceTransformerTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
loss=loss,
evaluator=evaluator,
)
# Train
logger.info("Starting training...")
trainer.train()
# Save final model
final_model_path = Path(args.output_dir) / "final"
logger.info(f"Saving final model to {final_model_path}")
model.save_pretrained(str(final_model_path))
logger.info("Training completed successfully!")
logger.info(f"Model saved to: {final_model_path}")
# Final evaluation
logger.info("Running final evaluation on full validation set...")
final_evaluator = create_evaluator(eval_dataset, batch_size=args.batch_size)
final_score = final_evaluator(model)
logger.info(f"Final validation score: {final_score}")
if __name__ == "__main__":
main()and how I run it:
#!/bin/bash
# Full training script for bi-encoder model
# Trains on ~6M samples for 5 epochs with batch size 128
set -e
echo "Starting FULL training run..."
echo "Model: intfloat/multilingual-e5-base"
echo "Training samples: ~5.9M"
echo "Validation samples: ~623K"
echo "Epochs: 5"
echo "Batch size: 128 (with bfloat16)"
echo ""
python3 train_bi_encoder.py \
--data_dir=data
--model_name intfloat/multilingual-e5-base \
--output_dir ./models/e5-multilingual-encoder \
--num_epochs 5 \
--batch_size 128 \
--learning_rate 5e-5 \
--warmup_ratio 0.1 \
--max_seq_length 512 \
--logging_steps 500
echo ""
echo "Training completed!"
echo "Model saved to: ./models/e5-multilingual-encoder/final"
Metadata
Metadata
Assignees
Labels
No labels