Skip to content

Speed degradation over training #3568

@ZeusFSX

Description

@ZeusFSX

Hi,
I faced with the problem that speed of processing data is degrading with more steps.
On the beginning of training I have speed 5 iteration per seconds, in the middle of epoch I get 3 seconds per iteration.

 11%|████████▍                                                                 | 26373/231300 [10:16:50<147:18:07,  3.01s/it]

For me waiting 147 hours is tooooo much.
Some details about my setup, its instance on AWS g6e.xlarge:

GPU: NVIDIA L40S (48GB VRAM)   // 43GB utilize during training
✓ Train dataset : 5,921,185 rows of pairs [sentence1, sentence2]
✓ Validation dataset: 623,518 rows

Also I see the on the nvidia-smi not fully utilized the GPU capacity:

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA L40S                    On  |   00000000:30:00.0 Off |                    0 |
| N/A   47C    P0             96W /  350W |   43117MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|    0   N/A  N/A           24938      C   python3                               43108MiB |
+-----------------------------------------------------------------------------------------+

Here is my training code:

"""
Train a bi-encoder model using intfloat/multilingual-e5-base with MultipleNegativesRankingLoss.

Usage:
    python train_bi_encoder.py --output_dir ./models/e5-multilingual-encoder
"""

import argparse
import logging
from pathlib import Path
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import SentenceTransformerTrainingArguments, BatchSamplers
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import torch

# Setup logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    level=logging.INFO
)
logger = logging.getLogger(__name__)


def load_parquet_dataset(data_dir: str):
    """Load training and validation datasets from parquet files."""
    logger.info(f"Loading datasets from {data_dir}")

    train_dataset = load_dataset(
        "parquet",
        data_files=str(Path(data_dir) / "train-00000-of-00001.parquet"),
        split="train"
    )

    eval_dataset = load_dataset(
        "parquet",
        data_files=str(Path(data_dir) / "val-00000-of-00001.parquet"),
        split="train"
    )

    logger.info(f"Loaded {len(train_dataset)} training samples")
    logger.info(f"Loaded {len(eval_dataset)} validation samples")
    logger.info(f"Dataset columns: {train_dataset.column_names}")

    return train_dataset, eval_dataset


def create_evaluator(eval_dataset, batch_size: int = 128):
    """Create an evaluator for validation during training."""
    # Sample a subset for faster evaluation (10k samples)
    eval_subset = eval_dataset.shuffle(seed=42).select(range(min(10000, len(eval_dataset))))

    sentences1 = eval_subset["sentence1"]
    sentences2 = eval_subset["sentence2"]

    # Create similarity scores (1.0 for positive pairs in our case)
    scores = [1.0] * len(sentences1)

    evaluator = EmbeddingSimilarityEvaluator(
        sentences1=sentences1,
        sentences2=sentences2,
        scores=scores,
        batch_size=batch_size,
        name="validation",
        show_progress_bar=True
    )

    return evaluator


def main():
    parser = argparse.ArgumentParser(description="Train bi-encoder")
    parser.add_argument(
        "--model_name",
        type=str,
        default="intfloat/multilingual-e5-base",
        help="Base model to fine-tune"
    )
    parser.add_argument(
        "--data_dir",
        type=str,
        default="data",
        help="Directory containing train and val parquet files"
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default="e5-multilingual-category-encoder",
        help="Directory to save the trained model"
    )
    parser.add_argument(
        "--num_epochs",
        type=int,
        default=5,
        help="Number of training epochs"
    )
    parser.add_argument(
        "--batch_size",
        type=int,
        default=128,
        help="Training batch size per device"
    )
    parser.add_argument(
        "--learning_rate",
        type=float,
        default=2e-5,
        help="Learning rate"
    )
    parser.add_argument(
        "--warmup_ratio",
        type=float,
        default=0.1,
        help="Warmup ratio of total training steps"
    )
    parser.add_argument(
        "--eval_steps",
        type=int,
        default=5000,
        help="Evaluate every N steps"
    )
    parser.add_argument(
        "--save_steps",
        type=int,
        default=5000,
        help="Save checkpoint every N steps"
    )
    parser.add_argument(
        "--logging_steps",
        type=int,
        default=500,
        help="Log every N steps"
    )
    parser.add_argument(
        "--max_seq_length",
        type=int,
        default=512,
        help="Maximum sequence length"
    )
    parser.add_argument(
        "--test_run",
        action="store_true",
        help="Run a quick test with limited data"
    )

    args = parser.parse_args()

    # Log GPU information
    if torch.cuda.is_available():
        logger.info(f"CUDA available: {torch.cuda.is_available()}")
        logger.info(f"CUDA device: {torch.cuda.get_device_name(0)}")
        logger.info(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    else:
        logger.warning("CUDA not available, training will be slow!")

    # Load datasets
    train_dataset, eval_dataset = load_parquet_dataset(args.data_dir)

    # For test run, use only a small subset
    if args.test_run:
        logger.info("TEST RUN MODE: Using limited dataset")
        train_dataset = train_dataset.shuffle(seed=42).select(range(1000))
        eval_dataset = eval_dataset.shuffle(seed=42).select(range(500))
        args.num_epochs = 1
        args.eval_steps = 50
        args.save_steps = 50
        args.logging_steps = 10

    # Load model
    logger.info(f"Loading model: {args.model_name}")
    model = SentenceTransformer(
        args.model_name,
        device="cuda" if torch.cuda.is_available() else "cpu"
    )

    # Set max sequence length
    model.max_seq_length = args.max_seq_length
    logger.info(f"Max sequence length: {model.max_seq_length}")

    # Define loss function
    # MultipleNegativesRankingLoss uses in-batch negatives
    # With batch_size=128, each sample has 127 negatives
    loss = MultipleNegativesRankingLoss(model=model)
    logger.info(f"Using MultipleNegativesRankingLoss with batch size {args.batch_size}")
    logger.info(f"Effective negatives per sample: {args.batch_size - 1}")

    # Create evaluator
    evaluator = create_evaluator(eval_dataset, batch_size=args.batch_size)

    # Configure training arguments
    training_args = SentenceTransformerTrainingArguments(
        output_dir=args.output_dir,
        num_train_epochs=args.num_epochs,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        learning_rate=args.learning_rate,
        warmup_ratio=args.warmup_ratio,
        fp16=False,  # Don't use fp16
        bf16=True,   # Use bfloat16 for better numerical stability
        batch_sampler=BatchSamplers.NO_DUPLICATES,  # Critical for MultipleNegativesRankingLoss
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=3,  # Keep only 3 best checkpoints
        logging_steps=args.logging_steps,
        logging_first_step=True,
        report_to="trackio",
        run_name=f"e5-multilingual-{args.num_epochs}epochs",
        seed=42,
        data_seed=42,
        load_best_model_at_end=True,
        metric_for_best_model="validation_spearman_cosine",
        greater_is_better=True,
    )

    # Log training configuration
    logger.info("=" * 80)
    logger.info("TRAINING CONFIGURATION")
    logger.info("=" * 80)
    logger.info(f"Model: {args.model_name}")
    logger.info(f"Training samples: {len(train_dataset):,}")
    logger.info(f"Validation samples: {len(eval_dataset):,}")
    logger.info(f"Epochs: {args.num_epochs}")
    logger.info(f"Batch size: {args.batch_size}")
    logger.info(f"Learning rate: {args.learning_rate}")
    logger.info(f"Warmup ratio: {args.warmup_ratio}")
    logger.info(f"Max sequence length: {args.max_seq_length}")
    logger.info(f"Using bfloat16: {training_args.bf16}")
    logger.info(f"Output directory: {args.output_dir}")
    logger.info("=" * 80)

    # Calculate steps
    steps_per_epoch = len(train_dataset) // args.batch_size
    total_steps = steps_per_epoch * args.num_epochs
    logger.info(f"Steps per epoch: {steps_per_epoch:,}")
    logger.info(f"Total training steps: {total_steps:,}")
    logger.info("=" * 80)

    # Create trainer
    trainer = SentenceTransformerTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        loss=loss,
        evaluator=evaluator,
    )

    # Train
    logger.info("Starting training...")
    trainer.train()

    # Save final model
    final_model_path = Path(args.output_dir) / "final"
    logger.info(f"Saving final model to {final_model_path}")
    model.save_pretrained(str(final_model_path))

    logger.info("Training completed successfully!")
    logger.info(f"Model saved to: {final_model_path}")

    # Final evaluation
    logger.info("Running final evaluation on full validation set...")
    final_evaluator = create_evaluator(eval_dataset, batch_size=args.batch_size)
    final_score = final_evaluator(model)
    logger.info(f"Final validation score: {final_score}")


if __name__ == "__main__":
    main()

and how I run it:

#!/bin/bash
# Full training script for bi-encoder model
# Trains on ~6M samples for 5 epochs with batch size 128

set -e

echo "Starting FULL training run..."
echo "Model: intfloat/multilingual-e5-base"
echo "Training samples: ~5.9M"
echo "Validation samples: ~623K"
echo "Epochs: 5"
echo "Batch size: 128 (with bfloat16)"
echo ""

python3 train_bi_encoder.py \
    --data_dir=data
    --model_name intfloat/multilingual-e5-base \
    --output_dir ./models/e5-multilingual-encoder \
    --num_epochs 5 \
    --batch_size 128 \
    --learning_rate 5e-5 \
    --warmup_ratio 0.1 \
    --max_seq_length 512 \
    --logging_steps 500

echo ""
echo "Training completed!"
echo "Model saved to: ./models/e5-multilingual-encoder/final"

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions