diff --git a/docs/throughput_metrics.md b/docs/throughput_metrics.md new file mode 100644 index 000000000..f48e8450e --- /dev/null +++ b/docs/throughput_metrics.md @@ -0,0 +1,123 @@ +# Throughput Metrics Documentation + +This document describes the inference throughput metrics automatically logged by LMMS-Eval chat models during evaluation. + +## Overview + +LMMS-Eval chat models automatically log detailed timing metrics during inference to help users understand model performance characteristics. These metrics are logged at the INFO level and provide insights into end-to-end latency, token generation speed, and other performance indicators. + +## Metrics Explained + +### Core Timing Metrics + +- **E2E (End-to-End Latency)**: Total time from request submission to response completion (in seconds) +- **TTFT (Time to First Token)**: Time from request submission until the first token is generated (in seconds) +- **TPOT (Time Per Output Token)**: Average time to generate each output token after the first (in seconds) +- **Speed (Inference Speed)**: Token generation rate calculated as 1/TPOT (tokens per second) +- **Output Tokens**: Number of tokens generated in the response + +### Batch Metrics + +For models that process multiple requests in batches: + +- **Batch Summary**: Aggregated metrics across all outputs in a batch +- **Total Time**: Total batch processing time +- **Total Tokens**: Sum of all output tokens in the batch +- **Avg Speed**: Average throughput across the entire batch (tokens/s) + +## Log Format Examples + +### Individual Output Metrics +``` +Output 0 - E2E: 2.145s, TTFT: 0.215s, TPOT: 0.048s, Speed: 20.8 tokens/s, Output tokens: 42 +``` + +### Batch Summary Metrics +``` +Batch summary - Total time: 2.145s, Total tokens: 128, Avg speed: 59.7 tokens/s +``` + +### Single Request Metrics (Non-batched) +``` +Inference metrics - E2E: 1.823s, TTFT: 0.182s, TPOT: 0.052s, Speed: 19.2 tokens/s, Output tokens: 32 +``` + +## Supported Models + +The following chat models automatically log throughput metrics: + +- **sglang_runtime** (`/lmms_eval/models/chat/sglang.py`) +- **vllm_chat** (`/lmms_eval/models/chat/vllm.py`) +- **llava_hf_chat** (`/lmms_eval/models/chat/llava_hf.py`) +- **openai_compatible_chat** (`/lmms_eval/models/chat/openai_compatible.py`) +- **qwen2_5_vl_chat** (`/lmms_eval/models/chat/qwen2_5_vl.py`) +- **huggingface_chat** (`/lmms_eval/models/chat/huggingface.py`) + +## Usage + +Throughput metrics are automatically logged during evaluation - no additional configuration is required. To view the metrics: + +1. **Command Line Output**: Metrics appear in real-time during evaluation +2. **Log Files**: Metrics are written to log files if logging is configured +3. **Log Level**: Ensure logging level is set to INFO or lower to see metrics + +### Example Evaluation Command +```bash +python -m lmms_eval \ + --model sglang_runtime \ + --model_args model=Qwen/Qwen2.5-VL-3B-Instruct \ + --tasks mme \ + --batch_size 4 \ + --log_samples \ + --output_path ./results +``` + +## Metric Calculation Details + +### TTFT Calculation +- **Available from model**: Uses actual first token timestamp when provided +- **Estimated**: When unavailable, estimated as 10% of total inference time + +### TPOT Calculation +- **Formula**: `(E2E_latency - TTFT) / (output_tokens - 1)` +- **Single token responses**: Uses full E2E latency as TPOT +- **Batch processing**: Divides total batch time by number of outputs + +### Speed Calculation +- **Formula**: `1 / TPOT` (when TPOT > 0) +- **Edge cases**: Set to 0 for single-token responses or zero TPOT + +## Performance Analysis + +### Interpreting Metrics + +- **High TTFT**: May indicate model loading, prompt processing, or scheduling delays +- **High TPOT**: Suggests slower token generation, possibly due to model size or hardware limitations +- **Low Speed**: Indicates throughput bottlenecks in token generation +- **E2E vs TTFT+TPOT**: Large differences may suggest batching overhead or system delays + +### Optimization Insights + +- **Reduce TTFT**: Optimize prompt processing, use model caching, improve scheduling +- **Reduce TPOT**: Use faster hardware, optimize model inference, adjust batch sizes +- **Batch Efficiency**: Compare individual vs batch metrics to assess batching benefits + +## Troubleshooting + +### Missing Metrics +- Ensure model supports throughput logging (see supported models list) +- Check log level is set to INFO or lower +- Verify model implementation includes timing instrumentation + +### Inaccurate Metrics +- TTFT estimates may be imprecise when actual timing unavailable +- Batch metrics average across multiple outputs, individual variance not captured +- Network latency may affect metrics for API-based models + +## Implementation Notes + +Throughput metrics are implemented consistently across chat models using: +- `time.time()` for wall-clock timing measurements +- Model-specific metadata when available (e.g., SGLang, VLLM native metrics) +- Fallback estimation methods for missing timing data +- Structured logging format for consistent parsing and analysis \ No newline at end of file diff --git a/examples/models/vllm_qwen2vl.sh b/examples/models/vllm_qwen2vl.sh index 33568a28b..e192f0a36 100644 --- a/examples/models/vllm_qwen2vl.sh +++ b/examples/models/vllm_qwen2vl.sh @@ -10,7 +10,7 @@ export NCCL_DEBUG=DEBUG python3 -m lmms_eval \ --model vllm \ - --model_args model_version=Qwen/Qwen2-VL-7B-Instruct,tensor_parallel_size=4 \ + --model_args model=Qwen/Qwen2-VL-7B-Instruct,tensor_parallel_size=4 \ --tasks mme,gsm8k_cot_self_consistency,mmmu_val \ --batch_size 64 \ --log_samples \ diff --git a/lmms_eval/models/chat/huggingface.py b/lmms_eval/models/chat/huggingface.py index 863bac6e7..a055a8309 100644 --- a/lmms_eval/models/chat/huggingface.py +++ b/lmms_eval/models/chat/huggingface.py @@ -1,5 +1,6 @@ import base64 import re +import time from io import BytesIO from typing import List, Optional, Tuple, Union @@ -16,6 +17,7 @@ from lmms_eval.api.instance import Instance from lmms_eval.api.model import lmms from lmms_eval.api.registry import register_model +from lmms_eval.models.model_utils.gen_metrics import log_metrics from lmms_eval.models.model_utils.reasoning_model_utils import ( parse_reasoning_model_answer, ) @@ -188,6 +190,8 @@ def _collate(x): chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") + e2e_latency = 0 + total_tokens = 0 for chunk in chunks: ctx, doc_to_messages, all_gen_kwargs, doc_id, task, split = zip(*chunk) chat_messages = [doc_to_messages[0](self.task_dict[task][split][ids]) for ids, task, split in zip(doc_id, task, split)] @@ -242,6 +246,7 @@ def _collate(x): current_gen_kwargs["temperature"] = None current_gen_kwargs["top_p"] = None + start_time = time.time() cont = self.model.generate( **inputs, eos_token_id=self.tokenizer.eos_token_id, @@ -253,10 +258,15 @@ def _collate(x): max_new_tokens=current_gen_kwargs["max_new_tokens"], use_cache=self.use_cache, ) + end_time = time.time() generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, cont)] answers = self.processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False) + # Calculate timing metrics for batch + e2e_latency += end_time - start_time + total_tokens += sum(len(ids) for ids in generated_ids_trimmed) + for ans, context in zip(answers, texts): clean_ans = parse_reasoning_model_answer(ans) res.append(clean_ans) @@ -268,6 +278,15 @@ def _collate(x): eval_logger.debug(f"Model Clean Response: {clean_ans}") # reorder this group of results back to original unsorted form res = re_ords.get_original(res) + # Calculate average speed + avg_speed = total_tokens / e2e_latency if e2e_latency > 0 else 0 + # Log metrics + metric_dict = { + "total_tokens": total_tokens, + "e2e_latency": e2e_latency, + "avg_speed": avg_speed, + } + log_metrics(**metric_dict) pbar.close() return res diff --git a/lmms_eval/models/chat/llava_hf.py b/lmms_eval/models/chat/llava_hf.py index 97bad3918..480caf81e 100644 --- a/lmms_eval/models/chat/llava_hf.py +++ b/lmms_eval/models/chat/llava_hf.py @@ -1,24 +1,14 @@ +import time import warnings from typing import List, Optional, Tuple, Union -import numpy as np -import PIL -import torch -from accelerate import Accelerator, DistributedType -from accelerate.state import AcceleratorState -from decord import VideoReader, cpu from tqdm import tqdm -from transformers import ( - AutoConfig, - AutoProcessor, - LlavaForConditionalGeneration, - LlavaNextForConditionalGeneration, -) from lmms_eval import utils from lmms_eval.api.instance import Instance from lmms_eval.api.model import lmms from lmms_eval.api.registry import register_model +from lmms_eval.models.model_utils.gen_metrics import log_metrics from lmms_eval.protocol import ChatMessages warnings.filterwarnings("ignore") @@ -53,6 +43,8 @@ def _collate(x): chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") + e2e_latency = 0 + total_tokens = 0 for chunk in chunks: ctx, doc_to_messages, all_gen_kwargs, doc_id, task, split = zip(*chunk) task = task[0] @@ -94,6 +86,7 @@ def _collate(x): gen_kwargs["num_beams"] = 1 do_sample = True if gen_kwargs["temperature"] > 0 else False try: + start_time = time.time() cont = self.model.generate( **inputs, do_sample=do_sample, @@ -105,11 +98,21 @@ def _collate(x): pad_token_id=self.eot_token_id, eos_token_id=self.eot_token_id, ) + end_time = time.time() cont = cont[:, inputs["input_ids"].shape[-1] :] + + # Calculate timing metrics + e2e_latency += end_time - start_time + total_tokens += cont.shape[-1] if len(cont.shape) > 1 else len(cont) + except Exception as e: eval_logger.error(f"Error {e} in generating") cont = "" - text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0] + e2e_latency += 0 + total_tokens += 0 + + text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0] if cont != "" else "" + if self.accelerator.is_main_process and doc_id[0] % 100 == 0: eval_logger.debug(f"Generated text for doc ID {doc_id[0]}:\n\n{text_outputs}\n") @@ -119,5 +122,15 @@ def _collate(x): # reorder this group of results back to original unsorted form res = re_ords.get_original(res) + metric_dict = { + "total_tokens": total_tokens, + "e2e_latency": e2e_latency, + "avg_speed": total_tokens / e2e_latency if e2e_latency > 0 else 0, + "additional_metrics": { + "rank": self.rank, + }, + } + log_metrics(**metric_dict) + pbar.close() return res diff --git a/lmms_eval/models/chat/openai_compatible.py b/lmms_eval/models/chat/openai_compatible.py index 7196d72e7..d66755d46 100644 --- a/lmms_eval/models/chat/openai_compatible.py +++ b/lmms_eval/models/chat/openai_compatible.py @@ -24,6 +24,7 @@ from openai import AzureOpenAI, OpenAI from PIL import Image +from lmms_eval.models.model_utils.gen_metrics import log_metrics from lmms_eval.models.simple.openai_compatible import ( OpenAICompatible as OpenAICompatibleSimple, ) @@ -40,6 +41,8 @@ def generate_until(self, requests) -> List[str]: res = [] pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + e2e_latency = 0 + total_tokens = 0 for ctx, doc_to_messages, gen_kwargs, doc_id, task, split in [reg.args for reg in requests]: if self.continual_mode is True and self.cache_mode == "resume": doc_uuid = f"{task}___{split}___{doc_id}" @@ -81,8 +84,22 @@ def generate_until(self, requests) -> List[str]: for attempt in range(self.max_retries): try: + start_time = time.time() response = self.client.chat.completions.create(**payload) + end_time = time.time() + response_text = response.choices[0].message.content + + # Calculate timing metrics + e2e_latency += end_time - start_time + + # Get token counts from response if available + if hasattr(response, "usage"): + total_tokens += response.usage.completion_tokens + else: + # Approximate token count if not provided + total_tokens += len(response_text.split()) + break # If successful, break out of the loop except Exception as e: @@ -105,5 +122,15 @@ def generate_until(self, requests) -> List[str]: with open(self.response_persistent_file, "w") as f: json.dump(self.response_cache, f) + # Calculate average speed + avg_speed = total_tokens / e2e_latency if e2e_latency > 0 else 0 + # Log metrics + metric_dict = { + "total_tokens": total_tokens, + "e2e_latency": e2e_latency, + "avg_speed": avg_speed, + } + log_metrics(**metric_dict) + pbar.close() return res diff --git a/lmms_eval/models/chat/qwen2_5_vl.py b/lmms_eval/models/chat/qwen2_5_vl.py index f74ce3e5c..be0afee90 100644 --- a/lmms_eval/models/chat/qwen2_5_vl.py +++ b/lmms_eval/models/chat/qwen2_5_vl.py @@ -1,3 +1,4 @@ +import time from typing import List, Optional, Tuple, Union import numpy as np @@ -8,6 +9,7 @@ from lmms_eval import utils from lmms_eval.api.instance import Instance from lmms_eval.api.registry import register_model +from lmms_eval.models.model_utils.gen_metrics import log_metrics from lmms_eval.models.model_utils.reasoning_model_utils import ( parse_reasoning_model_answer, ) @@ -38,6 +40,8 @@ def _collate(x): chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") + e2e_latency = 0 + total_tokens = 0 for chunk in chunks: ctx, doc_to_messages, all_gen_kwargs, doc_id, task, split = zip(*chunk) chat_messages = [doc_to_messages[0](self.task_dict[task][split][ids]) for ids, task, split in zip(doc_id, task, split)] @@ -88,6 +92,7 @@ def _collate(x): current_gen_kwargs["temperature"] = None current_gen_kwargs["top_p"] = None + start_time = time.time() cont = self.model.generate( **inputs, eos_token_id=self.tokenizer.eos_token_id, @@ -99,10 +104,15 @@ def _collate(x): max_new_tokens=current_gen_kwargs["max_new_tokens"], use_cache=self.use_cache, ) + end_time = time.time() generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, cont)] answers = self.processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False) + # Calculate timing metrics for batch + e2e_latency += end_time - start_time + total_tokens += sum(len(ids) for ids in generated_ids_trimmed) + for ans, context in zip(answers, texts): clean_ans = parse_reasoning_model_answer(ans) res.append(clean_ans) @@ -115,5 +125,18 @@ def _collate(x): # reorder this group of results back to original unsorted form res = re_ords.get_original(res) + # Calculate average speed + avg_speed = total_tokens / e2e_latency if e2e_latency > 0 else 0 + # Log metrics + metric_dict = { + "total_tokens": total_tokens, + "e2e_latency": e2e_latency, + "avg_speed": avg_speed, + "additional_metrics": { + "rank": self.rank, + }, + } + log_metrics(**metric_dict) + pbar.close() return res diff --git a/lmms_eval/models/chat/sglang.py b/lmms_eval/models/chat/sglang.py index 451cae185..92c477914 100644 --- a/lmms_eval/models/chat/sglang.py +++ b/lmms_eval/models/chat/sglang.py @@ -1,17 +1,17 @@ import json +import time import warnings from typing import List, Optional, Tuple, Union -import PIL from accelerate import Accelerator, DistributedType from sglang import Engine from tqdm import tqdm from transformers import AutoProcessor -from lmms_eval import utils from lmms_eval.api.instance import Instance from lmms_eval.api.model import lmms from lmms_eval.api.registry import register_model +from lmms_eval.models.model_utils.gen_metrics import log_metrics from lmms_eval.models.model_utils.load_video import load_video_decord from lmms_eval.protocol import ChatMessages @@ -26,7 +26,7 @@ class Sglang(lmms): def __init__( self, - model_version: str = "Qwen/Qwen2.5-VL-3B-Instruct", + model: str = "Qwen/Qwen2.5-VL-3B-Instruct", tensor_parallel_size: int = 1, gpu_memory_utilization: float = 0.8, batch_size: int = 1, @@ -40,7 +40,7 @@ def __init__( # Manually set a image token for GPT4V so that we can search for it # and split the text and image # Here we just use the same token as llava for convenient - self.model_version = model_version + self._model = model self.max_frame_num = max_frame_num self.threads = threads self.chat_template = chat_template @@ -53,9 +53,9 @@ def __init__( except json.JSONDecodeError: eval_logger.warning(f"Failed to parse JSON-like string for argument '{key}': {value}") - # Set up vllm client - self.client = Engine(model_path=model_version, tp_size=tensor_parallel_size, mem_fraction_static=gpu_memory_utilization, **kwargs) - self.processor = AutoProcessor.from_pretrained(model_version) + # Set up sglang client + self.client = Engine(model_path=model, tp_size=tensor_parallel_size, mem_fraction_static=gpu_memory_utilization, **kwargs) + self.processor = AutoProcessor.from_pretrained(model) accelerator = Accelerator() if accelerator.num_processes > 1: @@ -120,6 +120,8 @@ def generate_until(self, requests) -> List[str]: batch_size = self.batch_size_per_gpu batched_requests = [requests[i : i + batch_size] for i in range(0, len(requests), batch_size)] + total_tokens = 0 + e2e_latency = 0 for batch_requests in batched_requests: batched_messages = [] image_data = [] @@ -160,13 +162,37 @@ def generate_until(self, requests) -> List[str]: tokenize=False, add_generation_prompt=True, ) + + start_time = time.time() outputs = self.client.generate(texts, params, image_data=image_data) + end_time = time.time() response_text = [o["text"] for o in outputs] + # Calculate timing metrics for batch + e2e_latency += end_time - start_time + + for output_idx, output in enumerate(outputs): + # Get token count from output + if "meta_info" in output and "completion_tokens" in output["meta_info"]: + output_tokens = output["meta_info"]["completion_tokens"] + else: + output_tokens = len(output["text"].split()) + + total_tokens += output_tokens + + if len(outputs) > 1: + avg_speed = total_tokens / e2e_latency if e2e_latency > 0 else 0 + assert len(response_text) == len(batch_requests) res.extend(response_text) pbar.update(len(batch_requests)) + metric_dict = { + "total_tokens": total_tokens, + "e2e_latency": e2e_latency, + "avg_speed": avg_speed, + } + log_metrics(**metric_dict) pbar.close() return res diff --git a/lmms_eval/models/chat/vllm.py b/lmms_eval/models/chat/vllm.py index c64895dc4..4538e3fa3 100644 --- a/lmms_eval/models/chat/vllm.py +++ b/lmms_eval/models/chat/vllm.py @@ -1,24 +1,11 @@ -import asyncio -import base64 -import json -import os import time -from concurrent.futures import ThreadPoolExecutor -from copy import deepcopy -from io import BytesIO -from multiprocessing import cpu_count from typing import List, Optional, Tuple, Union -import numpy as np -from accelerate import Accelerator, DistributedType -from decord import VideoReader, cpu -from loguru import logger as eval_logger -from PIL import Image from tqdm import tqdm from lmms_eval.api.instance import Instance -from lmms_eval.api.model import lmms from lmms_eval.api.registry import register_model +from lmms_eval.models.model_utils.gen_metrics import log_metrics from lmms_eval.models.simple.vllm import VLLM as VLLMSimple from lmms_eval.protocol import ChatMessages @@ -40,6 +27,7 @@ def generate_until(self, requests) -> List[str]: batch_size = self.batch_size_per_gpu batched_requests = [requests[i : i + batch_size] for i in range(0, len(requests), batch_size)] + e2e_latency = 0 for batch_requests in batched_requests: batched_messages = [] for idx in range(len(batch_requests)): @@ -65,18 +53,39 @@ def generate_until(self, requests) -> List[str]: sampling_params = SamplingParams(**params) + start_time = time.time() if self.chat_template is not None: with open(self.chat_template, "r") as f: chat_template = f.read() response = self.client.chat(sampling_params=sampling_params, messages=batched_messages, chat_template=chat_template) else: response = self.client.chat(sampling_params=sampling_params, messages=batched_messages) + end_time = time.time() + response_text = [o.outputs[0].text for o in response] + # Calculate timing metrics for batch + e2e_latency += end_time - start_time + assert len(response_text) == len(batch_requests) res.extend(response_text) pbar.update(len(batch_requests)) + metrics = self.get_format_metrics() + total_tokens = metrics["generation_tokens"] + avg_speed = total_tokens / e2e_latency if e2e_latency > 0 else 0 + metric_dict = { + "total_tokens": total_tokens, + "e2e_latency": e2e_latency, + "avg_speed": avg_speed, + "additional_metrics": { + "ttft": metrics["ttft"], + "tpot": metrics["tpot"], + "rank": self.rank, + }, + } + log_metrics(**metric_dict) + pbar.close() return res @@ -86,3 +95,25 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: def generate_until_multi_round(self, requests) -> List[str]: raise NotImplementedError("TODO: Implement multi-round generation") + + def get_format_metrics(self): + metrics = self.client.get_metrics() + ttft = 0 + tpot = 0 + generation_tokens = 0 + for metric in metrics: + name = metric.name + if "time_to_first_token" in name: + ttft = metric.sum / metric.count + if "time_per_output_token_seconds" in name: + tpot = metric.sum / metric.count + if name == "vllm:generation_tokens": + generation_tokens = metric.value + + metrics = { + "ttft": ttft, + "tpot": tpot, + "generation_tokens": generation_tokens, + } + + return metrics diff --git a/lmms_eval/models/model_utils/gen_metrics.py b/lmms_eval/models/model_utils/gen_metrics.py new file mode 100644 index 000000000..aff03cb27 --- /dev/null +++ b/lmms_eval/models/model_utils/gen_metrics.py @@ -0,0 +1,90 @@ +import time +from typing import Any, Callable, Dict, List + +import torch +from loguru import logger as eval_logger + + +def space_tokenizer(text: str) -> int: + """ + A simple tokenizer that counts the token by the splitted space + Then a rough estimate of the token count is returned. (times 1.5) + Args: + text (str): The input text to tokenize. + """ + return len(text.split(" ")) * 1.5 + + +def calculate_token_throughput(token_count: int, duration: float) -> float: + """ + Calculate the token throughput. + + Args: + token_count (int): The number of tokens processed. + duration (float): The time taken to process the tokens in seconds. + + Returns: + float: The token throughput in tokens per second. + """ + if duration <= 0: + return 0.0 + return token_count / duration + + +def log_metrics(e2e_latency: float, total_tokens: int, avg_speed: float, additional_metrics: Dict[str, Any] = None): + """ + Log the metrics in a structured format. + + Args: + e2e_latency (float): The end-to-end latency in seconds. + total_tokens (int): The total number of tokens processed. + avg_speed (float): The average speed in tokens per second. + additional_metrics (Dict[str, Any]): Additional metrics to log. + """ + required_stats = f"Metric summary - Total time: {e2e_latency:.3f}s, Total tokens: {total_tokens}, Avg speed: {avg_speed:.1f} tokens/s" + if additional_metrics is not None: + required_stats += ", Additional metrics: " + required_stats += ", ".join(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}" for k, v in additional_metrics.items()) + eval_logger.info(required_stats) + + +class GenMetrics: + """ + A class to manage the generation of metrics for model evaluation. + """ + + def __init__(self, tokenize_fn: Callable = space_tokenizer): + self.tokenize_fn = tokenize_fn + + def __enter__(self): + """ + Initialize the context manager. + """ + self.metrics = {} + self.start_time = time.perf_counter() + return self + + def stop_timer(self): + self.end_time = time.perf_counter() + + def log_metric(self, content: List[Any], additional_metrics: Dict[str, Any] = None): + num_tokens = sum(self.tokenize_fn(item) for item in content) + duration = self.end_time - self.start_time + throughput = calculate_token_throughput(num_tokens, duration) + self.metrics = { + "num_tokens": num_tokens, + "duration": duration, + "throughput": throughput, + } + if additional_metrics: + self.metrics.update(additional_metrics) + + log_metrics(self.metrics) + + def __exit__(self, exc_type, exc_value, traceback): + """ + Finalize the context manager and return the collected metrics. + """ + self.end_time = time.perf_counter() + self.metrics["duration"] = self.end_time - self.start_time + return self.metrics diff --git a/lmms_eval/models/simple/vllm.py b/lmms_eval/models/simple/vllm.py index 6b3aba9c3..26bfb9853 100644 --- a/lmms_eval/models/simple/vllm.py +++ b/lmms_eval/models/simple/vllm.py @@ -53,7 +53,7 @@ def __init__( self.chat_template = chat_template self.min_image_pixels = min_image_pixels # Qwen 2/2.5-VL models enforce minimum image dimensions - self._enforce_image_resize = self._is_qwen_vl_model(model_version) + self._enforce_image_resize = self._is_qwen_vl_model(model) # Convert any string arguments that start with { and end with } to dictionaries for key, value in kwargs.items(): @@ -70,6 +70,7 @@ def __init__( tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=gpu_memory_utilization, trust_remote_code=trust_remote_code, + disable_log_stats=False, **kwargs, ) diff --git a/test/test_throughput_metrics.py b/test/test_throughput_metrics.py new file mode 100644 index 000000000..790825564 --- /dev/null +++ b/test/test_throughput_metrics.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Test script to verify the inference throughput metrics implementation. +This script demonstrates how the metrics are calculated and logged. +""" + +import time + + +# Simple logger replacement for testing +class MockLogger: + def info(self, msg): + print(f"INFO: {msg}") + + +eval_logger = MockLogger() + + +def test_tpot_calculation(): + """Test TPOT and inference speed calculation with example values.""" + + # Example metrics + e2e_latency = 2.5 # seconds + ttft = 0.25 # seconds (Time to First Token) + num_output_tokens = 50 + + # Calculate TPOT using the implemented formula + if num_output_tokens > 1: + tpot = (e2e_latency - ttft) / (num_output_tokens - 1) + inference_speed = 1 / tpot if tpot > 0 else 0 + else: + tpot = e2e_latency + inference_speed = 0 + + print(f"Test Metrics:") + print(f"E2E Latency: {e2e_latency:.3f}s") + print(f"TTFT: {ttft:.3f}s") + print(f"Output tokens: {num_output_tokens}") + print(f"TPOT: {tpot:.3f}s") + print(f"Inference Speed: {inference_speed:.1f} tokens/s") + print(f"Expected TPOT: {(2.5-0.25)/(50-1):.3f}s") + print(f"Expected Speed: {1/((2.5-0.25)/(50-1)):.1f} tokens/s") + + return tpot, inference_speed + + +def simulate_model_inference(): + """Simulate a model inference with timing measurements.""" + + print("\nSimulating model inference...") + + # Simulate generation time + start_time = time.time() + time.sleep(0.1) # Simulate inference delay + end_time = time.time() + + # Simulate output + generated_tokens = 25 + e2e_latency = end_time - start_time + ttft = e2e_latency * 0.1 # Estimate TTFT as 10% of total time + + # Calculate metrics + if generated_tokens > 1: + tpot = (e2e_latency - ttft) / (generated_tokens - 1) + inference_speed = 1 / tpot if tpot > 0 else 0 + else: + tpot = e2e_latency + inference_speed = 0 + + # Log using same format as implementation + eval_logger.info(f"Inference metrics - E2E: {e2e_latency:.3f}s, TTFT: {ttft:.3f}s, TPOT: {tpot:.3f}s, Speed: {inference_speed:.1f} tokens/s, Output tokens: {generated_tokens}") + + +if __name__ == "__main__": + print("Testing TPOT and Inference Speed Calculation") + print("=" * 50) + + # Test the calculation logic + test_tpot_calculation() + + # Simulate actual inference + simulate_model_inference() + + print("\nImplementation Summary:") + print("- Added timing measurement around model.generate() calls") + print("- Calculate TPOT = (e2e_latency - TTFT) / (num_output_tokens - 1)") + print("- Calculate inference_speed = 1 / TPOT") + print("- Log metrics for each inference request") + print("- Modified all chat models: openai_compatible, vllm, sglang, huggingface, llava_hf, qwen2_5_vl")