Skip to content

Rebase may 07 #1220

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 34 commits into
base: habana_main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
d419aa5
[V1] Enable TPU V1 backend by default (#17673)
mgoin May 6, 2025
a6fed02
[V1][PP] Support PP for MultiprocExecutor (#14219)
bigPYJ1151 May 6, 2025
cba31c4
[v1] AttentionMetadata for each layer (#17394)
heheda12345 May 6, 2025
175bda6
[Feat] Add deprecated=True to CLI args (#17426)
aarnphm May 6, 2025
0d11546
[Docs] Use gh-file to add links to tool_calling.md (#17709)
windsonsea May 6, 2025
aabcd2c
[v1] Introduce KVCacheBlocks as interface between Scheduler and KVCac…
heheda12345 May 6, 2025
7525d5f
[doc] Add RAG Integration example (#17692)
reidliu41 May 6, 2025
5b8c390
[Bugfix] Fix modality limits in vision language example (#17721)
DarkLight1337 May 6, 2025
6115b11
Make right sidebar more readable in "Supported Models" (#17723)
hmellor May 6, 2025
621ca2c
[TPU] Increase block size and reset block shapes (#16458)
bythew3i May 6, 2025
d456aea
[Misc] Add Next Edit Prediction (NEP) datasets support in `benchmark_…
dtransposed May 6, 2025
de906b9
[Bugfix] Fix for the condition to accept empty encoder inputs for mll…
gshtras May 6, 2025
2f925e5
[Kernel] Unified Triton kernel that doesn't distinguish between prefi…
tdoublep May 6, 2025
022afbe
Fix doc build performance (#17748)
hmellor May 7, 2025
ed3a1d2
[ROCm] fix num_stages for default moe config to avoid triton OutOfRes…
hongxiayang May 7, 2025
6de3e13
Add logging for torch nightly version (#17669)
yangw-dev May 7, 2025
18dd5e0
[Model] Mamba2 causal conv1d Refactor to Split Prefill and Decode Req…
cyang49 May 7, 2025
a17cef7
Removed unused marlin cuda code (#17684)
mgoin May 7, 2025
e50a1f1
[TPU] Add kernel test for moe_pallas (#17496)
mgoin May 7, 2025
950b711
Replace lm-eval bash script with pytest and use enforce_eager for fas…
mgoin May 7, 2025
8d84d83
[BugFix][Spec Decode] Fix hidden size mismatch between target and eag…
WoosukKwon May 7, 2025
822de7f
[Misc] Split model loader (#17712)
jeejeelee May 7, 2025
c3e9d50
[Misc] Use `apply_rotary_emb` from vllm_flash_attn for Qwen2-VL visio…
Isotr0py May 7, 2025
1a45a61
[Kernel] GGUF MoeVec kernel (#16780)
SzymonOzog May 7, 2025
f80ae5b
[Kernel] Use fused rmsnorm for some models like qwen3 series (#17735)
Eviannn May 7, 2025
ba7703e
[Misc] Remove qlora_adapter_name_or_path (#17699)
jeejeelee May 7, 2025
043e4c4
Add NeuronxDistributedInference support, Speculative Decoding, Dynami…
aws-satyajith May 7, 2025
8a15c26
[Frontend] Add missing chat templates for various MLLMs (#17758)
DarkLight1337 May 7, 2025
324a311
Fix test_memory_usage_no_spec (#17754)
sarckk May 7, 2025
98c89e1
Make key optional for rotary embedding (#17566)
sarckk May 7, 2025
f265865
Merge remote-tracking branch 'upstream/main' into habana_main
michalkuligowski May 7, 2025
82872a0
Update rotary_embedding.py
michalkuligowski May 7, 2025
ed57c46
Merge branch 'habana_main' into rebase_May_07
michalkuligowski May 9, 2025
0513cc3
Update rotary_embedding.py
michalkuligowski May 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions .buildkite/lm-eval-harness/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path

import pytest


def pytest_addoption(parser):
parser.addoption(
"--config-list-file",
action="store",
help="Path to the file listing model config YAMLs (one per line)")
parser.addoption("--tp-size",
action="store",
default="1",
help="Tensor parallel size to use for evaluation")


@pytest.fixture(scope="session")
def config_list_file(pytestconfig, config_dir):
rel_path = pytestconfig.getoption("--config-list-file")
return config_dir / rel_path


@pytest.fixture(scope="session")
def tp_size(pytestconfig):
return pytestconfig.getoption("--tp-size")


def pytest_generate_tests(metafunc):
if "config_filename" in metafunc.fixturenames:
rel_path = metafunc.config.getoption("--config-list-file")
config_list_file = Path(rel_path).resolve()
config_dir = config_list_file.parent
with open(config_list_file, encoding="utf-8") as f:
configs = [
config_dir / line.strip() for line in f
if line.strip() and not line.startswith("#")
]
metafunc.parametrize("config_filename", configs)
59 changes: 0 additions & 59 deletions .buildkite/lm-eval-harness/run-tests.sh

This file was deleted.

41 changes: 11 additions & 30 deletions .buildkite/lm-eval-harness/test_lm_eval_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,67 +3,48 @@
LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml

* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
* export LM_EVAL_TP_SIZE=4
* pytest -s test_lm_eval_correctness.py
pytest -s -v test_lm_eval_correctness.py \
--config-list-file=configs/models-small.txt \
--tp-size=1
"""

import os
from pathlib import Path

import lm_eval
import numpy
import pytest
import numpy as np
import yaml

RTOL = 0.08
TEST_DATA_FILE = os.environ.get(
"LM_EVAL_TEST_DATA_FILE",
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")

TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)


def launch_lm_eval(eval_config):
def launch_lm_eval(eval_config, tp_size):
trust_remote_code = eval_config.get('trust_remote_code', False)

model_args = f"pretrained={eval_config['model_name']}," \
f"tensor_parallel_size={TP_SIZE}," \
f"tensor_parallel_size={tp_size}," \
f"enforce_eager=true," \
f"add_bos_token=true," \
f"trust_remote_code={trust_remote_code}"

results = lm_eval.simple_evaluate(
model="vllm",
model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"],
batch_size="auto")

return results


def test_lm_eval_correctness():
eval_config = yaml.safe_load(
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))

if eval_config[
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
pytest.skip("FBGEMM is currently failing on main.")
def test_lm_eval_correctness_param(config_filename, tp_size):
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))

# Launch eval requests.
results = launch_lm_eval(eval_config)
results = launch_lm_eval(eval_config, tp_size)

# Confirm scores match ground truth.
success = True
for task in eval_config["tasks"]:
for metric in task["metrics"]:
ground_truth = metric["value"]
measured_value = results["results"][task["name"]][metric["name"]]
print(f'{task["name"]} | {metric["name"]}: '
f'ground_truth={ground_truth} | measured={measured_value}')
success = success and numpy.isclose(
success = success and np.isclose(
ground_truth, measured_value, rtol=RTOL)

# Assert at the end, print all scores even on failure for debugging.
assert success
4 changes: 3 additions & 1 deletion .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ docker run --privileged --net host --shm-size=16G -it \
&& echo TEST_10 \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
&& echo TEST_11 \
&& pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
&& pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py \
&& echo TEST_12 \
&& pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" \


# TODO: This test fails because it uses RANDOM_SEED sampling
Expand Down
4 changes: 2 additions & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ steps:
- vllm/model_executor/layers/quantization
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-small.txt -t 1
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1

- label: OpenAI API correctness
source_file_dependencies:
Expand Down Expand Up @@ -713,4 +713,4 @@ steps:
- vllm/model_executor/layers/quantization
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-large.txt -t 4
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
88 changes: 88 additions & 0 deletions benchmarks/benchmark_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -887,6 +887,94 @@ def sample(self,
return sampled_requests


# -----------------------------------------------------------------------------
# Next Edit Prediction Dataset Implementation
# -----------------------------------------------------------------------------


zeta_prompt = """### Instruction:
You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.

### User Edits:

{}

### User Excerpt:

{}

### Response:

""" # noqa: E501


def _format_zeta_prompt(
sample: dict,
original_start_marker: str = "<|editable_region_start|>") -> dict:
"""Format the zeta prompt for the Next Edit Prediction (NEP) dataset.

This function formats examples from the NEP dataset
into prompts and expected outputs. It could be
further extended to support more NEP datasets.

Args:
sample: The dataset sample containing events,
inputs, and outputs.
original_start_marker: The marker indicating the
start of the editable region. Defaults to
"<|editable_region_start|>".

Returns:
A dictionary with the formatted prompts and expected outputs.
"""
events = sample["events"]
input = sample["input"]
output = sample["output"]
prompt = zeta_prompt.format(events, input)

# following the original implementation, extract the focused region
# from the raw output
output_start_index = output.find(original_start_marker)
output_focused_region = output[output_start_index:]
expected_output = output_focused_region

return {"prompt": prompt, "expected_output": expected_output}


class NextEditPredictionDataset(HuggingFaceDataset):
"""
Dataset class for processing a Next Edit Prediction dataset.
"""

SUPPORTED_DATASET_PATHS = {
"zed-industries/zeta",
}
MAPPING_PROMPT_FUNCS = {
"zed-industries/zeta": _format_zeta_prompt,
}

def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
**kwargs):
formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(
self.dataset_path)
if formatting_prompt_func is None:
raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
samples = []
for sample in self.data:
sample = formatting_prompt_func(sample)
samples.append(
SampleRequest(
prompt=sample["prompt"],
prompt_len=len(tokenizer(sample["prompt"]).input_ids),
expected_output_len=len(
tokenizer(sample["expected_output"]).input_ids),
))
if len(samples) >= num_requests:
break
self.maybe_oversample_requests(samples, num_requests)
return samples


# -----------------------------------------------------------------------------
# ASR Dataset Implementation
# -----------------------------------------------------------------------------
Expand Down
8 changes: 6 additions & 2 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,9 @@
from benchmark_dataset import (AIMODataset, ASRDataset, BurstGPTDataset,
ConversationDataset, HuggingFaceDataset,
InstructCoderDataset, MTBenchDataset,
RandomDataset, SampleRequest, ShareGPTDataset,
SonnetDataset, VisionArenaDataset)
NextEditPredictionDataset, RandomDataset,
SampleRequest, ShareGPTDataset, SonnetDataset,
VisionArenaDataset)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json

MILLISECONDS_TO_SECONDS_CONVERSION = 1000
Expand Down Expand Up @@ -603,6 +604,9 @@ def main(args: argparse.Namespace):
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
dataset_class = AIMODataset
args.hf_split = "train"
elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501
dataset_class = NextEditPredictionDataset
args.hf_split = "train"
elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
dataset_class = ASRDataset
args.hf_split = "train"
Expand Down
Loading