Skip to content

Commit 5ff5046

Browse files
authored
Merge pull request #409 from NishantSinghhhhh/Restoration_single_task_bench_with_compression
[LFX Term 1 2026] Restoring LLM Edge Benchmark Suite Single Task Bench With Compression
2 parents bd88857 + 059c0d8 commit 5ff5046

4 files changed

Lines changed: 123 additions & 46 deletions

File tree

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,57 @@
1-
Large Language Model Edge Benchmark Suite: Implementation on KubeEdge-lanvs
1+
# llm-edge-benchmark-suite single_task_bench_with_compression
22

3+
This guide outlines the complete setup, configuration, and execution process for running the **Compression** Large Language Model (LLM) benchmarking suite using the [Ianvs](https://github.com/kubeedge/ianvs) edge computing framework.
4+
5+
> **CRITICAL FIRST STEPS: ABSOLUTE PATHS & DEPENDENCIES**
6+
> 1. **Correct all paths:** You **must** change every relative path (e.g., `models/qwen/...` or `dataset/...`) in all your `.yaml` configuration files to **absolute paths** (e.g., `/home/user/ianvs/models/qwen/...`). Ianvs will crash if it encounters relative paths.
7+
> 2. **Dependencies:** You must install the necessary packages via `requirements.txt` before executing any runs.
8+
9+
---
10+
11+
## Step 1: Environment Setup
12+
13+
First, ensure your Ianvs virtual environment is active:
14+
```bash
15+
source /path/to/your/ianvs_env/bin/activate
16+
```
17+
18+
Install the requirements.txt
19+
```bash
20+
pip install -r ianvs/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/requirements.txt
21+
```
22+
23+
---
24+
25+
## Step 2: Shared Model Acquisition
26+
27+
This benchmark shares the same `.gguf` model file as the standard suite. Ensure the `Qwen1.5-0.5B-Chat` model exists in your central models directory.
28+
29+
If it is missing, download it using a resumable command:
30+
```bash
31+
mkdir -p /ianvs/models/qwen
32+
wget -c -O ianvs/models/qwen/qwen_1_5_0_5b.gguf [https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GGUF/resolve/main/qwen1_5-0_5b-chat-q4_k_m.gguf](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GGUF/resolve/main/qwen1_5-0_5b-chat-q4_k_m.gguf)
33+
```
34+
35+
---
36+
37+
## Step 3: Configuration Alignment (Fixing the YAMLs)
38+
39+
You must manually update three different YAML files to remove relative paths and fix framework naming strictness.
40+
41+
### 1. Test Environment (`testenv/testenv.yaml`)
42+
Update the dataset location to an absolute path:
43+
```yaml
44+
dataset:
45+
train_data: "ianvs/dataset/data.jsonl"
46+
```
47+
48+
## Step 4: Execution
49+
50+
Once all paths are absolute and the script is updated, execute the benchmark:
51+
52+
```bash
53+
ianvs -f ianvs/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml
54+
```
55+
56+
### Expected Output
57+
Ianvs will execute the benchmark and generate a `workspace` directory. You will see a successful run log and a final table detailing latency, throughput, and prefill latency.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# LLM Core Execution
2+
llama-cpp-python>=0.2.20
3+
4+
# Machine Learning & Neural Network Basics
5+
torch>=2.0.0
6+
transformers>=4.35.0
7+
numpy>=1.24.0
8+
9+
# Ianvs Utilities & Data Handling
10+
pyyaml>=6.0
11+
pandas>=2.0.0
12+
requests>=2.31.0

examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
algorithm:
2-
paradigm_type: "singletasklearning_with_compression"
2+
paradigm_type: "singletasklearning"
33
mode: "with_compression"
44
initial_model_url: "models/qwen/qwen_1_5_0_5b.gguf"
55
quantization_type: "q8_0"
Lines changed: 54 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from sedna.common.class_factory import ClassFactory, ClassType
22
from llama_cpp import Llama
3-
from contextlib import redirect_stderr
43
import os
54
import psutil
65
import time
7-
import io
8-
import statistics
6+
import logging
7+
8+
logging.getLogger().setLevel(logging.INFO)
99

1010
@ClassFactory.register(ClassType.GENERAL, alias="LlamaCppModel")
1111
class LlamaCppModel:
@@ -16,7 +16,11 @@ def __init__(self, **kwargs):
1616
model_path = kwargs.get("model_path")
1717
if not model_path:
1818
raise ValueError("Model path is required.")
19+
1920
quantization_type = kwargs.get("quantization_type", None)
21+
if quantization_type:
22+
logging.info(f"Using quantization type: {quantization_type}")
23+
2024
# Init LLM model
2125
self.model = Llama(
2226
model_path=model_path,
@@ -30,37 +34,47 @@ def __init__(self, **kwargs):
3034
embedding=kwargs.get("embedding", False),
3135
)
3236

37+
# 1. FIXED: Optional arguments for Ianvs pipeline
38+
def preprocess(self, data=None, **kwargs):
39+
"""
40+
Pass-through for text data.
41+
"""
42+
return data
43+
3344
def predict(self, data, input_shape=None, **kwargs):
3445
data = data[:10]
3546
process = psutil.Process(os.getpid())
36-
start_time = time.time()
3747

3848
results = []
39-
total_times = []
40-
prefill_latencies = []
41-
mem_usages = []
4249

4350
for prompt in data:
4451
prompt_start_time = time.time()
4552

46-
f = io.StringIO()
47-
with redirect_stderr(f):
48-
output = self.model(
49-
prompt=prompt,
50-
max_tokens=kwargs.get("max_tokens", 32),
51-
stop=kwargs.get("stop", ["Q:", "\n"]),
52-
echo=kwargs.get("echo", True),
53-
temperature=kwargs.get("temperature", 0.8),
54-
top_p=kwargs.get("top_p", 0.95),
55-
top_k=kwargs.get("top_k", 40),
56-
repeat_penalty=kwargs.get("repeat_penalty", 1.1),
57-
)
58-
stdout_output = f.getvalue()
59-
60-
# parse timing info
61-
timings = self._parse_timings(stdout_output)
62-
prefill_latency = timings.get('prompt_eval_time', 0.0) # ms
63-
generated_text = output['choices'][0]['text']
53+
# Run model with stream=True to measure exact TTFT
54+
output_stream = self.model(
55+
prompt=prompt,
56+
max_tokens=kwargs.get("max_tokens", 32),
57+
stop=kwargs.get("stop", ["Q:", "\n"]),
58+
echo=kwargs.get("echo", True),
59+
temperature=kwargs.get("temperature", 0.8),
60+
top_p=kwargs.get("top_p", 0.95),
61+
top_k=kwargs.get("top_k", 40),
62+
repeat_penalty=kwargs.get("repeat_penalty", 1.1),
63+
stream=True # <--- TTFT Magic Flag
64+
)
65+
66+
generated_text = ""
67+
prefill_latency = 0.0
68+
first_token = True
69+
70+
# Iterate through the stream as the model generates it
71+
for chunk in output_stream:
72+
if first_token:
73+
prefill_latency = (time.time() - prompt_start_time) * 1000
74+
first_token = False
75+
76+
if "text" in chunk["choices"][0]:
77+
generated_text += chunk["choices"][0]["text"]
6478

6579
prompt_end_time = time.time()
6680
prompt_total_time = (prompt_end_time - prompt_start_time) * 1000 # convert to ms
@@ -69,29 +83,19 @@ def predict(self, data, input_shape=None, **kwargs):
6983
"generated_text": generated_text,
7084
"total_time": prompt_total_time,
7185
"prefill_latency": prefill_latency,
72-
"mem_usage":process.memory_info().rss,
86+
"mem_usage": process.memory_info().rss,
7387
}
7488

7589
results.append(result_with_time)
7690

77-
predict_dict = {
78-
"results": results,
79-
}
91+
return {"results": results}
8092

81-
return predict_dict
82-
83-
def _parse_timings(self, stdout_output):
84-
import re
85-
timings = {}
86-
for line in stdout_output.split('\n'):
87-
match = re.match(r'llama_print_timings:\s*(.+?)\s*=\s*([0-9\.]+)\s*ms', line)
88-
if match:
89-
key = match.group(1).strip()
90-
value = float(match.group(2))
91-
92-
key = key.lower().replace(' ', '_')
93-
timings[key] = value
94-
return timings
93+
# 2. FIXED: Optional arguments for Ianvs pipeline
94+
def postprocess(self, predict_output=None, **kwargs):
95+
"""
96+
Pass-through for prediction output.
97+
"""
98+
return predict_output
9599

96100
def evaluate(self, data, model_path=None, **kwargs):
97101
"""
@@ -125,5 +129,11 @@ def save(self, model_path):
125129
def load(self, model_url):
126130
pass
127131

132+
# 3. FIXED: Safe no-op for training pre-trained models
128133
def train(self, train_data, valid_data=None, **kwargs):
129-
return
134+
"""
135+
Dummy train method.
136+
Returns the model path to satisfy Ianvs pipeline requirements.
137+
"""
138+
logging.info("Training step bypassed: Using pre-trained weights for LLM inference.")
139+
return kwargs.get("model_path", "")

0 commit comments

Comments
 (0)