Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,2 +1,57 @@
Large Language Model Edge Benchmark Suite: Implementation on KubeEdge-lanvs
# llm-edge-benchmark-suite single_task_bench_with_compression

This guide outlines the complete setup, configuration, and execution process for running the **Compression** Large Language Model (LLM) benchmarking suite using the [Ianvs](https://github.com/kubeedge/ianvs) edge computing framework.

> **CRITICAL FIRST STEPS: ABSOLUTE PATHS & DEPENDENCIES**
> 1. **Correct all paths:** You **must** change every relative path (e.g., `models/qwen/...` or `dataset/...`) in all your `.yaml` configuration files to **absolute paths** (e.g., `/home/user/ianvs/models/qwen/...`). Ianvs will crash if it encounters relative paths.
> 2. **Dependencies:** You must install the necessary packages via `requirements.txt` before executing any runs.

---

## Step 1: Environment Setup

First, ensure your Ianvs virtual environment is active:
```bash
source /path/to/your/ianvs_env/bin/activate
```

Install the requirements.txt
```bash
pip install -r ianvs/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/requirements.txt
```

---

## Step 2: Shared Model Acquisition

This benchmark shares the same `.gguf` model file as the standard suite. Ensure the `Qwen1.5-0.5B-Chat` model exists in your central models directory.

If it is missing, download it using a resumable command:
```bash
mkdir -p /ianvs/models/qwen
wget -c -O ianvs/models/qwen/qwen_1_5_0_5b.gguf [https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GGUF/resolve/main/qwen1_5-0_5b-chat-q4_k_m.gguf](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GGUF/resolve/main/qwen1_5-0_5b-chat-q4_k_m.gguf)
Comment thread
NishantSinghhhhh marked this conversation as resolved.
```

---

## Step 3: Configuration Alignment (Fixing the YAMLs)

You must manually update three different YAML files to remove relative paths and fix framework naming strictness.

### 1. Test Environment (`testenv/testenv.yaml`)
Update the dataset location to an absolute path:
```yaml
dataset:
train_data: "ianvs/dataset/data.jsonl"
Comment thread
NishantSinghhhhh marked this conversation as resolved.
```

## Step 4: Execution

Once all paths are absolute and the script is updated, execute the benchmark:

```bash
ianvs -f ianvs/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml
```

### Expected Output
Ianvs will execute the benchmark and generate a `workspace` directory. You will see a successful run log and a final table detailing latency, throughput, and prefill latency.
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# LLM Core Execution
llama-cpp-python>=0.2.20

# Machine Learning & Neural Network Basics
torch>=2.0.0
transformers>=4.35.0
numpy>=1.24.0

# Ianvs Utilities & Data Handling
pyyaml>=6.0
pandas>=2.0.0
requests>=2.31.0
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
algorithm:
paradigm_type: "singletasklearning_with_compression"
paradigm_type: "singletasklearning"
mode: "with_compression"
initial_model_url: "models/qwen/qwen_1_5_0_5b.gguf"
quantization_type: "q8_0"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from sedna.common.class_factory import ClassFactory, ClassType
from llama_cpp import Llama
from contextlib import redirect_stderr
import os
import psutil
import time
import io
import statistics
import logging

logging.getLogger().setLevel(logging.INFO)
Comment thread
NishantSinghhhhh marked this conversation as resolved.

@ClassFactory.register(ClassType.GENERAL, alias="LlamaCppModel")
class LlamaCppModel:
Expand All @@ -16,7 +16,11 @@ def __init__(self, **kwargs):
model_path = kwargs.get("model_path")
if not model_path:
raise ValueError("Model path is required.")

quantization_type = kwargs.get("quantization_type", None)
if quantization_type:
logging.info(f"Using quantization type: {quantization_type}")
Comment thread
NishantSinghhhhh marked this conversation as resolved.

# Init LLM model
self.model = Llama(
model_path=model_path,
Expand All @@ -30,37 +34,47 @@ def __init__(self, **kwargs):
embedding=kwargs.get("embedding", False),
)

# 1. FIXED: Optional arguments for Ianvs pipeline
def preprocess(self, data=None, **kwargs):
"""
Pass-through for text data.
"""
return data

def predict(self, data, input_shape=None, **kwargs):
data = data[:10]
process = psutil.Process(os.getpid())
start_time = time.time()

results = []
total_times = []
prefill_latencies = []
mem_usages = []

for prompt in data:
prompt_start_time = time.time()

f = io.StringIO()
with redirect_stderr(f):
output = self.model(
prompt=prompt,
max_tokens=kwargs.get("max_tokens", 32),
stop=kwargs.get("stop", ["Q:", "\n"]),
echo=kwargs.get("echo", True),
temperature=kwargs.get("temperature", 0.8),
top_p=kwargs.get("top_p", 0.95),
top_k=kwargs.get("top_k", 40),
repeat_penalty=kwargs.get("repeat_penalty", 1.1),
)
stdout_output = f.getvalue()

# parse timing info
timings = self._parse_timings(stdout_output)
prefill_latency = timings.get('prompt_eval_time', 0.0) # ms
generated_text = output['choices'][0]['text']
# Run model with stream=True to measure exact TTFT
output_stream = self.model(
prompt=prompt,
max_tokens=kwargs.get("max_tokens", 32),
stop=kwargs.get("stop", ["Q:", "\n"]),
echo=kwargs.get("echo", True),
temperature=kwargs.get("temperature", 0.8),
top_p=kwargs.get("top_p", 0.95),
top_k=kwargs.get("top_k", 40),
repeat_penalty=kwargs.get("repeat_penalty", 1.1),
stream=True # <--- TTFT Magic Flag
)

generated_text = ""
prefill_latency = 0.0
first_token = True

# Iterate through the stream as the model generates it
for chunk in output_stream:
if first_token:
prefill_latency = (time.time() - prompt_start_time) * 1000
first_token = False

if "text" in chunk["choices"][0]:
generated_text += chunk["choices"][0]["text"]
Comment thread
NishantSinghhhhh marked this conversation as resolved.

prompt_end_time = time.time()
prompt_total_time = (prompt_end_time - prompt_start_time) * 1000 # convert to ms
Expand All @@ -69,29 +83,19 @@ def predict(self, data, input_shape=None, **kwargs):
"generated_text": generated_text,
"total_time": prompt_total_time,
"prefill_latency": prefill_latency,
"mem_usage":process.memory_info().rss,
"mem_usage": process.memory_info().rss,
}

results.append(result_with_time)

predict_dict = {
"results": results,
}
return {"results": results}

return predict_dict

def _parse_timings(self, stdout_output):
import re
timings = {}
for line in stdout_output.split('\n'):
match = re.match(r'llama_print_timings:\s*(.+?)\s*=\s*([0-9\.]+)\s*ms', line)
if match:
key = match.group(1).strip()
value = float(match.group(2))

key = key.lower().replace(' ', '_')
timings[key] = value
return timings
# 2. FIXED: Optional arguments for Ianvs pipeline
def postprocess(self, predict_output=None, **kwargs):
"""
Pass-through for prediction output.
"""
return predict_output

def evaluate(self, data, model_path=None, **kwargs):
"""
Expand Down Expand Up @@ -125,5 +129,11 @@ def save(self, model_path):
def load(self, model_url):
pass

# 3. FIXED: Safe no-op for training pre-trained models
def train(self, train_data, valid_data=None, **kwargs):
return
"""
Dummy train method.
Returns the model path to satisfy Ianvs pipeline requirements.
"""
logging.info("Training step bypassed: Using pre-trained weights for LLM inference.")
Comment thread
NishantSinghhhhh marked this conversation as resolved.
return kwargs.get("model_path", "")
Loading