Merge pull request #409 from NishantSinghhhhh/Restoration_single_task_bench_with_compression

kubeedge-bot · web-flow · commit 5ff504634ab5 · 2026-05-07T17:12:59.000+08:00
[LFX Term 1 2026] Restoring LLM Edge Benchmark Suite Single Task Bench With Compression
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/README.md b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/README.md
@@ -1,2 +1,57 @@
-Large Language Model Edge Benchmark Suite: Implementation on KubeEdge-lanvs
+# llm-edge-benchmark-suite single_task_bench_with_compression
 
+This guide outlines the complete setup, configuration, and execution process for running the **Compression** Large Language Model (LLM) benchmarking suite using the [Ianvs](https://github.com/kubeedge/ianvs) edge computing framework.
+
+>  **CRITICAL FIRST STEPS: ABSOLUTE PATHS & DEPENDENCIES**
+> 1. **Correct all paths:** You **must** change every relative path (e.g., `models/qwen/...` or `dataset/...`) in all your `.yaml` configuration files to **absolute paths** (e.g., `/home/user/ianvs/models/qwen/...`). Ianvs will crash if it encounters relative paths.
+> 2. **Dependencies:** You must install the necessary packages via `requirements.txt` before executing any runs.
+
+---
+
+## Step 1: Environment Setup
+
+First, ensure your Ianvs virtual environment is active:
+```bash
+source /path/to/your/ianvs_env/bin/activate
+```
+
+Install the requirements.txt
+```bash
+pip install -r ianvs/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/requirements.txt
+```
+
+---
+
+##  Step 2: Shared Model Acquisition
+
+This benchmark shares the same `.gguf` model file as the standard suite. Ensure the `Qwen1.5-0.5B-Chat` model exists in your central models directory.
+
+If it is missing, download it using a resumable command:
+```bash
+mkdir -p /ianvs/models/qwen
+wget -c -O ianvs/models/qwen/qwen_1_5_0_5b.gguf [https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GGUF/resolve/main/qwen1_5-0_5b-chat-q4_k_m.gguf](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GGUF/resolve/main/qwen1_5-0_5b-chat-q4_k_m.gguf)
+```
+
+---
+
+##  Step 3: Configuration Alignment (Fixing the YAMLs)
+
+You must manually update three different YAML files to remove relative paths and fix framework naming strictness.
+
+### 1. Test Environment (`testenv/testenv.yaml`)
+Update the dataset location to an absolute path:
+```yaml
+dataset:
+  train_data: "ianvs/dataset/data.jsonl"
+```
+
+##  Step 4: Execution
+
+Once all paths are absolute and the script is updated, execute the benchmark:
+
+```bash
+ianvs -f ianvs/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml
+```
+
+### Expected Output
+Ianvs will execute the benchmark and generate a `workspace` directory. You will see a successful run log and a final table detailing latency, throughput, and prefill latency.
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/requirements.txt‎ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/requirements.txt‎
@@ -0,0 +1,12 @@
+# LLM Core Execution
+llama-cpp-python>=0.2.20
+
+# Machine Learning & Neural Network Basics
+torch>=2.0.0
+transformers>=4.35.0
+numpy>=1.24.0
+
+# Ianvs Utilities & Data Handling
+pyyaml>=6.0
+pandas>=2.0.0
+requests>=2.31.0
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml
@@ -1,5 +1,5 @@
 algorithm:
-  paradigm_type: "singletasklearning_with_compression"
+  paradigm_type: "singletasklearning"
   mode: "with_compression"
   initial_model_url: "models/qwen/qwen_1_5_0_5b.gguf"
   quantization_type: "q8_0"
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py
@@ -1,11 +1,11 @@
 from sedna.common.class_factory import ClassFactory, ClassType
 from llama_cpp import Llama
-from contextlib import redirect_stderr
 import os
 import psutil
 import time
-import io
-import statistics
+import logging
+
+logging.getLogger().setLevel(logging.INFO)
 
 @ClassFactory.register(ClassType.GENERAL, alias="LlamaCppModel")
 class LlamaCppModel:
@@ -16,7 +16,11 @@ def __init__(self, **kwargs):
         model_path = kwargs.get("model_path")
         if not model_path:
             raise ValueError("Model path is required.")
+        
         quantization_type = kwargs.get("quantization_type", None)
+        if quantization_type:
+            logging.info(f"Using quantization type: {quantization_type}")
+            
         # Init LLM model
         self.model = Llama(
             model_path=model_path,
@@ -30,37 +34,47 @@ def __init__(self, **kwargs):
             embedding=kwargs.get("embedding", False),
         )
 
+    # 1. FIXED: Optional arguments for Ianvs pipeline
+    def preprocess(self, data=None, **kwargs):
+        """
+        Pass-through for text data.
+        """
+        return data
+
     def predict(self, data, input_shape=None, **kwargs):
         data = data[:10]
         process = psutil.Process(os.getpid())
-        start_time = time.time()
 
         results = []
-        total_times = []
-        prefill_latencies = []
-        mem_usages = []
 
         for prompt in data:
             prompt_start_time = time.time()
             
-            f = io.StringIO()
-            with redirect_stderr(f):
-                output = self.model(
-                    prompt=prompt,
-                    max_tokens=kwargs.get("max_tokens", 32),
-                    stop=kwargs.get("stop", ["Q:", "\n"]),
-                    echo=kwargs.get("echo", True),
-                    temperature=kwargs.get("temperature", 0.8),
-                    top_p=kwargs.get("top_p", 0.95),
-                    top_k=kwargs.get("top_k", 40),
-                    repeat_penalty=kwargs.get("repeat_penalty", 1.1),
-                )
-            stdout_output = f.getvalue()
-
-            # parse timing info
-            timings = self._parse_timings(stdout_output)
-            prefill_latency = timings.get('prompt_eval_time', 0.0)  # ms
-            generated_text = output['choices'][0]['text']
+            # Run model with stream=True to measure exact TTFT
+            output_stream = self.model(
+                prompt=prompt,
+                max_tokens=kwargs.get("max_tokens", 32),
+                stop=kwargs.get("stop", ["Q:", "\n"]),
+                echo=kwargs.get("echo", True),
+                temperature=kwargs.get("temperature", 0.8),
+                top_p=kwargs.get("top_p", 0.95),
+                top_k=kwargs.get("top_k", 40),
+                repeat_penalty=kwargs.get("repeat_penalty", 1.1),
+                stream=True  # <--- TTFT Magic Flag
+            )
+            
+            generated_text = ""
+            prefill_latency = 0.0
+            first_token = True
+
+            # Iterate through the stream as the model generates it
+            for chunk in output_stream:
+                if first_token:
+                    prefill_latency = (time.time() - prompt_start_time) * 1000 
+                    first_token = False
+                
+                if "text" in chunk["choices"][0]:
+                    generated_text += chunk["choices"][0]["text"]
 
             prompt_end_time = time.time()
             prompt_total_time = (prompt_end_time - prompt_start_time) * 1000  # convert to ms
@@ -69,29 +83,19 @@ def predict(self, data, input_shape=None, **kwargs):
                 "generated_text": generated_text,
                 "total_time": prompt_total_time,
                 "prefill_latency": prefill_latency,
-                "mem_usage":process.memory_info().rss,
+                "mem_usage": process.memory_info().rss,
             }
 
             results.append(result_with_time)
 
-        predict_dict = {
-            "results": results,
-        }
+        return {"results": results}
 
-        return predict_dict
-
-    def _parse_timings(self, stdout_output):
-        import re
-        timings = {}
-        for line in stdout_output.split('\n'):
-            match = re.match(r'llama_print_timings:\s*(.+?)\s*=\s*([0-9\.]+)\s*ms', line)
-            if match:
-                key = match.group(1).strip()
-                value = float(match.group(2))
-
-                key = key.lower().replace(' ', '_')
-                timings[key] = value
-        return timings
+    # 2. FIXED: Optional arguments for Ianvs pipeline
+    def postprocess(self, predict_output=None, **kwargs):
+        """
+        Pass-through for prediction output.
+        """
+        return predict_output
 
     def evaluate(self, data, model_path=None, **kwargs):
         """
@@ -125,5 +129,11 @@ def save(self, model_path):
     def load(self, model_url):
         pass
 
+    # 3. FIXED: Safe no-op for training pre-trained models
     def train(self, train_data, valid_data=None, **kwargs):
-        return
+        """
+        Dummy train method. 
+        Returns the model path to satisfy Ianvs pipeline requirements.
+        """
+        logging.info("Training step bypassed: Using pre-trained weights for LLM inference.")
+        return kwargs.get("model_path", "")