Merge pull request #158 from danielferr85/main

NotPunchnox · web-flow · commit 3b1d362c46d1 · 2026-06-05T11:29:28.000+02:00
Include vison capabilities for llama.cpp integration. Better error handling. Better documentation in code. Change notation for config.ini for GGUF models
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # RKLLama: LLM Server and Client for Rockchip 3588/3576
 
-### [Version: 0.0.69](#New-Version)
+### [Version: 0.0.70](#New-Version)
 
 Video demo ( version 0.0.1 ):
 
@@ -533,10 +533,12 @@ The structure of a GGUF model is similar to rkllm models. You need a folder with
        └── qwen3.5-4b:q8_0
            └── model.gguf (can have any name but must end in .gguf)
            └── config.ini (optional)
+           └── mmproj.gguf (optional - can have any name but is recommended tu include substring 'mmproj' in the name. Must end in .gguf. Only apply for multimodal models for vision capabilities)
+           
           
    ```
 
-   The contents of the config.ini are llama.cpp environment vars for RKNPU inference explained by the author of the fork: https://github.com/invisiofficial/rk-llama.cpp/tree/rknpu2/ggml/src/ggml-rknpu2 (RKNPU_DOMAINS variable is skipped because rkllama handles it) and llama.cpp argument for the llama-server process: https://github.com/invisiofficial/rk-llama.cpp/blob/rknpu2/tools/server/README.md
+   The contents of the config.ini are llama.cpp environment vars for RKNPU inference explained by the author of the fork: https://github.com/invisiofficial/rk-llama.cpp/tree/rknpu2/ggml/src/ggml-rknpu2 (RKNPU_DOMAINS variable is skipped because rkllama handles it) and llama.cpp arguments for the llama-server process: https://github.com/invisiofficial/rk-llama.cpp/blob/rknpu2/tools/server/README.md (Your are only allowed to use arguments that starts with '--')
 
    Some examples of config.ini files:
 
@@ -549,20 +551,20 @@ The structure of a GGUF model is similar to rkllm models. You need a folder with
    RKNPU_HYBRID=W8A8_HADAMARD
 
    [ARGS]
-   --mmap =
-   --no-repack=
-   --no-warmup=
-   --cache-type-k = q8_0
-   --cache-type-v = q8_0
-   --cache-ram = 2048
-   --batch-size = 2048
-   --ubatch-size = 2048
-   --top-p       = 1.0
-   --top-k       = 0
-   --min-p       = 0.01
-   --temp        = 1.0
-   --chat-template-kwargs = {"reasoning_effort": "low"}
-   --log-file = /opt/rkllama/models/gpt-oss-20b:q8_0/llamacpp.log
+   mmap =
+   no-repack=
+   no-warmup=
+   cache-type-k = q8_0
+   cache-type-v = q8_0
+   cache-ram = 2048
+   batch-size = 2048
+   ubatch-size = 2048
+   top-p       = 1.0
+   top-k       = 0
+   min-p       = 0.01
+   temp        = 1.0
+   chat-template-kwargs = {"reasoning_effort": "low"}
+   log-file = llamacpp.log (can have any name and be an absolute path if you dont want logs in the same model folder)
    ```
 
 
@@ -573,20 +575,20 @@ The structure of a GGUF model is similar to rkllm models. You need a folder with
    RKNPU_HYBRID=W4A4_HADAMARD
 
    [ARGS]
-   --mmap =
-   --no-repack=
-   --no-warmup=
-   --cache-type-k = q8_0
-   --cache-type-v = q8_0
-   --cache-ram = 2048
-   --batch-size = 2048
-   --ubatch-size = 2048
-   --ctx-size = 65536
-   --predict = 2048
-   --top-p       = 0.95
-   --top-k       = 64
-   --temp        = 1.0
-   --log-file = /home/orangepi/github/danielferr85/rkllama/gemma-4-26b-a4b-it:ud-iq4_xs/llamacpp.log
+   mmap =
+   no-repack=
+   no-warmup=
+   cache-type-k = q8_0
+   cache-type-v = q8_0
+   cache-ram = 2048
+   batch-size = 2048
+   ubatch-size = 2048
+   ctx-size = 65536
+   predict = 2048
+   top-p       = 0.95
+   top-k       = 64
+   temp        = 1.0
+   log-file = llamacpp.log (can have any name and be an absolute path if you dont want logs in the same model folder)
    ```
 
 
@@ -597,15 +599,16 @@ The structure of a GGUF model is similar to rkllm models. You need a folder with
    RKNPU_HYBRID=W8A8_STANDARD
 
    [ARGS]
-   --mmap =
-   --no-repack=
-   --no-warmup=
-   --cache-type-k = q8_0
-   --cache-type-v = q8_0
-   --cache-ram = 2048
-   --batch-size = 2048
-   --ubatch-size = 2048
-   --log-file = /home/orangepi/github/danielferr85/rkllama/qwen3.5-4b:q8_0/llamacpp.log
+   mmap =
+   no-repack=
+   no-warmup=
+   cache-type-k = q8_0
+   cache-type-v = q8_0
+   cache-ram = 2048
+   batch-size = 2048
+   ubatch-size = 2048
+   mmproj = mmproj-F16.gguf (projector for vision capabilities for the model)
+   log-file = llamacpp.log (can have any name and be an absolute path if you dont want logs in the same model folder)
    ```
 
    For qwen3.5 follow the recomendations:
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "rkllama"
-version = "0.0.69"
+version = "0.0.70"
 authors = [
     { name="NotPunchnox", email="punchnoxpro@gmail.com" },
     { name="TomJacobsUK", email="tom@tomjacobs.co.uk" },
diff --git a/src/rkllama/api/model_utils.py b/src/rkllama/api/model_utils.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 import rkllama.config
 import time
+import configparser
 
 # Configure logger
 logger = logging.getLogger("rkllama.model_utils")
@@ -569,10 +570,22 @@ def get_gguf_model_path(model_name) -> str:
     
     # Search for the GGUF files
     if os.path.isdir(model_path):
+
+        # Read the config for the GGUF model for llama.cpp (if exists)
+        config_file = os.path.join(model_path, "config.ini")
+        configuration = configparser.ConfigParser()
+        configuration.read(config_file)
+
+        # Read possible projector for vision models
+        expected_mmproj_subname = "mmproj"
+        if configuration is not None and "ARGS" in configuration.keys() and any(x in configuration["ARGS"].keys() for x in ["mmproj","--mmproj"]):
+            expected_mmproj_subname = configuration["ARGS"]["--mmproj"] if "--mmproj" in configuration["ARGS"].keys() else configuration["ARGS"]["mmproj"]
+
+        # Loop over the files in model directory
         for root, dirs, files in os.walk(model_path):
             for file in files:
                 file_path = os.path.join(root, file)
-                if file_path.lower().endswith(".gguf"):
+                if file_path.lower().endswith(".gguf") and expected_mmproj_subname.lower() not in file_path.lower(): # Prevent return projector
                     # return the file
                     return file_path
 
@@ -641,6 +654,7 @@ def wait_for_service(
     Wait until an HTTP service becomes available.
 
     Parameters:
+        process (dict): Popen process to check status
         url (str): URL to check.
         timeout (float): Requests timeout in seconds.
         interval (float): Seconds to wait between retry attempts.
@@ -671,8 +685,8 @@ def wait_for_service(
                 stdout, _ = process.communicate()
 
                 # Kill the process
-                server_process.kill()
-                server_process.wait(timeout=5)
+                process.kill()
+                process.wait(timeout=5)
 
                 # Check if insufficient memory in the current domain
                 if "RKNPU ERROR: Out of memory in allowed IOMMU domains" in stdout: 
@@ -687,6 +701,11 @@ def wait_for_service(
             # requests.get() waits for the server response unless a timeout is set [InlineCitation-1-Guide to Handling Python Requests Timeout](https://oxylabs.io/blog/python-requests-timeout)
             response = requests.get(url, timeout=timeout)
             if response.status_code == expected_status:
+
+                # Wait for warm up subprocess to prevent error: 
+                # requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) 
+                logger.debug(f"Waiting to finish warmup subprocess for llama-server...")
+                time.sleep(5)
                 return True, None
             
         except requests.RequestException:
@@ -697,8 +716,8 @@ def wait_for_service(
             logger.error(f"Timeout waiting for llama-server process to start....")
             
             # Kill the process
-            server_process.kill()
-            server_process.wait(timeout=5)
+            process.kill()
+            process.wait(timeout=5)
 
             # Return not initiated
             return False, False
diff --git a/src/rkllama/api/worker.py b/src/rkllama/api/worker.py
@@ -374,6 +374,8 @@ def run_llama_cpp_model_server(model_name, gguf_model_dir, gguf_model_path, port
     """
     Run an instance of llama.cpp for the required model program using subprocess.run.
     Args:
+        model_name (str): Model Name to load
+        gguf_model_dir (str): Directory where the model resides 
         gguf_model_path (str): GGUF Model to load
         port (int): port to assign to the llama.cpp model
         base_domain_id (int): Domain to execute the llama.cpp server
@@ -391,7 +393,7 @@ def run_llama_cpp_model_server(model_name, gguf_model_dir, gguf_model_path, port
         configuration = configparser.ConfigParser()
         configuration.read(config_file)
 
-        # Read custom environment vars
+        # Read custom environment vars to llama.cpp
         rk_llama_cpp_env = { "RKNPU_DOMAINS": f"{','.join(str(base_domain_id))}"}
         if configuration is not None and "ENV" in configuration.keys():
             for var in configuration["ENV"].keys():
@@ -409,14 +411,26 @@ def run_llama_cpp_model_server(model_name, gguf_model_dir, gguf_model_path, port
         cmd = ["taskset" ,"--cpu-list", cpu , os.path.join(rkllama.config.get_path("llamacpp"), "llama-server"), 
               "--model" , gguf_model_path, 
               "--port" , str(port), 
-              "--threads" , "4"]
+              "--threads" , "4"] # Defauul 4 threads
         
         # Read custom arguments to llama.cpp
         if configuration is not None and "ARGS" in configuration.keys():
             for arg in configuration["ARGS"].keys():
                 arg_value = configuration["ARGS"][arg]
+                
+                # Preparing format of arguments for llama-server
+                if not arg.startswith("--"):
+                    logger.debug(f"Adding -- to the start of the argument '{arg}'")
+                    arg = f"--{arg}"
+
                 # Check that argments are not the required calculated by rkllama
-                if arg not in ["-m", "--port", "--model", "--cpu-list"]:
+                if arg not in ["--port", "--model", "--cpu-list"]:
+
+                    # Cheking if projector and log-file exists without path in config
+                    if arg in ["--mmproj","--log-file"] and not arg_value.startswith("/"):
+                        logger.debug(f"Adding model directory to argument '{arg}' because currently relative path specified '{arg_value}'")
+                        arg_value = os.path.join(gguf_model_dir, arg_value)
+
                     logger.debug(f"Adding custom argument to llama.cpp '{arg}' with value '{arg_value}'")
                     cmd.append(arg)
                     if arg_value is not None and arg_value:
@@ -432,10 +446,6 @@ def run_llama_cpp_model_server(model_name, gguf_model_dir, gguf_model_path, port
             text=True,
         )
 
-        # Wait for warm up subprocess to prevent error: 
-        # requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) 
-        time.sleep(3) # 3 seconds
-
         # Waiting for service up
         logger.debug(f"Waiting for model {gguf_model_path} Up and running...")
         initialized, need_more_iommu_domains = wait_for_service(server_process, f"http://localhost:{port}/v1/models", max_wait = int(rkllama.config.get('model', 'max_seconds_waiting_worker_response')))
@@ -1515,6 +1525,7 @@ def create_worker_process(self, base_domain_ids, model_path, model_dir, options=
                         # Create the process
                         logger.debug(f"Trying to create the process for the worker for model {self.worker_model_info.model} with IOMMU domains {domains_assigned}")
                         self.process = run_llama_cpp_model_server(self.worker_model_info.model, model_dir,model_path,self.worker_model_info.llama_cpp_port,domains_assigned)
+                        # CHeck if load fail because need more base domains
                         if isinstance(self.process, int) and self.process == -1:
                             # More base domains needed
                             continue
diff --git a/src/rkllama/server/server.py b/src/rkllama/server/server.py
@@ -1337,7 +1337,7 @@ def embeddings_ollama():
 def ollama_version():
     """Return a dummy version to be compatible with Ollama clients"""
     return jsonify({
-        "version": "0.0.69"
+        "version": "0.0.70"
     }), 200
 
 
@@ -1573,7 +1573,8 @@ def forward_request_to_llama_cpp_worker(is_openai_request,request):
     Route to llama.cpp worker for inference for GGUF models
 
     Args:
-        request : Original request to forward
+        is_openai_request (bol):
+        request (dic): Original request to forward
     """
 
     # Check if llama.cpp directory exists defined
@@ -1627,36 +1628,50 @@ def forward_request_to_llama_cpp_worker(is_openai_request,request):
 
             def generate():
             
-                # Make the call to the llama-server with stream enable
-                with requests.post(
-                    proxy_route_url,
-                    json=data,
-                    headers=headers,
-                    timeout=120,
-                    stream=stream
-                ) as response:
+                try:
+                    # Make the call to the llama-server with stream enable
+                    with requests.post(
+                        proxy_route_url,
+                        json=data,
+                        headers=headers,
+                        timeout=120,
+                        stream=stream
+                    ) as response:
+
+                        # Check for error codes
+                        try:
+                            response.raise_for_status()
+                        except requests.HTTPError as e:
+                            error = f"OpenAI API error for model'{model_name}': {str(e)}"
+                            logger.error(error)
+                            return jsonify({"error": error}), 500
+                            
+                        
+                        # Create a converter to Ollama (if needed)
+                        converter = OpenAIToOllamaStreamConverter()
 
-                    # Check for error codes
-                    try:
-                        response.raise_for_status()
-                    except requests.HTTPError as e:
-                        raise RuntimeError(f"OpenAI API error: {e}") from e
-                    
-                    # Create a converter to Ollama (if needed)
-                    converter = OpenAIToOllamaStreamConverter()
-
-                    # Loop over the chunks returned by llama.cpp
-                    for line in response.iter_lines():
-                        # Decode the bytes line 
-                        line = line.decode("utf-8")
-                       
-                        if not is_openai_request: # Ollama
-                            for chunk in converter.process_line(line):
-                                yield json.dumps(chunk) + "\n"
-                        else: # OpenAI
-                            # Return the chunk to the client of the request
-                            yield f"{line}\n"
+                        # Loop over the chunks returned by llama.cpp
+                        for line in response.iter_lines():
+                            # Decode the bytes line 
+                            line = line.decode("utf-8")
                         
+                            if not is_openai_request: # Ollama
+                                for chunk in converter.process_line(line):
+                                    yield json.dumps(chunk) + "\n"
+                            else: # OpenAI
+                                # Return the chunk to the client of the request
+                                yield f"{line}\n"
+                except Exception as e:
+                    # Log the error
+                    logger.error(f"Streaming error: {e}", exc_info=True)
+                    # Send a JSON error message instead of breaking the stream
+                    yield json.dumps({
+                        "error": {
+                            "code": "INTERNAL_ERROR",
+                            "message": str(e)
+                        }
+                    }) + "\n"    
+
             logger.debug("Making the streaming call to llama-server...")
             return Response(stream_with_context(generate()), mimetype="text/event-stream") # OpenAI