Merge pull request #2792 from danielaskdd/fix/cypher-injection-workspace-label

danielaskdd · web-flow · commit 4dd7579b91f1 · 2026-03-18T15:44:01.000+08:00
fix(api): sanitize workspace from CLI args and HTTP headers to prevent injection
diff --git a/examples/modalprocessors_example.py b/examples/modalprocessors_example.py
@@ -19,11 +19,8 @@
 
 
 def get_llm_model_func(api_key: str, base_url: str = None):
-    return (
-        lambda prompt,
-        system_prompt=None,
-        history_messages=[],
-        **kwargs: openai_complete_if_cache(
+    return lambda prompt, system_prompt=None, history_messages=[], **kwargs: (
+        openai_complete_if_cache(
             "gpt-4o-mini",
             prompt,
             system_prompt=system_prompt,
@@ -41,41 +38,45 @@ def get_vision_model_func(api_key: str, base_url: str = None):
         system_prompt=None,
         history_messages=[],
         image_data=None,
-        **kwargs: openai_complete_if_cache(
-            "gpt-4o",
-            "",
-            system_prompt=None,
-            history_messages=[],
-            messages=[
-                {"role": "system", "content": system_prompt} if system_prompt else None,
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": prompt},
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:image/jpeg;base64,{image_data}"
+        **kwargs: (
+            openai_complete_if_cache(
+                "gpt-4o",
+                "",
+                system_prompt=None,
+                history_messages=[],
+                messages=[
+                    {"role": "system", "content": system_prompt}
+                    if system_prompt
+                    else None,
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{image_data}"
+                                },
                             },
-                        },
-                    ],
-                }
-                if image_data
-                else {"role": "user", "content": prompt},
-            ],
-            api_key=api_key,
-            base_url=base_url,
-            **kwargs,
-        )
-        if image_data
-        else openai_complete_if_cache(
-            "gpt-4o-mini",
-            prompt,
-            system_prompt=system_prompt,
-            history_messages=history_messages,
-            api_key=api_key,
-            base_url=base_url,
-            **kwargs,
+                        ],
+                    }
+                    if image_data
+                    else {"role": "user", "content": prompt},
+                ],
+                api_key=api_key,
+                base_url=base_url,
+                **kwargs,
+            )
+            if image_data
+            else openai_complete_if_cache(
+                "gpt-4o-mini",
+                prompt,
+                system_prompt=system_prompt,
+                history_messages=history_messages,
+                api_key=api_key,
+                base_url=base_url,
+                **kwargs,
+            )
         )
     )
 
@@ -178,14 +179,16 @@ async def initialize_rag(api_key: str, base_url: str = None):
         llm_model_func=lambda prompt,
         system_prompt=None,
         history_messages=[],
-        **kwargs: openai_complete_if_cache(
-            "gpt-4o-mini",
-            prompt,
-            system_prompt=system_prompt,
-            history_messages=history_messages,
-            api_key=api_key,
-            base_url=base_url,
-            **kwargs,
+        **kwargs: (
+            openai_complete_if_cache(
+                "gpt-4o-mini",
+                prompt,
+                system_prompt=system_prompt,
+                history_messages=history_messages,
+                api_key=api_key,
+                base_url=base_url,
+                **kwargs,
+            )
         ),
     )
 
diff --git a/lightrag/api/config.py b/lightrag/api/config.py
@@ -3,6 +3,7 @@
 """
 
 import os
+import re
 import argparse
 import logging
 from dotenv import load_dotenv
@@ -461,6 +462,17 @@ def parse_args() -> argparse.Namespace:
     ollama_server_infos.LIGHTRAG_NAME = args.simulated_model_name
     ollama_server_infos.LIGHTRAG_TAG = args.simulated_model_tag
 
+    # Sanitize workspace: only alphanumeric characters and underscores are allowed
+    if args.workspace:
+        sanitized = re.sub(r"[^a-zA-Z0-9_]", "_", args.workspace)
+        if sanitized != args.workspace:
+            logging.warning(
+                f"Workspace name '{args.workspace}' contains invalid characters. "
+                f"It has been sanitized to '{sanitized}'. "
+                "Only alphanumeric characters and underscores are allowed."
+            )
+            args.workspace = sanitized
+
     return args
 
 
diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
@@ -10,6 +10,7 @@
     get_swagger_ui_oauth2_redirect_html,
 )
 import os
+import re
 import logging
 import logging.config
 import sys
@@ -478,6 +479,14 @@ def get_workspace_from_request(request: Request) -> str | None:
 
         if not workspace:
             workspace = None
+        else:
+            sanitized = re.sub(r"[^a-zA-Z0-9_]", "_", workspace)
+            if sanitized != workspace:
+                logger.warning(
+                    f"Workspace header '{workspace}' contains invalid characters. "
+                    f"Sanitized to '{sanitized}'."
+                )
+                workspace = sanitized
 
         return workspace
 
diff --git a/lightrag/kg/memgraph_impl.py b/lightrag/kg/memgraph_impl.py
@@ -59,8 +59,18 @@ def __init__(self, namespace, global_config, embedding_func, workspace=None):
         self._driver = None
 
     def _get_workspace_label(self) -> str:
-        """Return workspace label (guaranteed non-empty during initialization)"""
-        return self.workspace
+        """Return sanitized workspace label safe for use as a backtick-quoted identifier in Cypher queries.
+
+        Escapes backticks by doubling them to prevent Cypher injection
+        via the LIGHTRAG-WORKSPACE header, while preserving a 1-to-1 mapping
+        for all other characters. The returned value is intended to be used
+        inside backticks (for example, MATCH (n:`{label}`)) and is not
+        validated as a standalone unquoted identifier.
+        """
+        workspace = self.workspace.strip()
+        if not workspace:
+            return "base"
+        return workspace.replace("`", "``")
 
     async def initialize(self):
         async with get_data_init_lock():
diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
@@ -91,8 +91,18 @@ def __init__(self, namespace, global_config, embedding_func, workspace=None):
         self._driver = None
 
     def _get_workspace_label(self) -> str:
-        """Return workspace label (guaranteed non-empty during initialization)"""
-        return self.workspace
+        """Return sanitized workspace label safe for use as a backtick-quoted identifier in Cypher queries.
+
+        Escapes backticks by doubling them to prevent Cypher injection
+        via the LIGHTRAG-WORKSPACE header, while preserving a 1-to-1 mapping
+        for all other characters. The returned value is intended to be used
+        inside backticks (for example, MATCH (n:`{label}`)) and is not
+        validated as a standalone unquoted identifier.
+        """
+        workspace = self.workspace.strip()
+        if not workspace:
+            return "base"
+        return workspace.replace("`", "``")
 
     def _normalize_index_suffix(self, workspace_label: str) -> str:
         """Normalize workspace label for safe use in index names."""
diff --git a/tests/test_aquery_data_endpoint.py b/tests/test_aquery_data_endpoint.py
@@ -605,7 +605,7 @@ def print_query_results(data: Dict[str, Any]):
             file_path = entity.get("file_path", "Unknown source")
             reference_id = entity.get("reference_id", "No reference")
 
-            print(f"   {i+1}. {entity_name} ({entity_type})")
+            print(f"   {i + 1}. {entity_name} ({entity_type})")
             print(
                 f"      Description: {description[:100]}{'...' if len(description) > 100 else ''}"
             )
@@ -624,7 +624,7 @@ def print_query_results(data: Dict[str, Any]):
             file_path = rel.get("file_path", "Unknown source")
             reference_id = rel.get("reference_id", "No reference")
 
-            print(f"   {i+1}. {src} → {tgt}")
+            print(f"   {i + 1}. {src} → {tgt}")
             print(f"      Keywords: {keywords}")
             print(
                 f"      Description: {description[:100]}{'...' if len(description) > 100 else ''}"
@@ -642,7 +642,7 @@ def print_query_results(data: Dict[str, Any]):
             chunk_id = chunk.get("chunk_id", "Unknown ID")
             reference_id = chunk.get("reference_id", "No reference")
 
-            print(f"   {i+1}. Text chunk ID: {chunk_id}")
+            print(f"   {i + 1}. Text chunk ID: {chunk_id}")
             print(f"      Source: {file_path}")
             print(f"      Reference ID: {reference_id}")
             print(
@@ -656,7 +656,7 @@ def print_query_results(data: Dict[str, Any]):
         for i, ref in enumerate(references):
             reference_id = ref.get("reference_id", "Unknown ID")
             file_path = ref.get("file_path", "Unknown source")
-            print(f"   {i+1}. Reference ID: {reference_id}")
+            print(f"   {i + 1}. Reference ID: {reference_id}")
             print(f"      File Path: {file_path}")
             print()
 
diff --git a/tests/test_lightrag_ollama_chat.py b/tests/test_lightrag_ollama_chat.py
@@ -714,7 +714,7 @@ async def run_concurrent_requests():
 
             for i, result in enumerate(results):
                 if isinstance(result, Exception):
-                    error_messages.append(f"Request {i+1} failed: {str(result)}")
+                    error_messages.append(f"Request {i + 1} failed: {str(result)}")
                 else:
                     success_results.append((i + 1, result))
 
diff --git a/tests/test_qdrant_migration.py b/tests/test_qdrant_migration.py
@@ -80,8 +80,8 @@ async def test_qdrant_migration_trigger(mock_qdrant_client, mock_embedding_func)
 
     # Setup mocks for migration scenario
     # 1. New collection does not exist, only legacy exists
-    mock_qdrant_client.collection_exists.side_effect = (
-        lambda name: name == legacy_collection
+    mock_qdrant_client.collection_exists.side_effect = lambda name: (
+        name == legacy_collection
     )
 
     # 2. Legacy collection exists and has data
@@ -173,8 +173,8 @@ async def test_qdrant_no_migration_needed(mock_qdrant_client, mock_embedding_fun
     )
 
     # Only new collection exists (no legacy collection found)
-    mock_qdrant_client.collection_exists.side_effect = (
-        lambda name: name == storage.final_namespace
+    mock_qdrant_client.collection_exists.side_effect = lambda name: (
+        name == storage.final_namespace
     )
 
     # Initialize
@@ -285,8 +285,8 @@ async def test_scenario_2_legacy_upgrade_migration(
     new_collection = storage.final_namespace
 
     # Case 4: Only legacy collection exists
-    mock_qdrant_client.collection_exists.side_effect = (
-        lambda name: name == legacy_collection
+    mock_qdrant_client.collection_exists.side_effect = lambda name: (
+        name == legacy_collection
     )
 
     # Mock legacy collection info with 1536d vectors
@@ -454,10 +454,13 @@ async def test_case1_empty_legacy_auto_cleanup(mock_qdrant_client, mock_embeddin
     new_collection = storage.final_namespace
 
     # Mock: Both collections exist
-    mock_qdrant_client.collection_exists.side_effect = lambda name: name in [
-        legacy_collection,
-        new_collection,
-    ]
+    mock_qdrant_client.collection_exists.side_effect = lambda name: (
+        name
+        in [
+            legacy_collection,
+            new_collection,
+        ]
+    )
 
     # Mock: Legacy collection is empty (0 records)
     def count_mock(collection_name, exact=True, count_filter=None):
@@ -520,10 +523,13 @@ async def test_case1_nonempty_legacy_warning(mock_qdrant_client, mock_embedding_
     new_collection = storage.final_namespace
 
     # Mock: Both collections exist
-    mock_qdrant_client.collection_exists.side_effect = lambda name: name in [
-        legacy_collection,
-        new_collection,
-    ]
+    mock_qdrant_client.collection_exists.side_effect = lambda name: (
+        name
+        in [
+            legacy_collection,
+            new_collection,
+        ]
+    )
 
     # Mock: Legacy collection has data (50 records)
     def count_mock(collection_name, exact=True, count_filter=None):
diff --git a/tests/test_workspace_isolation.py b/tests/test_workspace_isolation.py
@@ -222,7 +222,7 @@ async def test_lock_mechanism(stress_test_mode, parallel_workers):
     # Support stress testing with configurable number of workers
     num_workers = parallel_workers if stress_test_mode else 3
     parallel_workload = [
-        (f"ws_{chr(97+i)}", f"ws_{chr(97+i)}", "test_namespace")
+        (f"ws_{chr(97 + i)}", f"ws_{chr(97 + i)}", "test_namespace")
         for i in range(num_workers)
     ]
 
@@ -491,7 +491,7 @@ async def use_shared_lock(coroutine_id):
 
     print("✅ PASSED: NamespaceLock Concurrent Reuse")
     print(
-        f"   Same NamespaceLock instance used successfully in {expected_entries//2} concurrent coroutines"
+        f"   Same NamespaceLock instance used successfully in {expected_entries // 2} concurrent coroutines"
     )
 
 
diff --git a/tests/test_workspace_sanitization.py b/tests/test_workspace_sanitization.py

Original file line number	Diff line number	Diff line change
`@@ -222,7 +222,7 @@ async def test_lock_mechanism(stress_test_mode, parallel_workers):`
`222`	`222`	`# Support stress testing with configurable number of workers`
`223`	`223`	`num_workers = parallel_workers if stress_test_mode else 3`
`224`	`224`	`parallel_workload = [`
`225`		`- (f"ws_{chr(97+i)}", f"ws_{chr(97+i)}", "test_namespace")`
	`225`	`+ (f"ws_{chr(97 + i)}", f"ws_{chr(97 + i)}", "test_namespace")`
`226`	`226`	`for i in range(num_workers)`
`227`	`227`	`]`
`228`	`228`
`@@ -491,7 +491,7 @@ async def use_shared_lock(coroutine_id):`
`491`	`491`
`492`	`492`	`print("✅ PASSED: NamespaceLock Concurrent Reuse")`
`493`	`493`	`print(`
`494`		`- f" Same NamespaceLock instance used successfully in {expected_entries//2} concurrent coroutines"`
	`494`	`+ f" Same NamespaceLock instance used successfully in {expected_entries // 2} concurrent coroutines"`
`495`	`495`	`)`
`496`	`496`
`497`	`497`