HKUDS · MrGidea · Mar 19, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -34,7 +34,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install -e ".[api]"
+        pip install -e ".[api,offline-storage]"
         pip install pytest pytest-asyncio
 
     - name: Run offline tests

diff --git a/docs/RAGAnythingParserAlignment.md b/docs/RAGAnythingParserAlignment.md
@@ -0,0 +1,91 @@
+## RAG-Anything Parser Alignment Notes
+
+This document summarizes the companion changes made on the `RAG-Anything` side to better align its parser output with the LightRAG multimodal pipeline introduced in this PR.
+
+These notes are provided as reviewer context. The code changes described below live in the `RAG-Anything` repository, mainly in `raganything/parser.py`, rather than in this LightRAG pull request.
+
+## Why This Alignment Was Needed
+
+The LightRAG-side pipeline in this PR expects parser output to preserve heading structure, normalize multimodal block types consistently, and expose enough table metadata to generate correct LightRAG document sidecars.
+
+Without the parser-side alignment, several downstream issues appear more easily:
+
+- section headings may be lost before LightRAG sidecar generation
+- table dimensions can degrade to `[0, 0]`
+- table content may be harder to serialize into stable sidecar payloads
+- parser output shape may drift between Docling variants
+
+## RAG-Anything Changes
+
+### 1. Add safe helper functions for parser normalization
+
+Two small helpers were added:
+
+- `_to_int(value, default=0)`
+- `_grid_to_rows(grid)`
+
+Their purpose is to make parser output more defensive and consistent when Docling returns numeric fields or table cell structures in slightly different formats.
+
+### 2. Normalize text labels before branching
+
+Docling text blocks are now normalized through:
+
+- `label = str(block.get("label", "")).strip().lower()`
+
+This avoids relying on a raw case-sensitive label and makes formula / title / section-header detection more stable.
+
+### 3. Preserve section heading structure explicitly
+
+For Docling text blocks, `section_header` and `title` are now emitted as dedicated structured blocks:
+
+- `type: "section_header"` or `type: "title"`
+- `text`
+- `level`
+- `page_idx`
+
+This is important because the LightRAG-side conversion logic uses heading information to:
+
+- propagate `heading`
+- build `parent_headings`
+- keep multimodal sidecars attached to the correct section context
+
+### 4. Preserve label and level on normal text blocks
+
+For non-heading text blocks, the parser now also retains:
+
+- `label`
+- `level`
+
+This gives LightRAG more context when converting parser output into LightRAG document blocks and helps preserve document structure more faithfully.
+
+### 5. Improve table normalization for Docling output
+
+Table parsing was expanded to support both:
+
+- dict-style table payloads with `grid`, `num_rows`, `num_cols`
+- legacy list-style table payloads
+
+The parser now derives and exposes:
+
+- `table_body`
+- `rows`
+- `num_rows`
+- `num_cols`
+
+This is the key alignment needed for LightRAG-side table sidecar generation, especially to avoid empty dimensions and to keep table content serializable in a stable form.
+
+## Practical Impact on This PR
+
+These RAG-Anything parser changes are the external counterpart of the LightRAG work in this PR:
+
+- LightRAG now converts structured parser output into LightRAG document artifacts
+- multimodal sidecars depend on parser-side heading and table metadata
+- heading propagation and table dimension fixes are more reliable when the parser emits normalized structure upstream
+
+In short, the LightRAG code in this PR can run independently, but the best end-to-end behavior for Docling/RAG-Anything-driven multimodal ingestion depends on this parser alignment on the `RAG-Anything` side as well.
+
+## Scope Note
+
+This document is intentionally limited to parser-alignment notes for `RAG-Anything`.
+
+It does not describe the entity disambiguation experiment, which is explicitly excluded from this PR.
diff --git a/env.example b/env.example
@@ -207,6 +207,46 @@ SUMMARY_LANGUAGE=English
 ### Maximum token size allowed for entity extraction input context
 # MAX_EXTRACT_INPUT_TOKENS=20480
 
+### Use JSON structured output for entity extraction (default: true)
+# ENTITY_EXTRACTION_USE_JSON=true
+
+### Multimodal parsing/analyze integration
+### Optional parser routing rules, for example:
+###   pdf:mineru-iet,docx:docling,pptx:docling,*:native
+# LIGHTRAG_PARSER=
+### Optional local checkout path of RAG-Anything for parser integration
+# RAGANYTHING_ROOT=/path/to/RAG-Anything
+### Retry count for multimodal VLM analysis JSON normalization/writeback
+# VLM_ANALYZE_RETRIES=2
+### Maximum image bytes sent to VLM per multimodal item
+# VLM_MAX_IMAGE_BYTES=5242880
+
+### Async parser service protocol (optional)
+### Configure these when using remote MinerU/Docling async services
+# MINERU_ENDPOINT=http://localhost:8000/api/v1/task
+# MINERU_POLL_ENDPOINT=http://localhost:8000/api/v1/task/{trace_id}
+# MINERU_POLL_METHOD=GET
+# MINERU_ID_FIELD=trace_id
+# MINERU_STATUS_FIELD=status
+# MINERU_RESULT_URL_FIELD=result_url
+# MINERU_CONTENT_FIELD=content
+# MINERU_SUCCESS_VALUES=done,success,completed
+# MINERU_FAILED_VALUES=failed,error,cancelled
+# MINERU_POLL_INTERVAL_SECONDS=2
+# MINERU_MAX_POLLS=180
+
+# DOCLING_ENDPOINT=http://localhost:8081/v1/convert/file/async
+# DOCLING_POLL_ENDPOINT=http://localhost:8081/v1/convert/file/async/{task_id}
+# DOCLING_POLL_METHOD=GET
+# DOCLING_ID_FIELD=task_id
+# DOCLING_STATUS_FIELD=status
+# DOCLING_RESULT_URL_FIELD=result_url
+# DOCLING_CONTENT_FIELD=content
+# DOCLING_SUCCESS_VALUES=done,success,completed
+# DOCLING_FAILED_VALUES=failed,error,cancelled
+# DOCLING_POLL_INTERVAL_SECONDS=2
+# DOCLING_MAX_POLLS=180
+
 ### control the maximum chunk_ids stored in vector and graph db
 # MAX_SOURCE_IDS_PER_ENTITY=300
 # MAX_SOURCE_IDS_PER_RELATION=300
@@ -228,6 +268,14 @@ SUMMARY_LANGUAGE=English
 MAX_ASYNC=4
 ### Number of parallel processing documents(between 2~10, MAX_ASYNC/3 is recommended)
 MAX_PARALLEL_INSERT=2
+### Optional per-stage document pipeline concurrency
+# MAX_PARALLEL_PARSE_NATIVE=5
+# MAX_PARALLEL_PARSE_MINERU=3
+# MAX_PARALLEL_PARSE_DOCLING=3
+# MAX_PARALLEL_ANALYZE=2
+### Optional queue sizes for staged pipeline workers
+# QUEUE_SIZE_DEFAULT=100
+# QUEUE_SIZE_INSERT=4
 ### Max concurrency requests for Embedding
 # EMBEDDING_FUNC_MAX_ASYNC=8
 ### Num of chunks send to Embedding in single request
@@ -337,6 +385,43 @@ OLLAMA_LLM_NUM_CTX=32768
 # AWS_REGION=us-east-1
 # BEDROCK_LLM_TEMPERATURE=1.0
 
+###########################################################################
+### Optional role-specific LLM/VLM overrides
+### If unset, each role falls back to the base LLM_* configuration above.
+### Available roles: EXTRACT, KEYWORD, QUERY, VLM
+###########################################################################
+### Example: use a dedicated model/provider for entity extraction
+# EXTRACT_LLM_BINDING=openai
+# EXTRACT_LLM_MODEL=your_extract_model
+# EXTRACT_LLM_BINDING_HOST=https://api.example.com/v1
+# EXTRACT_LLM_BINDING_API_KEY=your_extract_api_key
+# MAX_ASYNC_EXTRACT_LLM=4
+# LLM_TIMEOUT_EXTRACT_LLM=180
+
+### Example: use a dedicated model/provider for keyword extraction
+# KEYWORD_LLM_BINDING=openai
+# KEYWORD_LLM_MODEL=your_keyword_model
+# KEYWORD_LLM_BINDING_HOST=https://api.example.com/v1
+# KEYWORD_LLM_BINDING_API_KEY=your_keyword_api_key
+# MAX_ASYNC_KEYWORD_LLM=4
+# LLM_TIMEOUT_KEYWORD_LLM=180
+
+### Example: use a dedicated model/provider for query answering
+# QUERY_LLM_BINDING=openai
+# QUERY_LLM_MODEL=your_query_model
+# QUERY_LLM_BINDING_HOST=https://api.example.com/v1
+# QUERY_LLM_BINDING_API_KEY=your_query_api_key
+# MAX_ASYNC_QUERY_LLM=4
+# LLM_TIMEOUT_QUERY_LLM=180
+
+### Example: use a dedicated model/provider for multimodal analysis
+# VLM_LLM_BINDING=openai
+# VLM_LLM_MODEL=your_vlm_model
+# VLM_LLM_BINDING_HOST=https://api.example.com/v1
+# VLM_LLM_BINDING_API_KEY=your_vlm_api_key
+# MAX_ASYNC_VLM_LLM=4
+# LLM_TIMEOUT_VLM_LLM=180
+
 #######################################################################################
 ### Embedding Configuration (Should not be changed after the first file processed)
 ### EMBEDDING_BINDING: ollama, openai, azure_openai, jina, lollms, aws_bedrock

diff --git a/lightrag/api/config.py b/lightrag/api/config.py
@@ -3,7 +3,6 @@
 """
 
 import os
-import re
 import argparse
 import logging
 from dotenv import load_dotenv
@@ -387,6 +386,48 @@ def parse_args() -> argparse.Namespace:
     # PDF decryption password
     args.pdf_decrypt_password = get_env_value("PDF_DECRYPT_PASSWORD", None)
 
+    # --- Per-role LLM configuration (extract / keyword / query / vlm) ---
+    ROLE_PREFIXES = ["EXTRACT", "KEYWORD", "QUERY", "VLM"]
+    for role in ROLE_PREFIXES:
+        binding_key = f"{role}_LLM_BINDING"
+        model_key = f"{role}_LLM_MODEL"
+        host_key = f"{role}_LLM_BINDING_HOST"
+        apikey_key = f"{role}_LLM_BINDING_API_KEY"
+        max_async_key = f"MAX_ASYNC_{role}_LLM"
+        timeout_key = f"LLM_TIMEOUT_{role}_LLM"
+
+        role_binding = get_env_value(binding_key, None, special_none=True)
+        role_model = get_env_value(model_key, None, special_none=True)
+        role_host = get_env_value(host_key, None, special_none=True)
+        role_apikey = get_env_value(apikey_key, None, special_none=True)
+        role_max_async = get_env_value(max_async_key, None, int, special_none=True)
+        role_timeout = get_env_value(timeout_key, None, int, special_none=True)
+
+        attr_prefix = role.lower()
+        setattr(args, f"{attr_prefix}_llm_binding", role_binding)
+        setattr(args, f"{attr_prefix}_llm_model", role_model)
+        setattr(args, f"{attr_prefix}_llm_binding_host", role_host)
+        setattr(args, f"{attr_prefix}_llm_binding_api_key", role_apikey)
+        setattr(args, f"{attr_prefix}_llm_max_async", role_max_async)
+        setattr(args, f"{attr_prefix}_llm_timeout", role_timeout)
+
+        # Cross-provider validation
+        if role_binding and role_binding != args.llm_binding:
+            missing = []
+            if not role_model:
+                missing.append(model_key)
+            if not role_host:
+                role_host = get_default_host(role_binding)
+                setattr(args, f"{attr_prefix}_llm_binding_host", role_host)
+            if not role_apikey:
+                missing.append(apikey_key)
+            if missing:
+                raise SystemExit(
+                    f"Cross-provider error for role '{role}': "
+                    f"binding={role_binding} differs from base={args.llm_binding}, "
+                    f"but required env vars are missing: {', '.join(missing)}"
+                )
+
     # Add environment variables that were previously read directly
     args.cors_origins = get_env_value("CORS_ORIGINS", "*")
     args.summary_language = get_env_value("SUMMARY_LANGUAGE", DEFAULT_SUMMARY_LANGUAGE)
@@ -395,9 +436,7 @@ def parse_args() -> argparse.Namespace:
 
     # For JWT Auth
     args.auth_accounts = get_env_value("AUTH_ACCOUNTS", "")
-    args.token_secret = get_env_value(
-        "TOKEN_SECRET", "lightrag-jwt-default-secret-key!"
-    )
+    args.token_secret = get_env_value("TOKEN_SECRET", "lightrag-jwt-default-secret")
     args.token_expire_hours = get_env_value("TOKEN_EXPIRE_HOURS", 48, float)
     args.guest_token_expire_hours = get_env_value("GUEST_TOKEN_EXPIRE_HOURS", 24, float)
     args.jwt_algorithm = get_env_value("JWT_ALGORITHM", "HS256")
@@ -462,17 +501,6 @@ def parse_args() -> argparse.Namespace:
     ollama_server_infos.LIGHTRAG_NAME = args.simulated_model_name
     ollama_server_infos.LIGHTRAG_TAG = args.simulated_model_tag
 
-    # Sanitize workspace: only alphanumeric characters and underscores are allowed
-    if args.workspace:
-        sanitized = re.sub(r"[^a-zA-Z0-9_]", "_", args.workspace)
-        if sanitized != args.workspace:
-            logging.warning(
-                f"Workspace name '{args.workspace}' contains invalid characters. "
-                f"It has been sanitized to '{sanitized}'. "
-                "Only alphanumeric characters and underscores are allowed."
-            )
-            args.workspace = sanitized
-
     return args