xugangqiang
diff --git a/‎Dockerfile‎
Lines changed: 5 additions & 17 deletions b/‎Dockerfile‎
Lines changed: 5 additions & 17 deletions
diff --git a/‎api/apps/__init__.py‎
Lines changed: 8 additions & 3 deletions b/‎api/apps/__init__.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎api/db/services/connector_service.py‎
Lines changed: 69 additions & 2 deletions b/‎api/db/services/connector_service.py‎
Lines changed: 69 additions & 2 deletions
diff --git a/‎api/db/services/document_service.py‎
Lines changed: 19 additions & 0 deletions b/‎api/db/services/document_service.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎api/db/services/user_canvas_version.py‎
Lines changed: 4 additions & 0 deletions b/‎api/db/services/user_canvas_version.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎common/data_source/github/connector.py‎
Lines changed: 57 additions & 10 deletions b/‎common/data_source/github/connector.py‎
Lines changed: 57 additions & 10 deletions
diff --git a/‎common/mcp_tool_call_conn.py‎
Lines changed: 6 additions & 1 deletion b/‎common/mcp_tool_call_conn.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎common/query_base.py‎
Lines changed: 3 additions & 1 deletion b/‎common/query_base.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎deepdoc/parser/pdf_parser.py‎
Lines changed: 3 additions & 9 deletions b/‎deepdoc/parser/pdf_parser.py‎
Lines changed: 3 additions & 9 deletions
@@ -35,26 +35,14 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
     apt update && \
     apt --no-install-recommends install -y ca-certificates; \
     if [ "$NEED_MIRROR" == "1" ]; then \
-        sed -i 's|http://archive.ubuntu.com/ubuntu|https://mirrors.tuna.tsinghua.edu.cn/ubuntu|g' /etc/apt/sources.list.d/ubuntu.sources; \
-        sed -i 's|http://security.ubuntu.com/ubuntu|https://mirrors.tuna.tsinghua.edu.cn/ubuntu|g' /etc/apt/sources.list.d/ubuntu.sources; \
+        sed -i 's|http://archive.ubuntu.com/ubuntu|https://mirrors.aliyun.com/ubuntu|g' /etc/apt/sources.list.d/ubuntu.sources; \
+        sed -i 's|http://security.ubuntu.com/ubuntu|https://mirrors.aliyun.com/ubuntu|g' /etc/apt/sources.list.d/ubuntu.sources; \
     fi; \
     rm -f /etc/apt/apt.conf.d/docker-clean && \
     echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache && \
     chmod 1777 /tmp && \
     apt update && \
-    apt install -y build-essential && \
-    apt install -y libglib2.0-0 libglx-mesa0 libgl1 && \
-    apt install -y pkg-config libicu-dev libgdiplus && \
-    apt install -y default-jdk && \
-    apt install -y libatk-bridge2.0-0 && \
-    apt install -y libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev && \
-    apt install -y libjemalloc-dev && \
-    apt install -y gnupg unzip curl wget git vim less && \
-    apt install -y ghostscript && \
-    apt install -y pandoc && \
-    apt install -y texlive && \
-    apt install -y fonts-freefont-ttf fonts-noto-cjk && \
-    apt install -y postgresql-client
+    apt install -y build-essential libglib2.0-0 libglx-mesa0 libgl1 pkg-config libicu-dev libgdiplus default-jdk libatk-bridge2.0-0 libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev libjemalloc-dev gnupg unzip curl wget git vim less ghostscript pandoc texlive fonts-freefont-ttf fonts-noto-cjk postgresql-client
 
 # Download resource from GitHub to /usr/share/infinity
 RUN mkdir -p /usr/share/infinity/resource && \
@@ -165,8 +153,8 @@ RUN --mount=type=cache,id=ragflow_uv,target=/root/.cache/uv,sharing=locked \
 COPY web web
 COPY docs docs
 RUN --mount=type=cache,id=ragflow_npm,target=/root/.npm,sharing=locked \
-    export NODE_OPTIONS="--max-old-space-size=4096" && \
-    cd web && npm install && npm run build
+    cd web && NODE_OPTIONS="--max-old-space-size=8192" npm install && \
+    NODE_OPTIONS="--max-old-space-size=8192" VITE_BUILD_SOURCEMAP=false VITE_MINIFY=esbuild npm run build
 
 COPY .git /ragflow/.git
 
 
@@ -121,11 +121,12 @@ def _load_user():
             g.user = user[0]
             return user[0]
     except Exception as e_auth:
-        logging.warning(f"load_user got exception {e_auth}")
+        logging.warning(f"load_user from jwt got exception {e_auth}")
         try:
             authorization = request.headers.get("Authorization")
             if len(authorization.split()) == 2:
-                objs = APIToken.query(token=authorization.split()[1])
+                token = authorization.split()[1]
+                objs = APIToken.query(token=token)
                 if objs:
                     user = UserService.query(id=objs[0].tenant_id, status=StatusEnum.VALID.value)
                     if user:
@@ -134,8 +135,12 @@ def _load_user():
                             return None
                         g.user = user[0]
                         return user[0]
+                    else:
+                        logging.warning(f"load_user: No user found for tenant_id={objs[0].tenant_id} from APIToken")
+                else:
+                    logging.warning(f"load_user: No APIToken found for token={token[:10]}...")
         except Exception as e_api_token:
-            logging.warning(f"load_user got exception {e_api_token}")
+            logging.warning(f"load_user from api token got exception {e_api_token}")
         # Fallback: try raw authorization value as access_token (for login tokens sent without JWT)
         try:
             authorization = request.headers.get("Authorization")
 
@@ -26,6 +26,7 @@
 from api.db.services.common_service import CommonService
 from api.db.services.document_service import DocumentService
 from api.db.services.document_service import DocMetadataService
+from api.utils.common import hash128
 from common.misc_utils import get_uuid
 from common.constants import TaskStatus
 from common.time_utils import current_timestamp, timestamp_to_date
@@ -78,6 +79,64 @@ def rebuild(cls, kb_id:str, connector_id: str, tenant_id:str):
         SyncLogsService.schedule(connector_id, kb_id, reindex=True)
         return err
 
+    @classmethod
+    def cleanup_stale_documents_for_task(
+        cls,
+        task_id: str,
+        connector_id: str,
+        kb_id: str,
+        tenant_id: str,
+        file_list,
+        delete_batch_size: int = 100,
+    ):
+        from api.db.services.file_service import FileService
+
+        if not Connector2KbService.query(connector_id=connector_id, kb_id=kb_id):
+            return 0, []
+
+        e, conn = cls.get_by_id(connector_id)
+        if not e:
+            return 0, []
+
+        source_type = f"{conn.source}/{conn.id}"
+        retain_doc_ids = {hash128(file.id) for file in file_list}
+        existing_docs = DocumentService.list_doc_headers_by_kb_and_source_type(
+            kb_id,
+            source_type,
+        )
+        stale_doc_ids = [
+            doc["id"] for doc in existing_docs if doc["id"] not in retain_doc_ids
+        ]
+        if not stale_doc_ids:
+            return 0, []
+
+        stale_doc_id_set = set(stale_doc_ids)
+        errors = []
+        for offset in range(0, len(stale_doc_ids), delete_batch_size):
+            err = FileService.delete_docs(
+                stale_doc_ids[offset : offset + delete_batch_size],
+                tenant_id,
+            )
+            if err:
+                errors.append(err)
+
+        remaining_doc_ids = {
+            doc["id"]
+            for doc in DocumentService.list_doc_headers_by_kb_and_source_type(
+                kb_id,
+                source_type,
+            )
+            if doc["id"] in stale_doc_id_set
+        }
+        removed_count = len(stale_doc_id_set) - len(remaining_doc_ids)
+        SyncLogsService.increase_removed_docs(
+            task_id,
+            removed_count,
+            "\n".join(errors),
+            len(errors),
+        )
+        return removed_count, errors
+
 
 class SyncLogsService(CommonService):
     model = SyncLogs
@@ -196,6 +255,16 @@ def increase_docs(cls, id, max_update, doc_num, err_msg="", error_count=0):
                          )\
             .where(cls.model.id == id).execute()
 
+    @classmethod
+    def increase_removed_docs(cls, id, removed_count, err_msg="", error_count=0):
+        cls.model.update(
+            docs_removed_from_index=cls.model.docs_removed_from_index + removed_count,
+            error_msg=cls.model.error_msg + err_msg,
+            error_count=cls.model.error_count + error_count,
+            update_time=current_timestamp(),
+            update_date=timestamp_to_date(current_timestamp()),
+        ).where(cls.model.id == id).execute()
+
     @classmethod
     def duplicate_and_parse(cls, kb, docs, tenant_id, src, auto_parse=True):
         from api.db.services.file_service import FileService
@@ -300,5 +369,3 @@ def list_connectors(cls, kb_id):
                     ).dicts()
         )
 
-
-
 
@@ -373,6 +373,25 @@ def get_all_doc_ids_by_kb_ids(cls, kb_ids):
             offset += limit
         return res
 
+    @classmethod
+    @DB.connection_context()
+    def list_doc_headers_by_kb_and_source_type(cls, kb_id, source_type, page_size=500):
+        fields = [cls.model.id, cls.model.kb_id, cls.model.source_type, cls.model.name]
+        docs = cls.model.select(*fields).where(
+            cls.model.kb_id == kb_id,
+            cls.model.source_type == source_type,
+        ).order_by(cls.model.create_time.asc())
+        offset = 0
+        res = []
+        while True:
+            doc_batch = docs.offset(offset).limit(page_size)
+            _temp = list(doc_batch.dicts())
+            if not _temp:
+                break
+            res.extend(_temp)
+            offset += page_size
+        return res
+
     @classmethod
     @DB.connection_context()
     def get_all_docs_by_creator_id(cls, creator_id):
 
@@ -11,13 +11,15 @@
 class UserCanvasVersionService(CommonService):
     model = UserCanvasVersion
 
+    # Build a stable display name for saved snapshots.
     @staticmethod
     def build_version_title(user_nickname, agent_title, ts=None):
         tenant = str(user_nickname or "").strip() or "tenant"
         title = str(agent_title or "").strip() or "agent"
         stamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(ts)) if ts is not None else time.strftime("%Y-%m-%d %H:%M:%S")
         return "{0}_{1}_{2}".format(tenant, title, stamp)
 
+    # Normalize DSL before comparing or writing version content.
     @staticmethod
     def _normalize_dsl(dsl):
         normalized = dsl
@@ -143,6 +145,7 @@ def save_or_replace_latest(cls, user_canvas_id, dsl, title=None, description=Non
                 .first()
             )
 
+            # Repeated saves with the same DSL only refresh the latest snapshot.
             if latest and cls._normalize_dsl(latest.dsl) == normalized_dsl:
                 # Protect released version: if latest is released and current is not,
                 # create a new version instead of updating
@@ -170,6 +173,7 @@ def save_or_replace_latest(cls, user_canvas_id, dsl, title=None, description=Non
                 cls.delete_all_versions(user_canvas_id)
                 return latest.id, False
 
+            # Real content changes create a new snapshot.
             insert_data = {"user_canvas_id": user_canvas_id, "dsl": normalized_dsl}
             if title is not None:
                 insert_data["title"] = title
 
@@ -28,14 +28,20 @@
     InsufficientPermissionsError,
     UnexpectedValidationError,
 )
-from common.data_source.interfaces import CheckpointedConnectorWithPermSyncGH, CheckpointOutput
+from common.data_source.interfaces import (
+    CheckpointedConnectorWithPermSyncGH,
+    CheckpointOutput,
+    CheckpointOutputWrapper,
+)
 from common.data_source.models import (
     ConnectorCheckpoint,
     ConnectorFailure,
     Document,
     DocumentFailure,
     ExternalAccess,
+    GenerateSlimDocumentOutput,
     SecondsSinceUnixEpoch,
+    SlimDocument,
 )
 from common.data_source.connector_runner import ConnectorRunner
 from .models import SerializedRepository
@@ -594,14 +600,8 @@ def _fetch_from_github(
             done_with_prs = False
             num_prs = 0
             pr = None
-            print("start: ", start)
             for pr in pr_batch:
                 num_prs += 1
-                print("-"*40)
-                print("PR name", pr.title)
-                print("updated at", pr.updated_at)
-                print("-"*40)
-                print("\n")
                 # we iterate backwards in time, so at this point we stop processing prs
                 if (
                     start is not None
@@ -732,10 +732,10 @@ def _fetch_from_github(
 
         if checkpoint.cached_repo_ids:
             logging.info(
-                f"{len(checkpoint.cached_repo_ids)} repos remaining (IDs: {checkpoint.cached_repo_ids})"
+                f"{len(checkpoint.cached_repo_ids)} checkpoint repos remaining (IDs: {checkpoint.cached_repo_ids})"
             )
         else:
-            logging.info("No more repos remaining")
+            logging.info("There are no more checkpoint repos left.")
 
         return checkpoint
 
@@ -923,6 +923,53 @@ def validate_checkpoint_json(
     ) -> GithubConnectorCheckpoint:
         return GithubConnectorCheckpoint.model_validate_json(checkpoint_json)
 
+    def retrieve_slim_document(
+        self,
+        start: SecondsSinceUnixEpoch | None = None,
+        end: SecondsSinceUnixEpoch | None = None,
+        callback: Any = None,
+    ) -> GenerateSlimDocumentOutput:
+        start_value = 0.0 if start is None else start
+        end_value = (
+            datetime.now(timezone.utc).timestamp() if end is None else end
+        )
+        checkpoint = self.build_dummy_checkpoint()
+        slim_batch: list[SlimDocument] = []
+
+        while checkpoint.has_more:
+            wrapper = CheckpointOutputWrapper[GithubConnectorCheckpoint]()
+            for document, failure, next_checkpoint in wrapper(
+                self.load_from_checkpoint(start_value, end_value, checkpoint)
+            ):
+                if failure is not None:
+                    logging.warning(
+                        "GitHub connector failure during slim retrieval: %s",
+                        getattr(failure, "failure_message", failure),
+                    )
+                    continue
+
+                if document is not None:
+                    slim_batch.append(SlimDocument(id=document.id))
+                    if len(slim_batch) >= SLIM_BATCH_SIZE:
+                        yield slim_batch
+                        slim_batch = []
+                        if callback:
+                            callback.progress("github_slim_document", 1)
+
+                if next_checkpoint is not None:
+                    checkpoint = next_checkpoint
+
+        if slim_batch:
+            yield slim_batch
+
+    def retrieve_all_slim_docs_perm_sync(
+        self,
+        start: SecondsSinceUnixEpoch | None = None,
+        end: SecondsSinceUnixEpoch | None = None,
+        callback: Any = None,
+    ) -> GenerateSlimDocumentOutput:
+        yield from self.retrieve_slim_document(start=start, end=end, callback=callback)
+
     def build_dummy_checkpoint(self) -> GithubConnectorCheckpoint:
         return GithubConnectorCheckpoint(
             stage=GithubConnectorStage.PRS, curr_page=0, has_more=True, num_retrieved=0
@@ -970,4 +1017,4 @@ def build_dummy_checkpoint(self) -> GithubConnectorCheckpoint:
             if failure:
                 print(f"Failure: {failure.failure_message}")
             if next_checkpoint:
-                checkpoint = next_checkpoint
+                checkpoint = next_checkpoint
@@ -182,6 +182,8 @@ async def _call_mcp_tool(self, name: str, arguments: dict[str, Any], request_tim
             return f"MCP server error: {result.content}"
 
         # For now, we only support text content
+        if not result.content:
+            return "MCP server returned empty content."
         if isinstance(result.content[0], TextContent):
             return result.content[0].text
         else:
@@ -214,7 +216,10 @@ def tool_call(self, name: str, arguments: dict[str, Any], timeout: float | int =
         if self._close:
             return "Error: Session is closed"
 
-        future = asyncio.run_coroutine_threadsafe(self._call_mcp_tool(name, arguments), self._event_loop)
+        future = asyncio.run_coroutine_threadsafe(
+            self._call_mcp_tool(name, arguments, request_timeout=timeout),
+            self._event_loop,
+        )
         try:
             return future.result(timeout=timeout)
         except FuturesTimeoutError:
 
@@ -32,7 +32,9 @@ def is_chinese(line):
 
     @staticmethod
     def sub_special_char(line):
-        return re.sub(r"([:\{\}/\[\]\-\*\?\"\(\)\|\+~\^])", r"\\\1", line).strip()
+        # Strip single quotes first to avoid Infinity's lexer treating them as string delimiters,
+        # then escape remaining Infinity/Lucene special characters.
+        return re.sub(r"([:\{\}/\[\]\-\*\?\"\(\)\|\+~\^])", r"\\\1", line.replace("'", "")).strip()
 
     @staticmethod
     def rmWWW(txt):
 
@@ -38,7 +38,6 @@
 from sklearn.metrics import silhouette_score
 
 from common.file_utils import get_project_base_directory
-from common.misc_utils import pip_install_torch
 from deepdoc.vision import OCR, AscendLayoutRecognizer, LayoutRecognizer, Recognizer, TableStructureRecognizer
 from rag.nlp import rag_tokenizer
 from rag.prompts.generator import vision_llm_describe_prompt
@@ -91,14 +90,9 @@ def __init__(self, **kwargs):
         self.tbl_det = TableStructureRecognizer()
 
         self.updown_cnt_mdl = xgb.Booster()
-        try:
-            pip_install_torch()
-            import torch.cuda
-
-            if torch.cuda.is_available():
-                self.updown_cnt_mdl.set_param({"device": "cuda"})
-        except Exception:
-            logging.info("No torch found.")
+        # xgboost model is very small; using CPU explicitly
+        self.updown_cnt_mdl.set_param({"device": "cpu"})
+        logging.info("updown_cnt_mdl initialized on CPU")
         try:
             model_dir = os.path.join(get_project_base_directory(), "rag/res/deepdoc")
             self.updown_cnt_mdl.load_model(os.path.join(model_dir, "updown_concat_xgb.model"))