Skip to content

Commit cd7bf87

Browse files
authored
Merge branch 'infiniflow:main' into main
2 parents a66ab16 + 27329b4 commit cd7bf87

115 files changed

Lines changed: 2168 additions & 868 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Dockerfile

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -35,26 +35,14 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
3535
apt update && \
3636
apt --no-install-recommends install -y ca-certificates; \
3737
if [ "$NEED_MIRROR" == "1" ]; then \
38-
sed -i 's|http://archive.ubuntu.com/ubuntu|https://mirrors.tuna.tsinghua.edu.cn/ubuntu|g' /etc/apt/sources.list.d/ubuntu.sources; \
39-
sed -i 's|http://security.ubuntu.com/ubuntu|https://mirrors.tuna.tsinghua.edu.cn/ubuntu|g' /etc/apt/sources.list.d/ubuntu.sources; \
38+
sed -i 's|http://archive.ubuntu.com/ubuntu|https://mirrors.aliyun.com/ubuntu|g' /etc/apt/sources.list.d/ubuntu.sources; \
39+
sed -i 's|http://security.ubuntu.com/ubuntu|https://mirrors.aliyun.com/ubuntu|g' /etc/apt/sources.list.d/ubuntu.sources; \
4040
fi; \
4141
rm -f /etc/apt/apt.conf.d/docker-clean && \
4242
echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache && \
4343
chmod 1777 /tmp && \
4444
apt update && \
45-
apt install -y build-essential && \
46-
apt install -y libglib2.0-0 libglx-mesa0 libgl1 && \
47-
apt install -y pkg-config libicu-dev libgdiplus && \
48-
apt install -y default-jdk && \
49-
apt install -y libatk-bridge2.0-0 && \
50-
apt install -y libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev && \
51-
apt install -y libjemalloc-dev && \
52-
apt install -y gnupg unzip curl wget git vim less && \
53-
apt install -y ghostscript && \
54-
apt install -y pandoc && \
55-
apt install -y texlive && \
56-
apt install -y fonts-freefont-ttf fonts-noto-cjk && \
57-
apt install -y postgresql-client
45+
apt install -y build-essential libglib2.0-0 libglx-mesa0 libgl1 pkg-config libicu-dev libgdiplus default-jdk libatk-bridge2.0-0 libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev libjemalloc-dev gnupg unzip curl wget git vim less ghostscript pandoc texlive fonts-freefont-ttf fonts-noto-cjk postgresql-client
5846

5947
# Download resource from GitHub to /usr/share/infinity
6048
RUN mkdir -p /usr/share/infinity/resource && \
@@ -165,8 +153,8 @@ RUN --mount=type=cache,id=ragflow_uv,target=/root/.cache/uv,sharing=locked \
165153
COPY web web
166154
COPY docs docs
167155
RUN --mount=type=cache,id=ragflow_npm,target=/root/.npm,sharing=locked \
168-
export NODE_OPTIONS="--max-old-space-size=4096" && \
169-
cd web && npm install && npm run build
156+
cd web && NODE_OPTIONS="--max-old-space-size=8192" npm install && \
157+
NODE_OPTIONS="--max-old-space-size=8192" VITE_BUILD_SOURCEMAP=false VITE_MINIFY=esbuild npm run build
170158

171159
COPY .git /ragflow/.git
172160

api/apps/__init__.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,12 @@ def _load_user():
121121
g.user = user[0]
122122
return user[0]
123123
except Exception as e_auth:
124-
logging.warning(f"load_user got exception {e_auth}")
124+
logging.warning(f"load_user from jwt got exception {e_auth}")
125125
try:
126126
authorization = request.headers.get("Authorization")
127127
if len(authorization.split()) == 2:
128-
objs = APIToken.query(token=authorization.split()[1])
128+
token = authorization.split()[1]
129+
objs = APIToken.query(token=token)
129130
if objs:
130131
user = UserService.query(id=objs[0].tenant_id, status=StatusEnum.VALID.value)
131132
if user:
@@ -134,8 +135,12 @@ def _load_user():
134135
return None
135136
g.user = user[0]
136137
return user[0]
138+
else:
139+
logging.warning(f"load_user: No user found for tenant_id={objs[0].tenant_id} from APIToken")
140+
else:
141+
logging.warning(f"load_user: No APIToken found for token={token[:10]}...")
137142
except Exception as e_api_token:
138-
logging.warning(f"load_user got exception {e_api_token}")
143+
logging.warning(f"load_user from api token got exception {e_api_token}")
139144
# Fallback: try raw authorization value as access_token (for login tokens sent without JWT)
140145
try:
141146
authorization = request.headers.get("Authorization")

api/db/services/connector_service.py

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from api.db.services.common_service import CommonService
2727
from api.db.services.document_service import DocumentService
2828
from api.db.services.document_service import DocMetadataService
29+
from api.utils.common import hash128
2930
from common.misc_utils import get_uuid
3031
from common.constants import TaskStatus
3132
from common.time_utils import current_timestamp, timestamp_to_date
@@ -78,6 +79,64 @@ def rebuild(cls, kb_id:str, connector_id: str, tenant_id:str):
7879
SyncLogsService.schedule(connector_id, kb_id, reindex=True)
7980
return err
8081

82+
@classmethod
83+
def cleanup_stale_documents_for_task(
84+
cls,
85+
task_id: str,
86+
connector_id: str,
87+
kb_id: str,
88+
tenant_id: str,
89+
file_list,
90+
delete_batch_size: int = 100,
91+
):
92+
from api.db.services.file_service import FileService
93+
94+
if not Connector2KbService.query(connector_id=connector_id, kb_id=kb_id):
95+
return 0, []
96+
97+
e, conn = cls.get_by_id(connector_id)
98+
if not e:
99+
return 0, []
100+
101+
source_type = f"{conn.source}/{conn.id}"
102+
retain_doc_ids = {hash128(file.id) for file in file_list}
103+
existing_docs = DocumentService.list_doc_headers_by_kb_and_source_type(
104+
kb_id,
105+
source_type,
106+
)
107+
stale_doc_ids = [
108+
doc["id"] for doc in existing_docs if doc["id"] not in retain_doc_ids
109+
]
110+
if not stale_doc_ids:
111+
return 0, []
112+
113+
stale_doc_id_set = set(stale_doc_ids)
114+
errors = []
115+
for offset in range(0, len(stale_doc_ids), delete_batch_size):
116+
err = FileService.delete_docs(
117+
stale_doc_ids[offset : offset + delete_batch_size],
118+
tenant_id,
119+
)
120+
if err:
121+
errors.append(err)
122+
123+
remaining_doc_ids = {
124+
doc["id"]
125+
for doc in DocumentService.list_doc_headers_by_kb_and_source_type(
126+
kb_id,
127+
source_type,
128+
)
129+
if doc["id"] in stale_doc_id_set
130+
}
131+
removed_count = len(stale_doc_id_set) - len(remaining_doc_ids)
132+
SyncLogsService.increase_removed_docs(
133+
task_id,
134+
removed_count,
135+
"\n".join(errors),
136+
len(errors),
137+
)
138+
return removed_count, errors
139+
81140

82141
class SyncLogsService(CommonService):
83142
model = SyncLogs
@@ -196,6 +255,16 @@ def increase_docs(cls, id, max_update, doc_num, err_msg="", error_count=0):
196255
)\
197256
.where(cls.model.id == id).execute()
198257

258+
@classmethod
259+
def increase_removed_docs(cls, id, removed_count, err_msg="", error_count=0):
260+
cls.model.update(
261+
docs_removed_from_index=cls.model.docs_removed_from_index + removed_count,
262+
error_msg=cls.model.error_msg + err_msg,
263+
error_count=cls.model.error_count + error_count,
264+
update_time=current_timestamp(),
265+
update_date=timestamp_to_date(current_timestamp()),
266+
).where(cls.model.id == id).execute()
267+
199268
@classmethod
200269
def duplicate_and_parse(cls, kb, docs, tenant_id, src, auto_parse=True):
201270
from api.db.services.file_service import FileService
@@ -300,5 +369,3 @@ def list_connectors(cls, kb_id):
300369
).dicts()
301370
)
302371

303-
304-

api/db/services/document_service.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,25 @@ def get_all_doc_ids_by_kb_ids(cls, kb_ids):
373373
offset += limit
374374
return res
375375

376+
@classmethod
377+
@DB.connection_context()
378+
def list_doc_headers_by_kb_and_source_type(cls, kb_id, source_type, page_size=500):
379+
fields = [cls.model.id, cls.model.kb_id, cls.model.source_type, cls.model.name]
380+
docs = cls.model.select(*fields).where(
381+
cls.model.kb_id == kb_id,
382+
cls.model.source_type == source_type,
383+
).order_by(cls.model.create_time.asc())
384+
offset = 0
385+
res = []
386+
while True:
387+
doc_batch = docs.offset(offset).limit(page_size)
388+
_temp = list(doc_batch.dicts())
389+
if not _temp:
390+
break
391+
res.extend(_temp)
392+
offset += page_size
393+
return res
394+
376395
@classmethod
377396
@DB.connection_context()
378397
def get_all_docs_by_creator_id(cls, creator_id):

api/db/services/user_canvas_version.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,15 @@
1111
class UserCanvasVersionService(CommonService):
1212
model = UserCanvasVersion
1313

14+
# Build a stable display name for saved snapshots.
1415
@staticmethod
1516
def build_version_title(user_nickname, agent_title, ts=None):
1617
tenant = str(user_nickname or "").strip() or "tenant"
1718
title = str(agent_title or "").strip() or "agent"
1819
stamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(ts)) if ts is not None else time.strftime("%Y-%m-%d %H:%M:%S")
1920
return "{0}_{1}_{2}".format(tenant, title, stamp)
2021

22+
# Normalize DSL before comparing or writing version content.
2123
@staticmethod
2224
def _normalize_dsl(dsl):
2325
normalized = dsl
@@ -143,6 +145,7 @@ def save_or_replace_latest(cls, user_canvas_id, dsl, title=None, description=Non
143145
.first()
144146
)
145147

148+
# Repeated saves with the same DSL only refresh the latest snapshot.
146149
if latest and cls._normalize_dsl(latest.dsl) == normalized_dsl:
147150
# Protect released version: if latest is released and current is not,
148151
# create a new version instead of updating
@@ -170,6 +173,7 @@ def save_or_replace_latest(cls, user_canvas_id, dsl, title=None, description=Non
170173
cls.delete_all_versions(user_canvas_id)
171174
return latest.id, False
172175

176+
# Real content changes create a new snapshot.
173177
insert_data = {"user_canvas_id": user_canvas_id, "dsl": normalized_dsl}
174178
if title is not None:
175179
insert_data["title"] = title

common/data_source/github/connector.py

Lines changed: 57 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,20 @@
2828
InsufficientPermissionsError,
2929
UnexpectedValidationError,
3030
)
31-
from common.data_source.interfaces import CheckpointedConnectorWithPermSyncGH, CheckpointOutput
31+
from common.data_source.interfaces import (
32+
CheckpointedConnectorWithPermSyncGH,
33+
CheckpointOutput,
34+
CheckpointOutputWrapper,
35+
)
3236
from common.data_source.models import (
3337
ConnectorCheckpoint,
3438
ConnectorFailure,
3539
Document,
3640
DocumentFailure,
3741
ExternalAccess,
42+
GenerateSlimDocumentOutput,
3843
SecondsSinceUnixEpoch,
44+
SlimDocument,
3945
)
4046
from common.data_source.connector_runner import ConnectorRunner
4147
from .models import SerializedRepository
@@ -594,14 +600,8 @@ def _fetch_from_github(
594600
done_with_prs = False
595601
num_prs = 0
596602
pr = None
597-
print("start: ", start)
598603
for pr in pr_batch:
599604
num_prs += 1
600-
print("-"*40)
601-
print("PR name", pr.title)
602-
print("updated at", pr.updated_at)
603-
print("-"*40)
604-
print("\n")
605605
# we iterate backwards in time, so at this point we stop processing prs
606606
if (
607607
start is not None
@@ -732,10 +732,10 @@ def _fetch_from_github(
732732

733733
if checkpoint.cached_repo_ids:
734734
logging.info(
735-
f"{len(checkpoint.cached_repo_ids)} repos remaining (IDs: {checkpoint.cached_repo_ids})"
735+
f"{len(checkpoint.cached_repo_ids)} checkpoint repos remaining (IDs: {checkpoint.cached_repo_ids})"
736736
)
737737
else:
738-
logging.info("No more repos remaining")
738+
logging.info("There are no more checkpoint repos left.")
739739

740740
return checkpoint
741741

@@ -923,6 +923,53 @@ def validate_checkpoint_json(
923923
) -> GithubConnectorCheckpoint:
924924
return GithubConnectorCheckpoint.model_validate_json(checkpoint_json)
925925

926+
def retrieve_slim_document(
927+
self,
928+
start: SecondsSinceUnixEpoch | None = None,
929+
end: SecondsSinceUnixEpoch | None = None,
930+
callback: Any = None,
931+
) -> GenerateSlimDocumentOutput:
932+
start_value = 0.0 if start is None else start
933+
end_value = (
934+
datetime.now(timezone.utc).timestamp() if end is None else end
935+
)
936+
checkpoint = self.build_dummy_checkpoint()
937+
slim_batch: list[SlimDocument] = []
938+
939+
while checkpoint.has_more:
940+
wrapper = CheckpointOutputWrapper[GithubConnectorCheckpoint]()
941+
for document, failure, next_checkpoint in wrapper(
942+
self.load_from_checkpoint(start_value, end_value, checkpoint)
943+
):
944+
if failure is not None:
945+
logging.warning(
946+
"GitHub connector failure during slim retrieval: %s",
947+
getattr(failure, "failure_message", failure),
948+
)
949+
continue
950+
951+
if document is not None:
952+
slim_batch.append(SlimDocument(id=document.id))
953+
if len(slim_batch) >= SLIM_BATCH_SIZE:
954+
yield slim_batch
955+
slim_batch = []
956+
if callback:
957+
callback.progress("github_slim_document", 1)
958+
959+
if next_checkpoint is not None:
960+
checkpoint = next_checkpoint
961+
962+
if slim_batch:
963+
yield slim_batch
964+
965+
def retrieve_all_slim_docs_perm_sync(
966+
self,
967+
start: SecondsSinceUnixEpoch | None = None,
968+
end: SecondsSinceUnixEpoch | None = None,
969+
callback: Any = None,
970+
) -> GenerateSlimDocumentOutput:
971+
yield from self.retrieve_slim_document(start=start, end=end, callback=callback)
972+
926973
def build_dummy_checkpoint(self) -> GithubConnectorCheckpoint:
927974
return GithubConnectorCheckpoint(
928975
stage=GithubConnectorStage.PRS, curr_page=0, has_more=True, num_retrieved=0
@@ -970,4 +1017,4 @@ def build_dummy_checkpoint(self) -> GithubConnectorCheckpoint:
9701017
if failure:
9711018
print(f"Failure: {failure.failure_message}")
9721019
if next_checkpoint:
973-
checkpoint = next_checkpoint
1020+
checkpoint = next_checkpoint

common/mcp_tool_call_conn.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,8 @@ async def _call_mcp_tool(self, name: str, arguments: dict[str, Any], request_tim
182182
return f"MCP server error: {result.content}"
183183

184184
# For now, we only support text content
185+
if not result.content:
186+
return "MCP server returned empty content."
185187
if isinstance(result.content[0], TextContent):
186188
return result.content[0].text
187189
else:
@@ -214,7 +216,10 @@ def tool_call(self, name: str, arguments: dict[str, Any], timeout: float | int =
214216
if self._close:
215217
return "Error: Session is closed"
216218

217-
future = asyncio.run_coroutine_threadsafe(self._call_mcp_tool(name, arguments), self._event_loop)
219+
future = asyncio.run_coroutine_threadsafe(
220+
self._call_mcp_tool(name, arguments, request_timeout=timeout),
221+
self._event_loop,
222+
)
218223
try:
219224
return future.result(timeout=timeout)
220225
except FuturesTimeoutError:

common/query_base.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ def is_chinese(line):
3232

3333
@staticmethod
3434
def sub_special_char(line):
35-
return re.sub(r"([:\{\}/\[\]\-\*\?\"\(\)\|\+~\^])", r"\\\1", line).strip()
35+
# Strip single quotes first to avoid Infinity's lexer treating them as string delimiters,
36+
# then escape remaining Infinity/Lucene special characters.
37+
return re.sub(r"([:\{\}/\[\]\-\*\?\"\(\)\|\+~\^])", r"\\\1", line.replace("'", "")).strip()
3638

3739
@staticmethod
3840
def rmWWW(txt):

deepdoc/parser/pdf_parser.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
from sklearn.metrics import silhouette_score
3939

4040
from common.file_utils import get_project_base_directory
41-
from common.misc_utils import pip_install_torch
4241
from deepdoc.vision import OCR, AscendLayoutRecognizer, LayoutRecognizer, Recognizer, TableStructureRecognizer
4342
from rag.nlp import rag_tokenizer
4443
from rag.prompts.generator import vision_llm_describe_prompt
@@ -91,14 +90,9 @@ def __init__(self, **kwargs):
9190
self.tbl_det = TableStructureRecognizer()
9291

9392
self.updown_cnt_mdl = xgb.Booster()
94-
try:
95-
pip_install_torch()
96-
import torch.cuda
97-
98-
if torch.cuda.is_available():
99-
self.updown_cnt_mdl.set_param({"device": "cuda"})
100-
except Exception:
101-
logging.info("No torch found.")
93+
# xgboost model is very small; using CPU explicitly
94+
self.updown_cnt_mdl.set_param({"device": "cpu"})
95+
logging.info("updown_cnt_mdl initialized on CPU")
10296
try:
10397
model_dir = os.path.join(get_project_base_directory(), "rag/res/deepdoc")
10498
self.updown_cnt_mdl.load_model(os.path.join(model_dir, "updown_concat_xgb.model"))

0 commit comments

Comments
 (0)