refactor(tools): share cwd-backfill helper + skip redundant scans

martinma51 · martinma51 · commit 2df5151a9c49 · 2026-05-21T12:15:32.000+08:00
Addresses all five Gemini-code-assist review points on PR xorbitsai#460. Centralise the manual file-registration backfill that python_executor and javascript_executor were each doing inline. Both adapters now just call ``workspace.backfill_files_from_cwd(working_directory)`` around the existing ``auto_register_files()`` block. Key changes from the duplicated inline implementations: 1. **Skip-when-redundant fast path.** When the executor's cwd is already inside ``workspace_dir`` (the common case in tests and for tasks whose workspace id matches the executor's working directory), ``auto_register_files`` already covers those files on its own scan. The new helper detects this with ``wd.is_relative_to(workspace_dir)`` and yields immediately without doing any second walk — avoiding the double-scan Gemini called out. 2. **``os.walk`` with in-place dir pruning** replaces ``rglob("*")`` + post-filter. Hidden subdirs, ``__pycache__`` and ``node_modules`` are pruned from ``dirnames`` so the walk never descends into them — significantly faster on trees that contain large ``node_modules`` directories (which is exactly the case for the pptxgenjs-using JS executor). 3. **No more ``p.resolve()`` inside the snapshot loop.** ``os.walk`` already yields absolute paths when given an absolute root, so we resolve the root once outside the walk and reuse it. Each entry in the resulting set is already absolute and comparable across the before/after snapshots without further normalisation. 4. **Module-level logger.** The inline helpers were creating a per-call ``_log`` shadow of the module logger; both adapters now use the module-level ``logger`` they already declared at import time. 5. **No per-call function redefinition.** The inline ``_scan_cwd`` was being defined fresh on every executor invocation; the extracted ``_scan_user_files`` is a module-level function. Also drop the redundant ``try/except`` around ``self.workspace.get_file_id_from_path(...)`` in ``workspace_file_tool.py``: that method already swallows internal exceptions and returns ``None``, so the outer guard was dead code. Tests: 60 tests across javascript_executor / python_executor / workspace / file_analysis suites pass; ruff check + format clean.
diff --git a/src/xagent/core/tools/adapters/vibe/javascript_executor.py b/src/xagent/core/tools/adapters/vibe/javascript_executor.py
@@ -81,51 +81,18 @@ def run_json_sync(self, args: Mapping[str, Any]) -> Any:
         executor = JavaScriptExecutorCore(working_directory)
 
         # Execute code within auto_register context.
-        # Same caveat as python_executor: the executor's working_directory may
-        # not be inside `self._workspace.workspace_dir` (e.g. when agent
-        # workspace id is "67" but executor cwd is "web_task_67/output"). In
-        # that case the built-in scan misses files written via `fs.writeFileSync`,
-        # `pptxgenjs.writeFile`, etc. Belt-and-braces: scan the actual cwd
-        # before/after the call and manually register new files.
+        # The executor's working_directory may sit outside
+        # ``self._workspace.workspace_dir`` (e.g. workspace id "67" but
+        # executor cwd "web_task_67/output"), in which case
+        # ``auto_register_files`` would miss files saved through raw fs
+        # IO (``fs.writeFileSync``, ``pptxgenjs.writeFile``, …). The shared
+        # ``backfill_files_from_cwd`` helper on TaskWorkspace handles the
+        # belt-and-braces snapshot — and is a no-op fast-path when the
+        # cwd is already inside workspace_dir, so we don't double-scan.
         if self._workspace and working_directory:
-            import logging as _logging
-
-            _log = _logging.getLogger(__name__)
-            from pathlib import Path as _Path
-
-            def _scan_cwd() -> set:
-                wd = _Path(working_directory)
-                if not wd.exists():
-                    return set()
-                # Filter on path segments BELOW working_directory only — the
-                # whole tree may live under a hidden parent like `.xagent_data`.
-                wd_resolved = wd.resolve()
-                results = set()
-                for p in wd.rglob("*"):
-                    if not p.is_file():
-                        continue
-                    try:
-                        rel_parts = p.resolve().relative_to(wd_resolved).parts
-                    except ValueError:
-                        continue
-                    if any(part.startswith(".") for part in rel_parts):
-                        continue
-                    if "__pycache__" in rel_parts or "node_modules" in rel_parts:
-                        continue
-                    results.add(p)
-                return results
-
-            files_before = _scan_cwd()
-            with self._workspace.auto_register_files():
-                result = executor.execute_code(exec_args.code, packages=pkg_list)
-            files_after = _scan_cwd()
-            new_files = files_after - files_before
-            for fp in new_files:
-                try:
-                    self._workspace.register_file(str(fp))
-                    _log.info(f"javascript_executor: backfill-registered new file {fp}")
-                except Exception as e:
-                    _log.warning(f"javascript_executor: failed to register {fp}: {e}")
+            with self._workspace.backfill_files_from_cwd(working_directory):
+                with self._workspace.auto_register_files():
+                    result = executor.execute_code(exec_args.code, packages=pkg_list)
         else:
             result = executor.execute_code(exec_args.code, packages=pkg_list)
 
diff --git a/src/xagent/core/tools/adapters/vibe/python_executor.py b/src/xagent/core/tools/adapters/vibe/python_executor.py
@@ -86,56 +86,19 @@ def run_json_sync(self, args: Mapping[str, Any]) -> Any:
 
         # Execute code within auto_register context.
         #
-        # The python_executor's `working_directory` is not always the same as
-        # `self._workspace.workspace_dir` — they can diverge when the agent
-        # workspace is keyed by raw task_id ("67") but the executor cwd lands
-        # in the per-user output tree ("web_task_67/output"). In that case the
-        # built-in `auto_register_files()` scan (which walks workspace_dir)
-        # would miss files saved by openpyxl/pptxgenjs/etc. via raw fs IO.
-        #
-        # Belt-and-braces: snapshot the executor's actual working_directory
-        # before/after the call and manually register any new files. This is
-        # a no-op when the dirs already coincide (file_id lookup deduplicates).
+        # ``working_directory`` is not always identical to
+        # ``self._workspace.workspace_dir`` — they diverge when the agent
+        # workspace is keyed by raw task_id ("67") but the executor cwd
+        # lands in the per-user output tree ("web_task_67/output").
+        # ``auto_register_files`` only walks workspace_dir, so it would
+        # miss files saved by openpyxl / pptxgenjs / etc. via raw fs IO.
+        # The shared ``backfill_files_from_cwd`` helper on TaskWorkspace
+        # snapshots cwd before/after and registers the diff. It's a
+        # no-op fast-path when cwd is already inside workspace_dir.
         if self._workspace and working_directory:
-            import logging as _logging
-
-            _log = _logging.getLogger(__name__)
-            from pathlib import Path as _Path
-
-            def _scan_cwd() -> set:
-                wd = _Path(working_directory)
-                if not wd.exists():
-                    return set()
-                # Only filter on the path SEGMENTS BELOW working_directory.
-                # We cannot reject paths whose parents are hidden (e.g.
-                # `.xagent_data`) because the whole tree lives under one.
-                wd_resolved = wd.resolve()
-                results = set()
-                for p in wd.rglob("*"):
-                    if not p.is_file():
-                        continue
-                    try:
-                        rel_parts = p.resolve().relative_to(wd_resolved).parts
-                    except ValueError:
-                        continue
-                    if any(part.startswith(".") for part in rel_parts):
-                        continue
-                    if "__pycache__" in rel_parts or "node_modules" in rel_parts:
-                        continue
-                    results.add(p)
-                return results
-
-            files_before = _scan_cwd()
-            with self._workspace.auto_register_files():
-                result = executor.execute_code(full_code, exec_args.capture_output)
-            files_after = _scan_cwd()
-            new_files = files_after - files_before
-            for fp in new_files:
-                try:
-                    self._workspace.register_file(str(fp))
-                    _log.info(f"python_executor: backfill-registered new file {fp}")
-                except Exception as e:
-                    _log.warning(f"python_executor: failed to register {fp}: {e}")
+            with self._workspace.backfill_files_from_cwd(working_directory):
+                with self._workspace.auto_register_files():
+                    result = executor.execute_code(full_code, exec_args.capture_output)
         else:
             result = executor.execute_code(full_code, exec_args.capture_output)
 
diff --git a/src/xagent/core/tools/core/workspace_file_tool.py b/src/xagent/core/tools/core/workspace_file_tool.py
@@ -529,13 +529,12 @@ def get_file_info(self, file_path_or_id: str) -> FileInfo:
         stat = resolved_path.stat()
 
         # Look up the registered file_id (UUID) so the LLM can build a chip
-        # link `[name](file:UUID)` in its final answer. If the file was not
-        # auto-registered, this returns None (and the chip won't render).
-        file_id: Optional[str] = None
-        try:
-            file_id = self.workspace.get_file_id_from_path(str(resolved_path))
-        except Exception:
-            file_id = None
+        # link `[name](file:UUID)` in its final answer. ``get_file_id_from_path``
+        # already returns None on any internal lookup failure, so no outer
+        # try/except is needed here.
+        file_id: Optional[str] = self.workspace.get_file_id_from_path(
+            str(resolved_path)
+        )
 
         return FileInfo(
             name=resolved_path.name,
diff --git a/src/xagent/core/workspace.py b/src/xagent/core/workspace.py
@@ -25,6 +25,38 @@
 # Context variable for auto-registration mode
 _auto_register = contextvars.ContextVar("_auto_register", default=False)
 
+# Directory names to skip when walking user-controlled trees. Hidden
+# subdirs (anything starting with ``.``) are also pruned.
+_IGNORED_DIR_NAMES: frozenset[str] = frozenset({"__pycache__", "node_modules"})
+
+
+def _scan_user_files(root: Path) -> set[Path]:
+    """Return every regular file under *root*, with ignored subtrees pruned.
+
+    Uses ``os.walk`` and trims ``dirnames`` in-place so we do **not**
+    descend into hidden subdirs, ``__pycache__``, or ``node_modules`` —
+    cheaper than ``rglob`` + filter, which still traverses the entire
+    tree before discarding it. Hidden files are skipped too.
+
+    Returns absolute :class:`Path` objects when *root* is absolute, so
+    callers can diff snapshots across the same root without resolving
+    each entry inside the loop.
+    """
+    if not root.exists():
+        return set()
+    results: set[Path] = set()
+    for dir_path, dir_names, file_names in os.walk(root):
+        dir_names[:] = [
+            d
+            for d in dir_names
+            if not d.startswith(".") and d not in _IGNORED_DIR_NAMES
+        ]
+        for name in file_names:
+            if name.startswith("."):
+                continue
+            results.add(Path(dir_path) / name)
+    return results
+
 
 def _safe_storage_relative_path(relative_path: str) -> str:
     path = Path(relative_path)
@@ -848,6 +880,50 @@ def auto_register_files(self) -> "Iterator[TaskWorkspace]":
                         f"File exists on disk but is not in database - will require backfill."
                     )
 
+    @contextmanager
+    def backfill_files_from_cwd(
+        self, working_directory: str | Path
+    ) -> "Iterator[TaskWorkspace]":
+        """Manually register files created in *working_directory* after the body runs.
+
+        The executor's ``working_directory`` is sometimes outside
+        ``workspace_dir`` (e.g. workspace id ``"67"`` but executor cwd
+        ``"web_task_67/output"``). When that happens, ``auto_register_files``
+        — which only scans ``workspace_dir`` — misses files saved through
+        raw fs IO (``fs.writeFileSync``, ``pptxgenjs.writeFile``, openpyxl,
+        etc.). This helper snapshots *working_directory* before/after the
+        body and manually registers any new files.
+
+        **No-op fast path:** when *working_directory* is already inside
+        the workspace tree, ``auto_register_files`` will see the new
+        files on its own scan; we skip the manual snapshot entirely to
+        avoid double-walking the same tree.
+
+        Centralised here so the executor adapters (python_executor,
+        javascript_executor, …) share one implementation instead of
+        duplicating the scan + diff + register loop.
+        """
+        wd_path = Path(working_directory).resolve()
+        ws_root = self.workspace_dir.resolve()
+
+        # If the executor's cwd already lives under workspace_dir,
+        # auto_register_files()'s own walk covers it — don't double-scan.
+        if wd_path == ws_root or wd_path.is_relative_to(ws_root):
+            yield self
+            return
+
+        files_before = _scan_user_files(wd_path)
+        try:
+            yield self
+        finally:
+            files_after = _scan_user_files(wd_path)
+            for fp in files_after - files_before:
+                try:
+                    self.register_file(str(fp))
+                    logger.info("backfill: registered %s", fp)
+                except Exception as exc:
+                    logger.warning("backfill: failed to register %s: %s", fp, exc)
+
     def _scan_all_files(self) -> set[Path]:
         """Scan all files in workspace and return as set."""
         files: set[Path] = set()