Skip to content

Commit 2df5151

Browse files
author
martinma51
committed
refactor(tools): share cwd-backfill helper + skip redundant scans
Addresses all five Gemini-code-assist review points on PR xorbitsai#460. Centralise the manual file-registration backfill that python_executor and javascript_executor were each doing inline. Both adapters now just call ``workspace.backfill_files_from_cwd(working_directory)`` around the existing ``auto_register_files()`` block. Key changes from the duplicated inline implementations: 1. **Skip-when-redundant fast path.** When the executor's cwd is already inside ``workspace_dir`` (the common case in tests and for tasks whose workspace id matches the executor's working directory), ``auto_register_files`` already covers those files on its own scan. The new helper detects this with ``wd.is_relative_to(workspace_dir)`` and yields immediately without doing any second walk — avoiding the double-scan Gemini called out. 2. **``os.walk`` with in-place dir pruning** replaces ``rglob("*")`` + post-filter. Hidden subdirs, ``__pycache__`` and ``node_modules`` are pruned from ``dirnames`` so the walk never descends into them — significantly faster on trees that contain large ``node_modules`` directories (which is exactly the case for the pptxgenjs-using JS executor). 3. **No more ``p.resolve()`` inside the snapshot loop.** ``os.walk`` already yields absolute paths when given an absolute root, so we resolve the root once outside the walk and reuse it. Each entry in the resulting set is already absolute and comparable across the before/after snapshots without further normalisation. 4. **Module-level logger.** The inline helpers were creating a per-call ``_log`` shadow of the module logger; both adapters now use the module-level ``logger`` they already declared at import time. 5. **No per-call function redefinition.** The inline ``_scan_cwd`` was being defined fresh on every executor invocation; the extracted ``_scan_user_files`` is a module-level function. Also drop the redundant ``try/except`` around ``self.workspace.get_file_id_from_path(...)`` in ``workspace_file_tool.py``: that method already swallows internal exceptions and returns ``None``, so the outer guard was dead code. Tests: 60 tests across javascript_executor / python_executor / workspace / file_analysis suites pass; ruff check + format clean.
1 parent 0f3c308 commit 2df5151

4 files changed

Lines changed: 105 additions & 100 deletions

File tree

src/xagent/core/tools/adapters/vibe/javascript_executor.py

Lines changed: 11 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -81,51 +81,18 @@ def run_json_sync(self, args: Mapping[str, Any]) -> Any:
8181
executor = JavaScriptExecutorCore(working_directory)
8282

8383
# Execute code within auto_register context.
84-
# Same caveat as python_executor: the executor's working_directory may
85-
# not be inside `self._workspace.workspace_dir` (e.g. when agent
86-
# workspace id is "67" but executor cwd is "web_task_67/output"). In
87-
# that case the built-in scan misses files written via `fs.writeFileSync`,
88-
# `pptxgenjs.writeFile`, etc. Belt-and-braces: scan the actual cwd
89-
# before/after the call and manually register new files.
84+
# The executor's working_directory may sit outside
85+
# ``self._workspace.workspace_dir`` (e.g. workspace id "67" but
86+
# executor cwd "web_task_67/output"), in which case
87+
# ``auto_register_files`` would miss files saved through raw fs
88+
# IO (``fs.writeFileSync``, ``pptxgenjs.writeFile``, …). The shared
89+
# ``backfill_files_from_cwd`` helper on TaskWorkspace handles the
90+
# belt-and-braces snapshot — and is a no-op fast-path when the
91+
# cwd is already inside workspace_dir, so we don't double-scan.
9092
if self._workspace and working_directory:
91-
import logging as _logging
92-
93-
_log = _logging.getLogger(__name__)
94-
from pathlib import Path as _Path
95-
96-
def _scan_cwd() -> set:
97-
wd = _Path(working_directory)
98-
if not wd.exists():
99-
return set()
100-
# Filter on path segments BELOW working_directory only — the
101-
# whole tree may live under a hidden parent like `.xagent_data`.
102-
wd_resolved = wd.resolve()
103-
results = set()
104-
for p in wd.rglob("*"):
105-
if not p.is_file():
106-
continue
107-
try:
108-
rel_parts = p.resolve().relative_to(wd_resolved).parts
109-
except ValueError:
110-
continue
111-
if any(part.startswith(".") for part in rel_parts):
112-
continue
113-
if "__pycache__" in rel_parts or "node_modules" in rel_parts:
114-
continue
115-
results.add(p)
116-
return results
117-
118-
files_before = _scan_cwd()
119-
with self._workspace.auto_register_files():
120-
result = executor.execute_code(exec_args.code, packages=pkg_list)
121-
files_after = _scan_cwd()
122-
new_files = files_after - files_before
123-
for fp in new_files:
124-
try:
125-
self._workspace.register_file(str(fp))
126-
_log.info(f"javascript_executor: backfill-registered new file {fp}")
127-
except Exception as e:
128-
_log.warning(f"javascript_executor: failed to register {fp}: {e}")
93+
with self._workspace.backfill_files_from_cwd(working_directory):
94+
with self._workspace.auto_register_files():
95+
result = executor.execute_code(exec_args.code, packages=pkg_list)
12996
else:
13097
result = executor.execute_code(exec_args.code, packages=pkg_list)
13198

src/xagent/core/tools/adapters/vibe/python_executor.py

Lines changed: 12 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -86,56 +86,19 @@ def run_json_sync(self, args: Mapping[str, Any]) -> Any:
8686

8787
# Execute code within auto_register context.
8888
#
89-
# The python_executor's `working_directory` is not always the same as
90-
# `self._workspace.workspace_dir` — they can diverge when the agent
91-
# workspace is keyed by raw task_id ("67") but the executor cwd lands
92-
# in the per-user output tree ("web_task_67/output"). In that case the
93-
# built-in `auto_register_files()` scan (which walks workspace_dir)
94-
# would miss files saved by openpyxl/pptxgenjs/etc. via raw fs IO.
95-
#
96-
# Belt-and-braces: snapshot the executor's actual working_directory
97-
# before/after the call and manually register any new files. This is
98-
# a no-op when the dirs already coincide (file_id lookup deduplicates).
89+
# ``working_directory`` is not always identical to
90+
# ``self._workspace.workspace_dir`` — they diverge when the agent
91+
# workspace is keyed by raw task_id ("67") but the executor cwd
92+
# lands in the per-user output tree ("web_task_67/output").
93+
# ``auto_register_files`` only walks workspace_dir, so it would
94+
# miss files saved by openpyxl / pptxgenjs / etc. via raw fs IO.
95+
# The shared ``backfill_files_from_cwd`` helper on TaskWorkspace
96+
# snapshots cwd before/after and registers the diff. It's a
97+
# no-op fast-path when cwd is already inside workspace_dir.
9998
if self._workspace and working_directory:
100-
import logging as _logging
101-
102-
_log = _logging.getLogger(__name__)
103-
from pathlib import Path as _Path
104-
105-
def _scan_cwd() -> set:
106-
wd = _Path(working_directory)
107-
if not wd.exists():
108-
return set()
109-
# Only filter on the path SEGMENTS BELOW working_directory.
110-
# We cannot reject paths whose parents are hidden (e.g.
111-
# `.xagent_data`) because the whole tree lives under one.
112-
wd_resolved = wd.resolve()
113-
results = set()
114-
for p in wd.rglob("*"):
115-
if not p.is_file():
116-
continue
117-
try:
118-
rel_parts = p.resolve().relative_to(wd_resolved).parts
119-
except ValueError:
120-
continue
121-
if any(part.startswith(".") for part in rel_parts):
122-
continue
123-
if "__pycache__" in rel_parts or "node_modules" in rel_parts:
124-
continue
125-
results.add(p)
126-
return results
127-
128-
files_before = _scan_cwd()
129-
with self._workspace.auto_register_files():
130-
result = executor.execute_code(full_code, exec_args.capture_output)
131-
files_after = _scan_cwd()
132-
new_files = files_after - files_before
133-
for fp in new_files:
134-
try:
135-
self._workspace.register_file(str(fp))
136-
_log.info(f"python_executor: backfill-registered new file {fp}")
137-
except Exception as e:
138-
_log.warning(f"python_executor: failed to register {fp}: {e}")
99+
with self._workspace.backfill_files_from_cwd(working_directory):
100+
with self._workspace.auto_register_files():
101+
result = executor.execute_code(full_code, exec_args.capture_output)
139102
else:
140103
result = executor.execute_code(full_code, exec_args.capture_output)
141104

src/xagent/core/tools/core/workspace_file_tool.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -529,13 +529,12 @@ def get_file_info(self, file_path_or_id: str) -> FileInfo:
529529
stat = resolved_path.stat()
530530

531531
# Look up the registered file_id (UUID) so the LLM can build a chip
532-
# link `[name](file:UUID)` in its final answer. If the file was not
533-
# auto-registered, this returns None (and the chip won't render).
534-
file_id: Optional[str] = None
535-
try:
536-
file_id = self.workspace.get_file_id_from_path(str(resolved_path))
537-
except Exception:
538-
file_id = None
532+
# link `[name](file:UUID)` in its final answer. ``get_file_id_from_path``
533+
# already returns None on any internal lookup failure, so no outer
534+
# try/except is needed here.
535+
file_id: Optional[str] = self.workspace.get_file_id_from_path(
536+
str(resolved_path)
537+
)
539538

540539
return FileInfo(
541540
name=resolved_path.name,

src/xagent/core/workspace.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,38 @@
2525
# Context variable for auto-registration mode
2626
_auto_register = contextvars.ContextVar("_auto_register", default=False)
2727

28+
# Directory names to skip when walking user-controlled trees. Hidden
29+
# subdirs (anything starting with ``.``) are also pruned.
30+
_IGNORED_DIR_NAMES: frozenset[str] = frozenset({"__pycache__", "node_modules"})
31+
32+
33+
def _scan_user_files(root: Path) -> set[Path]:
34+
"""Return every regular file under *root*, with ignored subtrees pruned.
35+
36+
Uses ``os.walk`` and trims ``dirnames`` in-place so we do **not**
37+
descend into hidden subdirs, ``__pycache__``, or ``node_modules`` —
38+
cheaper than ``rglob`` + filter, which still traverses the entire
39+
tree before discarding it. Hidden files are skipped too.
40+
41+
Returns absolute :class:`Path` objects when *root* is absolute, so
42+
callers can diff snapshots across the same root without resolving
43+
each entry inside the loop.
44+
"""
45+
if not root.exists():
46+
return set()
47+
results: set[Path] = set()
48+
for dir_path, dir_names, file_names in os.walk(root):
49+
dir_names[:] = [
50+
d
51+
for d in dir_names
52+
if not d.startswith(".") and d not in _IGNORED_DIR_NAMES
53+
]
54+
for name in file_names:
55+
if name.startswith("."):
56+
continue
57+
results.add(Path(dir_path) / name)
58+
return results
59+
2860

2961
def _safe_storage_relative_path(relative_path: str) -> str:
3062
path = Path(relative_path)
@@ -848,6 +880,50 @@ def auto_register_files(self) -> "Iterator[TaskWorkspace]":
848880
f"File exists on disk but is not in database - will require backfill."
849881
)
850882

883+
@contextmanager
884+
def backfill_files_from_cwd(
885+
self, working_directory: str | Path
886+
) -> "Iterator[TaskWorkspace]":
887+
"""Manually register files created in *working_directory* after the body runs.
888+
889+
The executor's ``working_directory`` is sometimes outside
890+
``workspace_dir`` (e.g. workspace id ``"67"`` but executor cwd
891+
``"web_task_67/output"``). When that happens, ``auto_register_files``
892+
— which only scans ``workspace_dir`` — misses files saved through
893+
raw fs IO (``fs.writeFileSync``, ``pptxgenjs.writeFile``, openpyxl,
894+
etc.). This helper snapshots *working_directory* before/after the
895+
body and manually registers any new files.
896+
897+
**No-op fast path:** when *working_directory* is already inside
898+
the workspace tree, ``auto_register_files`` will see the new
899+
files on its own scan; we skip the manual snapshot entirely to
900+
avoid double-walking the same tree.
901+
902+
Centralised here so the executor adapters (python_executor,
903+
javascript_executor, …) share one implementation instead of
904+
duplicating the scan + diff + register loop.
905+
"""
906+
wd_path = Path(working_directory).resolve()
907+
ws_root = self.workspace_dir.resolve()
908+
909+
# If the executor's cwd already lives under workspace_dir,
910+
# auto_register_files()'s own walk covers it — don't double-scan.
911+
if wd_path == ws_root or wd_path.is_relative_to(ws_root):
912+
yield self
913+
return
914+
915+
files_before = _scan_user_files(wd_path)
916+
try:
917+
yield self
918+
finally:
919+
files_after = _scan_user_files(wd_path)
920+
for fp in files_after - files_before:
921+
try:
922+
self.register_file(str(fp))
923+
logger.info("backfill: registered %s", fp)
924+
except Exception as exc:
925+
logger.warning("backfill: failed to register %s: %s", fp, exc)
926+
851927
def _scan_all_files(self) -> set[Path]:
852928
"""Scan all files in workspace and return as set."""
853929
files: set[Path] = set()

0 commit comments

Comments
 (0)