fix(sync): avoid shell for scan subprocesses (#814)

phernandez · web-flow · commit 55f314237d30 · 2026-05-11T09:20:11.000-05:00
Signed-off-by: phernandez &lt;paul@basicmachines.co&gt;
diff --git a/SECURITY.md b/SECURITY.md
@@ -8,6 +8,71 @@
 
 ## Reporting a Vulnerability
 
-Use this section to tell people how to report a vulnerability.
+If you find a vulnerability, please contact hello@basicmachines.co.
 
-If you find a vulnerability, please contact hello@basicmachines.co
+Please do not open a public GitHub issue for security vulnerabilities. We aim
+to respond within 72 hours and will coordinate a fix and disclosure timeline
+with you.
+
+## Threat Model
+
+Basic Memory is a local-first MCP server that reads and writes markdown files
+inside configured project directories. It runs on your machine with your user
+permissions, so local configuration deserves the same care as any other
+developer tool that can access your files.
+
+### What Basic Memory Controls
+
+- Filesystem-touching tools validate paths against the configured project root
+  with `validate_project_path()`, resolved paths, and `Path.is_relative_to()`.
+  Path traversal attempts such as `../../etc/passwd` are blocked at this layer.
+- Scan optimizations in `sync_service.py` call `find` through
+  `asyncio.create_subprocess_exec()` with explicit argument lists. Project paths
+  are passed as data, not interpolated into shell strings.
+- Auto-update code uses hardcoded commands, list-form arguments, and
+  `stdin=DEVNULL`. User-controlled strings do not reach a shell there.
+
+### MCP Client-Side Risk
+
+Recent MCP ecosystem research has highlighted a client-side pattern where an
+MCP host can be configured to run arbitrary commands as "servers." That risk is
+in the host configuration, not in notes or Basic Memory tool input.
+
+The recommended Basic Memory MCP configuration uses a known command with
+explicit arguments:
+
+```json
+{
+  "mcpServers": {
+    "basic-memory": {
+      "command": "uvx",
+      "args": ["basic-memory", "mcp"]
+    }
+  }
+}
+```
+
+Only add MCP server entries from sources you trust. Avoid inline shell scripts
+or command strings copied from untrusted sources. Treat third-party MCP server
+configuration with the same scrutiny as any locally executed program.
+
+Related ecosystem context:
+
+- OX Security: The Mother of All AI Supply Chains
+- CSO Online: RCE by design: MCP architectural choice haunts AI agent ecosystem
+
+### Out Of Scope
+
+- Basic Memory does not execute note content as code. Notes are returned as
+  data to the LLM.
+- Basic Memory does not open network ports by default. The MCP server uses
+  stdio; the optional REST API is intended for localhost use.
+- Basic Memory is designed for single-user local knowledge bases and does not
+  implement access controls between operating-system users.
+
+## Secure Configuration Checklist
+
+- MCP config `command` points to `uvx` or a trusted binary, not a shell string.
+- Project paths in Basic Memory config come from trusted local configuration.
+- If exposing the REST API, bind it only to localhost.
+- Review any third-party MCP servers before adding them to your host config.
diff --git a/src/basic_memory/sync/sync_service.py b/src/basic_memory/sync/sync_service.py
@@ -1530,12 +1530,36 @@ async def _quick_count_files(self, directory: Path) -> int:
                 count += 1
             return count
 
-        process = await asyncio.create_subprocess_shell(
-            f'find "{directory}" -type f | wc -l',
+        # Trigger: large-project scan optimization needs the OS `find` command.
+        # Why: passing argv directly avoids shell interpretation of configured project paths.
+        # Outcome: quotes and shell metacharacters in paths are treated as data.
+        process = await asyncio.create_subprocess_exec(
+            "find",
+            str(directory),
+            "-type",
+            "f",
+            "-print0",
             stdout=asyncio.subprocess.PIPE,
             stderr=asyncio.subprocess.PIPE,
         )
-        stdout, stderr = await process.communicate()
+
+        count = 0
+        stderr_task = None
+        if process.stderr is not None:
+            stderr_task = asyncio.create_task(process.stderr.read())
+
+        if process.stdout is None:
+            await process.wait()
+        else:
+            # Trigger: `find` can emit one path per file for very large projects.
+            # Why: collecting every path via communicate() scales memory with path bytes.
+            # Outcome: count null-delimited records in fixed-size chunks.
+            while chunk := await process.stdout.read(1024 * 1024):
+                count += chunk.count(b"\0")
+
+            await process.wait()
+
+        stderr = await stderr_task if stderr_task is not None else b""
 
         if process.returncode != 0:
             error_msg = stderr.decode().strip()
@@ -1550,7 +1574,7 @@ async def _quick_count_files(self, directory: Path) -> int:
                 count += 1
             return count
 
-        return int(stdout.strip())
+        return count
 
     async def _scan_directory_modified_since(
         self, directory: Path, since_timestamp: float
@@ -1583,8 +1607,16 @@ async def _scan_directory_modified_since(
         # Convert timestamp to find-compatible format
         since_date = datetime.fromtimestamp(since_timestamp).strftime("%Y-%m-%d %H:%M:%S")
 
-        process = await asyncio.create_subprocess_shell(
-            f'find "{directory}" -type f -newermt "{since_date}"',
+        # Trigger: incremental scans ask `find` to filter by modification time.
+        # Why: passing argv directly avoids shell interpretation of paths and timestamps.
+        # Outcome: optimized scanning keeps its speed without a shell injection boundary.
+        process = await asyncio.create_subprocess_exec(
+            "find",
+            str(directory),
+            "-type",
+            "f",
+            "-newermt",
+            since_date,
             stdout=asyncio.subprocess.PIPE,
             stderr=asyncio.subprocess.PIPE,
         )
diff --git a/tests/sync/test_sync_service_subprocess.py b/tests/sync/test_sync_service_subprocess.py
@@ -0,0 +1,108 @@
+"""Tests for safe subprocess usage in sync scan optimizations."""
+
+from datetime import datetime
+import sys
+
+import pytest
+
+import basic_memory.sync.sync_service as sync_service_module
+
+
+class _FakeProcess:
+    def __init__(self, stdout: bytes, returncode: int = 0):
+        self._stdout = stdout
+        self.returncode = returncode
+
+    async def communicate(self):
+        return self._stdout, b""
+
+
+class _FakeStream:
+    def __init__(self, chunks: list[bytes]):
+        self._chunks = chunks
+
+    async def read(self, _limit: int = -1):
+        if not self._chunks:
+            return b""
+        return self._chunks.pop(0)
+
+
+class _FakeStreamingProcess:
+    def __init__(self, stdout_chunks: list[bytes], returncode: int = 0):
+        self.stdout = _FakeStream(stdout_chunks)
+        self.stderr = _FakeStream([b""])
+        self.returncode = returncode
+
+    async def wait(self):
+        return self.returncode
+
+    async def communicate(self):  # pragma: no cover - failure path for the assertion below
+        raise AssertionError("_quick_count_files must stream stdout instead of buffering it")
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(sys.platform == "win32", reason="Windows path uses Python scan fallback")
+async def test_quick_count_files_uses_exec_without_shell(monkeypatch, sync_service, tmp_path):
+    """Directory names with shell metacharacters must be passed as exec args."""
+    directory = tmp_path / 'project "quoted"; echo unsafe'
+    captured_args: list[tuple[str, ...]] = []
+
+    async def fail_if_shell_called(*args, **kwargs):
+        raise AssertionError("create_subprocess_shell must not be used")
+
+    async def fake_create_subprocess_exec(*args, **kwargs):
+        captured_args.append(args)
+        return _FakeStreamingProcess(stdout_chunks=[b"/project/a.md\0", b"/project/b.md\0"])
+
+    monkeypatch.setattr(
+        sync_service_module.asyncio, "create_subprocess_shell", fail_if_shell_called
+    )
+    monkeypatch.setattr(
+        sync_service_module.asyncio,
+        "create_subprocess_exec",
+        fake_create_subprocess_exec,
+    )
+
+    count = await sync_service._quick_count_files(directory)
+
+    assert count == 2
+    assert captured_args == [("find", str(directory), "-type", "f", "-print0")]
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(sys.platform == "win32", reason="Windows path uses Python scan fallback")
+async def test_scan_directory_modified_since_uses_exec_without_shell(
+    monkeypatch,
+    sync_service,
+    tmp_path,
+):
+    """Incremental scan should pass the timestamp and directory as exec args."""
+    directory = tmp_path / "project $(echo unsafe)"
+    since_timestamp = 1_700_000_000.0
+    since_date = datetime.fromtimestamp(since_timestamp).strftime("%Y-%m-%d %H:%M:%S")
+    captured_args: list[tuple[str, ...]] = []
+
+    async def fail_if_shell_called(*args, **kwargs):
+        raise AssertionError("create_subprocess_shell must not be used")
+
+    async def fake_create_subprocess_exec(*args, **kwargs):
+        captured_args.append(args)
+        return _FakeProcess(
+            stdout=(f"{directory / 'notes' / 'a.md'}\n{directory / 'notes' / 'b.md'}\n").encode()
+        )
+
+    monkeypatch.setattr(
+        sync_service_module.asyncio, "create_subprocess_shell", fail_if_shell_called
+    )
+    monkeypatch.setattr(
+        sync_service_module.asyncio,
+        "create_subprocess_exec",
+        fake_create_subprocess_exec,
+    )
+
+    paths = await sync_service._scan_directory_modified_since(directory, since_timestamp)
+
+    assert paths == ["notes/a.md", "notes/b.md"]
+    assert captured_args == [
+        ("find", str(directory), "-type", "f", "-newermt", since_date),
+    ]