llm-d · albertoperdomo2 · May 7, 2026 · May 20, 2026 · May 21, 2026
@@ -83,5 +83,6 @@ cufile.json
 # uds tokenizer default tokenizer cache path
 services/uds_tokenizer/tokenizers
 services/uds_tokenizer/.venv
+kv_connectors/llmd_fs_backend/.venv
 
 **/vllm_source
@@ -18,6 +18,10 @@ BUILDER := $(shell command -v buildah >/dev/null 2>&1 && echo buildah || echo $(
 UDS_TOKENIZER_IMAGE ?= llm-d-uds-tokenizer:e2e-test
 FS_BACKEND_NAME ?= llmd-fs-backend
 FS_BACKEND_DEV_IMG ?= $(IMAGE_TAG_BASE)/$(FS_BACKEND_NAME):$(DEV_VERSION)
+FS_BACKEND_DIR := kv_connectors/llmd_fs_backend
+FS_BACKEND_CPU_TESTS ?= $(FS_BACKEND_DIR)/tests/cpu
+FS_BACKEND_VENV_DIR := $(FS_BACKEND_DIR)/.venv
+FS_BACKEND_VENV_BIN := $(FS_BACKEND_VENV_DIR)/bin
 
 # go source files
 SRC = $(shell find . -type f -name '*.go')
@@ -65,13 +69,31 @@ clang:
 test: unit-test e2e-test ## Run all tests (unit + e2e)
 
 .PHONY: unit-test
-unit-test: unit-test-uds  ## Run unit tests
+unit-test: unit-test-uds unit-test-fs-backend-cpu  ## Run unit tests
 
 .PHONY: unit-test-uds
 unit-test-uds: check-go download-zmq ## Run unit tests
 	@printf "\033[33;1m==== Running unit tests ====\033[0m\n"
 	@go test -v ./pkg/...
 
+.PHONY: fs-backend-cpu-install-deps
+fs-backend-cpu-install-deps: ## Set up venv and install FS backend CPU test dependencies
+	@printf "\033[33;1m==== Setting up FS backend CPU test venv ====\033[0m\n"
+	@if [ ! -f "$(FS_BACKEND_VENV_BIN)/python" ]; then \
+		echo "Creating virtual environment in $(FS_BACKEND_VENV_DIR)..."; \
+		$(PYTHON_EXE) -m venv $(FS_BACKEND_VENV_DIR); \
+		echo "Upgrading pip..."; \
+		$(FS_BACKEND_VENV_BIN)/pip install --upgrade pip > /dev/null; \
+	else \
+		echo "Virtual environment already exists"; \
+	fi
+	@$(FS_BACKEND_VENV_BIN)/pip install -q -r $(FS_BACKEND_DIR)/tests/requirements-cpu.txt
+
+.PHONY: unit-test-fs-backend-cpu
+unit-test-fs-backend-cpu: fs-backend-cpu-install-deps ## Run CPU-safe FS backend Python unit tests
+	@printf "\033[33;1m==== Running CPU-safe FS backend unit tests ====\033[0m\n"
+	@$(FS_BACKEND_VENV_BIN)/python -m pytest -q $(FS_BACKEND_CPU_TESTS)
+
 .PHONY: unit-test-race
 unit-test-race: check-go download-zmq ## Run unit tests with Go race detector enabled
 	@printf "\033[33;1m==== Running unit tests with race detector ====\033[0m\n"

@@ -22,8 +22,6 @@
 sys.path.insert(0, str(Path(__file__).parent))
 
 import pytest
-import torch
-from vllm.config import VllmConfig, set_current_vllm_config
 
 
 def pytest_addoption(parser):
@@ -35,20 +33,38 @@ def pytest_addoption(parser):
     parser.addoption("--obj-ca_bundle", default=None)
 
 
-@pytest.fixture(scope="session", autouse=True)
-def require_cuda():
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers",
+        "no_cuda_required: mark a test as not requiring CUDA setup/teardown",
+    )
+
+
+@pytest.fixture(autouse=True)
+def require_cuda(request):
     """Skip all tests in this session if CUDA is not available."""
+    if request.node.get_closest_marker("no_cuda_required"):
+        return
+
+    import torch
+
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
 
 
 @pytest.fixture(autouse=True)
-def cuda_teardown():
+def cuda_teardown(request):
     """Ensure CUDA and C++ thread-pool resources from one test are fully
     released before the next test starts. Without this, async destructors
     can cause 'cudaErrorUnknown' or stale file-open errors in subsequent tests.
     """
+    if request.node.get_closest_marker("no_cuda_required"):
+        yield
+        return
+
     yield
+    import torch
+
     gc.collect()  # force Python GC to call C++ destructors immediately
     torch.cuda.synchronize()  # surface any async CUDA errors in the right test
     torch.cuda.empty_cache()  # free cached allocations so next test starts clean
@@ -61,6 +77,8 @@ def default_vllm_config():
     that use get_current_vllm_config() outside of a full engine context.
     This matches vLLM's internal test fixture pattern.
     """
+    from vllm.config import VllmConfig, set_current_vllm_config
+
     # Use empty VllmConfig() which provides sensible defaults
     with set_current_vllm_config(VllmConfig()):
         yield
@@ -20,8 +20,11 @@
 from pathlib import Path
 
 import msgpack
+import pytest
 
-CONNECTOR_ROOT = Path(__file__).resolve().parents[1]
+pytestmark = pytest.mark.no_cuda_required
+
+CONNECTOR_ROOT = Path(__file__).resolve().parents[2]
 
 
 class PrepareStoreOutput:

@@ -0,0 +1,3 @@
+pytest
+msgpack
+pyzmq
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    pytest
+    msgpack
+    pyzmq