Fix code review issues: path validation, dead state, code quality

michaelchu · claude · michaelchu · commit 21f33e4e3aa9 · 2026-02-27T10:29:47.000-05:00
Security:
- Validate file_path in load_csv_data against uploaded_files whitelist
  using os.path.realpath to prevent path traversal
- Thread uploaded_files from agent → execute_tool → handler

Code quality:
- Move `import os` to top of _data_inspector.py
- Replace `assert el.path` with guard clause in app.py
- Derive CSV kwarg keys from datafeeds.default_kwargs (DRY)
- Use tmp_path fixture instead of hardcoded /tmp path in test

Tests:
- Add test_load_csv_rejects_non_uploaded_path
- Add test_load_csv_allows_uploaded_path

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/optopsy/ui/agent.py b/optopsy/ui/agent.py
@@ -701,6 +701,7 @@ async def chat(
                         self.datasets,
                         self.results,
                         dataset_fingerprint=self._dataset_fingerprint,
+                        uploaded_files=self.uploaded_files,
                     ),
                 )
                 # Update dataset and recompute fingerprint.  Always
diff --git a/optopsy/ui/app.py b/optopsy/ui/app.py
@@ -768,7 +768,8 @@ async def on_message(message: cl.Message):
     _upload_contexts: list[str] = []
     for el in csv_elements:
         try:
-            assert el.path is not None
+            if not el.path:
+                continue
             raw = pd.read_csv(el.path, nrows=5)
             label = el.name
             agent.uploaded_files[label] = el.path
diff --git a/optopsy/ui/tools/_data_inspector.py b/optopsy/ui/tools/_data_inspector.py
@@ -1,10 +1,12 @@
 """Data inspection tool handlers: load_csv_data, preview_data, describe_data, suggest_strategy_params."""
 
 import json as _json
+import os
 
 import pandas as pd
 
 import optopsy as op
+from optopsy.datafeeds import default_kwargs
 from optopsy.strategies._helpers import (
     _DEFAULT_ATM_DELTA,
     _DEFAULT_DEEP_ITM_DELTA,
@@ -17,34 +19,30 @@
 from ._helpers import _df_summary, _df_to_markdown
 from ._schemas import CALENDAR_STRATEGIES
 
+# Column kwargs accepted by csv_data(), derived from default_kwargs to stay DRY.
+_CSV_COL_KEYS = tuple(k for k in default_kwargs if k not in ("start_date", "end_date"))
+_CSV_KWARG_KEYS = tuple(default_kwargs.keys())
+
 
 @_register("load_csv_data")
 def _handle_load_csv_data(arguments, dataset, signals, datasets, results, _result):
     file_path = arguments.get("file_path")
     if not file_path:
         return _result("file_path is required.")
 
+    # Validate file_path against uploaded files to prevent path traversal.
+    allowed_paths = arguments.get("_uploaded_file_paths")
+    if allowed_paths is not None:
+        real_path = os.path.realpath(file_path)
+        allowed_real = {os.path.realpath(p) for p in allowed_paths}
+        if real_path not in allowed_real:
+            return _result(
+                f"Access denied: '{file_path}' is not a recognized uploaded file."
+            )
+
     # Build kwargs for csv_data() from the validated arguments.
     csv_kwargs = {}
-    for key in (
-        "start_date",
-        "end_date",
-        "underlying_symbol",
-        "underlying_price",
-        "option_type",
-        "expiration",
-        "quote_date",
-        "strike",
-        "bid",
-        "ask",
-        "delta",
-        "gamma",
-        "theta",
-        "vega",
-        "implied_volatility",
-        "volume",
-        "open_interest",
-    ):
+    for key in _CSV_KWARG_KEYS:
         val = arguments.get(key)
         if val is not None:
             csv_kwargs[key] = val
@@ -54,8 +52,6 @@ def _handle_load_csv_data(arguments, dataset, signals, datasets, results, _resul
     except Exception as e:
         return _result(f"Failed to load CSV: {e}")
 
-    import os
-
     label = os.path.basename(file_path)
     updated_datasets = {**datasets, label: df}
 
diff --git a/optopsy/ui/tools/_executor.py b/optopsy/ui/tools/_executor.py
@@ -210,6 +210,7 @@ def execute_tool(
     datasets: dict[str, pd.DataFrame] | None = None,
     results: dict[str, dict] | None = None,
     dataset_fingerprint: str | None = None,
+    uploaded_files: dict[str, str] | None = None,
 ) -> ToolResult:
     """
     Execute a tool call and return a ToolResult.
@@ -261,9 +262,11 @@ def execute_tool(
                 results=results,
             )
 
-    # Inject dataset fingerprint AFTER validation so Pydantic doesn't strip it.
+    # Inject internal metadata AFTER validation so Pydantic doesn't strip it.
     if dataset_fingerprint and tool_name in _CACHEABLE_TOOLS:
         arguments = {**arguments, "_dataset_fingerprint": dataset_fingerprint}
+    if uploaded_files and tool_name == "load_csv_data":
+        arguments = {**arguments, "_uploaded_file_paths": set(uploaded_files.values())}
 
     # Helper to build a ToolResult that always carries state forward.
     def _result(
diff --git a/tests/test_tools_integration.py b/tests/test_tools_integration.py
@@ -177,15 +177,38 @@ def test_load_csv_wrong_mapping_returns_error(self):
         )
         assert "Failed to load CSV" in r.llm_summary
 
-    def test_load_csv_missing_file(self):
+    def test_load_csv_missing_file(self, tmp_path):
         """Non-existent file returns an error."""
         r = execute_tool(
             "load_csv_data",
-            {"file_path": "/tmp/nonexistent_optopsy_test.csv"},
+            {"file_path": str(tmp_path / "nonexistent.csv")},
             None,
         )
         assert "Failed to load CSV" in r.llm_summary
 
+    def test_load_csv_rejects_non_uploaded_path(self):
+        """file_path not in uploaded_files is rejected when whitelist is present."""
+        csv_path = f"{TEST_DATA_DIR}/data_no_underlying_price.csv"
+        r = execute_tool(
+            "load_csv_data",
+            {"file_path": csv_path},
+            None,
+            uploaded_files={"other.csv": "/tmp/other.csv"},
+        )
+        assert "Access denied" in r.llm_summary
+
+    def test_load_csv_allows_uploaded_path(self):
+        """file_path in uploaded_files is accepted."""
+        csv_path = f"{TEST_DATA_DIR}/data_no_underlying_price.csv"
+        r = execute_tool(
+            "load_csv_data",
+            {"file_path": csv_path},
+            None,
+            uploaded_files={"data.csv": csv_path},
+        )
+        assert r.dataset is not None
+        assert "Access denied" not in r.llm_summary
+
     def test_load_then_preview(self):
         """Full flow: load_csv_data → preview_data on the loaded dataset."""
         csv_path = f"{TEST_DATA_DIR}/data_no_underlying_price.csv"

Original file line number	Diff line number	Diff line change
`@@ -701,6 +701,7 @@ async def chat(`
`701`	`701`	`self.datasets,`
`702`	`702`	`self.results,`
`703`	`703`	`dataset_fingerprint=self._dataset_fingerprint,`
	`704`	`+ uploaded_files=self.uploaded_files,`
`704`	`705`	`),`
`705`	`706`	`)`
`706`	`707`	`# Update dataset and recompute fingerprint. Always`