refactor(internal): parse projection params once; unify history projection; remove stale percentage

SealKan · SealKan · commit d9c98f67da5c · 2026-05-11T21:47:07.000+07:00
- _project_entity now accepts pre-parsed list[str]|None; callers parse
  fields/attribute_keys once before the entity loop (tools_search.py)
- ha_get_state warns when attribute_keys is supplied but attributes is
  not in fields= (the keys would be ignored silently)
- _fetch_history/_fetch_statistics return unwrapped inner dicts; all
  wrapping (add_timezone_metadata, project_fields) happens at the
  ha_get_history call site, consistent with every other tool
- Removed stale "94% token reduction" claim from ha_get_overview docstring
- Updated tests to match new signatures and shapes
diff --git a/src/ha_mcp/tools/tools_history.py b/src/ha_mcp/tools/tools_history.py
@@ -12,7 +12,7 @@
 import logging
 import re
 from datetime import UTC, datetime, timedelta
-from typing import Annotated, Any, Literal, cast
+from typing import Annotated, Any, Literal
 
 from fastmcp import Context
 from fastmcp.exceptions import ToolError
@@ -34,6 +34,7 @@
     build_pagination_metadata,
     coerce_int_param,
     parse_string_list_param,
+    project_fields,
 )
 
 logger = logging.getLogger(__name__)
@@ -339,14 +340,14 @@ async def ha_get_history(
 
             try:
                 if source == "statistics":
-                    result = await _fetch_statistics(
-                        ws_client, self._client, entity_id_list,
+                    inner = await _fetch_statistics(
+                        ws_client, entity_id_list,
                         start_dt, end_dt, period, statistic_types,
                         limit, offset,
                     )
                 else:
-                    result = await _fetch_history(
-                        ws_client, self._client, entity_id_list,
+                    inner = await _fetch_history(
+                        ws_client, entity_id_list,
                         start_dt, end_dt, minimal_response,
                         significant_changes_only, limit, offset,
                         _DEFAULT_HISTORY_LIMIT, _MAX_HISTORY_LIMIT,
@@ -357,16 +358,12 @@ async def ha_get_history(
                     total=3,
                     message="recorder query complete",
                 )
-                if fields is not None:
-                    parsed = parse_string_list_param(fields, "fields", allow_csv=True) or []
-                    keep = set(parsed) | {"success"}
-                    inner = result.get("data", result)
-                    if isinstance(inner, dict):
-                        result = {
-                            **result,
-                            "data": cast(dict[str, Any], {k: v for k, v in inner.items() if k in keep}),
-                        }
-                return result
+                # Project BEFORE wrapping so the helper applies at the same shape
+                # as every other tool (raw response dict). add_timezone_metadata
+                # wraps the result in {"data": ..., "metadata": ...} which would
+                # otherwise force a bespoke unwrap-project-rewrap site.
+                projected = project_fields(inner, fields)
+                return await add_timezone_metadata(self._client, projected)
             finally:
                 if ws_client:
                     await ws_client.disconnect()
@@ -464,7 +461,6 @@ def _parse_time_range(
 
 async def _fetch_history(
     ws_client: Any,
-    client: Any,
     entity_id_list: list[str],
     start_dt: datetime,
     end_dt: datetime,
@@ -475,7 +471,11 @@ async def _fetch_history(
     default_limit: int,
     max_limit: int,
 ) -> dict[str, Any]:
-    """Execute the history/history_during_period WebSocket call."""
+    """Execute the history/history_during_period WebSocket call.
+
+    Returns the unwrapped history dict; the caller is responsible for projection
+    and wrapping with ``add_timezone_metadata``.
+    """
     try:
         effective_limit = coerce_int_param(
             limit,
@@ -588,12 +588,11 @@ async def _fetch_history(
         },
     }
 
-    return await add_timezone_metadata(client, history_data)
+    return history_data
 
 
 async def _fetch_statistics(
     ws_client: Any,
-    client: Any,
     entity_id_list: list[str],
     start_dt: datetime,
     end_dt: datetime,
@@ -602,7 +601,11 @@ async def _fetch_statistics(
     limit: int | str | None,
     offset: int | str | None,
 ) -> dict[str, Any]:
-    """Execute the recorder/statistics_during_period WebSocket call."""
+    """Execute the recorder/statistics_during_period WebSocket call.
+
+    Returns the unwrapped statistics dict; the caller is responsible for projection
+    and wrapping with ``add_timezone_metadata``.
+    """
     try:
         effective_limit = coerce_int_param(
             limit,
@@ -762,4 +765,4 @@ async def _fetch_statistics(
             "These entities may not have state_class attribute or may not have recorded data yet."
         ]
 
-    return await add_timezone_metadata(client, statistics_data)
+    return statistics_data
diff --git a/src/ha_mcp/tools/tools_search.py b/src/ha_mcp/tools/tools_search.py
@@ -153,27 +153,30 @@ async def _exact_match_search(
 
 def _project_entity(
     record: dict[str, Any],
-    fields: str | list[str] | None,
-    attribute_keys: str | list[str] | None,
+    fields: list[str] | None,
+    attribute_keys: list[str] | None,
 ) -> dict[str, Any]:
     """Apply optional field projection to a HA entity record.
 
     ``fields`` filters which top-level keys to keep (e.g. ["state", "attributes"]).
     ``attribute_keys`` further filters the ``attributes`` sub-dict.
     Both default None = full payload (no-op).
-    Accepts a list or a CSV/JSON-array string for both parameters.
+
+    Both parameters are already parsed into ``list[str] | None`` — string/CSV inputs
+    must be normalised at the call site via ``parse_string_list_param`` (see
+    ``ha_get_state`` which parses once before the bulk loop to avoid re-parsing per
+    entity record).
     """
     if not isinstance(record, dict):
         return record  # non-dict (e.g. error path returning None) — skip projection
     if fields is not None:
-        parsed_fields = parse_string_list_param(fields, "fields", allow_csv=True) or []
-        keep = set(parsed_fields)
+        keep = set(fields)
         record = {k: v for k, v in record.items() if k in keep}
     if attribute_keys is not None:
-        parsed_attr_keys = parse_string_list_param(attribute_keys, "attribute_keys", allow_csv=True) or []
         attrs = record.get("attributes")
         if isinstance(attrs, dict):
-            record = {**record, "attributes": {k: v for k, v in attrs.items() if k in parsed_attr_keys}}
+            attr_keep = set(attribute_keys)
+            record = {**record, "attributes": {k: v for k, v in attrs.items() if k in attr_keep}}
     return record
 
 
@@ -920,8 +923,9 @@ async def ha_get_overview(
         Standard/full modes paginate entities (default 200 per page) — use offset
         to fetch more. Use 'domains' filter to narrow scope.
 
-        Use fields= to project the response to only the keys you need — up to 94%
-        token reduction when fetching a single sub-section (e.g. fields=["system_info"]).
+        Use fields= to project the response to only the keys you need — a
+        significantly smaller payload when fetching a single sub-section (e.g.
+        fields=["system_info"] returns just that section instead of the full overview).
         """
         # Validate fields= early so a malformed value returns VALIDATION_INVALID_PARAMETER
         # (ha_get_overview has no outer try/except, so ValueError would escape uncaught)
@@ -1250,17 +1254,61 @@ async def ha_get_state(
         Returns success=True if at least one entity state was retrieved.
         Check 'error_count' for any failed lookups in partial-success scenarios.
 
+        FIELDS PROJECTION:
+        `fields=` projects the per-entity record (`entity_id`, `state`, `attributes`,
+        `last_changed`, `last_updated`, `context`), NOT the outer bulk response wrapper.
+        In single-entity mode it filters keys of the returned record directly. In bulk
+        mode it filters keys of each record inside `states[entity_id]`; outer keys
+        (`success`, `count`, `states`, `errors`, ...) are always preserved.
+        `attribute_keys=` further narrows the `attributes` sub-dict and is only applied
+        when `"attributes"` is in `fields=` (or `fields=None`); otherwise it is a no-op.
+
         EXAMPLES:
         - Single: ha_get_state("light.kitchen")
         - Multiple: ha_get_state(["light.kitchen", "light.living_room", "sensor.temperature"])
         - State only: ha_get_state("light.kitchen", fields=["state"])
         - Slim bulk: ha_get_state(["light.kitchen", "sensor.temperature"], fields=["state", "attributes"], attribute_keys=["brightness"])
         """
+        # Parse projection params once up front so the bulk loop doesn't re-parse
+        # the same string/CSV input per entity (100 entities → 200 parses pre-fix).
+        # parse_string_list_param raises ValueError on bad input; surface as
+        # VALIDATION_INVALID_PARAMETER via the normal ToolError flow.
+        try:
+            parsed_fields = parse_string_list_param(fields, "fields", allow_csv=True)
+            parsed_attribute_keys = parse_string_list_param(
+                attribute_keys, "attribute_keys", allow_csv=True
+            )
+        except ValueError as e:
+            raise_tool_error(
+                create_validation_error(
+                    str(e),
+                    parameter=(
+                        "attribute_keys" if "attribute_keys" in str(e) else "fields"
+                    ),
+                )
+            )
+
+        # `attribute_keys` only takes effect when `attributes` is in the projected
+        # field set (or `fields=None`). Surface a warning rather than silently
+        # ignoring it — caller likely intended to slim attributes and would
+        # otherwise see an unfiltered or absent `attributes` key with no signal.
+        attribute_keys_no_effect = (
+            parsed_attribute_keys is not None
+            and parsed_fields is not None
+            and "attributes" not in parsed_fields
+        )
+
         # Single entity path
         if isinstance(entity_id, str):
             try:
                 result = await client.get_entity_state(entity_id)
-                result = _project_entity(result, fields, attribute_keys)
+                result = _project_entity(result, parsed_fields, parsed_attribute_keys)
+                if attribute_keys_no_effect and isinstance(result, dict):
+                    result["warning"] = (
+                        "attribute_keys was ignored because 'attributes' is not in "
+                        "fields=. Add 'attributes' to fields= (or omit fields=) to "
+                        "apply attribute_keys."
+                    )
                 return await add_timezone_metadata(client, result)
             except ToolError:
                 raise
@@ -1332,7 +1380,9 @@ async def _fetch_state(eid: str) -> dict[str, Any]:
 
             for eid, result in zip(unique_ids, results, strict=True):
                 if result.get("success") is True and "state" in result:
-                    states[eid] = _project_entity(result["state"], fields, attribute_keys)
+                    states[eid] = _project_entity(
+                        result["state"], parsed_fields, parsed_attribute_keys
+                    )
                 else:
                     error_detail = result.get("error")
                     if error_detail is None:
@@ -1353,6 +1403,13 @@ async def _fetch_state(eid: str) -> dict[str, Any]:
                 "states": states,
             }
 
+            if attribute_keys_no_effect:
+                response["warning"] = (
+                    "attribute_keys was ignored because 'attributes' is not in "
+                    "fields=. Add 'attributes' to fields= (or omit fields=) to "
+                    "apply attribute_keys."
+                )
+
             if errors:
                 response["errors"] = errors
                 response["error_count"] = len(errors)
diff --git a/tests/src/unit/test_context_injection.py b/tests/src/unit/test_context_injection.py
@@ -140,7 +140,10 @@ async def test_ha_get_history_works_without_ctx() -> None:
     ):
         result = await history_tool(entity_ids="sensor.test")
 
-    assert result is fake_result
+    # ha_get_history wraps the inner _fetch_history result via add_timezone_metadata
+    # — the inner payload must round-trip unchanged under fields=None.
+    assert result["data"] == fake_result
+    assert "metadata" in result
     fake_ws.disconnect.assert_awaited_once()
 
 
@@ -167,7 +170,8 @@ async def test_ha_get_history_emits_progress_with_ctx() -> None:
     ):
         result = await history_tool(entity_ids="sensor.test", ctx=ctx)
 
-    assert result is fake_result
+    assert result["data"] == fake_result
+    assert "metadata" in result
     ctx.info.assert_awaited()
     # Three events: connect, query dispatch, completion (progress jumps 1 -> 3).
     assert ctx.report_progress.await_count == 3
@@ -526,7 +530,8 @@ async def test_ha_get_history_statistics_emits_progress() -> None:
             entity_ids="sensor.test", source="statistics", period="day", ctx=ctx
         )
 
-    assert result is fake_result
+    assert result["data"] == fake_result
+    assert "metadata" in result
     assert ctx.report_progress.await_count == 3
     messages = _progress_messages(ctx)
     assert "querying recorder (statistics)" in messages[1]
diff --git a/tests/src/unit/test_get_state_fields_projection.py b/tests/src/unit/test_get_state_fields_projection.py
@@ -66,17 +66,8 @@ def test_does_not_mutate_original(self):
         assert "last_changed" in original
         assert original["attributes"]["color_temp"] == 3500
 
-    def test_fields_csv_string_input(self):
-        result = _project_entity(dict(_ENTITY_RECORD), "state,entity_id", None)
-        assert set(result.keys()) == {"state", "entity_id"}
-
-    def test_fields_json_array_string_input(self):
-        result = _project_entity(dict(_ENTITY_RECORD), '["state", "attributes"]', ["brightness"])
-        assert set(result.keys()) == {"state", "attributes"}
-        assert result["attributes"] == {"brightness": 200}
-
-    def test_attribute_keys_csv_string_input(self):
-        result = _project_entity(dict(_ENTITY_RECORD), None, "brightness,color_temp")
-        assert set(result["attributes"].keys()) == {"brightness", "color_temp"}
+    def test_non_dict_record_returned_unchanged(self):
+        # Defensive: error paths may pass None/non-dict; helper must not raise.
+        assert _project_entity(None, ["state"], None) is None  # type: ignore[arg-type]
 
 
diff --git a/tests/src/unit/test_overview_system_info.py b/tests/src/unit/test_overview_system_info.py
@@ -101,7 +101,7 @@ class TestHaGetOverviewFieldsProjection:
 
     Pins the contract from issue #1199: callers that only need one section
     (e.g. system_info) can request it via fields= and receive a response
-    that omits all other top-level keys, reducing token usage by up to 94%.
+    that omits all other top-level keys.
     """
 
     @pytest.fixture
diff --git a/tests/src/unit/test_tools_get_states.py b/tests/src/unit/test_tools_get_states.py
diff --git a/tests/src/unit/test_tools_history.py b/tests/src/unit/test_tools_history.py