Address PR review + publish-pipeline audit hardening

anassg-lago · anassg-lago · commit ec249dba8fd1 · 2026-06-03T06:55:40.000-07:00
Bugs
  - Anthropic messages.create(stream=True) under-billed input tokens. The
    stream wrapper read only the top-level `usage`, which on a basic stream
    appears only on message_delta as {output_tokens: N}. The authoritative
    input/cache counts arrive nested under message.usage on message_start and
    were dropped, so input billed 0. New _merge_stream_usage folds both
    locations (message_start input/cache + message_delta cumulative output)
    across sync and async paths. Fixtures now use the realistic wire shape
    (message_delta carries no input echo), so the stream tests are genuine
    regressions.

  - Legacy google-generativeai SDK silently emitted nothing. The detector
    matched both google-genai and the deprecated google-generativeai, but the
    wrapper only instruments the unified Client.models/.aio surface, so a
    legacy GenerativeModel wrapped nothing. Detector now returns a distinct
    'gemini_legacy' kind and wrap() rejects it with a migrate-to-google-genai
    message. ("genai" is not a substring of "generativeai", so no overlap.)

Docs
  - README: cache_read / audio_input / image_input are subsets of input for
    OpenAI and Gemini, not additive — summing them double-counts.

Publish-workflow hardening
  - Least-privilege default (permissions: contents: read); only publish gets
    id-token: write, only release gets contents: write.
  - All third-party actions pinned to full commit SHAs (version in comment).
  - Added `if: startsWith(github.ref, 'refs/tags/v')` to the publish job as
    defense-in-depth.
  - Added .github/dependabot.yml (github-actions) to keep SHA pins fresh.
  - RELEASING.md documents pypi environment protection (required reviewers +
    protected-tag restriction) as a REQUIRED setup step.

Gate: ruff + format + mypy clean; 319 unit tests pass; coverage 89.27%.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,18 @@
+version: 2
+
+# Keep the GitHub Actions in our workflows up to date. We pin every action to a
+# full commit SHA for supply-chain safety (see .github/workflows/publish.yml),
+# which means SHAs don't auto-update — Dependabot opens PRs that bump the SHA
+# and the trailing version comment together, so the pins stay both fixed and
+# fresh (and pick up upstream security patches).
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    commit-message:
+      prefix: "ci"
+    groups:
+      github-actions:
+        patterns:
+          - "*"
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -12,29 +12,43 @@ name: publish
 #   Workflow name:     publish.yml
 #   Environment name:  pypi
 #
+# Also configure deployment protection on the `pypi` GitHub environment
+# (required reviewers + restrict to protected v*.*.* tags) — see RELEASING.md.
+#
 # After setup, releasing is one command from your laptop:
 #   git tag v0.1.0 && git push --tags
+#
+# Third-party actions are pinned to full commit SHAs (not tags) so a hijacked
+# or re-pointed tag can't inject code into a job that mints OIDC tokens. The
+# trailing comment records the human-readable version each SHA corresponds to.
 
 on:
   push:
     tags:
       - "v*.*.*"
 
+# Least-privilege default for every job. Jobs that need more (OIDC, release
+# creation) opt in explicitly below.
+permissions:
+  contents: read
+
 jobs:
   # ----------------------------------------------------------------------
   # 1. Run the full CI gate first. If anything is red, abort before build.
   # ----------------------------------------------------------------------
   test:
     name: test (py${{ matrix.python-version }})
     runs-on: ubuntu-latest
+    permissions:
+      contents: read
     strategy:
       fail-fast: true
       matrix:
         python-version: ["3.10", "3.11", "3.12"]
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       - name: Install uv
-        uses: astral-sh/setup-uv@v4
+        uses: astral-sh/setup-uv@38f3f104447c67c051c4a08e39b64a148898af3a # v4.2.0
         with:
           python-version: ${{ matrix.python-version }}
           enable-cache: true
@@ -52,10 +66,12 @@ jobs:
     name: build
     needs: test
     runs-on: ubuntu-latest
+    permissions:
+      contents: read
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       - name: Install uv
-        uses: astral-sh/setup-uv@v4
+        uses: astral-sh/setup-uv@38f3f104447c67c051c4a08e39b64a148898af3a # v4.2.0
         with:
           python-version: "3.12"
       - name: Verify tag matches pyproject version
@@ -72,7 +88,7 @@ jobs:
           uv pip install --system build
           python -m build
       - name: Upload dist artifacts
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
         with:
           name: dist
           path: dist/
@@ -85,22 +101,27 @@ jobs:
     name: publish to PyPI
     needs: build
     runs-on: ubuntu-latest
-    # OIDC requires `id-token: write` permission on the workflow.
+    # OIDC requires `id-token: write`; nothing else is needed here.
     permissions:
       id-token: write
+      contents: read
+    # Defense-in-depth: only ever publish from a v*.*.* tag ref, even if the
+    # environment's protected-tag rule is removed or misconfigured in the UI.
+    if: startsWith(github.ref, 'refs/tags/v')
     # The `pypi` environment is what you bind to in PyPI's trusted-publisher
-    # config — restricts which workflows can claim the OIDC identity.
+    # config — and where deployment protection rules (required reviewers,
+    # protected-tag restriction) are enforced. See RELEASING.md.
     environment:
       name: pypi
       url: https://pypi.org/p/lago-agent-sdk
     steps:
       - name: Download dist
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
           name: dist
           path: dist/
       - name: Publish
-        uses: pypa/gh-action-pypi-publish@release/v1
+        uses: pypa/gh-action-pypi-publish@ecb4c3dfd4790f14e30aaeac04855c7413ee9368 # v1.12.2
         # No `password:` — OIDC handles auth automatically.
 
   # ----------------------------------------------------------------------
@@ -113,9 +134,9 @@ jobs:
     permissions:
       contents: write
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       - name: Download dist
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
           name: dist
           path: dist/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,19 @@ All notable changes to this project will be documented here. Format follows [Kee
 
 ## [Unreleased]
 
+### Fixed
+- **Anthropic `messages.create(stream=True)` under-billed input tokens.** The stream wrapper read only top-level `usage`, which on a basic stream appears only on `message_delta` as `{output_tokens: N}` — the authoritative `input_tokens` / `cache_*` counts arrive nested under `message.usage` on the `message_start` event and were ignored, so input billed 0. The wrapper now merges usage from `message_start` (input/cache) and `message_delta` (cumulative output). Sync + async paths; regression tests use the realistic wire shape (delta carries no input echo).
+- **Legacy `google-generativeai` SDK silently emitted no events.** The detector matched both the new `google-genai` and the deprecated `google-generativeai` SDKs, but the wrapper only instruments the unified `Client.models` / `.aio` surface — a legacy `GenerativeModel` routed through and wrapped nothing. `wrap()` now rejects legacy clients with a clear pointer to migrate to `google-genai`.
+
+### Security
+- Hardened the publish workflow: least-privilege `permissions: contents: read` default (only `publish` gets `id-token: write`, only `release` gets `contents: write`), and every third-party action pinned to a full commit SHA so a re-pointed tag can't inject code into the OIDC-token-minting job.
+- Added `if: startsWith(github.ref, 'refs/tags/v')` to the `publish` job as defense-in-depth — it refuses to run on a non-tag ref even if the environment's protected-tag rule is misconfigured.
+- Added `.github/dependabot.yml` (github-actions ecosystem) so the SHA pins stay fresh — Dependabot bumps the SHA and version comment together rather than letting actions silently age.
+- RELEASING.md now documents `pypi` environment protection (required reviewers + protected-tag restriction) as a **required** setup step, not optional, since trusted publishing is only as strong as that environment's rules.
+
+### Documentation
+- README: clarified that `cache_read`, `audio_input`, and `image_input` are **subsets** of `input` for OpenAI and Gemini (not additive) — summing them with `llm_input_tokens` double-counts.
+
 ### Added
 - Native `google-genai` SDK support covering `client.models.generate_content` + `generate_content_stream`, sync + async (`client.aio.models`).
 - `extract_gemini_native` adapter maps `usage_metadata`: `prompt_token_count → input`, `candidates_token_count → output`, `cached_content_token_count → cache_read`, `thoughts_token_count → reasoning`, `prompt_tokens_details[modality=AUDIO/IMAGE] → audio_input/image_input`, `candidates_tokens_details[modality=AUDIO] → audio_output`, count of `candidates[0].content.parts[].function_call → tool_calls`.
diff --git a/README.md b/README.md
@@ -184,6 +184,9 @@ Backed by `contextvars` for safe propagation across `asyncio` tasks.
 - **OpenAI's `reasoning_tokens` is a SUBSET of `output`** — already counted in `completion_tokens`.
 - **Gemini's `thoughts_token_count` is ADDITIVE to `output`** — `candidates + thoughts = total billable output`.
 
+**Semantic note on input breakdowns (avoid double-counting):**
+For both OpenAI and Gemini, `cache_read`, `audio_input`, and `image_input` are **subsets of `input`**, not additive to it — they are a breakdown of tokens already counted in `llm_input_tokens`. For example, OpenAI reports `cached_tokens` under `prompt_tokens_details` *within* `prompt_tokens`, and Gemini's docs state `prompt_token_count` "includes the number of tokens in the cached content". A billable metric that sums `llm_input_tokens + llm_cached_input_tokens` (or `+ llm_audio_input_tokens`, `+ llm_image_input_tokens`) will **double-count**. Bill on `llm_input_tokens` as the total; use the breakdown fields only for cost attribution or discounted-rate tiers (e.g. cached input billed at a lower rate), subtracting them from `input` rather than adding.
+
 OpenAI's Predicted Outputs tokens (`accepted_prediction_tokens`, `rejected_prediction_tokens`) are not surfaced — see the OpenAI adapter docstring for details on this intentional gap.
 
 ## Error policy
diff --git a/RELEASING.md b/RELEASING.md
@@ -25,6 +25,23 @@ Configure the trusted publisher on PyPI:
 
 Then in this repo: **Settings → Environments → New environment** named `pypi`. (No secrets needed inside it — OIDC handles auth.)
 
+### Environment protection (required before first release)
+
+Trusted publishing is bound to the `pypi` environment, so that environment is the **only** thing standing between a pushed tag and a live PyPI release. A freshly created environment has **no** protection rules by default — until you add them, any successful run publishes immediately. Treat this as a mandatory setup step, not an optional one. Configure it under **Settings → Environments → pypi**:
+
+| Rule | Setting | Why |
+| --- | --- | --- |
+| Required reviewers | Add 1+ maintainers | The publish job pauses for human approval before it can mint the OIDC token and upload — a second pair of eyes on every release. |
+| Deployment branches and tags | **Selected** → add a `v*.*.*` tag rule | Only protected version tags can deploy to `pypi`; a random branch push or arbitrary tag can't trigger a publish. |
+
+With these in place, the `test` and `build` jobs still run on any matching tag, but the `publish` job blocks until an approver signs off, and only for `v*.*.*` tags.
+
+The workflow itself is hardened in depth, so a misconfigured environment alone can't publish from the wrong place:
+- Least-privilege `permissions: contents: read` default — only `publish` gets `id-token: write`, only `release` gets `contents: write`.
+- Every third-party action pinned to a full commit SHA so a re-pointed tag can't inject code into the token-minting job (kept fresh by `.github/dependabot.yml`).
+- The `publish` job carries `if: startsWith(github.ref, 'refs/tags/v')`, so even without the environment rule it refuses to run on a non-tag ref.
+- `publish` consumes the exact artifact built and version-checked in the `build` job (it never rebuilds), so the bytes on PyPI match what was tested.
+
 ## Cutting a release
 
 ```bash
diff --git a/src/lago_agent_sdk/detector.py b/src/lago_agent_sdk/detector.py
@@ -37,7 +37,16 @@ def detect_client_kind(client: Any) -> str:
     # Older mistralai versions or aliased imports
     if cls_name == "mistral" and "mistral" in module:
         return "mistral"
-    if "google" in module and ("genai" in module or "generativeai" in module):
+    # New unified google-genai SDK only (`google.genai.Client`). The legacy
+    # google-generativeai SDK (`google.generativeai.GenerativeModel`) has a
+    # different surface — no `.models` / `.aio` — that the gemini wrapper cannot
+    # instrument, so it would silently wrap nothing. Flag it separately so wrap()
+    # can reject it with a clear migration message instead.
+    #
+    # "genai" is not a substring of "generativeai", so these never overlap.
+    if "google" in module and "genai" in module:
         return "gemini"
+    if "google" in module and "generativeai" in module:
+        return "gemini_legacy"
 
     return "unknown"
diff --git a/src/lago_agent_sdk/sdk.py b/src/lago_agent_sdk/sdk.py
@@ -95,6 +95,15 @@ def wrap(
             from .wrappers.gemini import wrap_gemini_client
 
             return wrap_gemini_client(self, client, dimensions=dimensions, subscription=subscription)
+        if kind == "gemini_legacy":
+            raise UnknownClientError(
+                "The legacy google-generativeai SDK "
+                "(`import google.generativeai; genai.GenerativeModel(...)`) is not "
+                "supported — its surface differs from the unified SDK and cannot be "
+                "instrumented. Migrate to google-genai: `pip install google-genai`, "
+                "then `from google import genai; client = genai.Client(...)` and wrap "
+                "the Client. See https://ai.google.dev/gemini-api/docs/migrate."
+            )
         if kind == "unknown":
             raise UnknownClientError(
                 f"Unknown client passed to wrap(): {type(client).__module__}.{type(client).__name__}. "
diff --git a/src/lago_agent_sdk/wrappers/anthropic.py b/src/lago_agent_sdk/wrappers/anthropic.py
@@ -46,6 +46,35 @@ def _is_message_like(obj: Any) -> bool:
         return False
 
 
+def _merge_stream_usage(accumulated: dict[str, Any], payload: Any) -> None:
+    """Fold one streaming event's usage into the running accumulator.
+
+    Anthropic splits authoritative usage across two events:
+      - ``message_start`` carries the input/cache counts nested under
+        ``message.usage`` (with ``output_tokens`` only primed to 1).
+      - ``message_delta`` carries the *cumulative* ``output_tokens`` at the top
+        level (and, in some API shapes, echoes input/cache there too).
+
+    Reading only the top-level usage misses ``message_start``'s input/cache, so
+    a basic stream — whose ``message_delta`` is just ``{"output_tokens": N}`` —
+    would bill ``input_tokens=0``. Merge both locations; ``dict.update`` lets the
+    more complete / more recent values win while preserving the input counts from
+    ``message_start`` when a delta omits them.
+    """
+    if not isinstance(payload, dict):
+        return
+    # message_start: input/cache live under message.usage
+    message = payload.get("message")
+    if isinstance(message, dict):
+        nested = message.get("usage")
+        if isinstance(nested, dict):
+            accumulated.update(nested)
+    # message_delta (and others): cumulative usage at the top level
+    top = payload.get("usage")
+    if isinstance(top, dict):
+        accumulated.update(top)
+
+
 def wrap_anthropic_client(
     sdk: Any,
     client: Any,
@@ -95,20 +124,18 @@ def _create(*args: Any, **kwargs: Any) -> Any:
             _emit_from(response, model_id, sub, dims)
             return response
 
-        # Streaming — wrap the iterator to capture the final usage on close.
+        # Streaming — wrap the iterator, merging usage across message_start
+        # (input/cache) and message_delta (cumulative output) before emitting.
         def _wrap_stream(src: Iterator[Any]) -> Iterator[Any]:
-            last_usage: dict[str, Any] | None = None
+            accumulated: dict[str, Any] = {}
             try:
                 for event in src:
                     payload = event.model_dump() if hasattr(event, "model_dump") else event
-                    if isinstance(payload, dict):
-                        usage = payload.get("usage")
-                        if isinstance(usage, dict):
-                            last_usage = {"usage": usage}
+                    _merge_stream_usage(accumulated, payload)
                     yield event
             finally:
-                if last_usage is not None:
-                    _emit_from(last_usage, model_id, sub, dims)
+                if accumulated:
+                    _emit_from({"usage": accumulated}, model_id, sub, dims)
 
         return _wrap_stream(response)
 
@@ -127,18 +154,15 @@ async def _create_async(*args: Any, **kwargs: Any) -> Any:
             return response
 
         async def _wrap_async_stream(src: AsyncIterator[Any]) -> AsyncIterator[Any]:
-            last_usage: dict[str, Any] | None = None
+            accumulated: dict[str, Any] = {}
             try:
                 async for event in src:
                     payload = event.model_dump() if hasattr(event, "model_dump") else event
-                    if isinstance(payload, dict):
-                        usage = payload.get("usage")
-                        if isinstance(usage, dict):
-                            last_usage = {"usage": usage}
+                    _merge_stream_usage(accumulated, payload)
                     yield event
             finally:
-                if last_usage is not None:
-                    _emit_from(last_usage, model_id, sub, dims)
+                if accumulated:
+                    _emit_from({"usage": accumulated}, model_id, sub, dims)
 
         return _wrap_async_stream(response)
 
diff --git a/tests/unit/test_wrapper_anthropic.py b/tests/unit/test_wrapper_anthropic.py
diff --git a/tests/unit/test_wrapper_gemini.py b/tests/unit/test_wrapper_gemini.py