comet-ml
diff --git a/‎.github/workflows/lint.yml‎
Lines changed: 9 additions & 4 deletions b/‎.github/workflows/lint.yml‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎.github/workflows/scout-feedback.yml‎
Lines changed: 0 additions & 64 deletions b/‎.github/workflows/scout-feedback.yml‎
Lines changed: 0 additions & 64 deletions
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 5 additions & 5 deletions b/‎.github/workflows/test.yml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎.github/workflows/unit-tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/unit-tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 6 additions & 1 deletion b/‎.gitignore‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎README-OPIK-INTEGRATION.md‎
Lines changed: 21 additions & 21 deletions b/‎README-OPIK-INTEGRATION.md‎
Lines changed: 21 additions & 21 deletions
diff --git a/‎README.md‎
Lines changed: 48 additions & 4 deletions b/‎README.md‎
Lines changed: 48 additions & 4 deletions
diff --git a/‎action.yml‎
Lines changed: 6 additions & 9 deletions b/‎action.yml‎
Lines changed: 6 additions & 9 deletions
diff --git a/‎actions/feedback/action.yml‎
Lines changed: 36 additions & 0 deletions b/‎actions/feedback/action.yml‎
Lines changed: 36 additions & 0 deletions
@@ -18,21 +18,26 @@ jobs:
           cache: pip
 
       - name: Install dev dependencies
-        run: pip install -r requirements-dev.txt
+        run: pip install -e ".[dev]"
 
       - name: Ruff (lint)
         run: ruff check .
 
       - name: Mypy (type check)
-        run: mypy scout.py scout_feedback.py init_scout.py
+        run: mypy src/scout
 
       - name: Validate action + workflow YAML
-        # actionlint (below) checks workflow files but not action.yml, so parse it here.
+        # actionlint (below) checks workflow files but not action.yml files, so parse them here.
         run: |
           python - <<'PY'
           import glob
           import yaml
-          for f in ["action.yml", *sorted(glob.glob(".github/workflows/*.yml"))]:
+          targets = [
+              "action.yml",
+              *sorted(glob.glob("actions/*/action.yml")),
+              *sorted(glob.glob(".github/workflows/*.yml")),
+          ]
+          for f in targets:
               with open(f) as fh:
                   yaml.safe_load(fh)
               print(f"ok: {f}")
 
@@ -25,13 +25,13 @@ jobs:
       - name: Validate secrets
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-          OPIK_API_KEY: ${{ secrets.SCOUT_OPIK_API_KEY }}
-          OPIK_WORKSPACE: ${{ vars.OPIK_WORKSPACE }}
+          OPIK_API_KEY: ${{ secrets.OPIK_API_KEY }}
+          OPIK_WORKSPACE: ${{ secrets.OPIK_WORKSPACE }}
         run: |
           MISSING=""
           [ -z "$ANTHROPIC_API_KEY" ] && MISSING="$MISSING ANTHROPIC_API_KEY"
           # Opik is a hard requirement — Scout sources its prompt from Opik and traces there.
-          [ -z "$OPIK_API_KEY" ] && MISSING="$MISSING SCOUT_OPIK_API_KEY"
+          [ -z "$OPIK_API_KEY" ] && MISSING="$MISSING OPIK_API_KEY"
           [ -z "$OPIK_WORKSPACE" ] && MISSING="$MISSING OPIK_WORKSPACE"
           if [ -n "$MISSING" ]; then
             echo "::error::Missing required config:$MISSING"
@@ -51,6 +51,6 @@ jobs:
           SCOUT_ESCALATION_TAG: ${{ vars.SCOUT_ESCALATION_TAG || 'Escalated request' }}
           SCOUT_GITHUB_REPO_OWNER: ${{ vars.SCOUT_GITHUB_REPO_OWNER }}
           SCOUT_GITHUB_REPO_NAME: ${{ vars.SCOUT_GITHUB_REPO_NAME }}
-          OPIK_API_KEY: ${{ secrets.SCOUT_OPIK_API_KEY }}
-          OPIK_WORKSPACE: ${{ vars.OPIK_WORKSPACE }}
+          OPIK_API_KEY: ${{ secrets.OPIK_API_KEY }}
+          OPIK_WORKSPACE: ${{ secrets.OPIK_WORKSPACE }}
           ISSUE_NUMBER: ${{ github.event.inputs.issue_number }}
@@ -18,7 +18,7 @@ jobs:
           cache: pip
 
       - name: Install dependencies
-        run: pip install -r requirements-dev.txt
+        run: pip install -e ".[dev]"
 
       - name: Run tests
         run: pytest
@@ -3,4 +3,9 @@
 __pycache__/
 .pytest_cache/
 .mypy_cache/
-.ruff_cache/
+.ruff_cache/
+
+# Packaging artifacts (pip install -e . / python -m build)
+build/
+dist/
+*.egg-info/
@@ -15,26 +15,26 @@ The fix: an in-memory simulator that behaves like GitHub at the seams Scout actu
 ## Architecture
 
 ```
-scout.py                 scout_eval.py
+scout.triage             evals/run_eval.py
    │                          │
    ▼                          ▼
 GitHubProvider           GitHubSimulator      ◄── implements ──┐
    (PyGithub)            (in-memory)                            │
                                                                 │
                                                        RepositoryProvider
-                                                       (providers/base.py)
+                                                  (scout/providers/base.py)
                                                                 ▲
                                                                 │
                                                           agent.run_agent
-                                                          (agent.py)
+                                                       (scout/agent.py)
 ```
 
 `agent.run_agent` is the single agent loop. It depends only on the `RepositoryProvider` protocol — not on PyGithub, not on any module globals. Both backends satisfy the same interface; swapping them changes nothing about the loop, the prompt, the model, or the tool dispatch.
 
 The protocol covers everything Scout's tools (and `main()`) need from "GitHub":
 
 ```python
-# providers/base.py
+# src/scout/providers/base.py
 class RepositoryProvider(Protocol):
     # read
     def get_issue_data(self, issue_number: int) -> dict: ...
@@ -52,7 +52,7 @@ class RepositoryProvider(Protocol):
 
 ## The simulator
 
-`providers/simulator.py` defines `GitHubSimulator` — a real object with mutable state, not a passive fixture. Scenarios build it up with a fluent API; the agent reads and writes against it; assertions inspect both the output text and the resulting state.
+`src/scout/providers/simulator.py` defines `GitHubSimulator` — a real object with mutable state, not a passive fixture. Scenarios build it up with a fluent API; the agent reads and writes against it; assertions inspect both the output text and the resulting state.
 
 ```python
 sim = (
@@ -113,7 +113,7 @@ Example real-GitHub spec:
 
 ## Scenarios — bridging JSON to the simulator
 
-Opik dataset rows are JSON; simulator behavior is Python. `providers/scenarios.py` reconciles them with a small registry:
+Opik dataset rows are JSON; simulator behavior is Python. `src/scout/providers/scenarios.py` reconciles them with a small registry:
 
 ```python
 SCENARIO_BUILDERS: dict[str, Callable[[dict], GitHubSimulator]] = {}
@@ -203,7 +203,7 @@ See `evals/starter_scenarios.py` for five worked examples covering duplicate cit
 
 ## How the eval driver works
 
-`scout_eval.py` is the bridge between Opik's `run_tests` and the agent loop:
+`evals/run_eval.py` is the bridge between Opik's `run_tests` and the agent loop:
 
 ```python
 def task(item: dict) -> dict:
@@ -251,8 +251,8 @@ Assertions can reference any of these. Examples:
 
 Both providers route traces through Opik identically — tracing lives at the agent/tool/LLM layer, not the provider layer. What differs is the project name:
 
-- Production runs (`scout.py` → `GitHubProvider`) trace to `scout:<owner>/<repo>`.
-- Eval runs (`scout_eval.py` → `GitHubSimulator`) trace to `scout-eval` (override with `SCOUT_EVAL_OPIK_PROJECT`).
+- Production runs (`scout.triage` → `GitHubProvider`) trace to `scout:<owner>/<repo>`.
+- Eval runs (`evals/run_eval.py` → `GitHubSimulator`) trace to `scout-eval` (override with `SCOUT_EVAL_OPIK_PROJECT`).
 
 Different projects keep prod triage and eval experiments visually separate in the Opik UI. Eval runs are noisy — you may run a 5-item suite many times while iterating on the prompt — and you don't want that drowning out real triage traces.
 
@@ -276,34 +276,34 @@ ANTHROPIC_API_KEY=... OPIK_API_KEY=... OPIK_WORKSPACE=... \
   GITHUB_TOKEN=unused \
   SCOUT_GITHUB_REPO_OWNER=x SCOUT_GITHUB_REPO_NAME=y \
   SCOUT_EXPERIMENT_NAME=baseline-v1 \
-  python scout_eval.py
+  python -m evals.run_eval
 ```
 
-`GITHUB_TOKEN` must be set because `scout.py` validates it at import time. For all-simulated suites the value is unused — `unused` is fine. **For suites that include real-GitHub-mode scenarios** (specs with no `files` key), it must be a real token with read access to the target repo. `SCOUT_EXPERIMENT_NAME` is treated as a *prefix*: each run gets `{prefix}-YYYY-MM-DD-HH-MM-SS` appended, so re-running without changing the env var produces a fresh, chronologically sortable experiment in the Opik UI.
+`GITHUB_TOKEN` must be set because the triage module validates it at import time. For all-simulated suites the value is unused — `unused` is fine. **For suites that include real-GitHub-mode scenarios** (specs with no `files` key), it must be a real token with read access to the target repo. `SCOUT_EXPERIMENT_NAME` is treated as a *prefix*: each run gets `{prefix}-YYYY-MM-DD-HH-MM-SS` appended, so re-running without changing the env var produces a fresh, chronologically sortable experiment in the Opik UI.
 
 ## Adding a scenario
 
 1. **Open `evals/starter_scenarios.py`** and append a new item to `STARTER_SCENARIOS` following the shape above.
 2. **Write 3–6 specific assertions** that a judge can answer yes/no clearly. Reference the surfaced output keys (`output`, `final_labels`, `applied_labels`, `search_queries`) when behavior matters more than text.
-3. **If your scenario needs programmable behavior** (flaky tools, multi-call state changes), register a new builder with `@register("your-name")` in `providers/scenarios.py` and reference it via `"scenario": "your-name"`. The base `_default` builder is composable — call it inside your builder and then mutate the result.
-4. **Run `pytest test_scout.py -k Starter`** — three parametrized validation tests will check your scenario is structurally valid before you push.
+3. **If your scenario needs programmable behavior** (flaky tools, multi-call state changes), register a new builder with `@register("your-name")` in `src/scout/providers/scenarios.py` and reference it via `"scenario": "your-name"`. The base `_default` builder is composable — call it inside your builder and then mutate the result.
+4. **Run `pytest tests/test_triage.py -k Starter`** — three parametrized validation tests will check your scenario is structurally valid before you push.
 5. **Re-seed and re-run:**
 
    ```bash
    python -m evals.seed_test_suite
-   python scout_eval.py
+   python -m evals.run_eval
    ```
 
 ## Files
 
 | Path | Purpose |
 |---|---|
-| `providers/base.py` | `RepositoryProvider` protocol |
-| `providers/github.py` | Production backend (PyGithub) |
-| `providers/simulator.py` | In-memory simulator with fluent builders |
-| `providers/scenarios.py` | Scenario builder registry + default/search-rate-limited builders |
-| `agent.py` | Agent loop, tool definitions, `make_tools`, `make_client` |
-| `scout.py` | Production entry point (env parsing, prompt loading, `main()`) |
-| `scout_eval.py` | Opik Test Suite driver |
+| `src/scout/providers/base.py` | `RepositoryProvider` protocol |
+| `src/scout/providers/github.py` | Production backend (PyGithub) |
+| `src/scout/providers/simulator.py` | In-memory simulator with fluent builders |
+| `src/scout/providers/scenarios.py` | Scenario builder registry + default/search-rate-limited builders |
+| `src/scout/agent.py` | Agent loop, tool definitions, `make_tools`, `make_client` |
+| `src/scout/triage.py` | Production entry point (env parsing, prompt loading, `main()`) |
+| `evals/run_eval.py` | Opik Test Suite driver |
 | `evals/starter_scenarios.py` | Five worked starter scenarios |
 | `evals/seed_test_suite.py` | Idempotent suite seeder |
@@ -209,9 +209,47 @@ Anyone viewing an issue can rate Scout's triage comment by adding a 👍 or 👎
 - **1.0** = all 👍, **0.0** = all 👎, otherwise the ratio `👍 / (👍 + 👎)` (e.g. 3 👍 and 1 👎 → `0.75`). Reactions other than 👍/👎 are ignored.
 - The score carries a `reason` that attributes the votes by GitHub login, e.g. `👍 2 (alice, bob) / 👎 1 (carol) from GitHub`, so you can see *who* reacted in the Opik UI alongside the trace.
 
-**How it works.** GitHub fires no event when a reaction is added, so a scheduled workflow (`.github/workflows/scout-feedback.yml`, every 30 min) polls recent issues, reads the reaction counts on Scout's comments, and upserts the score. Each Scout comment carries a hidden marker (`<!-- scout-feedback trace_id=… -->`) that maps it back to its Opik trace. The sync is idempotent — re-running simply recomputes the score from current reactions — so feedback lands in Opik within one cron interval and self-corrects as votes change.
+**How it works.** GitHub fires no event when a reaction is added, so a scheduled workflow polls recent issues every 30 minutes, reads the reaction counts on Scout's comments, and upserts the score. Each Scout comment carries a hidden marker (`<!-- scout-feedback trace_id=… -->`) that maps it back to its Opik trace. The sync is idempotent — re-running simply recomputes the score from current reactions — so feedback lands in Opik within one cron interval and self-corrects as votes change.
 
-**Enabling it.** Add `.github/workflows/scout-feedback.yml` to the repo that runs Scout (it reuses the same `SCOUT_OPIK_API_KEY` secret and `OPIK_WORKSPACE` / `SCOUT_GITHUB_REPO_OWNER` / `SCOUT_GITHUB_REPO_NAME` variables). Trigger it manually from the Actions tab for an immediate sync.
+**Enabling it.** The feedback sync ships as a second action published from this repo, `comet-ml/scout-repo-agent/actions/feedback`, alongside the triage action. Add a scheduled workflow to the repo that runs Scout — it reuses the same `OPIK_API_KEY` secret and `OPIK_WORKSPACE` value as the triage action:
+
+```yaml
+name: Scout Feedback Sync
+
+on:
+  schedule:
+    - cron: '*/30 * * * *'  # every 30 minutes
+  workflow_dispatch:
+    inputs:
+      since_days:
+        description: How many days back to scan issues for reactions
+        required: false
+        default: '7'
+        type: string
+
+concurrency:
+  group: scout-feedback-sync
+  cancel-in-progress: false
+
+jobs:
+  sync-feedback:
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    permissions:
+      issues: read
+      contents: read
+    steps:
+      - name: Sync reactions to Opik
+        uses: comet-ml/scout-repo-agent/actions/feedback@main
+        with:
+          github_token: ${{ github.token }}
+          since_days: ${{ github.event.inputs.since_days || '7' }}
+        env:
+          OPIK_API_KEY: ${{ secrets.OPIK_API_KEY }}
+          OPIK_WORKSPACE: ${{ vars.OPIK_WORKSPACE }}
+```
+
+Trigger it manually from the Actions tab for an immediate sync.
 
 > **Scan window.** GitHub does not bump an issue's `updated_at` when a reaction is added, so the sync only re-checks issues with other activity within `SCOUT_FEEDBACK_SINCE_DAYS` (default 7). Reactions on otherwise-quiet older issues may be missed — run the workflow manually with a larger `since_days` to backfill. Because the upsert is idempotent, re-syncing is always safe.
 
@@ -221,15 +259,21 @@ Use the manual trigger workflow in this repo's Actions tab (`Test Scout (Manual)
 
 ## Local development
 
+The code is an installable package under `src/scout/`. Install it (with dev extras) in editable mode:
+
 ```bash
-pip install -r requirements.txt
+pip install -e ".[dev]"
 
 # Copy and fill in the template
 cp .env.example .env
 
-python scout.py
+# Run triage locally (console script registered by the install).
+# Equivalent to `python -m scout.triage`.
+scout-triage
 ```
 
+Run the unit tests and linters with `pytest`, `ruff check .`, and `mypy src/scout`.
+
 `.env.example`:
 ```
 ANTHROPIC_API_KEY=
 
@@ -26,19 +26,16 @@ runs:
       with:
         python-version: '3.12'
 
-    - name: Cache pip dependencies
-      uses: actions/cache@v4
-      with:
-        path: ~/.cache/pip
-        key: scout-pip-${{ hashFiles(format('{0}/requirements.txt', github.action_path)) }}
-
-    - name: Install dependencies
+    - name: Install Scout
       shell: bash
-      run: pip install -r ${{ github.action_path }}/requirements.txt
+      # ${{ github.action_path }} is this action's directory — the repo root for
+      # the root action. Installing the package pulls runtime deps and registers
+      # the scout-triage console script.
+      run: pip install "${{ github.action_path }}"
 
     - name: Run Scout
       shell: bash
-      run: python ${{ github.action_path }}/scout.py
+      run: scout-triage
       env:
         ANTHROPIC_API_KEY: ${{ inputs.anthropic_api_key }}
         GITHUB_TOKEN: ${{ inputs.github_token }}
 
@@ -0,0 +1,36 @@
+name: Scout Feedback Sync
+description: Sync 👍/👎 reactions on Scout's issue comments to Opik as human feedback scores
+author: Comet (comet.com)
+
+inputs:
+  github_token:
+    description: GitHub token with issues:read permission
+    required: true
+  since_days:
+    description: How many days back to scan issues for reactions
+    required: false
+    default: '7'
+
+runs:
+  using: composite
+  steps:
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
+
+    - name: Install Scout
+      shell: bash
+      # ${{ github.action_path }} is this action's directory (actions/feedback);
+      # ../.. is the repo root, where the installable package lives. Installing it
+      # pulls runtime deps and registers the scout-feedback console script.
+      run: pip install "${{ github.action_path }}/../.."
+
+    - name: Sync reactions to Opik
+      shell: bash
+      run: scout-feedback
+      env:
+        GITHUB_TOKEN: ${{ inputs.github_token }}
+        SCOUT_FEEDBACK_SINCE_DAYS: ${{ inputs.since_days }}
+        # OPIK_API_KEY / OPIK_WORKSPACE are read from the environment the caller
+        # sets on the `uses:` step (see README), mirroring the triage action.