Merge pull request #31 from comet-ml/feat/opik-environment-support

LeoRoccoBreedt · web-flow · commit 48ac30497025 · 2026-06-17T13:59:27.000+02:00
feat: add OPIK_ENVIRONMENT support for dev/test/prod trace separation
diff --git a/.env.example b/.env.example
@@ -21,6 +21,14 @@ ISSUE_NUMBER=
 OPIK_API_KEY=
 OPIK_WORKSPACE=
 
+# Opik environment tag — separates traces by context in the Opik UI.
+# Convention:
+#   dev     — local development and offline eval runs
+#   test    — test suite runs (set automatically by run_test_suite.py)
+#   staging — UAT / pre-production validation
+#   prod    — GitHub Action (production triage)
+OPIK_ENVIRONMENT=dev
+
 # Opik project for eval traces — defaults to scout:{SCOUT_GITHUB_REPO_OWNER}/{SCOUT_GITHUB_REPO_NAME}
 # SCOUT_EVAL_OPIK_PROJECT=
 
diff --git a/README.md b/README.md
@@ -73,6 +73,7 @@ jobs:
           SCOUT_GITHUB_REPO_NAME: ${{ vars.SCOUT_GITHUB_REPO_NAME }}
           OPIK_API_KEY: ${{ secrets.OPIK_API_KEY }}
           OPIK_WORKSPACE: ${{ vars.OPIK_WORKSPACE }}
+          OPIK_ENVIRONMENT: prod
           ISSUE_NUMBER: ${{ github.event.issue.number || github.event.inputs.issue_number }}
 ```
 
@@ -93,6 +94,7 @@ The GitHub App must have these permissions:
 | `SCOUT_ESCALATION_TAG` | no | Label for escalated issues (default: `Escalated request`) |
 | `OPIK_API_KEY` | **yes** | Opik API key. Opik is required — Scout sources its system prompt from Opik and traces every run there. |
 | `OPIK_WORKSPACE` | **yes** | Opik workspace name |
+| `OPIK_ENVIRONMENT` | no | Tags traces by environment in the Opik UI. Convention: `dev` (local), `test` (test suite — set automatically), `staging` (UAT), `prod` (GitHub Action). |
 | `SCOUT_FEEDBACK_SINCE_DAYS` | no | Feedback sync only: how many days back to scan issues for 👍/👎 reactions (default: `7`) |
 | `ISSUE_NUMBER` | no | Override issue number (auto-detected from event payload) |
 | `SCOUT_MODEL` | no | Anthropic model ID (default: `claude-sonnet-4-6`) |
diff --git a/evals/README.md b/evals/README.md
@@ -18,9 +18,10 @@ ANTHROPIC_API_KEY=       # required — Scout agent and LLM judge both use Claud
 GITHUB_TOKEN=            # required — fetching issues and real-GitHub file mode during evals
 OPIK_API_KEY=            # required — logging traces and reading datasets
 OPIK_WORKSPACE=          # required — your Opik workspace name
+OPIK_ENVIRONMENT=dev     # dev (local + offline evals), test (test suite — auto-set), staging (UAT), prod (GitHub Action)
 SCOUT_GITHUB_REPO_OWNER= # required — repo to fetch issues from
 SCOUT_GITHUB_REPO_NAME=  # required — repo to fetch issues from
-SCOUT_EVAL_OPIK_PROJECT= # Optional — defaults to scout:{SCOUT_GITHUB_REPO_OWNER}/{SCOUT_GITHUB_REPO_NAME}
+SCOUT_EVAL_OPIK_PROJECT= # optional — defaults to scout:{SCOUT_GITHUB_REPO_OWNER}/{SCOUT_GITHUB_REPO_NAME}
 ```
 
 ---
@@ -99,6 +100,8 @@ python evals/run_test_suite.py
 
 Pass rate is printed on completion and visible in the Opik dashboard.
 
+> **Environment convention:** `run_test_suite.py` automatically defaults to `OPIK_ENVIRONMENT=test` — no manual setup needed. Offline eval runs use `dev` from `.env`. Override either by setting `OPIK_ENVIRONMENT` in the shell before running (e.g. `OPIK_ENVIRONMENT=staging` for UAT). The GitHub Action sets `prod`.
+
 ---
 
 ## Simulation modes
diff --git a/evals/run_offline_eval.py b/evals/run_offline_eval.py
@@ -36,7 +36,7 @@
 from opik.evaluation.metrics import Usefulness
 from dotenv import load_dotenv
 
-load_dotenv(override=True)
+load_dotenv()
 
 from scout.agent import make_client, run_agent  # noqa: E402
 from scout.providers.scenarios import build  # noqa: E402
@@ -83,7 +83,7 @@ def eval_task(item: dict) -> dict:
         sim = build(scenario, spec)
         logger.info("scenario=%s target=#%d", scenario, target)
 
-        comment, _trace_id = run_agent(
+        comment, _ = run_agent(
             sim,
             target,
             client=client,
diff --git a/evals/run_test_suite.py b/evals/run_test_suite.py
@@ -30,7 +30,8 @@
 import opik
 from dotenv import load_dotenv
 
-load_dotenv(override=True)
+os.environ.setdefault("OPIK_ENVIRONMENT", "test")
+load_dotenv()
 
 from scout.agent import make_client, run_agent  # noqa: E402
 from scout.providers.scenarios import build  # noqa: E402
diff --git a/evals/utils/fetch_github_issues.py b/evals/utils/fetch_github_issues.py
@@ -33,7 +33,7 @@
 from dotenv import load_dotenv
 from github import Github
 
-load_dotenv(override=True)
+load_dotenv()
 
 
 def fetch_issue(issue_obj) -> dict:
diff --git a/evals/utils/seed_dataset.py b/evals/utils/seed_dataset.py
@@ -30,7 +30,7 @@
 import opik
 from dotenv import load_dotenv
 
-load_dotenv(override=True)
+load_dotenv()
 
 logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
 logger = logging.getLogger(__name__)
diff --git a/evals/utils/seed_test_suite.py b/evals/utils/seed_test_suite.py
@@ -33,7 +33,7 @@
 import opik
 from dotenv import load_dotenv
 
-load_dotenv(override=True)
+load_dotenv()
 
 logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
 logger = logging.getLogger(__name__)
@@ -77,7 +77,7 @@ def items_from_github(path: str) -> tuple[list[dict], str, str]:
     return items, owner, name
 
 
-def items_from_starter() -> list[dict]:
+def items_from_starter() -> tuple[list[dict], list[str], dict]:
     from evals.utils.starter_scenarios import GLOBAL_ASSERTIONS, GLOBAL_EXECUTION_POLICY, STARTER_SCENARIOS
     items = []
     for s in STARTER_SCENARIOS:
@@ -91,11 +91,11 @@ def items_from_starter() -> list[dict]:
 
 def _get_or_create_suite(client: opik.Opik, global_assertions, global_execution_policy):
     try:
-    	suite = client.get_test_suite(name=SUITE_NAME, project_name=SUITE_PROJECT)
-    	logger.info("Found existing test suite %r — reusing.", SUITE_NAME)
-    	return suite
+        suite = client.get_test_suite(name=SUITE_NAME, project_name=SUITE_PROJECT)
+        logger.info("Found existing test suite %r — reusing.", SUITE_NAME)
+        return suite
     except Exception as e:
-    	logger.info("Test suite %r not found (%s) — creating it.", SUITE_NAME, type(e).__name__)
+        logger.info("Test suite %r not found (%s) — creating it.", SUITE_NAME, type(e).__name__)
 
     return client.create_test_suite(
         name=SUITE_NAME,
@@ -131,8 +131,8 @@ def main() -> None:
     all_items: list[dict] = []
 
     if args.from_github:
-        items, _owner, _name = items_from_github(args.from_github)
-        all_items.extend(items)
+        github_items, *_ = items_from_github(args.from_github)
+        all_items.extend(github_items)
 
     if args.from_starter:
         items, global_assertions, global_execution_policy = items_from_starter()
diff --git a/src/scout/triage.py b/src/scout/triage.py
@@ -281,7 +281,7 @@ def load_system_prompt() -> str:
     try:
         chat = client.get_chat_prompt(
             name=SCOUT_OPIK_PROMPT_NAME,
-            version=version,
+            commit=version,
             project_name=OPIK_PROJECT,
         )
     except PromptTemplateStructureMismatch:

Original file line number	Diff line number	Diff line change
`@@ -281,7 +281,7 @@ def load_system_prompt() -> str:`
`281`	`281`	`try:`
`282`	`282`	`chat = client.get_chat_prompt(`
`283`	`283`	`name=SCOUT_OPIK_PROMPT_NAME,`
`284`		`- version=version,`
	`284`	`+ commit=version,`
`285`	`285`	`project_name=OPIK_PROJECT,`
`286`	`286`	`)`
`287`	`287`	`except PromptTemplateStructureMismatch:`