braintrustdata · Jeff McCollum (jeffmccollum) · May 28, 2026
diff --git a/README.md b/README.md
@@ -4,14 +4,16 @@ A load testing suite for running benchmarks on self-hosted Braintrust data plane
 
 ## Overview
 
-This suite currently supports three types of tests:
+This suite currently supports four types of tests:
 
 - **Load Test**: Spawns simulated users to bombard the data plane with logs, simulating production traffic
 
 - **Large Eval Test**: Generates a large synthetic dataset and runs an eval against it
 
 - **Functional Test**: Exercises core API create/read/delete flows across key Braintrust resources
 
+- **Smoke Test**: Exercises the Topics pipeline on a disposable project with a small synthetic trace set
+
 The suite can be extended to support additional test types in the future, and that is a goal.
 
 Each test is highly configurable via the `braintest.yaml` config file. The tests should be configured to simulate a customer's expected load and usage patterns. We want to ensure that the infra Braintrust is hosted on can handle the customer's use case, and size up components accordingly if the tests fail.
@@ -63,11 +65,20 @@ To override any config value via environment variable, use `__` (double undersco
 | `loadtest.processes` | `LOADTEST__PROCESSES` |
 | `evaltest.trial_count` | `EVALTEST__TRIAL_COUNT` |
 | `functionaltest.name_prefix` | `FUNCTIONALTEST__NAME_PREFIX` |
+| `smoketest.count` | `SMOKETEST__COUNT` |
 
 Example:
 ```bash
 BRAINTRUST__API_URL=https://my-api.example.com LOADTEST__PROCESSES=8 python main.py
 ```
 
+The Topics smoke test is disabled by default because it creates a disposable
+project and exercises model-backed Topics processing. Enable it with
+`smoketest.run: True`, or run it directly with:
+
+```bash
+uv run python smoke_test/run.py --dry-run
+```
+
 ## Important Notes
-- No actual LLM calls are made in any of these tests. Everything is mocked. The purpose is to load test Braintrust infra, not the LLM provider.
+- Load, eval, and functional tests do not make actual LLM calls. The Topics smoke test exercises model-backed Topics processing, so keep it disabled unless you intentionally want to test that path.
diff --git a/braintest.yaml b/braintest.yaml
@@ -6,6 +6,26 @@ functionaltest:
   run: True # Whether or not to run this test
   name_prefix: functional-test # Prefix used when creating test resources
 
+smoketest:
+  run: False # Whether or not to run the Topics smoke test. This creates a disposable project.
+  org: null # Optional org name. If null, the script discovers it from BRAINTRUST_API_KEY.
+  api_url: null # Optional dataplane API URL override. If null, the script discovers it from BRAINTRUST_API_KEY.
+  project_prefix: topics-smoke # Created project names are suffixed with a UTC timestamp
+  count: 105 # Topics generation needs at least 100 facet summaries
+  idle_seconds: 10 # Minimum idle window before Topics processing runs
+  topic_window: 1h # Backfill window for Topics automation
+  generation_cadence: 1h # Topics automation rerun cadence
+  relabel_overlap: 10m # Overlap window used for relabeling
+  allow_below_threshold: False # Only set True when intentionally testing pre-generation behavior
+  skip_facet_preflight: False # Skip direct and async-batch preflight calls before bulk insert
+  skip_running_check: False # Skip polling for initial running or processed Topics rows
+  timeout: 60 # Generic API timeout in seconds
+  facet_preflight_timeout: 30 # Timeout for direct Topics preflight calls
+  function_visibility_timeout: 60 # Time to wait for saved function refs to resolve
+  function_visibility_interval: 2 # Poll interval for saved function visibility
+  running_check_timeout: 90 # Time to wait for Topics rows to start running or processing
+  running_check_interval: 3 # Poll interval for Topics status checks
+
 evaltest:
   run: True # Whether or not to run this test
   project_id: null # if null or blank, will create a new project with name above

diff --git a/config.py b/config.py
@@ -27,6 +27,27 @@ class EvalTestConfig(BaseModel):
     dataset: DatasetConfig = DatasetConfig()
 
 
+class SmokeTestConfig(BaseModel):
+    run: bool = False
+    org: str | None = None
+    api_url: str | None = None
+    project_prefix: str = "topics-smoke"
+    count: int = 105
+    idle_seconds: int = 10
+    topic_window: str = "1h"
+    generation_cadence: str = "1h"
+    relabel_overlap: str = "10m"
+    allow_below_threshold: bool = False
+    skip_facet_preflight: bool = False
+    skip_running_check: bool = False
+    timeout: int = 60
+    facet_preflight_timeout: int = 30
+    function_visibility_timeout: int = 60
+    function_visibility_interval: int = 2
+    running_check_timeout: int = 90
+    running_check_interval: int = 3
+
+
 class WaitTimeConfig(BaseModel):
     min: int = 5
     max: int = 10
@@ -81,6 +102,7 @@ class Settings(BaseSettings):
     braintrust: BraintrustConfig = BraintrustConfig()
     functionaltest: FunctionalTestConfig = FunctionalTestConfig()
     evaltest: EvalTestConfig = EvalTestConfig()
+    smoketest: SmokeTestConfig = SmokeTestConfig()
     loadtest: LoadTestConfig = LoadTestConfig()
 
     @classmethod

diff --git a/docs/smoke_test.md b/docs/smoke_test.md
@@ -0,0 +1,151 @@
+# Topics Smoke Test
+
+This folder contains a small support-only script for exercising Braintrust
+Topics on a fresh disposable project.
+
+## What It Does
+
+`smoke_test/run.py`:
+
+1. Uses `BRAINTRUST_API_KEY` to call the fixed app URL,
+   `https://www.braintrust.dev/api/apikey/login`.
+2. Discovers the org and dataplane `api_url` from that login response.
+3. Creates a fresh project named like `topics-smoke-20260515-143022`.
+4. Uses the built-in `Task` facet and creates a project-local topic-map function.
+5. Inserts one synthetic root LLM trace and runs direct and async-batch gateway
+   preflights through the normal Topics path.
+6. Inserts the remaining traces to reach 105 total.
+7. Enables a minimal Topics automation using those saved functions.
+8. Queues the Topics automation to run.
+9. Polls until the `Task` facet shows the expected 105 running or processed traces.
+
+The default count is 105 because topic generation needs at least 100 facet
+summaries. One facet keeps the model work and Baseten traffic as small as
+possible while still exercising the pipeline.
+
+The generated spans use `span_attributes.type = "llm"` because Topics builds
+facet input through Brainstore's thread preprocessor, which only includes LLM
+spans or spans with no explicit type.
+
+The script intentionally does not expose facet or model override flags. It uses
+the product's built-in `Task` facet and lets the backend choose the supported
+facet model for the dataplane.
+
+The preflight catches model routing or preprocessor failures before the
+remaining 104 traces are inserted and before Topics is queued. It also calls
+`/function/invoke-async-batch` with the gateway header because that is the path
+used by automation workers. Skip it only when testing the automation path
+without extra preflight calls:
+
+```bash
+uv run python smoke_test/run.py --skip-facet-preflight
+```
+
+## Usage
+
+Dry run without creating projects, traces, or automations:
+
+```bash
+uv run python smoke_test/run.py --dry-run
+```
+
+The dry-run output includes the fixed app URL plus the org and dataplane API URL
+that would be used. If no API key is present, those fields are only populated
+from explicit arguments or environment variables.
+
+Real run:
+
+```bash
+export BRAINTRUST_API_KEY=...
+uv run python smoke_test/run.py
+```
+
+Run through the full suite by enabling the `smoketest` section in
+`braintest.yaml`:
+
+```yaml
+smoketest:
+  run: True
+```
+
+The direct CLI and `main.py` entrypoint both use `braintest.yaml` defaults.
+CLI flags still override those defaults for one-off runs.
+
+The real run prints the app URL, org, and dataplane API URL before creating the
+project. On success, the final line is a clickable Topics results link for the
+created project.
+
+By default, the script waits up to 90 seconds for the initial facet work to show
+the expected running or processed count:
+
+```bash
+uv run python smoke_test/run.py --running-check-timeout 120 --running-check-interval 5
+```
+
+The Topics status query can temporarily move while the automation cursor
+advances. The script keeps polling until the expected traces are either running
+or already processed, and fails if neither count appears before the timeout.
+
+After the running or processed count appears, full topic results are not
+necessarily immediate. With the default `--idle-seconds 10`, the runtime
+rechecks active Topics states about every 10 seconds. A healthy tiny project
+should usually move from `waiting_for_facets` to topic generation and backfill
+within a few minutes, but vendor/model latency and retry behavior can stretch
+that. If it remains in `waiting_for_facets` with `ready_topic_maps: 0` after
+several checks, the facet summaries are not becoming ready.
+
+Skip that verification only when you want the old fire-and-forget behavior:
+
+```bash
+uv run python smoke_test/run.py --skip-running-check
+```
+
+If the API key belongs to multiple orgs:
+
+```bash
+uv run python smoke_test/run.py --org braintrustdata.com
+```
+
+If discovery fails or you need to force a dataplane:
+
+```bash
+uv run python smoke_test/run.py --org braintrustdata.com --api-url https://example.cloudfront.net
+```
+
+Check the project after creation:
+
+```bash
+uv run python smoke_test/run.py status --project topics-smoke-20260515-143022
+```
+
+## Expected States
+
+Topics may move through these states:
+
+- `waiting_for_facets`: facet summaries are still being processed or fewer than
+  100 summaries are ready.
+- `recomputing_topics`: topic maps are being generated from the summaries.
+- `pending_logs_processing`: topics are ready and classifications are being
+  prepared.
+- `processing_logs`: classifications are being written back to logs.
+- `idle`: the automation has no immediate work queued.
+
+## Cost Guardrails
+
+- Default traces: 105.
+- Facet: built-in `Task` only.
+- No facet model override is sent; the backend default is used.
+- Direct and async-batch preflight calls are run before the remaining traces are
+  seeded.
+- Counts below 100 are rejected unless `--allow-below-threshold` is passed.
+- Each run creates a fresh project so results are isolated and easy to inspect.
+- The post-queue running check fails at timeout if rows are still attempted but
+  not completed.
+
+## Local Checks
+
+```bash
+uv run python -m py_compile smoke_test/run.py
+uv run python -m unittest smoke_test/test_topics_smoke.py
+uv run python smoke_test/run.py --dry-run
+```
diff --git a/main.py b/main.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
-Main script to orchestrate functionaltest, evaltest, and loadtest execution
-based on braintest.yaml config.
+Main script to orchestrate functionaltest, smoketest, evaltest, and loadtest
+execution based on braintest.yaml config.
 """
 
 import os
@@ -43,6 +43,21 @@ def run_functionaltest(config):
         return False
 
 
+def run_smoketest(config):
+    try:
+        subprocess.run(
+            [sys.executable, "smoke_test/run.py"],
+            check=True,
+            capture_output=False,
+            env={**os.environ, "PYTHONPATH": "."},
+        )
+        print("Smoke test completed successfully.")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"Smoke test failed with error code {e.returncode}")
+        return False
+
+
 def run_loadtest(config):
     print("Load Test")
 
@@ -211,6 +226,14 @@ def main():
             print("Functional test is not enabled. Skipping...")
             results["functionaltest"] = "SKIPPED"
 
+        if config.get("smoketest", {}).get("run", False):
+            print("\n-----Running Smoke Test-----")
+            smoketest_success = run_smoketest(config)
+            results["smoketest"] = "SUCCESS" if smoketest_success else "FAILED"
+        else:
+            print("\nSmoke test is not enabled. Skipping...")
+            results["smoketest"] = "SKIPPED"
+
         if config.get("evaltest", {}).get("run", False):
             print("\n-----Running Eval Test-----")
             evaltest_success = run_evaltest(config)