Skip to content

Commit acdd4dc

Browse files
Add server perf soak CI harness
1 parent 33a161a commit acdd4dc

5 files changed

Lines changed: 965 additions & 0 deletions

File tree

.github/workflows/server-perf.yml

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
name: Server Perf
2+
3+
on:
4+
pull_request:
5+
branches: [main]
6+
paths:
7+
- ".github/workflows/server-perf.yml"
8+
- "Dockerfile"
9+
- "docker-compose.yml"
10+
- "app/Support/ServerPollingCache.php"
11+
- "app/Support/WorkflowTaskPoller.php"
12+
- "app/Support/WorkflowTaskPollRequestStore.php"
13+
- "config/server.php"
14+
- "scripts/perf/**"
15+
push:
16+
branches: [main]
17+
paths:
18+
- ".github/workflows/server-perf.yml"
19+
- "Dockerfile"
20+
- "docker-compose.yml"
21+
- "app/Support/ServerPollingCache.php"
22+
- "app/Support/WorkflowTaskPoller.php"
23+
- "app/Support/WorkflowTaskPollRequestStore.php"
24+
- "config/server.php"
25+
- "scripts/perf/**"
26+
schedule:
27+
- cron: "17 7 * * *"
28+
workflow_dispatch:
29+
inputs:
30+
duration_seconds:
31+
description: "Soak duration in seconds"
32+
required: false
33+
default: "7200"
34+
concurrency:
35+
description: "Concurrent long-poll workers"
36+
required: false
37+
default: "24"
38+
grafana_remote_write:
39+
description: "Enable Grafana Cloud remote_write when variables/secrets are configured"
40+
required: false
41+
type: boolean
42+
default: true
43+
44+
permissions:
45+
contents: read
46+
47+
concurrency:
48+
group: server-perf-${{ github.event_name }}-${{ github.ref }}
49+
cancel-in-progress: false
50+
51+
jobs:
52+
smoke:
53+
name: Polling cache bounded-growth smoke
54+
if: github.event_name == 'pull_request' || github.event_name == 'push'
55+
runs-on: ubuntu-latest
56+
timeout-minutes: 45
57+
58+
steps:
59+
- name: Checkout server
60+
uses: actions/checkout@v6
61+
62+
- name: Set up Docker Buildx
63+
uses: docker/setup-buildx-action@v4
64+
65+
- name: Run short perf smoke
66+
env:
67+
DW_PERF_DURATION_SECONDS: "120"
68+
DW_PERF_CONCURRENCY: "8"
69+
DW_PERF_NAMESPACES: "4"
70+
DW_PERF_TASK_QUEUES: "8"
71+
DW_PERF_MAX_SERVER_MEMORY_MB: "768"
72+
DW_PERF_MAX_POLLING_KEYS: "512"
73+
DW_PERF_MAX_FINAL_POLLING_KEYS: "0"
74+
run: scripts/perf/run-server-soak.sh
75+
76+
- name: Upload perf artifacts
77+
if: always()
78+
uses: actions/upload-artifact@v7
79+
with:
80+
name: server-perf-smoke
81+
path: build/perf/
82+
if-no-files-found: warn
83+
84+
soak:
85+
name: Vultr self-hosted polling cache soak
86+
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
87+
runs-on: [self-hosted, linux, x64, vultr-perf, server-perf]
88+
timeout-minutes: 390
89+
90+
steps:
91+
- name: Checkout server
92+
uses: actions/checkout@v6
93+
94+
- name: Run long perf soak
95+
env:
96+
DW_PERF_DURATION_SECONDS: ${{ github.event_name == 'workflow_dispatch' && inputs.duration_seconds || '7200' }}
97+
DW_PERF_CONCURRENCY: ${{ github.event_name == 'workflow_dispatch' && inputs.concurrency || '24' }}
98+
DW_PERF_NAMESPACES: "8"
99+
DW_PERF_TASK_QUEUES: "16"
100+
DW_PERF_MAX_SERVER_MEMORY_MB: "1024"
101+
DW_PERF_MAX_POLLING_KEYS: "2048"
102+
DW_PERF_MAX_FINAL_POLLING_KEYS: "0"
103+
DW_PERF_MAX_SERVER_MEMORY_SLOPE_MB_HOUR: "128"
104+
DW_PERF_GRAFANA_REMOTE_WRITE_ENABLED: ${{ github.event_name != 'workflow_dispatch' || inputs.grafana_remote_write }}
105+
DW_PERF_GRAFANA_REMOTE_WRITE_URL: ${{ vars.DW_PERF_GRAFANA_REMOTE_WRITE_URL }}
106+
DW_PERF_GRAFANA_USERNAME: ${{ vars.DW_PERF_GRAFANA_USERNAME }}
107+
DW_PERF_GRAFANA_API_TOKEN: ${{ secrets.DW_PERF_GRAFANA_API_TOKEN }}
108+
run: scripts/perf/run-server-soak.sh
109+
110+
- name: Upload perf artifacts
111+
if: always()
112+
uses: actions/upload-artifact@v7
113+
with:
114+
name: server-perf-soak
115+
path: build/perf/
116+
if-no-files-found: warn

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
/public/storage
55
/storage/*.key
66
/vendor
7+
/build/perf
8+
__pycache__/
9+
*.py[cod]
710
.env
811
.env.backup
912
.env.production

docs/perf-runner.md

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Server Perf Runner
2+
3+
Issue `zorporation/durable-workflow#461` tracks bounded cache and metric-cardinality discipline. The server repo contributes the CI/perf harness for that work.
4+
5+
## Runner Shape
6+
7+
Use GitHub Actions as the control plane and one trusted self-hosted Vultr runner for long soaks.
8+
9+
Initial Vultr size:
10+
11+
- Product: High Performance Cloud Compute
12+
- Shape: 4 vCPU / 8 GB RAM / NVMe
13+
- Budget: about 48 USD/month monthly cap, or about 0.071 USD/hour
14+
- Backups: off for the first pass
15+
16+
Vultr bills stopped instances, so destroy the server if the runner is no longer needed.
17+
18+
## GitHub Runner Labels
19+
20+
Register the runner with these labels:
21+
22+
- `self-hosted`
23+
- `linux`
24+
- `x64`
25+
- `vultr-perf`
26+
- `server-perf`
27+
28+
The long soak workflow targets all five labels. Do not attach these labels to general-purpose runners that may execute untrusted pull request code.
29+
30+
Install the current GitHub Actions runner package when provisioning the box. The workflow uses current Node 24-based actions, so the runner must be at least `2.327.1`.
31+
32+
## GitHub Configuration
33+
34+
Required for the soak job:
35+
36+
- A self-hosted runner with the labels above.
37+
38+
Optional for Grafana Cloud remote write:
39+
40+
- Repository variable `DW_PERF_GRAFANA_REMOTE_WRITE_URL`
41+
- Repository variable `DW_PERF_GRAFANA_USERNAME`
42+
- Repository secret `DW_PERF_GRAFANA_API_TOKEN`
43+
44+
When those values are absent, the harness still runs and uploads local JSON, log, and Prometheus exposition artifacts. When all are present, the wrapper starts a short-lived Prometheus sidecar that scrapes the harness endpoint and remote-writes to Grafana Cloud.
45+
46+
## Harness Behavior
47+
48+
The harness starts the production Docker Compose stack with isolated ports and a unique Compose project name, then drives the real worker polling route:
49+
50+
- creates perf namespaces,
51+
- registers workers across multiple task queues,
52+
- repeatedly calls `POST /api/worker/workflow-tasks/poll` with unique `poll_request_id` values,
53+
- samples server, Redis, MySQL, and polling-cache counts,
54+
- waits for the polling-result TTL window to drain,
55+
- fails if cache keys, memory ceiling, request errors, or long-run memory slope exceed the configured budgets.
56+
57+
The short smoke job runs on GitHub-hosted runners and proves the harness plus cache-key drain path. The long soak runs on the Vultr self-hosted runner and enforces the memory slope budget after the run is long enough to make that signal meaningful.
58+
59+
## Local Run
60+
61+
From the server repo:
62+
63+
```bash
64+
DW_PERF_DURATION_SECONDS=120 \
65+
DW_PERF_CONCURRENCY=8 \
66+
scripts/perf/run-server-soak.sh
67+
```
68+
69+
Artifacts land in `build/perf/` by default. The script removes the Compose project and volumes on exit.
70+
71+
## Safety Rules
72+
73+
- Do not run the self-hosted soak job for pull requests from forks.
74+
- Keep the runner dedicated to trusted workflows.
75+
- Keep Docker cleanup in the job even on failure.
76+
- Do not commit Grafana tokens, runner registration tokens, or generated Prometheus configs.

scripts/perf/run-server-soak.sh

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
5+
ARTIFACT_DIR="${DW_PERF_ARTIFACT_DIR:-$ROOT_DIR/build/perf}"
6+
RUN_ID="${GITHUB_RUN_ID:-local}-$(date +%s)"
7+
PROJECT="${DW_PERF_COMPOSE_PROJECT:-dw-server-perf-$RUN_ID}"
8+
SERVER_PORT="${DW_PERF_SERVER_PORT:-18080}"
9+
MYSQL_PORT="${DW_PERF_MYSQL_PORT:-13306}"
10+
REDIS_PORT="${DW_PERF_REDIS_PORT:-16379}"
11+
METRICS_PORT="${DW_PERF_METRICS_PORT:-19090}"
12+
AUTH_TOKEN="${DW_PERF_AUTH_TOKEN:-perf-token}"
13+
POLL_TIMEOUT="${DW_PERF_POLL_TIMEOUT:-1}"
14+
PROMETHEUS_CONTAINER="${PROJECT}-prometheus"
15+
PROMETHEUS_CONFIG_DIR=""
16+
17+
mkdir -p "$ARTIFACT_DIR"
18+
19+
if [ -z "${APP_KEY:-}" ]; then
20+
APP_KEY="base64:$(openssl rand -base64 32)"
21+
export APP_KEY
22+
fi
23+
24+
export APP_VERSION="${APP_VERSION:-2.0.0-perf}"
25+
export DW_AUTH_DRIVER="${DW_AUTH_DRIVER:-token}"
26+
export DW_AUTH_TOKEN="${DW_AUTH_TOKEN:-$AUTH_TOKEN}"
27+
export DW_WORKER_TOKEN="${DW_WORKER_TOKEN:-}"
28+
export DW_OPERATOR_TOKEN="${DW_OPERATOR_TOKEN:-}"
29+
export DW_ADMIN_TOKEN="${DW_ADMIN_TOKEN:-}"
30+
export DW_AUTH_BACKWARD_COMPATIBLE="${DW_AUTH_BACKWARD_COMPATIBLE:-true}"
31+
32+
OVERRIDE_FILE="$ARTIFACT_DIR/docker-compose.perf.yml"
33+
cat > "$OVERRIDE_FILE" <<YAML
34+
services:
35+
bootstrap:
36+
environment:
37+
LOG_LEVEL: warning
38+
DW_WORKER_POLL_TIMEOUT: "$POLL_TIMEOUT"
39+
DW_WORKER_POLL_INTERVAL_MS: "50"
40+
DW_WORKER_POLL_SIGNAL_CHECK_INTERVAL_MS: "25"
41+
server:
42+
ports: !override
43+
- "${SERVER_PORT}:8080"
44+
environment:
45+
LOG_LEVEL: warning
46+
DW_WORKER_POLL_TIMEOUT: "$POLL_TIMEOUT"
47+
DW_WORKER_POLL_INTERVAL_MS: "50"
48+
DW_WORKER_POLL_SIGNAL_CHECK_INTERVAL_MS: "25"
49+
worker:
50+
environment:
51+
LOG_LEVEL: warning
52+
DW_WORKER_POLL_TIMEOUT: "$POLL_TIMEOUT"
53+
DW_WORKER_POLL_INTERVAL_MS: "50"
54+
DW_WORKER_POLL_SIGNAL_CHECK_INTERVAL_MS: "25"
55+
scheduler:
56+
environment:
57+
LOG_LEVEL: warning
58+
mysql:
59+
ports: !override []
60+
healthcheck:
61+
test: ["CMD", "mysqladmin", "ping", "-h", "localhost"]
62+
interval: 5s
63+
timeout: 3s
64+
retries: 24
65+
start_period: 30s
66+
redis:
67+
ports: !override []
68+
YAML
69+
70+
cleanup() {
71+
local status=$?
72+
73+
docker logs "${PROJECT}-server-1" > "$ARTIFACT_DIR/server.log" 2>&1 || true
74+
docker logs "${PROJECT}-worker-1" > "$ARTIFACT_DIR/worker.log" 2>&1 || true
75+
docker logs "${PROJECT}-scheduler-1" > "$ARTIFACT_DIR/scheduler.log" 2>&1 || true
76+
docker logs "${PROJECT}-mysql-1" > "$ARTIFACT_DIR/mysql.log" 2>&1 || true
77+
docker logs "${PROJECT}-redis-1" > "$ARTIFACT_DIR/redis.log" 2>&1 || true
78+
79+
docker rm -f "$PROMETHEUS_CONTAINER" >/dev/null 2>&1 || true
80+
if [ -n "$PROMETHEUS_CONFIG_DIR" ]; then
81+
rm -rf "$PROMETHEUS_CONFIG_DIR"
82+
fi
83+
84+
docker compose -p "$PROJECT" -f "$ROOT_DIR/docker-compose.yml" -f "$OVERRIDE_FILE" down -v --remove-orphans || true
85+
exit "$status"
86+
}
87+
trap cleanup EXIT
88+
89+
maybe_start_prometheus() {
90+
if [ "${DW_PERF_GRAFANA_REMOTE_WRITE_ENABLED:-true}" != "true" ]; then
91+
echo "Grafana Cloud remote_write disabled for this run; writing local perf artifacts only."
92+
return
93+
fi
94+
95+
if [ -z "${DW_PERF_GRAFANA_REMOTE_WRITE_URL:-}" ] \
96+
|| [ -z "${DW_PERF_GRAFANA_USERNAME:-}" ] \
97+
|| [ -z "${DW_PERF_GRAFANA_API_TOKEN:-}" ]; then
98+
echo "Grafana Cloud remote_write is not configured; writing local perf artifacts only."
99+
return
100+
fi
101+
102+
PROMETHEUS_CONFIG_DIR="$(mktemp -d)"
103+
cat > "$PROMETHEUS_CONFIG_DIR/prometheus.yml" <<YAML
104+
global:
105+
scrape_interval: 15s
106+
scrape_configs:
107+
- job_name: durable_workflow_server_perf
108+
static_configs:
109+
- targets:
110+
- host.docker.internal:${METRICS_PORT}
111+
labels:
112+
repository: "${GITHUB_REPOSITORY:-local}"
113+
workflow: "${GITHUB_WORKFLOW:-local}"
114+
run_id: "${GITHUB_RUN_ID:-local}"
115+
runner: "${RUNNER_NAME:-local}"
116+
remote_write:
117+
- url: "${DW_PERF_GRAFANA_REMOTE_WRITE_URL}"
118+
basic_auth:
119+
username: "${DW_PERF_GRAFANA_USERNAME}"
120+
password: "${DW_PERF_GRAFANA_API_TOKEN}"
121+
YAML
122+
123+
docker run -d --rm \
124+
--name "$PROMETHEUS_CONTAINER" \
125+
--add-host=host.docker.internal:host-gateway \
126+
-v "$PROMETHEUS_CONFIG_DIR/prometheus.yml:/etc/prometheus/prometheus.yml:ro" \
127+
"${DW_PERF_PROMETHEUS_IMAGE:-prom/prometheus:v2.55.1}" \
128+
--config.file=/etc/prometheus/prometheus.yml \
129+
--storage.tsdb.retention.time=2h \
130+
--web.enable-lifecycle >/dev/null
131+
}
132+
133+
server_base_url() {
134+
local base_url="http://127.0.0.1:${SERVER_PORT}"
135+
local docker_host_url
136+
local docker_host_ip
137+
local server_id
138+
local server_ip
139+
140+
if curl -fsS --max-time 2 "$base_url/api/health" >/dev/null 2>&1; then
141+
echo "$base_url"
142+
return
143+
fi
144+
145+
docker_host_ip="$(ip route 2>/dev/null | awk '/default/ {print $3; exit}')"
146+
if [ -n "$docker_host_ip" ]; then
147+
docker_host_url="http://${docker_host_ip}:${SERVER_PORT}"
148+
if curl -fsS --max-time 2 "$docker_host_url/api/health" >/dev/null 2>&1; then
149+
echo "$docker_host_url"
150+
return
151+
fi
152+
fi
153+
154+
server_id="$(docker compose -p "$PROJECT" -f "$ROOT_DIR/docker-compose.yml" -f "$OVERRIDE_FILE" ps -q server)"
155+
server_ip="$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$server_id" 2>/dev/null || true)"
156+
157+
if [ -n "$server_ip" ]; then
158+
echo "http://${server_ip}:8080"
159+
return
160+
fi
161+
162+
echo "$base_url"
163+
}
164+
165+
cd "$ROOT_DIR"
166+
167+
echo "Starting perf stack with project ${PROJECT} on http://127.0.0.1:${SERVER_PORT}"
168+
docker compose -p "$PROJECT" -f "$ROOT_DIR/docker-compose.yml" -f "$OVERRIDE_FILE" up -d --build --wait
169+
170+
maybe_start_prometheus
171+
BASE_URL="$(server_base_url)"
172+
echo "Running perf load against ${BASE_URL}"
173+
174+
DW_PERF_BASE_URL="$BASE_URL" \
175+
DW_PERF_AUTH_TOKEN="$AUTH_TOKEN" \
176+
DW_PERF_ARTIFACT_DIR="$ARTIFACT_DIR" \
177+
DW_PERF_COMPOSE_PROJECT="$PROJECT" \
178+
DW_PERF_METRICS_PORT="$METRICS_PORT" \
179+
DW_PERF_POLL_TIMEOUT="$POLL_TIMEOUT" \
180+
"$ROOT_DIR/scripts/perf/server_soak.py"

0 commit comments

Comments
 (0)