Skip to content

Commit e63c340

Browse files
authored
Benchmark minor improvements (#418)
1 parent e62b7ca commit e63c340

3 files changed

Lines changed: 38 additions & 22 deletions

File tree

.github/workflows/benchmark.yml

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ permissions:
33
contents: read
44
on:
55
workflow_dispatch:
6+
schedule:
7+
# Every Monday and Thursday at 3 AM UTC+8
8+
- cron: '0 19 * * 0,3'
69

710
jobs:
811
benchmark:
@@ -25,7 +28,7 @@ jobs:
2528
runner:
2629
- self-hosted
2730
- 1ES.Pool=agl-runner-cpu
28-
timeout: 60
31+
timeout: 45
2932
args: >-
3033
--mode batch
3134
--total-tasks 4096
@@ -40,7 +43,7 @@ jobs:
4043
runner:
4144
- self-hosted
4245
- 1ES.Pool=agl-runner-cpu
43-
timeout: 60
46+
timeout: 45
4447
args: >-
4548
--mode batch
4649
--total-tasks 10000
@@ -70,25 +73,25 @@ jobs:
7073
runner:
7174
- self-hosted
7275
- 1ES.Pool=agl-runner-cpu
73-
timeout: 60
76+
timeout: 120
7477
args: >-
7578
--mode batch
76-
--total-tasks 100000
79+
--total-tasks 50000
7780
--batch-size 8192
7881
--n-runners 256
7982
--max-rounds 6
8083
--sleep-seconds 0.1
8184
- id: scenario-long-queues
8285
display: Long rollout queues
8386
kind: scenario
84-
store_workers: 32
87+
store_workers: 48
8588
runner:
8689
- self-hosted
8790
- 1ES.Pool=agl-runner-cpu
88-
timeout: 60
91+
timeout: 120
8992
args: >-
9093
--mode batch_partial
91-
--total-tasks 100000
94+
--total-tasks 50000
9295
--batch-size 1024
9396
--n-runners 256
9497
--remaining-tasks 4096
@@ -97,14 +100,14 @@ jobs:
97100
- id: scenario-high-concurrency
98101
display: High-throughput concurrent requests
99102
kind: scenario
100-
store_workers: 32
103+
store_workers: 96
101104
runner:
102105
- self-hosted
103106
- 1ES.Pool=agl-runner-cpu
104-
timeout: 60
107+
timeout: 120
105108
args: >-
106109
--mode single
107-
--total-tasks 100000
110+
--total-tasks 50000
108111
--concurrency 2048
109112
--n-runners 256
110113
--max-rounds 2
@@ -172,6 +175,7 @@ jobs:
172175
STORE_URL: http://localhost:4747
173176
STORE_API_URL: http://localhost:4747/v1/agl
174177
PROM_URL: http://localhost:9090
178+
GITHUB_ACTIONS_TIMEOUT_MINUTES: ${{ matrix.workload.timeout }}
175179
WORKLOAD_KIND: ${{ matrix.workload.kind }}
176180
WORKLOAD_ID: ${{ matrix.workload.id }}
177181
BACKEND_ID: ${{ matrix.backend.id }}
@@ -338,27 +342,27 @@ jobs:
338342
runner: ubuntu-latest
339343
workload:
340344
- id: high-insert
341-
total_tasks: 100000
345+
total_tasks: 50000
342346
concurrency: 2048
343347
type: insert
344348
- id: medium-insert
345-
total_tasks: 100000
349+
total_tasks: 50000
346350
concurrency: 128
347351
type: insert
348352
- id: low-insert
349-
total_tasks: 100000
353+
total_tasks: 50000
350354
concurrency: 4
351355
type: insert
352356
- id: high-dequeue
353-
total_tasks: 100000
357+
total_tasks: 50000
354358
concurrency: 2048
355359
type: dequeue
356360
- id: medium-dequeue
357-
total_tasks: 100000
361+
total_tasks: 50000
358362
concurrency: 128
359363
type: dequeue
360364
- id: low-dequeue
361-
total_tasks: 100000
365+
total_tasks: 50000
362366
concurrency: 4
363367
type: dequeue
364368
env:

docker/compose.store.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ services:
99

1010
command: agl store --host 0.0.0.0 --port 4747
1111

12+
ulimits:
13+
nofile:
14+
soft: 65535
15+
hard: 65535
16+
1217
develop:
1318
watch:
1419
# Sync the working directory with the `/app` directory in the container

tests/benchmark/benchmark_store.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import random
99
import sys
1010
import threading
11+
import time
1112
from typing import Any, Dict, List, Literal, Optional, Sequence, Set, Tuple, cast
1213

1314
from rich.console import Console
@@ -19,7 +20,9 @@
1920

2021
console = Console()
2122

22-
MAX_RUNTIME_SECONDS = 30 * 60
23+
# Minus 10 to leave time for setting up env.
24+
MAX_RUNTIME_SECONDS = (int(os.getenv("GITHUB_ACTIONS_TIMEOUT_MINUTES", "30")) - 10) * 60
25+
MAX_STALE_SECONDS = 300
2326

2427

2528
def _abort_due_to_timeout() -> None:
@@ -157,7 +160,7 @@ async def algorithm_batch(self, total_tasks: int, batch_size: int):
157160

158161
pending = {rollout_id: task_name for rollout_id, task_name in batch_rollouts}
159162
completed_ids: Set[str] = set()
160-
completed_ids_last_updated: int = 0
163+
completed_ids_last_updated: float = time.perf_counter()
161164
while len(completed_ids) < len(batch_rollouts):
162165
finished_rollouts = await store.wait_for_rollouts(
163166
rollout_ids=[rollout_id for rollout_id, _ in batch_rollouts],
@@ -177,13 +180,17 @@ async def algorithm_batch(self, total_tasks: int, batch_size: int):
177180

178181
# Check and warn for stale rollouts
179182
if complete_ids_updated:
180-
completed_ids_last_updated = 0
183+
completed_ids_last_updated = time.perf_counter()
181184
else:
182-
completed_ids_last_updated += 1
183-
if completed_ids_last_updated >= 10:
185+
if time.perf_counter() - completed_ids_last_updated > MAX_STALE_SECONDS / 2:
184186
unfinished_ids = set(rollout_id for rollout_id, _ in batch_rollouts) - completed_ids
185187
print(f"Stale rollouts: {unfinished_ids}")
186-
completed_ids_last_updated = 0
188+
if time.perf_counter() - completed_ids_last_updated > MAX_STALE_SECONDS:
189+
current_workers = await store.query_workers()
190+
console.print(f"Stalled. Current worker status shown below:")
191+
for worker in current_workers:
192+
console.print(f" Worker: {worker}", width=1024) # Avoid wrapping
193+
raise RuntimeError("Rollout progress has stalled for too long")
187194

188195
await asyncio.sleep(5.0)
189196

0 commit comments

Comments
 (0)