Skip to content

Commit fc4f1ca

Browse files
address resonble comments
1 parent 73aae0f commit fc4f1ca

File tree

8 files changed

+90
-100
lines changed

8 files changed

+90
-100
lines changed

src/cloudai/workloads/aiconfig/aiconfigurator.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
from typing import List, Optional, Union
2020

21-
from pydantic import BaseModel, ConfigDict
21+
from pydantic import BaseModel, ConfigDict, model_validator
2222

2323
from cloudai.core import CmdArgs, Installable, TestDefinition
2424

@@ -74,6 +74,12 @@ class AiconfiguratorCmdArgs(CmdArgs):
7474
agg: Optional[Agg] = None
7575
disagg: Optional[Disagg] = None
7676

77+
@model_validator(mode="after")
78+
def _validate_agg_disagg(self) -> "AiconfiguratorCmdArgs":
79+
if self.agg is not None and self.disagg is not None:
80+
raise ValueError("Only one of 'agg' or 'disagg' may be specified.")
81+
return self
82+
7783

7884
class AiconfiguratorTestDefinition(TestDefinition):
7985
"""Test object for running Aiconfigurator predictor as a workload."""

src/cloudai/workloads/aiconfig/predictor.py

Lines changed: 22 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,15 @@
1616

1717
from __future__ import annotations
1818

19-
import sys
2019
from typing import Any, Dict, Optional, cast
2120

21+
from aiconfigurator.sdk import common
22+
from aiconfigurator.sdk import config as aic_config
23+
from aiconfigurator.sdk import inference_session as aic_inference_session
24+
from aiconfigurator.sdk import models as aic_models
25+
from aiconfigurator.sdk import perf_database as aic_perf_database
26+
from aiconfigurator.sdk.backends import factory as aic_backends_factory
27+
2228

2329
def _to_enum(enum_cls: Any, value_or_name: Any) -> Any:
2430
"""
@@ -39,40 +45,6 @@ def _validate_nextn(nextn: int, nextn_accept_rates: Optional[list[float]]) -> li
3945
return nextn_accept_rates or []
4046

4147

42-
def _ensure_aiconfigurator_available(*, need_inference_session: bool) -> dict[str, Any]:
43-
"""
44-
Import required aiconfigurator symbols or raise a consistent ModuleNotFoundError.
45-
46-
Returns a dict of imported symbols so call sites can stay concise.
47-
"""
48-
try:
49-
from aiconfigurator.sdk import common
50-
from aiconfigurator.sdk.backends.factory import get_backend
51-
from aiconfigurator.sdk.config import ModelConfig, RuntimeConfig
52-
from aiconfigurator.sdk.models import get_model
53-
from aiconfigurator.sdk.perf_database import get_database
54-
55-
if need_inference_session:
56-
from aiconfigurator.sdk.inference_session import InferenceSession
57-
else:
58-
InferenceSession = None # type: ignore[assignment]
59-
except ModuleNotFoundError as e:
60-
raise ModuleNotFoundError(
61-
"Missing dependency 'aiconfigurator'. Install it in the Python environment used for this test. "
62-
f"(python={sys.executable})"
63-
) from e
64-
65-
return {
66-
"common": common,
67-
"get_backend": get_backend,
68-
"ModelConfig": ModelConfig,
69-
"RuntimeConfig": RuntimeConfig,
70-
"get_model": get_model,
71-
"get_database": get_database,
72-
"InferenceSession": InferenceSession,
73-
}
74-
75-
7648
def predict_ifb_single(
7749
*,
7850
model_name: str,
@@ -103,22 +75,14 @@ def predict_ifb_single(
10375
overwrite_num_layers: int = 0,
10476
) -> Dict[str, Any]:
10577
"""Predict metrics for a single IFB configuration using the aiconfigurator SDK primitives."""
106-
syms = _ensure_aiconfigurator_available(need_inference_session=False)
107-
common = syms["common"]
108-
get_backend = syms["get_backend"]
109-
ModelConfig = syms["ModelConfig"]
110-
RuntimeConfig = syms["RuntimeConfig"]
111-
get_model = syms["get_model"]
112-
get_database = syms["get_database"]
113-
114-
database = get_database(system=system, backend=backend, version=version)
78+
database = aic_perf_database.get_database(system=system, backend=backend, version=version)
11579
if database is None:
11680
raise ValueError(f"No perf database found for system={system} backend={backend} version={version}")
117-
backend_impl = cast(Any, get_backend(backend))
81+
backend_impl = cast(Any, aic_backends_factory.get_backend(backend))
11882

11983
accept_rates = _validate_nextn(nextn, nextn_accept_rates)
12084

121-
mc = ModelConfig(
85+
mc = aic_config.ModelConfig(
12286
tp_size=tp,
12387
pp_size=pp,
12488
attention_dp_size=dp,
@@ -133,9 +97,9 @@ def predict_ifb_single(
13397
nextn_accept_rates=accept_rates,
13498
overwrite_num_layers=overwrite_num_layers,
13599
)
136-
model = get_model(model_name, mc, backend)
100+
model = aic_models.get_model(model_name, mc, backend)
137101

138-
rc = RuntimeConfig(batch_size=batch_size, isl=isl, osl=osl)
102+
rc = aic_config.RuntimeConfig(batch_size=batch_size, isl=isl, osl=osl)
139103
summary = backend_impl.run_ifb(model=model, database=database, runtime_config=rc, ctx_tokens=ctx_tokens)
140104
df = summary.get_summary_df()
141105
if df is None or df.empty:
@@ -197,24 +161,15 @@ def predict_disagg_single(
197161
decode_correction_scale: float = 1.0,
198162
) -> Dict[str, Any]:
199163
"""Predict metrics for a single disaggregated configuration (explicit prefill/decode workers)."""
200-
syms = _ensure_aiconfigurator_available(need_inference_session=True)
201-
common = syms["common"]
202-
get_backend = syms["get_backend"]
203-
ModelConfig = syms["ModelConfig"]
204-
RuntimeConfig = syms["RuntimeConfig"]
205-
get_model = syms["get_model"]
206-
get_database = syms["get_database"]
207-
InferenceSession = syms["InferenceSession"]
208-
209-
perf_db = get_database(system=system, backend=backend, version=version)
164+
perf_db = aic_perf_database.get_database(system=system, backend=backend, version=version)
210165
if perf_db is None:
211166
raise ValueError(f"No perf database found for system={system} backend={backend} version={version}")
212167

213-
perf_backend = cast(Any, get_backend(backend))
168+
perf_backend = cast(Any, aic_backends_factory.get_backend(backend))
214169

215170
accept_rates = _validate_nextn(nextn, nextn_accept_rates)
216171

217-
p_mc = ModelConfig(
172+
p_mc = aic_config.ModelConfig(
218173
tp_size=p_tp,
219174
pp_size=p_pp,
220175
attention_dp_size=p_dp,
@@ -229,7 +184,7 @@ def predict_disagg_single(
229184
nextn_accept_rates=accept_rates,
230185
overwrite_num_layers=overwrite_num_layers,
231186
)
232-
d_mc = ModelConfig(
187+
d_mc = aic_config.ModelConfig(
233188
tp_size=d_tp,
234189
pp_size=d_pp,
235190
attention_dp_size=d_dp,
@@ -245,14 +200,14 @@ def predict_disagg_single(
245200
overwrite_num_layers=overwrite_num_layers,
246201
)
247202

248-
rc_prefill = RuntimeConfig(batch_size=p_bs, isl=isl, osl=osl)
249-
rc_decode = RuntimeConfig(batch_size=d_bs, isl=isl, osl=osl)
203+
rc_prefill = aic_config.RuntimeConfig(batch_size=p_bs, isl=isl, osl=osl)
204+
rc_decode = aic_config.RuntimeConfig(batch_size=d_bs, isl=isl, osl=osl)
250205

251-
prefill_model = get_model(model_name, p_mc, backend)
252-
decode_model = get_model(model_name, d_mc, backend)
206+
prefill_model = aic_models.get_model(model_name, p_mc, backend)
207+
decode_model = aic_models.get_model(model_name, d_mc, backend)
253208

254-
prefill_sess = InferenceSession(prefill_model, perf_db, perf_backend)
255-
decode_sess = InferenceSession(decode_model, perf_db, perf_backend)
209+
prefill_sess = aic_inference_session.InferenceSession(prefill_model, perf_db, perf_backend)
210+
decode_sess = aic_inference_session.InferenceSession(decode_model, perf_db, perf_backend)
256211

257212
prefill_summary = prefill_sess.run_static(mode="static_ctx", runtime_config=rc_prefill)
258213
decode_summary = decode_sess.run_static(mode="static_gen", runtime_config=rc_decode)

src/cloudai/workloads/aiconfig/report_generation_strategy.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,12 @@ class AiconfiguratorReportGenerationStrategy(ReportGenerationStrategy):
3838

3939
def can_handle_directory(self) -> bool:
4040
return isinstance(self.test_run.test, AiconfiguratorTestDefinition) and (
41-
(self.test_run.output_path / "report.txt").is_file() or (self.test_run.output_path / "stdout.txt").is_file()
41+
(self.test_run.output_path / "report.json").is_file()
42+
or (self.test_run.output_path / "stdout.txt").is_file()
4243
)
4344

4445
def _load_results(self) -> Optional[dict]:
45-
result_path = self.test_run.output_path / "report.txt"
46+
result_path = self.test_run.output_path / "report.json"
4647
if result_path.is_file():
4748
try:
4849
with result_path.open("r", encoding="utf-8") as f:

src/cloudai/workloads/aiconfig/simple_predictor.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,12 @@ def parse_args() -> argparse.Namespace:
6565
parser.add_argument("--decode-correction-scale", type=float, default=1.0)
6666

6767
# output
68-
parser.add_argument("--output", required=True, type=Path, help="Path to write report.txt")
68+
parser.add_argument(
69+
"--output",
70+
required=True,
71+
type=Path,
72+
help="Path to write predictor JSON output (filename is user-specified).",
73+
)
6974

7075
# optional quantization and features (strings to be converted by SDK)
7176
parser.add_argument("--gemm-quant-mode", default="fp8_block")

src/cloudai/workloads/aiconfig/standalone_command_gen_strategy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def gen_exec_command(self) -> str:
3838
args: AiconfiguratorCmdArgs = tdef.cmd_args
3939
out_dir = Path(self.test_run.output_path).resolve()
4040

41-
report_txt = Path(out_dir) / "report.txt"
41+
report_json = Path(out_dir) / "report.json"
4242
stdout_txt = Path(out_dir) / "stdout.txt"
4343
stderr_txt = Path(out_dir) / "stderr.txt"
4444

@@ -117,7 +117,7 @@ def gen_exec_command(self) -> str:
117117
else:
118118
cmd = [*base_cmd, "--mode", "agg"]
119119

120-
cmd.extend(["--output", str(report_txt)])
120+
cmd.extend(["--output", str(report_json)])
121121

122122
cmd_str = " ".join(shlex.quote(str(x)) for x in cmd)
123123
full_cmd = f"{cmd_str} 1> {shlex.quote(str(stdout_txt))} 2> {shlex.quote(str(stderr_txt))}"

tests/conftest.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from cloudai.systems.kubernetes import KubernetesSystem
2828
from cloudai.systems.runai import RunAISystem
2929
from cloudai.systems.slurm import SlurmGroup, SlurmPartition, SlurmSystem
30+
from cloudai.systems.standalone import StandaloneSystem
3031
from cloudai.workloads.nccl_test.nccl import NCCLCmdArgs, NCCLTestDefinition
3132

3233

@@ -115,6 +116,16 @@ def runai_system(tmp_path: Path) -> RunAISystem:
115116
return system
116117

117118

119+
@pytest.fixture
120+
def standalone_system(tmp_path: Path) -> StandaloneSystem:
121+
return StandaloneSystem(
122+
name="standalone",
123+
scheduler="standalone",
124+
install_path=tmp_path / "install",
125+
output_path=tmp_path / "output",
126+
)
127+
128+
118129
@pytest.fixture
119130
def base_tr(slurm_system: SlurmSystem) -> TestRun:
120131
return TestRun(

tests/report_generation_strategy/test_aiconfigurator_report_generation_strategy.py

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,6 @@
3131
from cloudai.workloads.aiconfig.aiconfigurator import Agg
3232

3333

34-
@pytest.fixture
35-
def standalone_system(tmp_path: Path) -> StandaloneSystem:
36-
return StandaloneSystem(
37-
name="standalone",
38-
scheduler="standalone",
39-
install_path=tmp_path / "install",
40-
output_path=tmp_path / "output",
41-
)
42-
43-
4434
def _make_tr(tmp_path: Path) -> TestRun:
4535
tdef = AiconfiguratorTestDefinition(
4636
name="aiconfig",
@@ -60,7 +50,7 @@ def _make_tr(tmp_path: Path) -> TestRun:
6050
def test_can_handle_directory_when_report_exists(tmp_path: Path, standalone_system: StandaloneSystem) -> None:
6151
tr = _make_tr(tmp_path)
6252
tr.output_path.mkdir(parents=True, exist_ok=True)
63-
(tr.output_path / "report.txt").write_text("{}", encoding="utf-8")
53+
(tr.output_path / "report.json").write_text("{}", encoding="utf-8")
6454

6555
strategy = AiconfiguratorReportGenerationStrategy(standalone_system, tr)
6656
assert strategy.can_handle_directory() is True
@@ -70,7 +60,7 @@ def test_generate_report_writes_summary(tmp_path: Path, standalone_system: Stand
7060
tr = _make_tr(tmp_path)
7161
tr.output_path.mkdir(parents=True, exist_ok=True)
7262
payload = {"ttft_ms": 10.0, "tpot_ms": 2.0, "tokens_per_s_per_gpu": 3.0, "tokens_per_s_per_user": 4.0, "oom": False}
73-
(tr.output_path / "report.txt").write_text(json.dumps(payload), encoding="utf-8")
63+
(tr.output_path / "report.json").write_text(json.dumps(payload), encoding="utf-8")
7464

7565
strategy = AiconfiguratorReportGenerationStrategy(standalone_system, tr)
7666
strategy.generate_report()
@@ -86,7 +76,7 @@ def test_generate_report_writes_summary(tmp_path: Path, standalone_system: Stand
8676
def test_get_metric_default_prefers_throughput(tmp_path: Path, standalone_system: StandaloneSystem) -> None:
8777
tr = _make_tr(tmp_path)
8878
tr.output_path.mkdir(parents=True, exist_ok=True)
89-
(tr.output_path / "report.txt").write_text(json.dumps({"tokens_per_s_per_gpu": 123.0}), encoding="utf-8")
79+
(tr.output_path / "report.json").write_text(json.dumps({"tokens_per_s_per_gpu": 123.0}), encoding="utf-8")
9080

9181
strategy = AiconfiguratorReportGenerationStrategy(standalone_system, tr)
9282
assert strategy.get_metric("default") == 123.0
@@ -95,7 +85,7 @@ def test_get_metric_default_prefers_throughput(tmp_path: Path, standalone_system
9585
def test_get_metric_default_falls_back_to_inverse_latency(tmp_path: Path, standalone_system: StandaloneSystem) -> None:
9686
tr = _make_tr(tmp_path)
9787
tr.output_path.mkdir(parents=True, exist_ok=True)
98-
(tr.output_path / "report.txt").write_text(json.dumps({"tpot_ms": 2.0}), encoding="utf-8")
88+
(tr.output_path / "report.json").write_text(json.dumps({"tpot_ms": 2.0}), encoding="utf-8")
9989

10090
strategy = AiconfiguratorReportGenerationStrategy(standalone_system, tr)
10191
assert pytest.approx(strategy.get_metric("default"), rel=1e-6) == 0.5
@@ -113,7 +103,7 @@ def test_load_results_falls_back_to_stdout_last_json(tmp_path: Path, standalone_
113103
def test_get_metric_unknown_returns_error(tmp_path: Path, standalone_system: StandaloneSystem) -> None:
114104
tr = _make_tr(tmp_path)
115105
tr.output_path.mkdir(parents=True, exist_ok=True)
116-
(tr.output_path / "report.txt").write_text(json.dumps({"ttft_ms": 1.0}), encoding="utf-8")
106+
(tr.output_path / "report.json").write_text(json.dumps({"ttft_ms": 1.0}), encoding="utf-8")
117107

118108
strategy = AiconfiguratorReportGenerationStrategy(standalone_system, tr)
119109
assert strategy.get_metric("nonexistent") == METRIC_ERROR

tests/standalone_command_gen_strategy/test_aiconfigurator_standalone_command_gen_strategy.py

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,7 @@
2727
AiconfiguratorStandaloneCommandGenStrategy,
2828
AiconfiguratorTestDefinition,
2929
)
30-
from cloudai.workloads.aiconfig.aiconfigurator import Disagg
31-
32-
33-
@pytest.fixture
34-
def standalone_system(tmp_path: Path) -> StandaloneSystem:
35-
return StandaloneSystem(
36-
name="standalone",
37-
scheduler="standalone",
38-
install_path=tmp_path / "install",
39-
output_path=tmp_path / "output",
40-
)
30+
from cloudai.workloads.aiconfig.aiconfigurator import Agg, Disagg
4131

4232

4333
def test_gen_exec_command_writes_repro_script_and_returns_bash(
@@ -94,6 +84,38 @@ def test_gen_exec_command_writes_repro_script_and_returns_bash(
9484
assert "--mode" in content and "disagg" in content
9585
assert "--d-bs" in content and "8" in content
9686

97-
assert str((out_dir.resolve() / "report.txt")) in content
87+
assert str((out_dir.resolve() / "report.json")) in content
9888
assert str((out_dir.resolve() / "stdout.txt")) in content
9989
assert str((out_dir.resolve() / "stderr.txt")) in content
90+
91+
92+
def test_gen_exec_command_agg_branch(
93+
tmp_path: Path, standalone_system: StandaloneSystem, monkeypatch: pytest.MonkeyPatch
94+
) -> None:
95+
monkeypatch.setattr("sys.executable", "/tmp/python")
96+
97+
tdef = AiconfiguratorTestDefinition(
98+
name="aiconfig",
99+
description="desc",
100+
test_template_name="Aiconfigurator",
101+
cmd_args=AiconfiguratorCmdArgs(
102+
model_name="LLAMA3.1_70B",
103+
system="h200_sxm",
104+
backend="trtllm",
105+
version="0.20.0",
106+
isl=4000,
107+
osl=500,
108+
agg=Agg(batch_size=8, ctx_tokens=16, tp=1, pp=1, dp=1, moe_tp=1, moe_ep=1),
109+
),
110+
)
111+
out_dir = tmp_path / "out-agg"
112+
tr = TestRun(name="tr", test=tdef, num_nodes=1, nodes=[], output_path=out_dir)
113+
114+
strategy = AiconfiguratorStandaloneCommandGenStrategy(standalone_system, tr)
115+
cmd = strategy.gen_exec_command()
116+
assert cmd.startswith("bash ")
117+
118+
content = (out_dir.resolve() / "run_simple_predictor.sh").read_text(encoding="utf-8")
119+
assert "--mode" in content and "agg" in content
120+
assert "--batch-size" in content and "8" in content
121+
assert "--ctx-tokens" in content and "16" in content

0 commit comments

Comments
 (0)