Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
run: bash scripts/check_boundary.sh

- name: Typecheck
run: uv run mypy src tests examples main.py packages/openrange-trl/src
run: uv run mypy src tests examples main.py packages/openrange-trl/src packages/openrange-rllm/src

- name: Test with coverage
run: uv run coverage run -m pytest tests
Expand Down
26 changes: 0 additions & 26 deletions examples/_briefing.py

This file was deleted.

2 changes: 1 addition & 1 deletion examples/codex_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
TaskSpec,
)

from examples._briefing import agent_briefing
from openrange.agent import agent_briefing
from openrange.agent_backend import CodexAgentBackend
from openrange.core import PACKS, auto_evolve, consequence_gate
from openrange.core.episode import AgentTurn, EpisodeReport
Expand Down
162 changes: 162 additions & 0 deletions examples/rllm_grpo_cyber.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
"""Train a cyber agent on an OpenRange world pool with rLLM's ``AgentTrainer``.

This is the rLLM half of "one scaffold, two modes": the *same* agent loop that
``examples/codex_eval.py`` evaluates with is trained here, swapping only the
sampler. ``openrange_rllm`` maps each OpenRange episode onto rLLM's
``Episode``/``Step`` and exposes the policy as an ``@rllm.rollout`` flow; rLLM's
gateway captures token ids and logprobs, GRPO does the rest. The reward is the
pack's own dense subgoal ladder (no reward logic here).

A pool of command-injection "company" worlds becomes an rLLM dataset (one row per
pentest task, carrying its ``snapshot_id``/``task_id``); ``snapshot_resolver``
maps each sampled rLLM task back to its world. The agent reaches the live webapp
over HTTP from a host shell (PROCESS backing) and composes ``curl`` itself.

Run on one CUDA GPU through rLLM's verl backend. Validated end to end on an
A100-40GB inside the maintainers' ``verlai/verl:vllm011.latest`` image (torch 2.8
/ vLLM 0.11 / flash-attn)::

python -m examples.rllm_grpo_cyber \
rllm/backend=verl algorithm.adv_estimator=grpo \
+model.name=Qwen/Qwen2.5-7B-Instruct \
actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \
actor_rollout_ref.model.lora_rank=32 \
actor_rollout_ref.model.lora_alpha=32 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=16384 \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.mode=async \
actor_rollout_ref.rollout.enforce_eager=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.rollout.n=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
trainer.n_gpus_per_node=1 data.train_batch_size=2 \
rllm.trainer.total_batches=1

Gotchas (both cost real debugging):

- LoRA uses the **flat** keys ``lora_rank`` / ``lora_alpha``. The nested
``lora.rank`` is silently ignored, which means full fine-tuning — a 7B then
OOMs a 40GB card, whereas with LoRA on it fits comfortably.
- OpenRange currently requires Python **3.14** (PEP 758 ``except`` syntax) but the
verl GPU stack ships only Python **3.12** wheels, so they cannot share a
process. Running this needs OpenRange made 3.12-compatible (a 3-line change) or
3.14 wheels for torch/vLLM/flash-attn (not available yet).

Building the pool + registering the dataset is CPU-only; ``trainer.train()`` is
the CUDA boundary (Ray + vLLM + FSDP under the verl backend).
"""

from __future__ import annotations

import subprocess
from collections.abc import Mapping
from typing import Any

from cyber_webapp import WebappPack
from cyber_webapp.difficulty import world_difficulty
from openrange_pack_sdk import Snapshot
from openrange_rllm import (
build_rllm_dataset_rows,
make_evaluator,
make_rollout,
snapshot_resolver,
)

from openrange import EpisodeService, EvalPool, WorldPool
from openrange.core.sandbox import CommandResult

_DATASET = "openrange_cyber"
_FAMILY = "webapp.pentest"
_RUN_ROOT = "or-runs/cyber-rllm"


def _company(seed: int) -> dict[str, Any]:
return {
"world": {"goal": "recover the hidden flag from the internal estate"},
"pack": {"id": "webapp"},
"runtime": {"tick": {"mode": "off"}},
"npc": [],
"seed": seed,
"topology": "chain",
}


def _difficulty(snapshot: Snapshot) -> float:
return float(world_difficulty(snapshot.graph))


class _HostRun:
def run(self, command: str, *, timeout: float = 120.0) -> CommandResult:
done = subprocess.run(
["bash", "-lc", command],
capture_output=True,
text=True,
timeout=timeout,
check=False,
)
return CommandResult(done.returncode, done.stdout + done.stderr)

def close(self) -> None:
return None


def _host_bind(_surface: Mapping[str, Any]) -> _HostRun:
return _HostRun()


def main() -> None:
import hydra
from omegaconf import DictConfig
from rllm.data.dataset import DatasetRegistry
from rllm.trainer import AgentTrainer

@hydra.main( # type: ignore[untyped-decorator]
config_path="pkg://rllm.trainer.config",
config_name="unified",
version_base=None,
)
def _train(config: DictConfig) -> None:
pack = WebappPack()
train_pool = WorldPool.seed(
pack,
[_company(seed) for seed in range(4)],
difficulty_fn=_difficulty,
family=_FAMILY,
max_size=8,
)
val_pool = EvalPool.seed(
pack,
[_company(seed) for seed in (7, 8)],
difficulty_fn=_difficulty,
family=_FAMILY,
)
DatasetRegistry.register_dataset(
_DATASET,
build_rllm_dataset_rows(train_pool.snapshots(), family=_FAMILY),
"train",
)
DatasetRegistry.register_dataset(
_DATASET,
build_rllm_dataset_rows(val_pool.snapshots(), family=_FAMILY),
"test",
)
resolve = snapshot_resolver([*train_pool.snapshots(), *val_pool.snapshots()])
service = EpisodeService(pack, _RUN_ROOT)
trainer = AgentTrainer(
backend=config.rllm.get("backend", "verl"),
agent_flow=make_rollout(service, resolve, bind_run=_host_bind),
evaluator=make_evaluator(),
config=config,
train_dataset=DatasetRegistry.load_dataset(_DATASET, "train"),
val_dataset=DatasetRegistry.load_dataset(_DATASET, "test"),
)
trainer.train()

_train()


if __name__ == "__main__": # pragma: no cover
main()
2 changes: 1 addition & 1 deletion examples/strands_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from openrange_pack_sdk import LLMResult, OpenRangeError, Snapshot, TaskSpec

from examples._briefing import agent_briefing
from openrange.agent import agent_briefing
from openrange.core.episode import AgentTurn
from openrange.runtime import EpisodeContext, OpenRangeRun, RunConfig

Expand Down
2 changes: 1 addition & 1 deletion packages/graphschema/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ description = "Typed property-graph meta-model with declarative schemas."
readme = "README.md"
license = "MIT"
license-files = ["LICENSE"]
requires-python = ">=3.14"
requires-python = ">=3.12"
authors = [{ name = "Vecna AI" }]
keywords = ["graph", "ontology", "schema", "property-graph", "typed-graph"]
classifiers = [
Expand Down
2 changes: 1 addition & 1 deletion packages/openrange-pack-sdk/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ version = "0.1.0"
description = "Pack-author SDK for OpenRange: Protocols, value types, and base errors that packs depend on. Zero runtime deps on OpenRange."
readme = "README.md"
license = "MIT"
requires-python = ">=3.14"
requires-python = ">=3.12"
dependencies = [
"graphschema",
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def _read_result(self) -> Mapping[str, Any]:
return {}
try:
data = json.loads(result_path.read_text(encoding="utf-8"))
except OSError, ValueError: # ValueError also covers a non-UTF-8 read
except (OSError, ValueError): # ValueError also covers a non-UTF-8 read
return {}
return dict(data) if isinstance(data, Mapping) else {}

Expand Down
42 changes: 42 additions & 0 deletions packages/openrange-rllm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# openrange-rllm

Optional [rLLM](https://github.com/rllm-org/rllm) `AgentTrainer` integration for
OpenRange. OpenRange owns the world and the grade; rLLM owns the RL training
loop. This adapter is the thin seam between them:

- **`agent_rollout_to_episode`** — maps one OpenRange agent rollout onto rLLM's
`Episode` / `Trajectory` / `Step`, one step per harness turn in call order.
- **`make_rollout`** — wraps the harness as an `@rllm.rollout` flow `(task,
config) -> Episode`; it runs one real episode on a shared `EpisodeService`.
- **`make_evaluator`** — surfaces the verifier's grade as an `@rllm.evaluator`.
- **`GatewaySampler`** — a `Sampler` that calls the policy at `config.base_url`
through OpenRange's own OpenAI-compatible backend. rLLM's gateway records token
ids and logprobs, so the rollout leaves those fields empty and rLLM's trace
enrichment fills them.

`import openrange_rllm` pulls **no** rLLM — every rLLM import is local to the
function that needs it. To run the real trainer, install the `train` extra and
construct the trainer in your own script/notebook:

```python
from rllm.trainer import AgentTrainer
from openrange_rllm import make_rollout, make_evaluator

trainer = AgentTrainer(
config=config, # a Hydra DictConfig (backend: verl|tinker)
agent_flow=make_rollout(service, resolve, bind_run=bind_run),
evaluator=make_evaluator(),
train_dataset=train_dataset,
val_dataset=val_dataset,
backend="verl",
)
trainer.train()
```

A complete, runnable example — building a world pool, registering the dataset,
and the validated single-GPU run command — is in
[`examples/rllm_grpo_cyber.py`](../../examples/rllm_grpo_cyber.py).

rLLM is installed from source (`rllm-org/rllm`); the GPU backend (`rllm[verl]`)
needs CUDA. The adapter itself, and its tests, run on CPU against rLLM's
pydantic-only core types.
29 changes: 29 additions & 0 deletions packages/openrange-rllm/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[build-system]
requires = ["hatchling>=1.27"]
build-backend = "hatchling.build"

[project]
name = "openrange-rllm"
version = "0.1.0"
description = "Optional rLLM AgentTrainer integration for OpenRange. Maps an OpenRange agent rollout onto rLLM's Episode/Trajectory/Step and wraps the harness as an @rllm.rollout flow; OpenRange itself stays trainer-agnostic."
readme = "README.md"
license = "MIT"
requires-python = ">=3.12"
dependencies = [
"openrange",
"openrange-pack-sdk",
]

# ``import openrange_rllm`` pulls no rLLM: every rLLM import is local to the
# function that needs it, so the module loads on a plain machine. The live trainer
# (``rllm.trainer.AgentTrainer``) needs rLLM plus its ``verl`` GPU backend
# (torch/vllm/verl) installed from source on a CUDA box — see README. rLLM is not
# published on PyPI, so it is deliberately not declared as a dependency here
# (doing so makes the universal workspace lock unsatisfiable).

[tool.hatch.build.targets.wheel]
packages = ["src/openrange_rllm"]

[tool.uv.sources]
openrange = { workspace = true }
openrange-pack-sdk = { workspace = true }
Loading
Loading