gke-labs
diff --git a/‎.dockerignore‎
Lines changed: 1 addition & 2 deletions b/‎.dockerignore‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎Makefile‎
Lines changed: 48 additions & 3 deletions b/‎Makefile‎
Lines changed: 48 additions & 3 deletions
diff --git a/‎examples/sft/gsm8k/vllm_eval.py‎
Lines changed: 13 additions & 2 deletions b/‎examples/sft/gsm8k/vllm_eval.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎examples/tiny/tiny_rl.py‎
Lines changed: 142 additions & 0 deletions b/‎examples/tiny/tiny_rl.py‎
Lines changed: 142 additions & 0 deletions
@@ -1,6 +1,5 @@
 .git
-.venv
-examples/.venv
+**/.venv
 **/__pycache__
 **/*.pyc
 .ruff_cache
 
@@ -13,13 +13,32 @@ HOST           ?= 127.0.0.1
 PORT           ?= 9003
 # The fully qualified base URL used by local CLI tools and clients
 BASE_URL       ?= http://$(HOST):$(PORT)
-TEST_PYTHONPATH ?= examples/sft/pig-latin
+UNIT_TESTS ?= tests.test_gateway_paths tests.test_lora_targets tests.test_snapshot_agent tests.test_trainer_optimizer_correctness tests.test_worker_launch_processor
+# Only forward BASE_URL to e2e when the user supplied it. The Makefile default
+# is for local CLI usage; e2e should start its own backend by default.
+TRAINING_TEST_BASE_URL ?= $(if $(filter environment command line,$(origin BASE_URL)),$(BASE_URL),)
+TRAINING_TEST_EXTRA ?= gpu
+TRAINING_TEST_ARGS ?=
+PIGLATIN_TEST_PYTHONPATH ?= examples/sft/pig-latin
+
+# CUDA_VISIBLE_DEVICES can be provided either as an environment variable or as a
+# Make variable, and is inherited by the backend/eval subprocesses.
+ifneq ($(origin CUDA_VISIBLE_DEVICES),undefined)
+  export CUDA_VISIBLE_DEVICES
+endif
 
 help:
 	@echo "make server                              # $(BASE_MODEL), SAMPLING_BACKEND=$(SAMPLING_BACKEND), port $(PORT)"
 	@echo "make server BASE_MODEL=google/gemma-4-e2b SAMPLING_BACKEND=vllm"
 	@echo "VLLM_ARCHITECTURE_OVERRIDE=Gemma4ForCausalLM make vllm BASE_MODEL=google/gemma-4-e2b"
-	@echo "make test | lint | fmt"
+	@echo "make test                               # fast unit tests"
+	@echo "make test e2e tiny-lora|tiny-fft|tiny-rl|lora-textsql|fft-gsm8k|fft-gsm8k-x2  # tiny-* = fast overfit smoke tests"
+	@echo "make test e2e tiny-lora BASE_URL=http://host:9003"
+	@echo "CUDA_VISIBLE_DEVICES=0 make test e2e tiny-fft"
+	@echo "make test e2e tiny-fft TRAINING_TEST_ARGS='steps=20'"
+	@echo "make test e2e fft-gsm8k TRAINING_TEST_ARGS='steps=10 eval_examples=8 extra=\"batch=2\"'"
+	@echo "make test piglatin                      # pig-latin example end-to-end tests"
+	@echo "make lint | fmt"
 
 # ---------------------------------------------------------------------------
 # Server
@@ -42,14 +61,40 @@ ifeq (cli,$(firstword $(MAKECMDGOALS)))
   $(eval $(CLI_ARGS):;@:)
 endif
 
+ifeq (test,$(firstword $(MAKECMDGOALS)))
+  TEST_MODE := $(word 2,$(MAKECMDGOALS))
+  TEST_SCENARIO := $(word 3,$(MAKECMDGOALS))
+  TEST_ARGS := $(wordlist 2,$(words $(MAKECMDGOALS)),$(MAKECMDGOALS))
+  ifneq ($(TEST_ARGS),)
+    $(eval $(TEST_ARGS):;@:)
+  endif
+endif
+
 cli:
 	@cd dev/tools && BASE_URL="$(BASE_URL)" uv run python cli.py $(CLI_ARGS)
 
 # ---------------------------------------------------------------------------
 # Dev
 # ---------------------------------------------------------------------------
 test:
-	PYTHONPATH="$(TEST_PYTHONPATH)" uv --project examples run python -m unittest discover -s tests
+	@mode="$(TEST_MODE)"; \
+	scenario="$(TEST_SCENARIO)"; \
+	if [ -z "$$mode" ] || [ "$$mode" = "unit" ]; then \
+	  uv run --frozen --exact --extra cpu python -m unittest $(UNIT_TESTS); \
+	elif [ "$$mode" = "e2e" ]; then \
+	  if [ -z "$$scenario" ]; then \
+	    echo "Missing e2e scenario. Expected tiny-lora, tiny-fft, tiny-rl, lora-textsql, fft-gsm8k, or fft-gsm8k-x2."; \
+	    exit 2; \
+	  fi; \
+	  set -- "scenario=$$scenario" "uv_extra=$(TRAINING_TEST_EXTRA)"; \
+	  if [ -n "$(TRAINING_TEST_BASE_URL)" ]; then set -- "$$@" "base_url=$(TRAINING_TEST_BASE_URL)"; fi; \
+	  uv run --extra "$(TRAINING_TEST_EXTRA)" python scripts/run_training_e2e.py "$$@" $(TRAINING_TEST_ARGS); \
+	elif [ "$$mode" = "piglatin" ]; then \
+	  PYTHONPATH="$(PIGLATIN_TEST_PYTHONPATH)" uv --project examples run python -m unittest discover -s tests; \
+	else \
+	  echo "Unknown test mode '$$mode'. Expected unit, e2e, or piglatin."; \
+	  exit 2; \
+	fi
 
 lint:
 	uvx ruff check .
 
@@ -22,21 +22,32 @@ def main() -> None:
   parser = argparse.ArgumentParser()
   parser.add_argument("--path", required=True)
   parser.add_argument("--data", default="gsm8k_test.json")
+  parser.add_argument("--gpu-memory-utilization", type=float, default=0.85)
+  parser.add_argument("--min-accuracy", type=float, default=0.0, help="exit nonzero if accuracy falls below this fraction")
   args = parser.parse_args()
 
   with open(args.data) as f:
     data = json.load(f)
 
-  llm = LLM(model=args.path, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=1024, enforce_eager=True)
+  llm = LLM(
+    model=args.path,
+    dtype="bfloat16",
+    gpu_memory_utilization=args.gpu_memory_utilization,
+    max_model_len=1024,
+    enforce_eager=True,
+  )
   sampling_params = SamplingParams(temperature=0.0, max_tokens=256, stop=["\nQuestion:"])
   start = time.time()
   outputs = llm.generate([datum["prompt"] for datum in data], sampling_params)
   elapsed = time.time() - start
   correct = sum(int(extract(output.outputs[0].text) == datum["gold"]) for datum, output in zip(data, outputs, strict=True))
+  accuracy = correct / len(data)
 
   print("***************************************************************")
-  print(f"[VLLM] {args.path} 0-shot GSM8K acc = {correct / len(data):.1%} on {len(data)} problems in {elapsed:.1f}s")
+  print(f"[VLLM] {args.path} 0-shot GSM8K acc = {accuracy:.1%} on {len(data)} problems in {elapsed:.1f}s")
   print("***************************************************************")
+  if accuracy < args.min_accuracy:
+    raise SystemExit(f"GSM8K accuracy {accuracy:.1%} is below the required {args.min_accuracy:.1%}")
 
 
 if __name__ == "__main__":
 
@@ -0,0 +1,142 @@
+"""Tiny RL smoke test: sample from the current policy, reward completions that
+contain the target answer, and run a few importance-sampling policy-gradient steps.
+
+  uv --project examples run python examples/tiny/tiny_rl.py base_url=http://127.0.0.1:9003
+"""
+
+from __future__ import annotations
+
+import json
+import math
+import os
+import shutil
+import statistics
+from pathlib import Path
+from typing import Any, cast
+
+import chz
+import tinker
+from tinker import types
+
+BASE_URL = "http://127.0.0.1:9003"
+
+os.environ.setdefault("TINKER_API_KEY", "tml-dummy-key")
+os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
+
+
+@chz.chz
+class Config:
+  base_model: str = "Qwen/Qwen2.5-0.5B"
+  base_url: str = os.getenv("TINKER_BASE_URL", os.getenv("BASE_URL", BASE_URL))
+  log_dir: str = str(Path(__file__).with_name("artifacts") / "tiny_rl")
+  prompt: str = "Question: What is 2 + 2?\nAnswer:"
+  target: str = "4"
+  steps: int = 2
+  samples_per_prompt: int = 8
+  max_tokens: int = 16
+  temperature: float = 1.0
+  learning_rate: float = 1e-5
+  grad_clip_norm: float = 1.0
+  loss_fn: str = "importance_sampling"
+  rank: int = 16
+  seed: int = 0
+  behavior_if_log_dir_exists: str = "delete"
+
+
+def reset_log_dir(path: Path, behavior: str) -> None:
+  if not path.exists():
+    path.mkdir(parents=True)
+    return
+  if behavior == "delete":
+    shutil.rmtree(path)
+    path.mkdir(parents=True)
+    return
+  if behavior == "error":
+    raise RuntimeError(f"Log directory already exists: {path}")
+  raise ValueError(f"Unsupported behavior_if_log_dir_exists={behavior!r}")
+
+
+def write_metric(log_dir: Path, row: dict[str, Any]) -> None:
+  with (log_dir / "metrics.jsonl").open("a", encoding="utf-8") as f:
+    f.write(json.dumps(row, sort_keys=True) + "\n")
+
+
+def build_datum(prompt_tokens: list[int], completion_tokens: list[int], logprobs: list[float], advantage: float) -> types.Datum:
+  tokens = prompt_tokens + completion_tokens
+  prompt_pad = [0.0] * (len(prompt_tokens) - 1)
+  return types.Datum(
+    model_input=types.ModelInput.from_ints(tokens=tokens[:-1]),
+    loss_fn_inputs=cast(
+      Any,
+      {
+        "target_tokens": tokens[1:],
+        "weights": prompt_pad + [1.0] * len(completion_tokens),
+        "logprobs": prompt_pad + logprobs,
+        "advantages": prompt_pad + [advantage] * len(completion_tokens),
+      },
+    ),
+  )
+
+
+def main(config: Config) -> None:
+  if config.steps < 1:
+    raise ValueError("Tiny RL needs steps >= 1")
+  log_dir = Path(config.log_dir)
+  reset_log_dir(log_dir, config.behavior_if_log_dir_exists)
+
+  client = tinker.ServiceClient(api_key=os.getenv("TINKER_API_KEY", "tml-dummy-key"), base_url=config.base_url)
+  trainer = client.create_lora_training_client(
+    base_model=config.base_model,
+    rank=config.rank,
+    seed=config.seed,
+    train_attn=True,
+    train_mlp=True,
+    # Qwen2.5-0.5B ties lm_head to embed_tokens; LoRA on the tied head trips a
+    # PEFT warning and vLLM cannot load lm_head adapter weights at all.
+    train_unembed=False,
+  )
+  tokenizer = trainer.get_tokenizer()
+  prompt_tokens = tokenizer.encode(config.prompt, add_special_tokens=False)
+  prompt = types.ModelInput.from_ints(tokens=prompt_tokens)
+  sampling_params = types.SamplingParams(max_tokens=config.max_tokens, temperature=config.temperature)
+
+  mean_reward = 0.0
+  for step in range(1, config.steps + 1):
+    sampler = trainer.save_weights_and_get_sampling_client()
+    sequences = sampler.sample(prompt=prompt, num_samples=config.samples_per_prompt, sampling_params=sampling_params).result().sequences
+
+    rewards = []
+    for sequence in sequences:
+      tokens, logprobs = list(sequence.tokens), list(sequence.logprobs or [])
+      if not tokens or len(tokens) != len(logprobs):
+        raise RuntimeError(f"Sampler must return aligned tokens and logprobs, got {len(tokens)} tokens and {len(logprobs)} logprobs")
+      rewards.append(1.0 if config.target in tokenizer.decode(tokens) else 0.0)
+
+    # Group-centered advantages; when every reward ties, fall back to a uniform
+    # positive advantage so the update still exercises a nonzero gradient.
+    mean_reward = statistics.fmean(rewards)
+    advantages = [reward - mean_reward for reward in rewards]
+    if all(abs(advantage) < 1e-8 for advantage in advantages):
+      advantages = [1.0] * len(rewards)
+
+    datums = [
+      build_datum(prompt_tokens, list(sequence.tokens), list(sequence.logprobs or []), advantage)
+      for sequence, advantage in zip(sequences, advantages)
+    ]
+    fwdbwd = trainer.forward_backward(datums, config.loss_fn).result()
+    trainer.optim_step(types.AdamParams(learning_rate=config.learning_rate, grad_clip_norm=config.grad_clip_norm)).result()
+
+    loss = float(fwdbwd.metrics.get("loss:mean", 0.0))
+    if not math.isfinite(loss):
+      raise RuntimeError(f"Loss must be finite, got {loss!r}")
+    write_metric(log_dir, {"phase": "train", "step": step, "loss": loss, "mean_reward": mean_reward, "num_datums": len(datums)})
+    print(f"[tiny-rl] step={step:02d}/{config.steps} loss={loss:.6f} mean_reward={mean_reward:.2f} datums={len(datums)}")
+
+  final_state_path = trainer.save_state("tiny-rl-final").result().path
+  write_metric(log_dir, {"phase": "final", "step": config.steps, "final_state_path": final_state_path, "mean_reward": mean_reward})
+  print(f"[tiny-rl] mean_reward={mean_reward:.2f}")
+  print(f"final_state_path={final_state_path}")
+
+
+if __name__ == "__main__":
+  chz.nested_entrypoint(main, allow_hyphens=True)