NVIDIA
diff --git a/‎.agents/README.md‎
Lines changed: 53 additions & 0 deletions b/‎.agents/README.md‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎.agents/clusters.yaml.example‎
Lines changed: 19 additions & 0 deletions b/‎.agents/clusters.yaml.example‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎.claude/scripts/sync-upstream-skills.sh‎ ‎.agents/scripts/sync-upstream-skills.sh‎.claude/scripts/sync-upstream-skills.sh renamed to .agents/scripts/sync-upstream-skills.sh
Lines changed: 8 additions & 5 deletions b/‎.claude/scripts/sync-upstream-skills.sh‎ ‎.agents/scripts/sync-upstream-skills.sh‎.claude/scripts/sync-upstream-skills.sh renamed to .agents/scripts/sync-upstream-skills.sh
Lines changed: 8 additions & 5 deletions
diff --git a/‎.claude/skills/accessing-mlflow/SKILL.md‎ ‎.agents/skills/accessing-mlflow/SKILL.md‎.claude/skills/accessing-mlflow/SKILL.md renamed to .agents/skills/accessing-mlflow/SKILL.md b/‎.claude/skills/accessing-mlflow/SKILL.md‎ ‎.agents/skills/accessing-mlflow/SKILL.md‎.claude/skills/accessing-mlflow/SKILL.md renamed to .agents/skills/accessing-mlflow/SKILL.md
diff --git a/‎.claude/skills/common/credentials.md‎ ‎.agents/skills/common/credentials.md‎.claude/skills/common/credentials.md renamed to .agents/skills/common/credentials.md b/‎.claude/skills/common/credentials.md‎ ‎.agents/skills/common/credentials.md‎.claude/skills/common/credentials.md renamed to .agents/skills/common/credentials.md
diff --git a/‎…laude/skills/common/environment-setup.md‎ ‎…gents/skills/common/environment-setup.md‎.claude/skills/common/environment-setup.md renamed to .agents/skills/common/environment-setup.md
Lines changed: 2 additions & 2 deletions b/‎…laude/skills/common/environment-setup.md‎ ‎…gents/skills/common/environment-setup.md‎.claude/skills/common/environment-setup.md renamed to .agents/skills/common/environment-setup.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎.claude/skills/common/remote-execution.md‎ ‎.agents/skills/common/remote-execution.md‎.claude/skills/common/remote-execution.md renamed to .agents/skills/common/remote-execution.md
Lines changed: 7 additions & 6 deletions b/‎.claude/skills/common/remote-execution.md‎ ‎.agents/skills/common/remote-execution.md‎.claude/skills/common/remote-execution.md renamed to .agents/skills/common/remote-execution.md
Lines changed: 7 additions & 6 deletions
diff --git a/‎.claude/skills/common/remote_exec.sh‎ ‎.agents/skills/common/remote_exec.sh‎.claude/skills/common/remote_exec.sh renamed to .agents/skills/common/remote_exec.sh
Lines changed: 9 additions & 4 deletions b/‎.claude/skills/common/remote_exec.sh‎ ‎.agents/skills/common/remote_exec.sh‎.claude/skills/common/remote_exec.sh renamed to .agents/skills/common/remote_exec.sh
Lines changed: 9 additions & 4 deletions
diff --git a/‎.claude/skills/common/slurm-setup.md‎ ‎.agents/skills/common/slurm-setup.md‎.claude/skills/common/slurm-setup.md renamed to .agents/skills/common/slurm-setup.md
Lines changed: 1 addition & 1 deletion b/‎.claude/skills/common/slurm-setup.md‎ ‎.agents/skills/common/slurm-setup.md‎.claude/skills/common/slurm-setup.md renamed to .agents/skills/common/slurm-setup.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎…de/skills/common/workspace-management.md‎ ‎…ts/skills/common/workspace-management.md‎.claude/skills/common/workspace-management.md renamed to .agents/skills/common/workspace-management.md b/‎…de/skills/common/workspace-management.md‎ ‎…ts/skills/common/workspace-management.md‎.claude/skills/common/workspace-management.md renamed to .agents/skills/common/workspace-management.md
@@ -0,0 +1,53 @@
+# `.agents/` — agent-agnostic source of truth
+
+This directory is the canonical location for assets shared by AI coding agents
+working in this repository (Claude Code, Codex, Cursor, …).
+
+## Layout
+
+```text
+.agents/
+├── skills/                 # SKILL.md files (canonical)
+│   └── <skill-name>/SKILL.md
+├── scripts/                # shared helper scripts (sync-upstream-skills.sh, …)
+└── clusters.yaml.example   # remote-cluster config template
+```
+
+## Why this exists
+
+Different agents look for skills/config in vendor-specific directories. Rather
+than maintaining N copies that drift out of sync, **`.agents/` is the single
+source of truth** — each agent's guidance or install mechanism points here
+directly.
+
+## How each agent finds these
+
+Each agent points at `.agents/` through whatever mechanism it supports — never
+a copy:
+
+- **Claude Code** only auto-discovers skills under `.claude/skills/`, so
+  `.claude/` holds relative in-repo symlinks back into `.agents/`:
+  `.claude/skills → ../.agents/skills`, `.claude/scripts → ../.agents/scripts`,
+  and `.claude/clusters.yaml.example → ../.agents/clusters.yaml.example`. These
+  follow the same committed-symlink pattern already used elsewhere in this repo
+  (e.g. `CLAUDE.md`, `tools/launcher/modules/Model-Optimizer`).
+- **Future agents** (Codex, Cursor, …) add their own symlink or config pointing
+  at `.agents/`.
+
+## Editing rules
+
+- **Always edit files under `.agents/`**.
+- Vendored-verbatim skills (`launching-evals`, `accessing-mlflow`) are managed
+  by `.agents/scripts/sync-upstream-skills.sh` — do not modify by hand.
+- New skills go in `.agents/skills/<skill-name>/SKILL.md` following the
+  conventions of existing skills (e.g. `.agents/skills/monitor/SKILL.md`).
+
+## Project-level cluster config
+
+The remote-execution skills look for a `clusters.yaml` at, in order:
+
+1. `~/.config/modelopt/clusters.yaml` (user-level, recommended)
+2. `<repo-root>/.agents/clusters.yaml` (project-level, canonical)
+3. `<repo-root>/.claude/clusters.yaml` (project-level, back-compat)
+
+See `clusters.yaml.example` for the schema.
@@ -0,0 +1,19 @@
+# ModelOpt Remote Cluster Configuration
+# Copy to ~/.config/modelopt/clusters.yaml (user-level, recommended)
+# or .agents/clusters.yaml (project-level, can be committed).
+# .claude/clusters.yaml is also accepted for back-compat.
+
+clusters:
+  # GPU workstation or SLURM login node
+  my-cluster:
+    login_node: cluster-login.example.com
+    user: myusername
+    ssh_key: ~/.ssh/id_rsa
+    # ssh_proxy: "socat - PROXY:localhost:%h:%p,proxyport=3128"  # optional
+    workspace: /path/to/remote/workdir
+    gpu_type: H100   # used for quantization format recommendation
+    # slurm:
+    #   default_account: my_account
+    #   default_partition: batch_short
+
+default_cluster: my-cluster
@@ -21,15 +21,18 @@
 # NOT managed by this script — update it manually when pulling upstream changes.
 #
 # Usage:
-#   .claude/scripts/sync-upstream-skills.sh            # re-vendor at the pinned SHA
-#   UPSTREAM_SHA=<sha> .claude/scripts/sync-upstream-skills.sh   # bump to a new SHA
+#   .agents/scripts/sync-upstream-skills.sh            # re-vendor at the pinned SHA
+#   UPSTREAM_SHA=<sha> .agents/scripts/sync-upstream-skills.sh   # bump to a new SHA
 #
 # Requires: gh, base64, awk. Run from the repo root.
 #
-# The script overwrites .claude/skills/<skill>/ with upstream contents and
+# The script overwrites .agents/skills/<skill>/ with upstream contents and
 # re-applies our provenance lines into each SKILL.md frontmatter. If you have
 # local changes to a vendored skill, they will be lost — that is expected,
 # since vendored-verbatim skills should not be modified locally.
+#
+# Note: .claude/skills/ (and other agent-specific skill dirs) are symlinks to
+# .agents/skills/ — see .agents/README.md.
 
 set -euo pipefail
 
@@ -40,7 +43,7 @@ SHORT_SHA="${SHA:0:7}"
 
 UPSTREAM_REPO="NVIDIA-NeMo/Evaluator"
 UPSTREAM_BASE="packages/nemo-evaluator-launcher/.claude/skills"
-DEST_BASE=".claude/skills"
+DEST_BASE=".agents/skills"
 
 if [[ ! -d "$DEST_BASE" ]]; then
     echo "error: run from the repo root (expected $DEST_BASE/ to exist)" >&2
@@ -116,7 +119,7 @@ inject_provenance() {
                 print "license: Apache-2.0"
                 print "# Vendored verbatim from NVIDIA NeMo Evaluator (commit " short ")"
                 print "# https://github.com/NVIDIA-NeMo/Evaluator/tree/" sha "/packages/nemo-evaluator-launcher/.claude/skills/" skill
-                print "# To re-sync: .claude/scripts/sync-upstream-skills.sh"
+                print "# To re-sync: .agents/scripts/sync-upstream-skills.sh"
                 if (extra != "") {
                     n = split(extra, lines, "\\|")
                     for (i = 1; i <= n; i++) print "# " lines[i]
 
@@ -24,7 +24,7 @@ If previous runs left patches in `modelopt/` (from 4C unlisted model work), chec
 2. **User doesn't specify** → check for cluster config:
 
 ```bash
-cat ~/.config/modelopt/clusters.yaml 2>/dev/null || cat .claude/clusters.yaml 2>/dev/null
+cat ~/.config/modelopt/clusters.yaml 2>/dev/null || cat .agents/clusters.yaml 2>/dev/null || cat .claude/clusters.yaml 2>/dev/null
 ```
 
 If a cluster config exists with content → **use the remote cluster** (do not fall back to local even if local GPUs are available — the cluster config indicates the user's preferred execution environment). Otherwise → **local execution**.
@@ -34,7 +34,7 @@ If the cluster config contains multiple clusters and the user did not name the t
 For remote, connect:
 
 ```bash
-source .claude/skills/common/remote_exec.sh
+source .agents/skills/common/remote_exec.sh
 remote_load_cluster <cluster_name>
 remote_check_ssh
 remote_detect_env    # sets REMOTE_ENV_TYPE = slurm / docker / bare
 
@@ -9,8 +9,9 @@ Read this when Claude Code runs on a different machine than the target GPU clust
 Config locations (checked in order, first found wins):
 
 1. `~/.config/modelopt/clusters.yaml` — user-level (not committed, recommended)
-2. `.claude/clusters.yaml` — project-level (can be committed for shared defaults)
-3. Interactive input — if neither file exists, ask the user (see SKILL.md Step 0) and write `~/.config/modelopt/clusters.yaml` before proceeding
+2. `.agents/clusters.yaml` — project-level, canonical (can be committed for shared defaults)
+3. `.claude/clusters.yaml` — project-level, back-compat
+4. Interactive input — if no file exists, ask the user (see SKILL.md Step 0) and write `~/.config/modelopt/clusters.yaml` before proceeding
 
 ```yaml
 clusters:
@@ -38,14 +39,14 @@ rsync -av /path/to/local/checkpoint <cluster-login>:<cluster-workspace>/<session
 
 Use the `workspace` path from your cluster config as the destination root, and keep staged checkpoints under the session/model directory. Compute nodes on a given cluster share the same storage as its login node, so once staged, the path works everywhere on that cluster.
 
-See `.claude/clusters.yaml.example` for a fully annotated example with multiple cluster types.
+See `.agents/clusters.yaml.example` for a fully annotated example with multiple cluster types.
 
 ---
 
 ## 2. Connect and Establish Persistent Session
 
 ```bash
-source .claude/skills/common/remote_exec.sh
+source .agents/skills/common/remote_exec.sh
 remote_load_cluster <cluster_name>    # or omit name to use default_cluster
 remote_check_ssh                      # validates connectivity + starts persistent session
 ```
@@ -153,5 +154,5 @@ remote_sync_from <remote_output_subdir> /local/output/
 ## Reference Files
 
 - **`skills/common/remote_exec.sh`** — Full utility library (session, run, sync, SLURM, Docker helpers)
-- **`.claude/clusters.yaml`** — Active cluster configuration
-- **`.claude/clusters.yaml.example`** — Annotated example config
+- **`.agents/clusters.yaml`** — Active cluster configuration (canonical; `.claude/clusters.yaml` also accepted for back-compat)
+- **`.agents/clusters.yaml.example`** — Annotated example config
@@ -17,7 +17,7 @@
 # remote_exec.sh — Remote execution utility for ModelOpt agent skills
 #
 # Usage:
-#   source .claude/skills/common/remote_exec.sh
+#   source .agents/skills/common/remote_exec.sh
 #   remote_load_cluster <cluster_name>     # or: remote_load_cluster (uses default)
 #   remote_check_ssh
 #   remote_detect_env                       # detect SLURM vs Docker vs bare metal
@@ -41,12 +41,17 @@
 # ── Helpers ──────────────────────────────────────────────────────────────────
 
 _remote_config_file() {
-    # Find clusters.yaml: user-level > project-level
+    # Find clusters.yaml: user-level > project-level.
+    # Project-level is checked at .agents/clusters.yaml (canonical) and then
+    # .claude/clusters.yaml (back-compat).
     local user_config="${HOME}/.config/modelopt/clusters.yaml"
     local project_config
-    # Walk up from pwd looking for .claude/clusters.yaml
     local dir="$PWD"
     while [[ "$dir" != "/" ]]; do
+        if [[ -f "$dir/.agents/clusters.yaml" ]]; then
+            project_config="$dir/.agents/clusters.yaml"
+            break
+        fi
         if [[ -f "$dir/.claude/clusters.yaml" ]]; then
             project_config="$dir/.claude/clusters.yaml"
             break
@@ -196,7 +201,7 @@ remote_load_cluster() {
     if [[ -z "$config_file" ]]; then
         echo "ERROR: No clusters.yaml found. Provide cluster info interactively or create one." >&2
         echo "  User config:    ~/.config/modelopt/clusters.yaml" >&2
-        echo "  Project config: .claude/clusters.yaml" >&2
+        echo "  Project config: .agents/clusters.yaml (or .claude/clusters.yaml)" >&2
         return 1
     fi
 
 
@@ -215,7 +215,7 @@ which docker 2>/dev/null && echo "RUNTIME=docker"
 
 | Runtime | Typical clusters | SLURM integration |
 | --- | --- | --- |
-| **enroot/pyxis** | NVIDIA internal (DGX Cloud, EOS, Selene, GCP-NRT) | `srun --container-image` |
+| **enroot/pyxis** | HPC clusters with container runtime (e.g. DGX Cloud and similar Slurm + container setups) | `srun --container-image` |
 | **Docker** | Bare-metal / on-prem with GPU | `docker run` inside job script |
 
 ### Step 2: Check credentials for the image's registry