From 95a5b9fe479ba382da21f59d6b42e3a40e070ae4 Mon Sep 17 00:00:00 2001
From: Gabe Weisz <gabe.weisz@amd.com>
Date: Thu, 18 Jun 2026 08:42:35 -0700
Subject: [PATCH 1/8] Register TraceLens analysis-orchestrator in federated
 skill sources.

Path B: pin AMD-AGI/TraceLens at the commit currently at the tip of local
feat/gw_rename_directories so the import workflow can vendor
analysis-orchestrator from TraceLens/Agent/Analysis/skills.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/scripts/sources.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/scripts/sources.yml b/.github/scripts/sources.yml
index 3d88a09..d0cbbde 100644
--- a/.github/scripts/sources.yml
+++ b/.github/scripts/sources.yml
@@ -30,3 +30,10 @@ sources:
     license: MIT
     skills:
       - magpie
+  - name: amd-agi-tracelens
+    repo: AMD-AGI/TraceLens
+    ref: 9b461bb25192ce73cb70de912ce27df515b56b44
+    path: TraceLens/Agent/Analysis/skills
+    license: MIT
+    skills:
+      - analysis-orchestrator

From ffe41aa565b8cd45f5b2de88be57c0f709815b71 Mon Sep 17 00:00:00 2001
From: Gabe Weisz <gabe.weisz@amd.com>
Date: Thu, 18 Jun 2026 09:04:49 -0700
Subject: [PATCH 2/8] Point TraceLens federation source at
 feat/gw_rename_directories.

Track the branch ref instead of a fixed SHA so imports pick up branch
tip until this catalog entry is pinned again.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/scripts/sources.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/sources.yml b/.github/scripts/sources.yml
index d0cbbde..6cf62b6 100644
--- a/.github/scripts/sources.yml
+++ b/.github/scripts/sources.yml
@@ -32,7 +32,7 @@ sources:
       - magpie
   - name: amd-agi-tracelens
     repo: AMD-AGI/TraceLens
-    ref: 9b461bb25192ce73cb70de912ce27df515b56b44
+    ref: feat/gw_rename_directories
     path: TraceLens/Agent/Analysis/skills
     license: MIT
     skills:

From 60bc8471afeaf7063332c397f395ad5632d1fc03 Mon Sep 17 00:00:00 2001
From: Daniel Holanda <holand.daniel@gmail.com>
Date: Thu, 18 Jun 2026 17:17:08 -0700
Subject: [PATCH 3/8] Add skill

---
 .claude-plugin/marketplace.json               |   5 +
 skills/analysis-orchestrator/.federated.json  |   9 +
 skills/analysis-orchestrator/SKILL.md         |  54 ++
 .../agents/convolution-analyzer.md            | 186 ++++++
 .../agents/cpu-idle-analyzer.md               | 175 +++++
 .../agents/elementwise-analyzer.md            | 159 +++++
 .../agents/gemm-analyzer.md                   | 150 +++++
 .../agents/generic-op-analyzer.md             | 166 +++++
 .../agents/kernel-fusion-analyzer.md          | 316 +++++++++
 .../agents/model-identification-agent.md      | 101 +++
 .../agents/moe-analyzer.md                    | 168 +++++
 .../agents/multi-kernel-analyzer.md           | 272 ++++++++
 .../agents/norm-analyzer.md                   | 164 +++++
 .../agents/reduce-analyzer.md                 | 151 +++++
 .../agents/sdpa-analyzer.md                   | 212 ++++++
 .../agents/triton-analyzer.md                 | 157 +++++
 skills/analysis-orchestrator/reference.md     | 626 ++++++++++++++++++
 skills/analysis-orchestrator/skill-card.md    |  13 +
 18 files changed, 3084 insertions(+)
 create mode 100644 skills/analysis-orchestrator/.federated.json
 create mode 100644 skills/analysis-orchestrator/SKILL.md
 create mode 100644 skills/analysis-orchestrator/agents/convolution-analyzer.md
 create mode 100644 skills/analysis-orchestrator/agents/cpu-idle-analyzer.md
 create mode 100644 skills/analysis-orchestrator/agents/elementwise-analyzer.md
 create mode 100644 skills/analysis-orchestrator/agents/gemm-analyzer.md
 create mode 100644 skills/analysis-orchestrator/agents/generic-op-analyzer.md
 create mode 100644 skills/analysis-orchestrator/agents/kernel-fusion-analyzer.md
 create mode 100644 skills/analysis-orchestrator/agents/model-identification-agent.md
 create mode 100644 skills/analysis-orchestrator/agents/moe-analyzer.md
 create mode 100644 skills/analysis-orchestrator/agents/multi-kernel-analyzer.md
 create mode 100644 skills/analysis-orchestrator/agents/norm-analyzer.md
 create mode 100644 skills/analysis-orchestrator/agents/reduce-analyzer.md
 create mode 100644 skills/analysis-orchestrator/agents/sdpa-analyzer.md
 create mode 100644 skills/analysis-orchestrator/agents/triton-analyzer.md
 create mode 100644 skills/analysis-orchestrator/reference.md
 create mode 100644 skills/analysis-orchestrator/skill-card.md

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index f4aa2d7..08198da 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -9,6 +9,11 @@
     "version": "0.1.0"
   },
   "plugins": [
+    {
+      "name": "analysis-orchestrator",
+      "source": "./skills/analysis-orchestrator",
+      "description": "Orchestrates modular PyTorch profiler trace analysis with TraceLens: generates perf reports, prepares category data, runs system-level and compute-kernel subagents in parallel, validates outputs, and writes a prioritized stakeholder report (analysis.md)."
+    },
     {
       "name": "apu-memory-tuner",
       "source": "./skills/apu-memory-tuner",
diff --git a/skills/analysis-orchestrator/.federated.json b/skills/analysis-orchestrator/.federated.json
new file mode 100644
index 0000000..72e64e6
--- /dev/null
+++ b/skills/analysis-orchestrator/.federated.json
@@ -0,0 +1,9 @@
+{
+  "source": "amd-agi-tracelens",
+  "repo": "AMD-AGI/TraceLens",
+  "ref": "feat/gw_rename_directories",
+  "commit": "9b461bb25192ce73cb70de912ce27df515b56b44",
+  "path": "TraceLens/Agent/Analysis/skills/analysis-orchestrator",
+  "license": "MIT",
+  "imported_at": "2026-06-19T00:16:47Z"
+}
diff --git a/skills/analysis-orchestrator/SKILL.md b/skills/analysis-orchestrator/SKILL.md
new file mode 100644
index 0000000..3ad6c86
--- /dev/null
+++ b/skills/analysis-orchestrator/SKILL.md
@@ -0,0 +1,54 @@
+---
+name: analysis-orchestrator
+description: >-
+  Orchestrates modular PyTorch profiler trace analysis with TraceLens: generates perf
+  reports, prepares category data, runs system-level and compute-kernel subagents in
+  parallel, validates outputs, and writes a prioritized stakeholder report (analysis.md).
+  Use when the user asks to follow the analysis orchestrator, run the agentic analysis
+  workflow, analyze a trace, compare two traces, or mentions standalone or comparative
+  TraceLens analysis.
+---
+
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+# Analysis orchestrator
+
+Coordinate **system-level** analysis (CPU/idle, kernel fusion, multi-kernel / comm / memcpy) and **compute-kernel** analysis (GEMM, SDPA, elementwise, etc.): one trace load, shared prep, parallel subagents, then aggregation into `analysis.md`.
+
+## Full procedure
+
+Follow **[reference.md](reference.md)** for every step (user prompts, `<prefix>` / `{CMD}` usage, CLI commands, subagent launch text, validation, report `tee` order, plot embedding, and trace diagnostics).
+
+## Workflow index
+
+```
+0. Query User Inputs (Platform, Trace Path(s), Analysis Mode, Environment Setup)
+1. Generate Performance Report (branches on analysis mode: training vs inference then, comparison scope)
+2-5. Prepare Category Data (GPU Util, Top Ops, Tree Data, Multi-Kernel Data, Category Filtering)
+6. System-Level Analysis (PARALLEL) → system_findings/
+7. Compute Kernel Subagents (PARALLEL) → category_findings/
+   7.5. Aggregate → priority_data.json::findings[]
+8. Validate Subagent Outputs
+9. load_findings + Model Identification (subagent) → metadata/model_info.json
+10. Render performance PNG if agent_extension.py is absent
+11. Generate analysis.md (orchestrator writes via <prefix> tee), optional extension, embed PNG
+```
+
+## Rules
+
+- **Subagents:** Use the Task tool **only** where reference.md says “subagent” (Steps **6**, **7**, **9**). The orchestrator runs everything else, including Step 7.5, using the command prefix from `<output_dir>/cache/cmd_prefix.txt` (`{CMD}` substitution).
+- **Language:** Prefer vendor-agnostic terms (GPU kernels, collective communication, vendor GEMM library, DNN primitives, GPU graph). When quoting trace data, real kernel names are fine.
+- **Subagent prompts:** Point each subagent at the checked-in agent file under `TraceLens/Agent/Analysis/skills/analysis-orchestrator/agents/<name>.md` (see reference.md for exact paths and prompt shells).
+
+## Primary outputs
+
+- **Deliverable:** `<output_dir>/analysis.md`
+- **Internals:** `system_findings/`, `category_findings/`, `category_data/`, `metadata/`, `perf_report*.xlsx`, CSV folders — see package README for layout.
+
+## Agent layout
+
+Project subagents ship with this skill: `TraceLens/Agent/Analysis/skills/analysis-orchestrator/agents/*.md`.
diff --git a/skills/analysis-orchestrator/agents/convolution-analyzer.md b/skills/analysis-orchestrator/agents/convolution-analyzer.md
new file mode 100644
index 0000000..19446a3
--- /dev/null
+++ b/skills/analysis-orchestrator/agents/convolution-analyzer.md
@@ -0,0 +1,186 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+---
+name: convolution-analyzer
+description: Analyze Convolution operations for compute efficiency and layout optimization. Use when orchestrator needs Convolution category analysis.
+model: claude-opus-4-7-high
+---
+
+# Convolution Analysis Subagent
+
+Analyze Convolution operations for compute efficiency and memory-layout optimization. Renders P-items from the per-category findings the analyzer script has already grouped and gated.
+
+---
+
+## Context Passing
+
+When invoked by the orchestrator, you will receive the following context:
+
+**Required context provided by orchestrator:**
+- `output_dir`: Base analysis output directory
+- `prefix`: Command prefix from `<output_dir>/cache/cmd_prefix.txt` — contains a template with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+- `cat`: `conv_fwd` or `conv_bwd`
+- `comparison_scope`: `standalone` (default) or `comparative`
+
+**Input files (pre-computed by orchestrator):**
+1. `<output_dir>/category_data/<cat>_ops.csv` - Filtered Convolution operations (includes `call_stack` column for architecture context)
+2. `<output_dir>/metadata/<cat>_metadata.json` - Hardware specs
+
+**Output file you must write:**
+- `<output_dir>/category_findings/<cat>_findings.md`
+
+---
+
+## Error Handling
+
+**If category data files are missing:**
+1. Write a findings file noting: "No Convolution operations found in trace"
+2. Return gracefully
+
+**If analysis script fails:**
+1. Write a findings file with Status: ERROR
+2. **CRITICAL: Do NOT manually analyze the raw CSV data**
+3. **CRITICAL: Do NOT provide any bottleneck findings**
+
+---
+
+## Language Guidelines
+
+Use vendor-agnostic terminology:
+- "GPU kernels" not "CUDA kernels"
+- "DNN library" not vendor-specific names
+- Focus on operation semantics, not vendor implementation details
+
+---
+
+## Analysis Workflow
+
+### Step 1: Run Analysis Script
+
+```bash
+<prefix> python3 \
+  TraceLens/Agent/Analysis/category_analyses/convolution_analysis.py \
+  --output-dir <output_dir> \
+  --category <cat> \
+  --comparison_scope <comparison_scope>
+```
+
+### Step 2: Read metrics
+
+```bash
+cat <output_dir>/category_data/<cat>_metrics.json
+```
+
+`category_specific.transpose_overhead_percent` flags memory-layout mismatch (NCHW vs NHWC); reference it in **Identification** for any memory-bound finding when it exceeds ~10%.
+
+### Step 3: Classify members by name
+
+Each `category_findings[i].members[j].operation` carries a torch op name (e.g. `aten::conv2d`, `aten::conv_transpose2d`). Classify each member semantically when describing the finding:
+
+- **Standard 2D**: `conv2d` operations (most common in CNNs).
+- **1D**: `conv1d` operations (sequence/audio models).
+- **3D**: `conv3d` operations (video/volumetric models).
+- **Depthwise**: depthwise / channel-wise convolutions (low parallelism, expect lower efficiency).
+- **Transpose / Deconv**: transpose convolutions, deconvolutions (also signals potential layout mismatch — cross-reference with `category_specific.transpose_overhead_percent`).
+- **Other**: anything not matching the above.
+
+These are guidelines; if a member doesn't fit neatly, classify it semantically.
+
+### Step 4: Render P-items from `category_findings`
+
+**efficiency_percent semantics:**
+- **Standalone:** Treat `efficiency_percent` as **% of roofline**.
+- **Comparative:** Treat `efficiency_percent` as **100 × (trace2 kernel time) / (trace1 kernel time)**.
+
+Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `time_ms`, `library`) using the Action Prose Guidance, Expected Efficiency, and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+
+**Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
+
+**Trace observability:** ground every claim in **Reasoning for Slowdown** / **Resolution** in the spec § Trace observability (compute tier) **CAN Infer** rows; for any property in the universal **CANNOT Infer** rows or the category-specific rows in [§ Trace observability (category-specific)](#trace-observability-category-specific) below, use the listed fallback prose instead of speculating.
+
+---
+
+## Action Prose Guidance
+
+Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].bound_type`:
+
+| `bound_type` | Action template |
+|---|---|
+| `compute` | Profile the dominant member kernels for tile-size and wave-occupancy tuning. Depthwise members will naturally show lower efficiency due to limited parallelism — call that out in **Identification** before recommending tuning. |
+| `memory` | If `transpose_overhead_percent` > 10%, recommend converting to channels-last layout (`model.to(memory_format=torch.channels_last)`) to eliminate transpose overhead. Otherwise optimize memory access patterns of the dominant member kernels. |
+
+---
+
+## Expected efficiency per operation type
+
+| Convolution type | Expected efficiency | Bound type |
+|------------------|---------------------|------------|
+| Large kernels (5×5+) | >70% of peak TFLOPS | compute-bound |
+| Standard 3×3 | >70% of peak TFLOPS | compute-bound |
+| 1×1 (pointwise) | >60% of peak HBM BW | memory-bound |
+| Depthwise | >50% (low parallelism) | varies |
+
+**Transpose overhead bands:**
+- `>20%`: high — strongly recommend channels-last.
+- `10–20%`: moderate — consider channels-last.
+- `<10%`: acceptable.
+
+---
+
+## Common Patterns
+
+### Transpose overhead (layout mismatch)
+- **Symptoms:** Many `batched_transpose` kernels; 30–45% of convolution time.
+- **Cause:** PyTorch defaults to NCHW; vendor DNN libraries prefer NHWC.
+- **Algorithmic (primary):** `model.to(memory_format=torch.channels_last)`.
+
+### Large-kernel convolutions
+- **Symptoms:** Kernel size > 3×3, compute-bound.
+- **Algorithmic:** Limited — these are typically well-optimized.
+- **Kernel:** Profile if efficiency below expected band.
+
+### Small-kernel convolutions (1×1, 3×3)
+- **Symptoms:** Common in modern architectures.
+- **Algorithmic:** Fusion opportunities → defer to kernel fusion analysis.
+- **Kernel:** Optimize memory access patterns.
+
+### Depthwise convolutions
+- **Symptoms:** Low efficiency due to limited parallelism.
+- **Algorithmic:** Limited optimization potential.
+- **Kernel:** Specialized depthwise kernels.
+
+---
+
+## Trace observability (category-specific)
+
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, Convolution analysis cannot observe:
+
+| NOT observable | Why | Fallback prose |
+|----------------|-----|----------------|
+| Per-op layout (NCHW vs. NHWC) | Only the aggregate `category_specific.transpose_overhead_percent` is exposed, not per-op layout | "Per-op layout not visible — refer to aggregate `transpose_overhead_percent`." |
+
+---
+
+## Validate findings
+
+Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+
+```bash
+<prefix> python3 -c "
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_findings_file
+passed, errors = validate_findings_file(sys.argv[1], sys.argv[2], sys.argv[3])
+if not passed:
+    print('FAIL:')
+    for e in errors:
+        print('  - ' + e)
+    sys.exit(1)
+print('PASS: Findings file is valid')
+" '<output_dir>/category_findings/<cat>_findings.md' 'compute' '<comparison_scope>'
+```
+
+If validation fails, fix the findings file and re-run. Max 2 retries.
diff --git a/skills/analysis-orchestrator/agents/cpu-idle-analyzer.md b/skills/analysis-orchestrator/agents/cpu-idle-analyzer.md
new file mode 100644
index 0000000..0ff5055
--- /dev/null
+++ b/skills/analysis-orchestrator/agents/cpu-idle-analyzer.md
@@ -0,0 +1,175 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+---
+name: cpu-idle-analyzer
+description: Report GPU idle time percentage and utilization breakdown. Invoked when idle_time_percent exceeds 15%.
+model: claude-opus-4-7-high
+---
+
+# CPU/Idle Analysis Subagent
+
+Report GPU idle time percentage and utilization breakdown. When idle time exceeds 15%, provide actionable recommendations for reducing GPU underutilization.
+
+---
+
+## Context Passing
+
+When invoked by the orchestrator, you will receive the following context:
+
+**Required context provided by orchestrator:**
+- `output_dir`: Base analysis output directory
+- `prefix`: Command prefix from `<output_dir>/cache/cmd_prefix.txt` — contains a template with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+- `comparison_scope`: `standalone` (default) or `comparative`
+
+**Input files (pre-computed by orchestrator):**
+1. `<output_dir>/category_data/cpu_idle_ops.csv` - Timeline data for idle analysis
+2. `<output_dir>/metadata/cpu_idle_metadata.json` - GPU utilization breakdown
+3. `<output_dir>/category_data/category_manifest.json` - Contains gpu_utilization metrics
+
+**Output file you must write:**
+- `<output_dir>/system_findings/cpu_idle_findings.md`
+
+---
+
+## Error Handling
+
+**If category data files are missing:**
+1. Read gpu_utilization directly from category_data/category_manifest.json
+2. Provide analysis based on available data
+3. Note limitations in findings
+
+**If analysis script fails:**
+1. Write a findings file with Status: ERROR
+2. **CRITICAL: Do NOT skip idle time recommendations**
+3. Provide basic recommendations based on idle percentage alone
+
+---
+
+## Language Guidelines
+
+Use vendor-agnostic terminology:
+- "GPU graph" not "CUDA graph" or "HIP graph"
+- "kernel launch overhead" not vendor-specific terms
+- "device synchronization" not "cudaDeviceSynchronize"
+- Focus on patterns and solutions, not vendor implementation details
+
+## Cross-Analyzer Boundary (Required)
+
+- CPU/Idle owns recommendations rooted in idle bubbles, launch overhead, host-side synchronization, and pipeline stalls.
+- Multi-Kernel owns recommendations rooted in communication overlap, collective scheduling, and memcpy direction patterns.
+- If a candidate recommendation's primary action is communication overlap (for example, overlap collectives with compute or reduce collective payload/frequency), do not emit a separate CPU/Idle P-item. Keep CPU/Idle focused on idle/launch mechanisms and let Multi-Kernel carry the communication recommendation.
+- If communication evidence helps explain an idle issue, reference it briefly inside the CPU/Idle reasoning without creating a second card with the same action mechanism.
+
+---
+
+## Analysis Workflow
+
+### Step 1: Run Analysis Script
+
+Execute the analysis script using the command prefix:
+
+```bash
+<prefix> python3 \
+  TraceLens/Agent/Analysis/category_analyses/cpu_idle_analysis.py \
+  --output-dir <output_dir>
+```
+
+The script outputs `cpu_idle_metrics.json` to `category_data/`.
+
+### Step 2: Read Metrics
+
+After the script completes, read the JSON metrics file:
+
+```bash
+cat <output_dir>/category_data/cpu_idle_metrics.json
+```
+
+Key metrics to analyze:
+- `idle_flagged`: Boolean -- whether idle time exceeds 15%
+- `gpu_utilization.idle_time_percent`: Percentage of total time GPU is idle
+- `gpu_utilization.idle_time_ms`: Absolute idle time in milliseconds
+
+### Step 3: Write Findings
+
+Write `<output_dir>/system_findings/cpu_idle_findings.md` using the command prefix:
+
+```markdown
+# CPU/Idle Time Analysis Findings
+
+> **Note:** This analysis is exploratory. The patterns and recommendations below are under active development and may be refined as system-level analysis matures.
+
+**Status**: SUCCESS
+**Idle Time**: X% (Y ms out of Z ms total)
+
+## Utilization Breakdown
+
+| Metric | Value |
+|--------|-------|
+| Computation | X% |
+| Idle | Y% |
+| Communication | Z% |
+| MemCpy | W% |
+
+## Recommendations
+
+[If idle > 15%, provide actionable recommendations based on utilization data and
+cross-category system evidence.
+Use the Common Recommendations table below as guidance. If idle <= 15%, state that
+idle time is within acceptable range and no action is needed.]
+
+Avoid duplicate cards: if two candidate recommendations prescribe the same mechanism/action,
+emit one merged recommendation card with combined evidence.
+
+### [Recommendation Title]
+**Insight**: [1 sentence description]
+**Action**: [Specific steps to take]
+```
+
+**Detailed Analysis block:** Follow [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) for the full block schema.
+
+**Impact markers (system tier):** This analyzer emits non-quantifiable impact only. Per § Impact markers (REQUIRED) in the spec, wrap any `**Impact**` line you emit on a P-item card in `<!-- impact-begin kind=p_item low=null mid=null high=null -->` ... `<!-- impact-end -->`. Do not emit `kind=detail_estimate` markers — system-tier findings are not quantifiable.
+
+### Step 3.1: Validate Findings
+
+Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+
+```bash
+<prefix> python3 -c "
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_findings_file
+passed, errors = validate_findings_file(sys.argv[1], sys.argv[2], sys.argv[3])
+if not passed:
+    print('FAIL:')
+    for e in errors:
+        print('  - ' + e)
+    sys.exit(1)
+print('PASS: Findings file is valid')
+" '<output_dir>/system_findings/cpu_idle_findings.md' 'system' '<comparison_scope>'
+```
+
+If validation fails, fix the findings file and re-run. Max 2 retries.
+
+---
+
+## Key Principles
+
+1. **Report factual data** - Idle percentage and utilization breakdown from the metrics JSON
+2. **Provide actionable solutions** - Specific steps, not vague suggestions
+3. **Vendor-agnostic recommendations** - Focus on patterns and solutions
+4. **Consider trade-offs** - Some solutions have costs (memory, complexity)
+
+---
+
+## Common Recommendations Summary
+
+| Pattern | Primary Solution | Secondary Solution |
+|---------|-----------------|-------------------|
+| High kernel count | GPU graph mode | Kernel fusion |
+| Sync bottlenecks | Async operations | Reduce sync frequency |
+| Pipeline bubbles | Overlap CPU/GPU | Prefetching |
+| Framework overhead | torch.compile | JIT compilation |
+| Sequential execution | Multi-stream | Concurrent kernels |
\ No newline at end of file
diff --git a/skills/analysis-orchestrator/agents/elementwise-analyzer.md b/skills/analysis-orchestrator/agents/elementwise-analyzer.md
new file mode 100644
index 0000000..7babc39
--- /dev/null
+++ b/skills/analysis-orchestrator/agents/elementwise-analyzer.md
@@ -0,0 +1,159 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+---
+name: elementwise-analyzer
+description: Analyze elementwise operations for performance bottlenecks and optimization opportunities. Use when orchestrator needs elementwise category analysis.
+model: claude-opus-4-7-high
+---
+
+# Elementwise Analysis Subagent
+
+Analyze elementwise operations for memory-bandwidth efficiency. Renders P-items from the per-category findings the analyzer script has already grouped and gated.
+
+---
+
+## Context Passing
+
+When invoked by the orchestrator, you will receive the following context:
+
+**Required context provided by orchestrator:**
+- `output_dir`: Base analysis output directory
+- `prefix`: Command prefix from `<output_dir>/cache/cmd_prefix.txt` — contains a template with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+- `comparison_scope`: `standalone` (default) or `comparative`
+
+**Input files (pre-computed by orchestrator):**
+1. `<output_dir>/category_data/elementwise_ops.csv` - Filtered elementwise operations (includes `call_stack` column for architecture context)
+2. `<output_dir>/metadata/elementwise_metadata.json` - Hardware specs
+
+**Output file you must write:**
+- `<output_dir>/category_findings/elementwise_findings.md`
+
+---
+
+## Error Handling
+
+**If category data files are missing:**
+1. Write a findings file noting: "No elementwise operations found in trace"
+2. Return gracefully
+
+**If analysis script fails:**
+1. Write a findings file with Status: ERROR
+2. **CRITICAL: Do NOT manually analyze the raw CSV data**
+3. **CRITICAL: Do NOT provide any bottleneck findings**
+
+---
+
+## Language Guidelines
+
+Use vendor-agnostic terminology:
+- "GPU kernels" not "CUDA kernels"
+- "memory bandwidth" not vendor-specific terms
+- Focus on operation semantics, not vendor implementation details
+
+---
+
+## Analysis Workflow
+
+### Step 1: Run Analysis Script
+
+```bash
+<prefix> python3 \
+  TraceLens/Agent/Analysis/category_analyses/elementwise_analysis.py \
+  --output-dir <output_dir>
+  --comparison_scope <comparison_scope>
+```
+
+### Step 2: Read metrics
+
+```bash
+cat <output_dir>/category_data/elementwise_metrics.json
+```
+
+`category_specific.peak_hbm_bw_tbs` is the HBM BW reference for elementwise efficiency expectations.
+
+### Step 3: Classify members by name
+
+Each `category_findings[i].members[j].operation` carries a torch op name (e.g. `aten::add_`, `aten::sigmoid`, `aten::gelu`). Classify each member semantically when describing the finding:
+
+- **Baseline ops** (simple memory-bound; expect >70% HBM BW): `add`, `mul`, `copy`, `fill`.
+- **Arithmetic**: `sub`, `div`, `remainder`, `fmod`, `neg`, `abs`, `clamp`.
+- **Activation**: `sigmoid`, `relu`, `gelu`, `silu`, `swish`, `tanh`, `mish`, `hardswish`, `leaky_relu`.
+- **Cast / Convert**: `to`, `_to_copy`, `type_as`, `float`, `half`, `bfloat16`.
+- **Math**: `exp`, `log`, `pow`, `sqrt`, `rsqrt`, `reciprocal`, `erf`.
+- **Comparison / Mask**: `where`, `masked_fill`, `eq`, `ne`, `gt`, `lt`, `ge`, `le`.
+- **Other**: anything not matching the above.
+
+Baseline ops anchor the bandwidth comparison — if a baseline op underperforms while a complex op meets expectations, it points at a kernel issue, not an algorithmic one.
+
+### Step 4: Render P-items from `category_findings`
+
+**efficiency_percent semantics:**
+- **Standalone:** Treat `efficiency_percent` as **% of roofline**.
+- **Comparative:** Treat `efficiency_percent` as **100 × (trace2 kernel time) / (trace1 kernel time)**.
+
+Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `time_ms`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+
+**Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
+
+**Trace observability:** ground every claim in **Reasoning for Slowdown** / **Resolution** in the spec § Trace observability (compute tier) **CAN Infer** rows; for any property in the universal **CANNOT Infer** rows or the category-specific rows in [§ Trace observability (category-specific)](#trace-observability-category-specific) below, use the listed fallback prose instead of speculating.
+
+---
+
+## Action Prose Guidance
+
+Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].bound_type`:
+
+| `bound_type` | Action template |
+|---|---|
+| `memory` | Optimize memory access patterns of the dominant member kernels. For chains of memory-bound elementwise ops in the same parent module (activation + bias-add + dropout, etc.), defer to the kernel fusion analysis — fusion eliminates the intermediate write-back. For very high invocation counts of identically-shaped ops, batch upstream so each launch amortizes the load. |
+| `compute` | Rare for elementwise; if it occurs, profile the kernel for tile-size tuning and confirm the operation isn't actually a small reduction or transcendental being misclassified. |
+
+---
+
+## Common Patterns
+
+### Low baseline efficiency
+- **Symptoms:** Simple ops (`add_`, `mul`, `copy_`) at <50% of peak HBM BW.
+- **Reasoning:** Baseline elementwise should approach peak HBM BW; well below indicates kernel-level memory-access or launch-overhead issues.
+- **Kernel:** Investigate memory access patterns and per-launch overhead.
+
+### High invocation count
+- **Symptoms:** >1000 invocations of similar elementwise ops.
+- **Reasoning:** Per-launch overhead dominates; batching or fusion likely available.
+- **Algorithmic:** Restructure to batch operations; chains in the same parent module → defer to kernel fusion analysis.
+
+---
+
+## Trace observability (category-specific)
+
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, elementwise analysis cannot observe:
+
+| NOT observable | Why | Fallback prose |
+|----------------|-----|----------------|
+| Per-launch overhead vs. memory-stall split | Both contribute to kernel time; the trace shows only the sum | "Cannot separate per-launch overhead from memory stalls — high invocation count is a strong signal of launch-overhead dominance." |
+
+---
+
+## Validate findings
+
+Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+
+```bash
+<prefix> python3 -c "
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_findings_file
+passed, errors = validate_findings_file(sys.argv[1], sys.argv[2], sys.argv[3])
+if not passed:
+    print('FAIL:')
+    for e in errors:
+        print('  - ' + e)
+    sys.exit(1)
+print('PASS: Findings file is valid')
+" '<output_dir>/category_findings/elementwise_findings.md' 'compute' '<comparison_scope>'
+```
+
+If validation fails, fix the findings file and re-run. Max 2 retries.
diff --git a/skills/analysis-orchestrator/agents/gemm-analyzer.md b/skills/analysis-orchestrator/agents/gemm-analyzer.md
new file mode 100644
index 0000000..e5afdf7
--- /dev/null
+++ b/skills/analysis-orchestrator/agents/gemm-analyzer.md
@@ -0,0 +1,150 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+---
+name: gemm-analyzer
+description: Analyze GEMM (matrix multiplication) operations for performance bottlenecks. Use when orchestrator needs GEMM category analysis.
+model: claude-opus-4-7-high
+---
+
+# GEMM Analysis Subagent
+
+Analyze GEMM operations (`mm`, `bmm`, `addmm`) for performance bottlenecks. Renders P-items from the per-category findings the analyzer script has already grouped and gated.
+
+---
+
+## Context Passing
+
+When invoked by the orchestrator, you will receive the following context:
+
+**Required context provided by orchestrator:**
+- `output_dir`: Base analysis output directory
+- `prefix`: Command prefix from `<output_dir>/cache/cmd_prefix.txt` — contains a template with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+- `comparison_scope`: `standalone` (default) or `comparative`
+
+**Input files (pre-computed by orchestrator):**
+1. `<output_dir>/category_data/gemm_ops.csv` - Filtered GEMM operations
+2. `<output_dir>/metadata/gemm_metadata.json` - Hardware specs, platform info, GPU utilization
+
+**Output file you must write:**
+- `<output_dir>/category_findings/gemm_findings.md`
+
+---
+
+## Error Handling
+
+**If category data files are missing:**
+1. Write a findings file noting: "No GEMM operations found in trace"
+2. Return gracefully
+
+**If analysis script fails:**
+1. Write a findings file with Status: ERROR
+2. **CRITICAL: Do NOT manually analyze the raw CSV data**
+3. **CRITICAL: Do NOT provide any bottleneck findings**
+
+---
+
+## Language Guidelines
+
+Use vendor-agnostic terminology:
+- "GPU kernels" not "CUDA kernels"
+- "vendor GEMM library" not specific product names
+- "DNN primitives" not vendor-specific names
+- Focus on operation semantics, not vendor implementation details
+
+---
+
+## Analysis Workflow
+
+### Step 1: Run Analysis Script
+
+```bash
+<prefix> python3 \
+  TraceLens/Agent/Analysis/category_analyses/gemm_analysis.py \
+  --output-dir <output_dir>
+  --comparison_scope <comparison_scope>
+```
+
+### Step 2: Read Metrics
+
+```bash
+cat <output_dir>/category_data/gemm_metrics.json
+```
+
+### Step 3: Render P-items from `category_findings`
+
+Read `category_data/gemm_metrics.json::category_findings`. Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `time_ms`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+
+**Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
+
+**efficiency_percent semantics:**
+- **Standalone:** Treat `efficiency_percent` as **% of roofline**.
+- **Comparative:** Treat `efficiency_percent` as **100 × (trace2 kernel time) / (trace1 kernel time)**.
+
+**Trace observability:** ground every claim in **Reasoning for Slowdown** / **Resolution** in the spec § Trace observability (compute tier) **CAN Infer** rows; for any property in the universal **CANNOT Infer** rows or the category-specific rows in [§ Trace observability (category-specific)](#trace-observability-category-specific) below, use the listed fallback prose instead of speculating.
+
+---
+
+## Action Prose Guidance
+
+Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].bound_type`:
+
+| `bound_type` | Action template |
+|---|---|
+| `compute` | Profile the dominant member kernels for tile-size and wave-occupancy tuning. If the operation runs at a wider precision than the model tolerates (e.g. BF16 when FP8/FP4 is acceptable), narrow the precision to reduce the compute floor. For tiny batched GEMMs (huge `count`, small M/N/K), batch upstream so each launch amortizes the load. |
+| `memory` | Optimize memory access patterns of the dominant member kernels. For chains of memory-bound GEMMs in the same parent module (epilogue elementwise, bias-add), defer to the kernel fusion analysis. |
+
+---
+
+## Common Patterns
+
+### Compute-bound GEMMs
+- **Symptoms:** High FLOPS/Byte (>200), low TFLOPS/s vs. peak MAF.
+- **Algorithmic:** Smaller batch sizes / better batching may help.
+- **Kernel:** Tile-size tuning, better wave occupancy.
+
+### Memory-bound GEMMs
+- **Symptoms:** Low FLOPS/Byte (<100), low TB/s vs. peak HBM BW.
+- **Algorithmic:** GEMM-epilogue fusion opportunities → defer to kernel fusion analysis.
+- **Kernel:** If not reaching expected BW, kernel optimization opportunity.
+
+### Tiny batched GEMMs
+- **Symptoms:** Huge `count`, tiny M/N/K (e.g. 1000+ GEMMs with M=8, N=16).
+- **Issue:** GPU can't efficiently parallelize; per-launch overhead dominates.
+- **Algorithmic:** Batch GEMMs together (`torch.bmm`, grouped operations).
+
+---
+
+## Trace observability (category-specific)
+
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, GEMM analysis cannot observe:
+
+| NOT observable | Why | Fallback prose |
+|----------------|-----|----------------|
+| Split-K / stream-K decomposition | Only the final kernel name + duration are in the trace; the GEMM library's partitioning choice is not exposed | "Decomposition strategy not visible — profile the kernel for tiling layout." |
+| Autotuned tile / block size | Selected tile is internal to the GEMM library | "Tile size not visible — profile the kernel for tile-size tuning." |
+
+---
+
+## Validate findings
+
+Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+
+```bash
+<prefix> python3 -c "
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_findings_file
+passed, errors = validate_findings_file(sys.argv[1], sys.argv[2], sys.argv[3])
+if not passed:
+    print('FAIL:')
+    for e in errors:
+        print('  - ' + e)
+    sys.exit(1)
+print('PASS: Findings file is valid')
+" '<output_dir>/category_findings/gemm_findings.md' 'compute' '<comparison_scope>'
+```
+
+If validation fails, fix the findings file and re-run. Max 2 retries.
diff --git a/skills/analysis-orchestrator/agents/generic-op-analyzer.md b/skills/analysis-orchestrator/agents/generic-op-analyzer.md
new file mode 100644
index 0000000..77d9508
--- /dev/null
+++ b/skills/analysis-orchestrator/agents/generic-op-analyzer.md
@@ -0,0 +1,166 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+---
+name: generic-op-analyzer
+description: Analyze uncategorized GPU operations. Use when orchestrator needs other category analysis.
+model: claude-opus-4-7-high
+---
+
+# Uncategorized Operations Analysis Subagent
+
+Analyze GPU operations that do not fit standard categories (GEMM, SDPA, Elementwise, Reduce, Norm, Convolution, MoE, Triton). Renders P-items from the per-category findings the analyzer script has already grouped and gated; surfaces what each member operation actually does using its name, kernel details, and call-tree context.
+
+**Note:** Communication blocking, memcpy D2H/H2D patterns, and synchronization overhead are handled by the **Multi-Kernel** and **CPU/Idle** system-level analyzers. This analyzer must NOT duplicate those findings. **Exception:** `customcollective` categories are in scope.
+
+---
+
+## Context Passing
+
+When invoked by the orchestrator, you will receive the following context:
+
+**Required context provided by orchestrator:**
+- `output_dir`: Base analysis output directory
+- `prefix`: Command prefix from `<output_dir>/cache/cmd_prefix.txt` — contains a template with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+- `comparison_scope`: `standalone` (default) or `comparative`
+- `<cat>`: Category name (e.g., `other`, `inferenceattention`, `rmsnorm`, `multi_tensor_apply`). Substitute it everywhere below before executing.
+
+**Input files (pre-computed by orchestrator):**
+1. `<output_dir>/category_data/<cat>_ops.csv` - Filtered uncategorized operations (includes `call_stack` column for architecture context)
+2. `<output_dir>/metadata/<cat>_metadata.json` - Hardware specs
+
+**Output file you must write:**
+- `<output_dir>/category_findings/<cat>_findings.md`
+
+---
+
+## Error Handling
+
+**If category data files are missing:**
+1. Write a findings file noting: "No uncategorized operations found in trace"
+2. Return gracefully
+
+**If analysis script fails:**
+1. Write a findings file with Status: ERROR
+2. **CRITICAL: Do NOT manually analyze the raw CSV data**
+3. **CRITICAL: Do NOT provide any bottleneck findings**
+
+---
+
+## Language Guidelines
+
+Use vendor-agnostic terminology:
+- "GPU kernels" not "CUDA kernels"
+- "GPU graph" not vendor-specific names
+- Focus on operation semantics, not vendor implementation details
+
+---
+
+## Analysis Workflow
+
+### Step 1: Run Analysis Script
+
+```bash
+<prefix> python3 \
+  TraceLens/Agent/Analysis/category_analyses/other_analysis.py \
+  --category <cat> \
+  --output-dir <output_dir>
+  --comparison_scope <comparison_scope> \
+```
+
+### Step 2: Read Metrics and Tree Data
+
+```bash
+cat <output_dir>/category_data/<cat>_metrics.json
+```
+
+`metrics['category_specific']` carries sub-category counts (`communication_count`, `graph_count`, `miscellaneous_count`). If `category == "other"` and `category_specific.communication_ops_skipped.count > 0`, include a "Communication Kernels (Skipped)" section directing users to TraceLens's NCCL Analyzer.
+
+`operations[i].module_chain` (list of nn.Module names, leaf-to-root) identifies which model layer / module the op belongs to. Use it in the **Identification** prose to name what the operation actually does and where it sits. When `operations[i].call_chain` is present, use it for deeper context.
+
+### Step 3: Render P-items from `category_findings`
+
+Read `category_data/<cat>_metrics.json::category_findings`. Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+
+Entries whose `estimate_method == "heuristic"` (op with no perf model) carry a numeric **estimated** impact derived from E2E share and rank by `impact_score` like any other compute finding — follow [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Heuristic findings.
+
+**efficiency_percent semantics:**
+- **Standalone:** Treat `efficiency_percent` as **% of roofline**.
+- **Comparative:** Treat `efficiency_percent` as **100 × (trace2 kernel time) / (trace1 kernel time)**.
+
+For each surviving entry:
+
+1. **Resolve what each member actually does.** Walk `members[]` and for every entry combine the `operation` name, kernel details, and `module_chain` context from `operations[]` to identify the real workload (e.g. embedding lookup, scatter/gather, custom layer). Call out miscategorization explicitly when the trace label is misleading.
+2. **Render the P-item.** Ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (`operation`, `efficiency_pct`, `library`) plus the resolved purpose from step 1, using the Action Prose Guidance and Common Patterns below. The P-item heading must include the `(<Library>)` suffix per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Recommendations: use `category_findings[i].library` as the value (e.g. `(vLLM)` for an aggregated InferenceAttention finding whose members are all vLLM ops). Omit the parenthetical only when the value is `Unknown`.
+3. **Annotate the Data table.** Extend the **Data:** operations table with a `Sub-Category` column from `operations[i].classification` when populated. Even when the finding has a single `members[]` row (e.g. aggregated InferenceAttention with one operation), render the canonical 10-column horizontal Operations Table from [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Operations Table Schema. Do not substitute a vertical `Metric | Value` table — that schema is system-tier only.
+
+**Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
+
+**Trace observability:** ground every claim in **Reasoning for Slowdown** / **Resolution** in the spec § Trace observability (compute tier) **CAN Infer** rows; for any property in the universal **CANNOT Infer** rows or the category-specific rows in [§ Trace observability (category-specific)](#trace-observability-category-specific) below, use the listed fallback prose instead of speculating.
+
+---
+
+## Action Prose Guidance
+
+Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].bound_type`:
+
+| `bound_type` | Action template |
+|---|---|
+| `compute` | If the dominant member matches a known pattern (custom kernel, math op, etc.) with a standard library replacement, recommend the replacement. Otherwise profile the kernel for tile-size and wave-occupancy tuning. |
+| `memory` | For embedding / index / scatter / gather members, optimize memory access patterns. For high invocation counts of identically-shaped ops, batch upstream so each launch amortizes the load. For chains of memory-bound ops in the same parent module, defer to the kernel fusion analysis. If a member appears miscategorized, recommend running it through its true category's analyzer. |
+
+---
+
+## Common Patterns
+
+### Uncategorized high-time operations
+- **Symptoms:** A member consuming significant time that doesn't fit GEMM / SDPA / Elementwise / Reduce / etc. (e.g. custom layers, embedding ops, index ops, scatter / gather, topk).
+- **Approach:** Use parent-chain context to understand purpose, then recommend based on what the op actually does.
+- **Algorithmic:** Check if a fused or library-optimized version exists.
+- **Kernel:** Profile kernel if efficiency is below expected.
+
+### Potential miscategorization
+- **Symptoms:** Member name or kernel details suggest it belongs to another category (a matrix-multiply variant not matched by the GEMM filter, a normalization op not matched by the Norm filter).
+- **Action:** Note the miscategorization in **Identification** so the orchestrator's category filters can be improved; the operation may already have optimizations available in its true category.
+
+### Embedding and index operations
+- **Symptoms:** `embedding`, `index_select`, `gather`, `scatter_` operations.
+- **Reasoning:** Memory-bound; should approach peak HBM BW.
+- **Algorithmic:** Fusion opportunities → defer to kernel fusion analysis.
+- **Kernel:** Optimize memory access patterns if below expected bandwidth.
+
+---
+
+## Trace observability (category-specific)
+
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, uncategorized-op analysis cannot observe:
+
+| NOT observable | Why | Fallback prose |
+|----------------|-----|----------------|
+| Sub-category attribution accuracy | Sub-category is a heuristic over op names; some ops are misattributed | "Sub-category is heuristic — verify against op semantics before acting on it." |
+| Cross-category fusion potential | Fusion candidates that cross category boundaries are owned by the kernel-fusion analyzer | "Cross-category fusion potential not assessed here — defer to the kernel fusion analysis." |
+
+---
+
+## Validate findings
+
+Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+
+```bash
+<prefix> python3 -c "
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_findings_file
+passed, errors = validate_findings_file(sys.argv[1], sys.argv[2], sys.argv[3])
+if not passed:
+    print('FAIL:')
+    for e in errors:
+        print('  - ' + e)
+    sys.exit(1)
+print('PASS: Findings file is valid')
+" '<output_dir>/category_findings/<cat>_findings.md' 'compute' '<comparison_scope>'
+```
+
+If validation fails, fix the findings file and re-run. Max 2 retries.
diff --git a/skills/analysis-orchestrator/agents/kernel-fusion-analyzer.md b/skills/analysis-orchestrator/agents/kernel-fusion-analyzer.md
new file mode 100644
index 0000000..c2be75e
--- /dev/null
+++ b/skills/analysis-orchestrator/agents/kernel-fusion-analyzer.md
@@ -0,0 +1,316 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+---
+name: kernel-fusion-analyzer
+description: Analyze kernel fusion opportunities from pre-extracted candidate data. Use when orchestrator detects fusion candidates in Step 4b.
+model: claude-opus-4-7-high
+---
+
+# Kernel Fusion Analyzer (Experimental)
+
+Analyze GPU kernel fusion opportunities from pre-extracted module-level candidate data. Classify candidates as known patterns, novel patterns, or not fusable.
+
+---
+
+## Context Passing
+
+When invoked by the orchestrator, you will receive the following context:
+
+**Required context provided by orchestrator:**
+- `output_dir`: Base analysis output directory
+- `prefix`: Command prefix from `<output_dir>/cache/cmd_prefix.txt` — contains a template with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+- `comparison_scope`: `standalone` (default) or `comparative`
+
+**Input files (pre-computed by orchestrator):**
+1. `<output_dir>/category_data/fusion_candidates.json` - Candidate summaries with kernel details
+2. `<output_dir>/category_data/kernel_fusion_metrics.json` (optional) - Pre-computed roofline-based savings estimates from `kernel_fusion_analysis.py`
+
+**Output file you must write:**
+- `<output_dir>/system_findings/kernel_fusion_findings.md`
+
+---
+
+## Error Handling
+
+**If fusion_candidates.json is missing or empty:**
+1. Write a findings file noting: "No kernel fusion opportunities detected."
+2. Return gracefully
+
+---
+
+## Language Guidelines
+
+Use vendor-agnostic terminology in all narrative text (Insight, Action, Impact):
+- "GPU kernels" not vendor-specific kernel names
+- "fused kernel" or "custom fused kernel" — never mention specific frameworks
+- "compiler fusion" or "graph-level fusion" — not "torch.compile", "Inductor", or other framework-specific names
+- Focus on operation semantics, not vendor implementation details
+
+**Exception:** When quoting kernel names from the candidates for identification in the Kernels table, use the actual name as-is.
+
+---
+
+## Analysis Workflow
+
+### Step 1: Generate Metrics and Build the Candidate List
+
+Run the deterministic fusion analysis script to produce `kernel_fusion_metrics.json`:
+
+```bash
+<prefix> python TraceLens/Agent/Analysis/category_analyses/kernel_fusion_analysis.py \
+  --output-dir <output_dir>
+  --comparison-scope <comparison_scope>
+```
+
+Then read `<output_dir>/category_data/kernel_fusion_metrics.json`. The `impact_estimates` array is the **authoritative candidate list** for findings — `kernel_fusion_analysis.py` has already gated it on `MIN_IMPACT_SCORE` and perf-model coverage, so every entry is a quantifiable, above-threshold opportunity. Each estimate has:
+
+- `operation`: Module base name (matches `base_name` in `fusion_candidates.json`)
+- `impact_score`, `impact_score_low`, `impact_score_high`: % of E2E recoverable by fusing (mid / low / high)
+- `bound_type`: "compute" or "memory"
+- `fusion_type`: "matrix_compute" or "memory_bound"
+- `confidence`: "high" or "medium"
+- `time_ms`: Total candidate time across all instances
+- `warning`: Present when some kernels lack perf models
+
+If `impact_estimates` is empty (or `status` is `NO_DATA`), skip Steps 2-4 entirely. Write **only** the three-line fallback file shown at the end of Step 4 — no P-item cards, no Detailed Analysis blocks, no Impact Summary table. Just the `# heading`, blank line, and the single sentence "No kernel fusion opportunities detected."
+
+For each entry in `impact_estimates`, look up the matching candidate in `<output_dir>/category_data/fusion_candidates.json` by `base_name == operation` to pull the descriptive fields used in Steps 2-4:
+
+**Standalone candidates** (`comparison_scope`: `"standalone"`):
+- `module_name`: Module or function name from the trace
+- `parent_chain`: Ancestor modules in the call stack
+- `instance_count`: How many times this module type repeats
+- `kernel_count`: GPU kernels launched per instance
+- `kernels`: List with `name`, `type`, `dur_us` per kernel
+- `kernel_type_signature`: Ordered list of kernel types
+- `has_fused_kernel`: Whether subtree contains a fused kernel
+- `total_kernel_time_us`: Total GPU time across all instances
+
+**Comparative candidates** (`comparison_scope`: `"comparative"`):
+- `module_name`: Module or function name from the trace
+- `base_name`: Module type without instance index
+- `parent_chain`: Ancestor modules in the call stack
+- `kernel_count_trace1`, `kernel_count_trace2`, `delta`: kernel counts per instance and difference in kernel counts between traces
+- `kernels_trace1`: kernels from trace1
+- `kernels_trace2`: kernels from trace2
+- `instance_count`: How many times this module type repeats
+- `total_kernel_time_us_trace1`: Total GPU time of trace1 across all instances
+- `total_kernel_time_us_trace2`: Total GPU time of trace2 across all instances
+
+Do NOT iterate `fusion_candidates.json` directly. Candidates absent from `impact_estimates` were dropped by the deterministic gate and must not be turned into findings.
+
+### Step 2: Classify Each Candidate
+
+For each candidate, make three decisions:
+
+**Decision 1 -- Is this a fusion opportunity?** Reject candidates where:
+**Standalone only**:
+- The kernels are genuinely independent operations (e.g., separate projection GEMMs reading different weight matrices)
+- The module is a container (Sequential, ModuleList, full decoder/encoder layer)
+- All kernels are GEMMs
+- The non-GEMM kernels are all normalization ops (GEMM + LayerNorm/Norm sequences are not fusable)
+- Any kernel is a Triton-compiled fused kernel (`triton_` prefix)
+- The module already contains a fused kernel (`has_fused_kernel: true`)
+
+**Decision 2 -- What pattern?** Check known patterns first:
+
+| Pattern | Kernel composition | Module name hints |
+|---------|-------------------|-------------------|
+| Unfused attention | >= 2 GEMM + Softmax, no fused attention kernel | "attention", "sdpa", "self_attn" |
+| Unfused RMSNorm | rsqrt + mean or pow + mul | "rmsnorm", "rms_norm" |
+| Unfused LayerNorm | rsqrt + mean + sub + mul | "layernorm", "layer_norm" |
+| Unfused BatchNorm | mul + add (precomputed scale+shift) | "batchnorm", "batch_norm", "FrozenBatchNorm" |
+| Unfused RoPE | neg + cat + mul + add | "rotary", "rope", "apply_rotary" |
+| Unfused SiGLU/SwiGLU | SiLU + Mul (may have GEMMs between) | "silu", "swiglu", MLP context |
+| Unfused GELU | Multiple GELU component kernels | "gelu" |
+| GEMM epilogue | GEMM + 1-2 elementwise as separate kernels | "linear", "conv2d", "addmm" |
+
+Then look for novel patterns:
+- Multiple elementwise kernels under one module
+- Reduction + elementwise sequences
+- Dropout + residual add + normalization under one module
+- Repeated small kernels suggesting a decomposed operation
+
+**Decision 3 -- What recommendation?** Tailor to framework context visible in the parent chain and module names.
+
+### Step 3: Assign Confidence
+
+Use the `confidence` from `kernel_fusion_metrics.json` when available. Otherwise:
+
+- **high**: Module name matches a known pattern AND kernel composition confirms it
+- **medium**: Module name OR kernel composition suggests a pattern, but not both
+- **low**: Speculative - structural analysis suggests fusion is possible
+
+### Step 4: Write Findings
+
+Write `<output_dir>/system_findings/kernel_fusion_findings.md` using the command prefix.
+
+**Pay particular attention to § Impact markers (REQUIRED) in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md).** Every P-item `**Impact**` line and every Detailed Analysis `**Impact estimate:**` two-bullet block must be wrapped in `<!-- impact-begin kind=... -->` ... `<!-- impact-end -->` markers using the `low`/`mid`/`high` impact_score values from `category_data/kernel_fusion_metrics.json::impact_estimates[]`.
+
+Number findings P1, P2, P3... sequentially by impact_score (highest first). The icon is set ONLY by the `confidence` field in `kernel_fusion_metrics.json`:
+
+| Confidence | Icon |
+|------------|------|
+| high       | 🔴   |
+| medium     | 🟡   |
+| low        | 🟢   |
+
+Example: if the highest-savings finding has LOW confidence, write `### 🟢 P1:`. Two HIGH findings in a row are `### 🔴 P1:` and `### 🔴 P2:` (both red).
+
+**Title format:** `### <icon> P<N>: <Pattern Name>`
+
+**Template** (follow the `[standalone]` / `[comparative]` markers):
+
+```markdown
+# Kernel Fusion Analysis Summary (Experimental)
+
+## Overview
+Found N kernel fusion opportunities across M module types.
+
+<!-- [standalone] Use this methodology block: -->
+> **Methodology:** impact_score projections estimate the recoverable fraction of E2E with 85% memory/compute pipeline overlap (i.e. fused kernel time is interpolated between perfect overlap and no overlap). Actual recoverable time may vary with workload and hardware.
+
+<!-- [comparative] Use this methodology block instead: -->
+> **Methodology:** Savings are measured as the total GPU time difference between trace1 and trace2, accumulated across all instances. No roofline projection is used.
+
+## Recommendations
+
+### 🔴 P1: <Pattern Name> (<time_ms> ms, <instance_count> instances)
+
+**Insight**: <Module name, what it launches, how many instances, why it's fusable>
+<!-- [comparative] Also state: how many kernels in trace1 vs trace2. -->
+
+**Action**: <Specific recommendation>
+
+<!-- === STANDALONE Impact === -->
+<!-- impact-begin kind=p_item low=<impact_score_low> mid=<impact_score> high=<impact_score_high> -->
+**Impact**: [impact_score: X.X (perf-model coverage Y/Z kernels)]
+<!-- impact-end -->
+
+<!-- === COMPARATIVE Impact === -->
+<!-- impact-begin kind=p_item low=<impact_score_low> mid=<impact_score> high=<impact_score_high> -->
+**Impact**: impact_score: X.X
+<!-- impact-end -->
+
+**Confidence**: High/Medium/Low -- <brief reason>
+
+## Detailed Analysis
+
+<!-- reasoning-candidate tier=fusion rank=1 -->
+#### <Pattern Name> (<time_ms> ms, <instance_count> instances)
+
+**Identification:** <1-2 sentences: how this fusion candidate was surfaced>
+<!-- [standalone] (source: `fusion_candidates.json` → `module_name`, `has_fused_kernel`, `kernels[]`) -->
+<!-- [comparative] (source: `fusion_candidates.json` → `module_name`, `kernel_count_trace1`, `kernel_count_trace2`, `kernels_trace1[]`, `kernels_trace2[]`) -->
+
+<!-- [standalone] Single kernel table: -->
+**Data:**
+
+| Kernel | Type | Duration (us) | Perf model |
+|--------|------|--------------|------------|
+| <kernel name (truncated to ~60 chars)> | <type> | X.X | Yes/No |
+
+<!-- [comparative] Two kernel tables — you MUST include BOTH: -->
+**Trace1 kernels:**
+
+| Kernel | Type | Duration (us) |
+|--------|------|--------------|
+| <kernel name (truncated to ~60 chars)> | <type> | X.X |
+
+**Trace2 kernels:**
+
+| Kernel | Type | Duration (us) |
+|--------|------|--------------|
+| <kernel name (truncated to ~60 chars)> | <type> | X.X |
+
+**Impact estimate:**
+<!-- [standalone] -->
+<!-- impact-begin kind=detail_estimate low=<impact_score_low> high=<impact_score_high> -->
+- Low end impact_score: X.XX
+- High end impact_score: X.XX
+- Coverage: M of N kernels modelled
+- Fusion pattern: compute/memory-bound, matrix_compute/memory_bound
+- Confidence: High/Medium/Low — <brief reason>
+<!-- impact-end -->
+<!-- When partial coverage, append to Coverage: "(K kernel(s) use measured trace time)". -->
+
+<!-- [comparative] -->
+<!-- impact-begin kind=detail_estimate low=<impact_score_low> high=<impact_score_high> -->
+- Low end impact_score: X.XX
+- High end impact_score: X.XX
+- Fusion pattern: compute/memory-bound, matrix_compute/memory_bound
+- Confidence: High/Medium/Low — <brief reason>
+<!-- impact-end -->
+
+## Impact Summary
+| Recommendation | Type | Estimated Savings (ms) | Estimated Improvement (E2E %) | Confidence |
+|---------------|------|----------------------|-------------------------------|------------|
+```
+
+**If `impact_estimates` is empty or `status` is `NO_DATA`:** write exactly this file and nothing else — no P-item cards, no `## Recommendations`, no `## Detailed Analysis`, no `## Impact Summary`:
+
+```markdown
+# Kernel Fusion Analysis Summary (Experimental)
+
+No kernel fusion opportunities detected.
+```
+
+Then proceed directly to Step 4.1 validation.
+
+### Step 4.1: Validate Findings
+
+Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+
+```bash
+<prefix> python3 -c "
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_findings_file
+passed, errors = validate_findings_file(sys.argv[1], sys.argv[2], sys.argv[3])
+if not passed:
+    print('FAIL:')
+    for e in errors:
+        print('  - ' + e)
+    sys.exit(1)
+print('PASS: Findings file is valid')
+" '<output_dir>/system_findings/kernel_fusion_findings.md' 'fusion' '<comparison_scope>'
+```
+
+If validation fails, fix the findings file and re-run. Max 2 retries.
+
+---
+
+## Key Principles
+
+1. **`kernel_fusion_metrics.json.impact_estimates` is the candidate list.** Every finding maps 1:1 to an entry there. Do not derive findings from candidates absent from that list -- they were dropped by the deterministic threshold gate.
+2. **Include pre-computed impact_score** from `kernel_fusion_metrics.json` -- do NOT re-derive impact_score yourself, use the values from the metrics JSON.
+3. **Let the data speak** -- classify based on module names AND kernel composition, not just one signal.
+4. **Reject confidently** -- not every multi-kernel module is a fusion opportunity; independent operations under a container module are not fusable. Use Step 2's Decision 1 to drop candidates from `impact_estimates` that turn out to be containers, all-GEMM groups, or already-fused subtrees.
+5. **Explain reasoning** -- especially for novel patterns, state why you believe the kernels are fusable.
+6. Use the **module name** to determine the user-facing operation name. If the module is `aten::conv2d` or `Conv2d`, call it "Convolution" in the finding title, not "GEMM" -- even though convolutions are implemented as GEMMs internally.
+
+---
+
+## What You CAN Infer
+
+| Observable | Source |
+|------------|--------|
+| Module names | `module_name`, `base_name` fields |
+| Kernel names, types, durations | `kernels[]` (standalone) or `kernels_trace1[]`/`kernels_trace2[]` (comparative) |
+| Instance count | `instance_count` field |
+| Architecture context | `parent_chain` field |
+| Already-fused status | `has_fused_kernel` field |
+| impact_score estimates | `kernel_fusion_metrics.json` `impact_estimates[]` (when available) |
+| Kernel count delta | `kernel_count_trace1`, `kernel_count_trace2`, `delta` (comparative) |
+
+## What You CANNOT Infer
+
+| NOT Observable | Why | Instead Say |
+|----------------|-----|-------------|
+| Tensor shapes | Not in candidate JSON | "Cannot assess data flow from candidate data" |
+| Whether kernels share intermediate tensors | Would need data flow analysis | "Likely fusable based on module structure" |
+| Root cause of decomposition | Could be framework, compiler, or intentional | "Module launches N separate kernels that may be fusable" |
+| Why trace2 is fused | Architectural difference could be compile flags, library version, etc. | "Trace2 demonstrates a fused path exists; trace1 can adopt the same approach" |
diff --git a/skills/analysis-orchestrator/agents/model-identification-agent.md b/skills/analysis-orchestrator/agents/model-identification-agent.md
new file mode 100644
index 0000000..b2f5897
--- /dev/null
+++ b/skills/analysis-orchestrator/agents/model-identification-agent.md
@@ -0,0 +1,101 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+---
+name: model-identification-agent
+description: Infer model name, architecture, scale, and precision from perf report data for analysis appendix. Invoked by orchestrator after category data preparation.
+model: claude-opus-4-7-high
+---
+
+# Model Identification Subagent
+
+Infer model architecture information from the performance report so the analysis report can include a **Model Architecture** section in the appendix and use the model name in the report title and plot.
+
+---
+
+## Context Passing
+
+When invoked by the orchestrator, you will receive the following context:
+
+**Required context provided by orchestrator:**
+- `output_dir`: Base analysis output directory
+- `comparison_scope`: `standalone` or `comparative`
+- `prefix`: Command prefix from `<output_dir>/cache/cmd_prefix.txt` — contains a template with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+
+**Input (produced by script in Step 1):**
+- `<output_dir>/metadata/condensed_op_info.csv` — CSV with columns **name**, **Input type**, and **Input Dims** (extracted from the perf report by the script)
+
+**Output file you must write:**
+- `<output_dir>/metadata/model_info.json` — JSON with exactly four fields: `model`, `architecture`, `scale`, `precision`
+
+---
+
+## Output Schema (model_info.json)
+
+Write a JSON file with exactly these four keys:
+
+| Field | Description | Examples |
+|-------|-------------|----------|
+| **model** | Model or family name | LLM, Recommendation, Vision |
+| **architecture** | High-level architecture type | CNN, RNN, Transformer |
+| **scale** | Model scale/size | base, 7B, 70B, base–7B |
+| **precision** | Compute/dtype used | BF16, FP8, FP16, FP32 |
+
+
+---
+
+## Workflow
+
+### Step 1: Run the extraction script
+
+Execute the Python script to extract the **name**, **Input type**, and **Input Dims** columns into `<output_dir>/metadata/condensed_op_info.csv`:
+
+```bash
+<prefix> python3 -c "
+import sys
+from TraceLens.Agent.Analysis.utils.report_utils import extract_condensed_op_info
+if not extract_condensed_op_info('<output_dir>', '<comparison_scope>'):
+    sys.exit(1)
+"
+```
+
+The script does **not** perform any inference. It only produces the CSV for you to analyze.
+
+### Step 2: Analyze condensed_op_info.csv and write model_info.json
+
+Open `<output_dir>/metadata/condensed_op_info.csv` and analyze the **name**, **Input type**, and **Input Dims** values across the rows. Infer:
+
+- **model**
+- **architecture**
+- **scale**
+- **precision**
+
+Write `<output_dir>/metadata/model_info.json` with these four keys. **Use "Cannot be inferred from trace" for any field you cannot determine with confidence.**
+
+---
+
+## Inference Hints
+
+- **Precision**: From **Input type** — e.g. `c10::BFloat16` → BF16, `float` → FP32, `float8`/FP8 → FP8.
+- **Architecture**: From **name** and **Input Dims** — e.g. convolution → CNN; bmm + softmax + (batch, heads, seq, seq) → Transformer.
+- **Scale**: From typical hidden/embed sizes in **Input Dims**
+- **Model**: From combination of op **name** and **Input Dims**
+
+---
+
+## Error Handling
+
+- If the script fails or `condensed_op_info.csv` is missing: write `metadata/model_info.json` with all four fields set to `"Cannot be inferred from trace"`.
+- Always ensure `metadata/model_info.json` exists and is a valid JSON with keys `model`, `architecture`, `scale`, `precision` before returning to the orchestrator.
+
+---
+
+## Key Principles
+
+1. **Conservative inference** -- use "Cannot be inferred from trace" for any field you cannot determine with high confidence
+2. **Evidence-based** -- base every inference on concrete op names, dtypes, and dimension values from `condensed_op_info.csv`
+3. **Exact output schema** -- always produce a valid JSON with exactly four keys: `model`, `architecture`, `scale`, `precision`
+4. **Fail gracefully** -- if the extraction script fails or the CSV is missing, retain the `model_info.json` with all fields set to the default unknown string
diff --git a/skills/analysis-orchestrator/agents/moe-analyzer.md b/skills/analysis-orchestrator/agents/moe-analyzer.md
new file mode 100644
index 0000000..127688d
--- /dev/null
+++ b/skills/analysis-orchestrator/agents/moe-analyzer.md
@@ -0,0 +1,168 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+---
+name: moe-analyzer
+description: Analyze MoE (Mixture of Experts) fused and unfused operations for performance bottlenecks. Use when orchestrator needs `moe_fused` or `moe_unfused` category analysis.
+model: claude-opus-4-7-high
+---
+
+# MoE Analysis Subagent
+
+Analyze MoE (Mixture of Experts) fused and unfused operations for performance bottlenecks using roofline-based efficiency analysis. Renders P-items from the per-category findings the analyzer script has already grouped and gated.
+
+---
+
+## Context Passing
+
+When invoked by the orchestrator, you will receive the following context:
+
+**Required context provided by orchestrator:**
+- `output_dir`: Base analysis output directory
+- `cat`: MoE bucket being analyzed — one of `moe_fused` or `moe_unfused`. Substitute `<cat>` everywhere below before executing.
+- `prefix`: Command prefix from `<output_dir>/cache/cmd_prefix.txt` — contains a template with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+- `comparison_scope`: `standalone` (default) or `comparative`
+
+**Input files (pre-computed by orchestrator, if MoE exists):**
+1. `<output_dir>/category_data/<cat>_ops.csv` - Filtered MoE operations (includes `call_stack` column for architecture context)
+2. `<output_dir>/metadata/<cat>_metadata.json` - Hardware specs
+
+**Output file you must write:**
+- `<output_dir>/category_findings/<cat>_findings.md`
+
+---
+
+## Error Handling
+
+**If category data files are missing or status is NO_DATA:**
+1. Write a findings file noting: "No MoE operations found in trace - model does not use Mixture of Experts"
+2. Return gracefully
+
+**If analysis script fails:**
+1. Write a findings file with Status: ERROR
+2. **CRITICAL: Do NOT manually analyze the raw CSV data**
+3. **CRITICAL: Do NOT provide any bottleneck findings**
+
+---
+
+## Language Guidelines
+
+Use vendor-agnostic terminology:
+- "GPU kernels" not "CUDA kernels"
+- "MoE implementation" not vendor-specific libraries
+- Focus on operation semantics, not vendor implementation details
+
+---
+
+## Analysis Workflow
+
+### Step 1: Run Analysis Script
+
+```bash
+<prefix> python3 \
+  TraceLens/Agent/Analysis/category_analyses/moe_analysis.py \
+  --output-dir <output_dir> \
+  --comparison_scope <comparison_scope>
+  --category <cat>
+```
+
+### Step 2: Read Metrics
+
+```bash
+cat <output_dir>/category_data/<cat>_metrics.json
+```
+
+If `status` is `NO_DATA`, write the no-MoE finding noted in Error Handling and stop.
+
+The byte estimation for MoE is an **average-case approximation** under uniform routing; the FLOPS calculation is exact. When emitting any memory-bound finding (where the byte estimate drives the metric), state in **Identification** that TB/s, FLOPS/Byte, and efficiency carry this approximation. Do not speculate about per-expert load imbalance or routing decisions — they are not observable from kernel-level trace data.
+
+### Step 3: Render P-items from `category_findings`
+
+**efficiency_percent semantics:**
+- **Standalone:** Treat `efficiency_percent` as **% of roofline**.
+- **Comparative:** Treat `efficiency_percent` as **100 × (trace2 kernel time) / (trace1 kernel time)**.
+
+Read `category_data/<cat>_metrics.json::category_findings`. Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `library`, precision from `Compute Spec`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+
+**Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
+
+**Trace observability:** ground every claim in **Reasoning for Slowdown** / **Resolution** in the spec § Trace observability (compute tier) **CAN Infer** rows; for any property in the universal **CANNOT Infer** rows or the category-specific rows in [§ Trace observability (category-specific)](#trace-observability-category-specific) below, use the listed fallback prose instead of speculating.
+
+---
+
+## Action Prose Guidance
+
+Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].bound_type`. Always check member `Compute Spec` before recommending precision narrowing — do NOT suggest "lower precision" if the operation is already at the narrowest practical precision (FP4):
+
+| `bound_type` | Action template |
+|---|---|
+| `compute` | Profile the dominant member kernels for tile-size and wave-occupancy tuning. If the operation runs at a wider precision than the model tolerates (e.g. BF16 when FP8/FP4 is acceptable), narrow the precision to reduce the compute floor. If already at FP4, focus on kernel tuning. |
+| `memory` | Batch more tokens upstream to increase arithmetic intensity and shift toward compute-bound. Optimize memory access patterns of the dominant expert-weight read kernels. |
+
+---
+
+## Common Patterns
+
+### Memory-bound MoE (FP4/FP8 weights, low token count)
+- **Symptoms:** Low FLOPS/Byte; low TB/s vs. peak HBM BW.
+- **Reasoning:** Weight reads dominate memory traffic; narrow-precision weights reduce bytes but FLOPs stay the same per token, so few tokens means low arithmetic intensity.
+- **Algorithmic:** Batch more tokens to raise arithmetic intensity.
+- **Kernel:** If well below peak HBM BW, kernel has room for memory-access optimization.
+
+### Compute-bound MoE (BF16 weights or high token count)
+- **Symptoms:** High FLOPS/Byte; low TFLOPS/s vs. peak MAF.
+- **Reasoning:** Compute dominates with large token counts or wider-precision weights.
+- **Algorithmic:** Quantization (FP8/FP4) if model quality allows.
+- **Kernel:** If well below peak MAF, kernel has room for compute-utilization tuning.
+
+### Unfused multi-stage MoE GEMMs (`moe_unfused` only)
+- **Symptoms:** Multiple sequential expert-GEMM kernel launches per token group (e.g. `*_gemm1_*` followed by `*_gemm2_*`).
+- **Reasoning:** Each launch pays kernel-launch overhead and cannot share on-chip memory across the FC1 -> activation -> FC2 chain; intermediate activations must round-trip through HBM.
+- **Algorithmic:** Switch to a fused MoE expert kernel that combines the per-stage GEMMs (and ideally activation) in a single launch.
+- **Kernel:** If a fused variant is unavailable, apply the standard per-bound-type tuning from the table above to each stage independently.
+
+### Already-fused operations (`moe_fused` only)
+- **Reasoning:** Fused MoE kernels combine routing + FC1 + activation + FC2 in a single kernel launch; fusion opportunities are limited.
+- **Focus:** The roofline gap (efficiency vs. peak), not further fusion.
+
+### No MoE category in trace
+- **Reasoning:** Model doesn't use Mixture of Experts.
+- **Action:** Report as "N/A" and stop.
+
+---
+
+## Trace observability (category-specific)
+
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, MoE workloads have these blind spots:
+
+| NOT observable | Why | Fallback prose |
+|----------------|-----|----------------|
+| Per-expert load imbalance | Trace lacks per-expert token counts | "Cannot assess expert load balance from trace data." |
+| Routing decisions / gating quality | Router internals are not traced | "Cannot assess routing quality from trace data." |
+| Token distribution across experts | Not surfaced in kernel-level events | "Cannot assess token distribution from trace data." |
+| True per-token byte traffic | Byte estimate assumes uniform routing; the per-token bytes actually moved depend on routing | "TB/s, FLOPS/Byte, and efficiency are uniform-routing approximations." |
+
+---
+
+## Validate findings
+
+Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+
+```bash
+<prefix> python3 -c "
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_findings_file
+passed, errors = validate_findings_file(sys.argv[1], sys.argv[2], sys.argv[3])
+if not passed:
+    print('FAIL:')
+    for e in errors:
+        print('  - ' + e)
+    sys.exit(1)
+print('PASS: Findings file is valid')
+" '<output_dir>/category_findings/<cat>_findings.md' 'compute' '<comparison_scope>'
+```
+
+If validation fails, fix the findings file and re-run. Max 2 retries.
diff --git a/skills/analysis-orchestrator/agents/multi-kernel-analyzer.md b/skills/analysis-orchestrator/agents/multi-kernel-analyzer.md
new file mode 100644
index 0000000..2c91157
--- /dev/null
+++ b/skills/analysis-orchestrator/agents/multi-kernel-analyzer.md
@@ -0,0 +1,272 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+---
+name: multi-kernel-analyzer
+description: Analyze cross-cutting multi-kernel issues including memcpy D2H/H2D patterns, communication blocking compute, and compute/communication overlap. System-level analysis tier.
+model: claude-opus-4-7-high
+---
+
+# Multi-Kernel Issue Analysis Subagent
+
+Analyze cross-cutting multi-kernel issues that affect the GPU pipeline as a whole. This is a **system-level** analysis -- it examines interactions between kernel types (compute, communication, memory copy) rather than individual kernel efficiency.
+
+**Three analysis areas:**
+1. **Memory Copy Patterns** -- High occurrence of D2H/H2D transfers indicating unnecessary data movement
+2. **Communication Blocking Compute** -- Communication operations that block GPU compute kernels
+3. **Compute/Communication Overlap** -- Lack of overlap between communication and compute, missed pipelining opportunities
+
+---
+
+## Context Passing
+
+When invoked by the orchestrator, you will receive the following context:
+
+**Required context provided by orchestrator:**
+- `output_dir`: Base analysis output directory
+- `prefix`: Command prefix from `<output_dir>/cache/cmd_prefix.txt` — contains a template with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+
+**Input files (pre-computed by orchestrator):**
+1. `<output_dir>/category_data/multi_kernel_data.json` - Pre-computed memcpy/communication/overlap data
+2. `<output_dir>/metadata/multi_kernel_metadata.json` - Platform specs and GPU utilization
+3. `<output_dir>/category_data/category_manifest.json` - Contains gpu_utilization metrics
+
+**Output file you must write:**
+- `<output_dir>/system_findings/multi_kernel_findings.md`
+
+---
+
+## Error Handling
+
+**If multi_kernel_data.json is missing:**
+1. Read gpu_utilization from category_data/category_manifest.json
+2. Report based on exposed_memcpy_time_percent and exposed_comm_time_percent
+3. Note limitations in findings
+
+**If analysis script fails:**
+1. Write a findings file with Status: ERROR
+2. Include the error message and traceback
+3. Do NOT attempt manual analysis of raw trace data
+
+---
+
+## Language Guidelines
+
+Use vendor-agnostic terminology:
+- "collective communication" not vendor library names (exception: quoting kernel names)
+- "memory copy D2H/H2D" not vendor-specific API names
+- "compute/communication overlap" not vendor-specific implementation details
+- "GPU graph" not "CUDA graph" or "HIP graph"
+
+## Cross-Analyzer Boundary (Required)
+
+- Multi-Kernel owns recommendations rooted in communication overlap, collective scheduling, and memcpy direction patterns.
+- CPU/Idle owns recommendations rooted in idle bubbles, launch overhead, host-side synchronization, and pipeline stalls.
+- Do not emit a Multi-Kernel card when the primary mechanism/action is launch-overhead reduction or host-pipeline tuning unless there is distinct communication/memcpy evidence that changes the action.
+- If two candidate Multi-Kernel cards prescribe the same mechanism/action, merge into one card and combine evidence instead of emitting near-duplicates.
+
+---
+
+## Analysis Workflow
+
+### Step 1: Run Analysis Script
+
+Execute the analysis script using the command prefix:
+
+```bash
+<prefix> python3 \
+  TraceLens/Agent/Analysis/category_analyses/multi_kernel_analysis.py \
+  --output-dir <output_dir>
+```
+
+The script outputs `multi_kernel_metrics.json` to `category_data/`.
+
+### Step 2: Read Metrics
+
+After the script completes, read the JSON metrics file:
+
+```bash
+cat <output_dir>/category_data/multi_kernel_metrics.json
+```
+
+Key metrics to analyze:
+- `memcpy_assessment`: Boolean `flagged` and per-direction breakdown of memory copy issues
+- `nccl_blocking_assessment`: Boolean `flagged` for communication blocking compute
+- `overlap_assessment`: Boolean `flagged` for compute/communication overlap quality
+- `patterns_detected`: List of detected patterns with description (no recommendations -- you generate those)
+
+### Step 2.1: Recommendation Decision Gates
+
+Before drafting recommendations, map each candidate to a specific gate:
+
+- **Overlap gate:** Recommend overlap tuning only when `overlap_assessment.flagged == true` or exposed communication is clearly on the critical path.
+- **Blocking gate:** Recommend communication-blocking fixes only when `nccl_blocking_assessment.flagged == true` and exposed communication is material in absolute or percentage terms.
+- **Memcpy gate:** Recommend direction-specific transfer fixes only when that direction has clear evidence (time share and/or count pattern) in `memcpy_assessment`.
+- **Synchronization gate:** Recommend sync cleanup only when metrics or detected patterns indicate barrier-like behavior; do not speculate.
+- **Merge gate:** If two candidates prescribe the same mechanism/action, emit one merged card with combined evidence.
+
+### Step 3: Analyze Memory Copy Patterns
+
+Examine `memcpy_assessment` for D2H and H2D issues. `memcpy_assessment.flagged` is `true` when any direction exceeds thresholds (>5% of total time or >10 transfers).
+
+**D2H (Device-to-Host) Issues:**
+- Frequent D2H copies suggest unnecessary data movement back to host
+- Common causes: `.item()`, `.cpu()`, scalar operations, logging in hot path
+- Solution: Keep data on device; use device-side reductions; batch host reads
+
+**H2D (Host-to-Device) Issues:**
+- Frequent H2D copies suggest repeated data staging
+- Common causes: Unpinned memory, on-the-fly tensor creation, data loading
+- Solution: Pin host memory; pre-allocate device tensors; use async transfers
+
+**D2D (Device-to-Device) Issues:**
+- Redundant D2D copies indicate unnecessary on-device data movement between buffers or GPUs
+- Common causes: Explicit `.to(device)` on already-resident tensors, contiguous() calls, format conversions
+- Solution: Eliminate redundant copies; use in-place operations or aliased tensors where possible
+
+### Step 4: Analyze Communication Blocking and Synchronization
+
+Examine `nccl_blocking_assessment`. `nccl_blocking_assessment.flagged` is `true` when exposed communication exceeds 5% of total GPU time.
+
+**Blocking indicators:**
+- High `exposed_comm_time_ms` means communication is on the critical path
+- This time is NOT overlapped with compute -- GPU is waiting
+
+**Synchronization barriers:**
+- Explicit device-level synchronization or stream-level syncs stall the GPU pipeline
+- Common causes: Debug synchronization left in production code, unnecessary sync between independent operations
+- Solution: Remove unnecessary device/stream syncs; use stream events for fine-grained ordering instead of full device sync
+
+**Redundant collective operations:**
+- Multiple allreduce/allgather on the same or overlapping data within a single iteration
+- Common causes: Framework layers issuing separate collectives that could be fused, duplicate gradient syncs
+- Solution: Deduplicate or fuse collectives; reduce collective frequency per iteration
+
+**Selection rule for this step:**
+- If `nccl_blocking_assessment.flagged` is false and exposed communication is not material, do not emit a standalone "communication blocking" recommendation.
+
+### Step 5: Analyze Compute/Communication Overlap
+
+Examine `overlap_assessment`. `overlap_assessment.flagged` is `true` when overlap ratio is below 70%.
+
+**Overlap improvement strategies (choose based on analysis mode):**
+
+For **training** workloads:
+1. Enable gradient communication overlap (async allreduce during backward)
+2. Pipeline micro-batches to overlap compute of batch N+1 with comm of batch N
+3. Use gradient bucketing to better align communication with available compute
+
+For **inference** workloads (vLLM / SGLang):
+1. Overlap tensor-parallel collective communication with decode compute using separate streams
+2. Pipeline prefill and decode phases so collectives from one phase overlap compute of the next
+3. Reduce collective payload size via quantized or compressed allreduce, tuning communication environment variables
+
+- Do not recommend payload compression/quantization when overlap is already healthy and exposed communication is not a dominant bottleneck.
+
+### Step 6: Write System Findings
+
+Write `<output_dir>/system_findings/multi_kernel_findings.md` using the command prefix:
+
+Recommendation quality requirements (apply before writing):
+- Each recommendation must cite a concrete evidence points from metrics or detected patterns in `Insight` or `Detailed Analysis`.
+- Each `Action` must name one concrete mechanism (for example, bucket sizing, stream split, collective fusion, async staging) and avoid generic advice.
+- For each recommendation, include a clear expected metric movement in prose (for example, lower exposed communication time, higher overlap ratio, lower D2H/H2D count).
+- Do not emit two recommendations with effectively the same action mechanism; merge them.
+
+```markdown
+# Multi-Kernel Issue Analysis Findings
+
+> **Note:** This analysis is exploratory. The patterns and recommendations below are under active development and may be refined as system-level analysis matures.
+
+**Status**: [SUCCESS/ERROR]
+**Analysis Tier**: System-Level
+
+## Summary
+
+| Metric | Value | Flagged |
+|--------|-------|---------|
+| Total Memcpy Events | X | true/false |
+| D2H Transfers | X (Y ms) | true/false |
+| H2D Transfers | X (Y ms) | true/false |
+| Exposed Communication | X% of total | true/false |
+| Compute/Comm Overlap | X% | true/false |
+
+## Memory Copy Analysis
+
+### D2H (Device-to-Host) Transfers
+- **Count**: X transfers
+- **Total Time**: Y ms (Z% of total GPU time)
+- **Flagged**: true/false
+- **Root Cause**: [analysis based on count and time patterns]
+
+### H2D (Host-to-Device) Transfers
+- **Count**: X transfers
+- **Total Time**: Y ms (Z% of total GPU time)
+- **Flagged**: true/false
+- **Root Cause**: [analysis]
+
+## Communication Blocking Analysis
+
+### Communication Blocking Compute
+- **Exposed Communication Time**: X ms (Y% of total)
+- **Total Communication Time**: X ms
+- **Flagged**: true/false
+
+### Compute/Communication Overlap
+- **Overlap Ratio**: X% (target > 70%)
+- **Flagged**: true/false
+
+## Detected Patterns
+
+1. **[Pattern Name]**
+   - Evidence: [metrics]
+   - Recommendation: [specific action]
+
+## Recommendations
+
+### System P<N>: [Highest Priority Multi-Kernel Issue]
+**Insight**: [1 sentence]
+**Action**: [1-2 sentences]
+
+### System P<N+1>: [Next Issue]
+**Insight**: [1 sentence]
+**Action**: [1-2 sentences]
+
+```
+
+**Detailed Analysis block:** Follow [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) for the full block schema.
+
+**Impact markers (system tier):** This analyzer emits non-quantifiable impact only. Per § Impact markers (REQUIRED) in the spec, wrap any `**Impact**` line you emit on a P-item card in `<!-- impact-begin kind=p_item low=null mid=null high=null -->` ... `<!-- impact-end -->`. Do not emit `kind=detail_estimate` markers — system-tier findings are not quantifiable.
+
+### Step 7.1: Validate Findings
+
+Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+
+```bash
+<prefix> python3 -c "
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_findings_file
+passed, errors = validate_findings_file(sys.argv[1], sys.argv[2], sys.argv[3])
+if not passed:
+    print('FAIL:')
+    for e in errors:
+        print('  - ' + e)
+    sys.exit(1)
+print('PASS: Findings file is valid')
+" '<output_dir>/system_findings/multi_kernel_findings.md' 'system' '<comparison_scope>'
+```
+
+If validation fails, fix the findings file and re-run. Max 2 retries.
+
+---
+
+## Key Principles
+
+1. **System-level focus** - These are pipeline/framework issues, NOT individual kernel issues
+2. **Provide actionable solutions** - Specific steps, not vague suggestions
+3. **Vendor-agnostic recommendations** - Focus on patterns and solutions
+4. **Priority numbering is sequential** - The orchestrator assigns final P-numbers. Use P<N> placeholders; if CPU/Idle is below threshold, multi-kernel issues start at P1
+5. **Do NOT duplicate category analysis** - This analysis is about cross-cutting patterns, not individual op efficiency
diff --git a/skills/analysis-orchestrator/agents/norm-analyzer.md b/skills/analysis-orchestrator/agents/norm-analyzer.md
new file mode 100644
index 0000000..2d419cf
--- /dev/null
+++ b/skills/analysis-orchestrator/agents/norm-analyzer.md
@@ -0,0 +1,164 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+---
+name: norm-analyzer
+description: Analyze normalization operations (BatchNorm, LayerNorm, GroupNorm, etc.) for memory bandwidth efficiency. Use when orchestrator needs norm category analysis.
+model: claude-opus-4-7-high
+---
+
+# Normalization Analysis Subagent
+
+Analyze normalization operations (BatchNorm, LayerNorm, GroupNorm, InstanceNorm) for memory-bandwidth efficiency. Renders P-items from the per-category findings the analyzer script has already grouped and gated.
+
+---
+
+## Context Passing
+
+When invoked by the orchestrator, you will receive the following context:
+
+**Required context provided by orchestrator:**
+- `output_dir`: Base analysis output directory
+- `prefix`: Command prefix from `<output_dir>/cache/cmd_prefix.txt` — contains a template with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+- `comparison_scope`: `standalone` (default) or `comparative`
+- `cat`: `norm_fwd` or `norm_bwd`
+
+**Input files (pre-computed by orchestrator):**
+1. `<output_dir>/category_data/<cat>_ops.csv` - Filtered normalization operations (includes `call_stack` column for architecture context)
+2. `<output_dir>/metadata/<cat>_metadata.json` - Hardware specs
+
+**Output file you must write:**
+- `<output_dir>/category_findings/<cat>_findings.md`
+
+---
+
+## Error Handling
+
+**If category data files are missing:**
+1. Write a findings file noting: "No normalization operations found in trace"
+2. Return gracefully
+
+**If analysis script fails:**
+1. Write a findings file with Status: ERROR
+2. **CRITICAL: Do NOT manually analyze the raw CSV data**
+3. **CRITICAL: Do NOT provide any bottleneck findings**
+
+---
+
+## Language Guidelines
+
+Use vendor-agnostic terminology:
+- "GPU kernels" not "CUDA kernels"
+- "native normalization kernels" not vendor-specific names
+- Focus on operation semantics, not vendor implementation details
+
+---
+
+## Analysis Workflow
+
+### Step 1: Run Analysis Script
+
+```bash
+<prefix> python3 \
+  TraceLens/Agent/Analysis/category_analyses/norm_analysis.py \
+  --output-dir <output_dir> \
+  --comparison_scope <comparison_scope> \
+  --category <cat>
+```
+
+### Step 2: Read metrics
+
+```bash
+cat <output_dir>/category_data/<cat>_metrics.json
+```
+
+### Step 3: Classify members by name
+
+Each `category_findings[i].members[j].operation` carries a torch op name (e.g. `aten::batch_norm`, `aten::layer_norm`, `aten::group_norm`). Classify each member semantically when describing the finding:
+
+- **BatchNorm**: `batch_norm`, `batchnorm` (per-channel; common in CNNs).
+- **LayerNorm**: `layer_norm`, `layernorm` (per-token; common in Transformers).
+- **GroupNorm**: `group_norm`, `groupnorm` (hybrid; used in diffusion models).
+- **InstanceNorm**: `instance_norm` (per-instance; used in style transfer).
+- **Other**: anything not matching the above.
+
+Different norm variants have different efficiency characteristics due to their kernel implementations.
+
+### Step 4: Render P-items from `category_findings`
+
+**efficiency_percent semantics:**
+- **Standalone:** Treat `efficiency_percent` as **% of roofline**.
+- **Comparative:** Treat `efficiency_percent` as **100 × (trace2 kernel time) / (trace1 kernel time)**.
+
+Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+
+**Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
+
+**Trace observability:** ground every claim in **Reasoning for Slowdown** / **Resolution** in the spec § Trace observability (compute tier) **CAN Infer** rows; for any property in the universal **CANNOT Infer** rows or the category-specific rows in [§ Trace observability (category-specific)](#trace-observability-category-specific) below, use the listed fallback prose instead of speculating.
+
+---
+
+## Action Prose Guidance
+
+Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].bound_type`:
+
+| `bound_type` | Action template |
+|---|---|
+| `memory` | Optimize memory access patterns of the dominant member kernels. For BatchNorm-heavy CNNs, channels-last layout (`model.to(memory_format=torch.channels_last)`) often improves coalescing. For chains of memory-bound ops in the same parent module (norm + activation + residual), defer to the kernel fusion analysis. |
+| `compute` | Rare for normalization; if it occurs, profile the kernel for tile-size and wave-occupancy tuning. |
+
+---
+
+## Common Patterns
+
+### Low efficiency vs. baseline
+- **Symptoms:** Normalization at <20% of peak HBM BW while simple elementwise hits >70%.
+- **Reasoning:** Norm kernel may be suboptimal; the elementwise baseline shows the hardware is healthy.
+- **Algorithmic:** LayerNorm or GroupNorm alternatives may have better kernels.
+- **Kernel:** Profile the norm kernel.
+
+### CNN-heavy workloads
+- **Symptoms:** BatchNorm is 10–50% of compute (ResNet, EfficientNet, etc.).
+- **Algorithmic:** Channels-last memory format.
+- **Kernel:** Optimize the BatchNorm kernel.
+
+### Norm-type variations
+- **BatchNorm**: per-channel.
+- **LayerNorm**: per-token.
+- **GroupNorm**: hybrid.
+- Different implementations may have different efficiency — name the variant in **Identification**.
+
+---
+
+## Trace observability (category-specific)
+
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, normalization analysis cannot observe:
+
+| NOT observable | Why | Fallback prose |
+|----------------|-----|----------------|
+| Reduction algorithm | The strategy is internal to the norm kernel | "Reduction strategy not visible — profile the kernel to identify the variant." |
+
+---
+
+## Validate findings
+
+Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+
+```bash
+<prefix> python3 -c "
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_findings_file
+passed, errors = validate_findings_file(sys.argv[1], sys.argv[2], sys.argv[3])
+if not passed:
+    print('FAIL:')
+    for e in errors:
+        print('  - ' + e)
+    sys.exit(1)
+print('PASS: Findings file is valid')
+" '<output_dir>/category_findings/<cat>_findings.md' 'compute' '<comparison_scope>'
+```
+
+If validation fails, fix the findings file and re-run. Max 2 retries.
diff --git a/skills/analysis-orchestrator/agents/reduce-analyzer.md b/skills/analysis-orchestrator/agents/reduce-analyzer.md
new file mode 100644
index 0000000..065ffac
--- /dev/null
+++ b/skills/analysis-orchestrator/agents/reduce-analyzer.md
@@ -0,0 +1,151 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+---
+name: reduce-analyzer
+description: Analyze reduce operations for performance bottlenecks and optimization opportunities. Use when orchestrator needs reduce category analysis.
+model: claude-opus-4-7-high
+---
+
+# Reduce Analysis Subagent
+
+Analyze reduce operations (softmax, sum, mean, max, min) for memory-bandwidth efficiency. Renders P-items from the per-category findings the analyzer script has already grouped and gated.
+
+---
+
+## Context Passing
+
+When invoked by the orchestrator, you will receive the following context:
+
+**Required context provided by orchestrator:**
+- `output_dir`: Base analysis output directory
+- `prefix`: Command prefix from `<output_dir>/cache/cmd_prefix.txt` — contains a template with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+- `comparison_scope`: `standalone` (default) or `comparative`
+
+**Input files (pre-computed by orchestrator):**
+1. `<output_dir>/category_data/reduce_ops.csv` - Filtered reduce operations (includes `call_stack` column for architecture context)
+2. `<output_dir>/metadata/reduce_metadata.json` - Hardware specs
+
+**Output file you must write:**
+- `<output_dir>/category_findings/reduce_findings.md`
+
+---
+
+## Error Handling
+
+**If category data files are missing:**
+1. Write a findings file noting: "No reduce operations found in trace"
+2. Return gracefully
+
+**If analysis script fails:**
+1. Write a findings file with Status: ERROR
+2. **CRITICAL: Do NOT manually analyze the raw CSV data**
+3. **CRITICAL: Do NOT provide any bottleneck findings**
+
+---
+
+## Language Guidelines
+
+Use vendor-agnostic terminology:
+- "GPU kernels" not "CUDA kernels"
+- "memory bandwidth" not vendor-specific terms
+- Focus on operation semantics, not vendor implementation details
+
+---
+
+## Analysis Workflow
+
+### Step 1: Run Analysis Script
+
+```bash
+<prefix> python3 \
+  TraceLens/Agent/Analysis/category_analyses/reduce_analysis.py \
+  --output-dir <output_dir>
+  --comparison_scope <comparison_scope>
+```
+
+### Step 2: Read metrics
+
+```bash
+cat <output_dir>/category_data/reduce_metrics.json
+```
+
+`category_specific.softmax_count` flags attention-pattern reductions; reference it in **Identification** when softmax dominates a finding.
+
+### Step 3: Classify members by name
+
+Each `category_findings[i].members[j].operation` carries a torch op name (e.g. `aten::softmax`, `aten::sum`, `aten::mean`). Classify each member semantically when describing the finding:
+
+- **Softmax**: `softmax` (attention activation; common in Transformer attention layers).
+- **Sum**: `sum` (element summation across dimensions; common in loss / gradient accumulation).
+- **Mean**: `mean`, `avg` (average reduction; used in pooling and normalization).
+- **Max**: `max` (maximum-value reduction; used in argmax and pooling).
+- **Min**: `min` (minimum-value reduction; used in clamping and threshold logic).
+- **Other**: anything not matching the above.
+
+### Step 4: Render P-items from `category_findings`
+
+Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+
+**efficiency_percent semantics:**
+- **Standalone:** Treat `efficiency_percent` as **% of roofline**.
+- **Comparative:** Treat `efficiency_percent` as **100 × (trace2 kernel time) / (trace1 kernel time)**.
+
+**Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
+
+**Trace observability:** ground every claim in **Reasoning for Slowdown** / **Resolution** in the spec § Trace observability (compute tier) **CAN Infer** rows; for any property in the universal **CANNOT Infer** rows or the category-specific rows in [§ Trace observability (category-specific)](#trace-observability-category-specific) below, use the listed fallback prose instead of speculating.
+
+---
+
+## Action Prose Guidance
+
+Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].bound_type`:
+
+| `bound_type` | Action template |
+|---|---|
+| `memory` | Optimize memory access patterns of the dominant member kernels. For softmax members in an attention parent chain, the unfused softmax indicates a fusion opportunity — defer to the kernel fusion analysis. For chains of memory-bound reductions in the same parent module (norm + reduce + scale), defer to the kernel fusion analysis. |
+| `compute` | Rare for reductions; if it occurs, profile the kernel for wave-occupancy tuning. |
+
+---
+
+## Common Patterns
+
+### Standalone reductions
+- **Symptoms:** `sum`, `mean`, `max` operations in isolation (no fusion candidate above).
+- **Reasoning:** Memory-bound reductions should approach peak HBM BW for simple cases.
+- **Kernel:** Investigate kernel-level memory access patterns if well below the band.
+
+---
+
+## Trace observability (category-specific)
+
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, reduce analysis cannot observe:
+
+| NOT observable | Why | Fallback prose |
+|----------------|-----|----------------|
+| Reduction algorithm (tree vs. block-shuffle vs. atomic) | The strategy is internal to the reduce kernel | "Reduction strategy not visible — profile the kernel to identify the variant." |
+
+---
+
+## Validate findings
+
+Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+
+```bash
+<prefix> python3 -c "
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_findings_file
+passed, errors = validate_findings_file(sys.argv[1], sys.argv[2], sys.argv[3])
+if not passed:
+    print('FAIL:')
+    for e in errors:
+        print('  - ' + e)
+    sys.exit(1)
+print('PASS: Findings file is valid')
+" '<output_dir>/category_findings/reduce_findings.md' 'compute' '<comparison_scope>'
+```
+
+If validation fails, fix the findings file and re-run. Max 2 retries.
diff --git a/skills/analysis-orchestrator/agents/sdpa-analyzer.md b/skills/analysis-orchestrator/agents/sdpa-analyzer.md
new file mode 100644
index 0000000..3c2f009
--- /dev/null
+++ b/skills/analysis-orchestrator/agents/sdpa-analyzer.md
@@ -0,0 +1,212 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+---
+name: sdpa-analyzer
+description: Analyze Scaled Dot Product Attention operations (forward and backward) for performance bottlenecks. Supports Flash Attention and Paged Attention (vLLM) analysis. Handles both sdpa_fwd and sdpa_bwd categories.
+model: claude-opus-4-7-high
+---
+
+# SDPA Analysis Subagent
+
+Analyze SDPA (Scaled Dot Product Attention) operations for performance bottlenecks. Supports forward (`sdpa_fwd`) and backward (`sdpa_bwd`), Flash Attention, and Paged Attention (vLLM). Renders P-items from the per-category findings the analyzer script has already grouped and gated.
+
+---
+
+## Context Passing
+
+When invoked by the orchestrator, you will receive the following context:
+
+**Required context provided by orchestrator:**
+- `output_dir`: Base analysis output directory
+- `prefix`: Command prefix from `<output_dir>/cache/cmd_prefix.txt` — contains a template with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+- `comparison_scope`: `standalone` (default) or `comparative`
+- `sdpa`: Either `sdpa_fwd` (forward pass) or `sdpa_bwd` (backward pass)
+
+**Input files (pre-computed by orchestrator):**
+1. `<output_dir>/category_data/<sdpa>_ops.csv` - Filtered SDPA operations (includes `call_stack` column for architecture context)
+2. `<output_dir>/metadata/<sdpa>_metadata.json` - Hardware specs, GPU utilization
+
+**Output file you must write:**
+- `<output_dir>/category_findings/<sdpa>_findings.md`
+
+---
+
+## Error Handling
+
+**If category data files are missing:**
+1. Write a findings file noting: "No SDPA operations found in trace"
+2. Return gracefully
+
+**If analysis script fails:**
+1. Write a findings file with Status: ERROR
+2. **CRITICAL: Do NOT manually analyze the raw CSV data**
+3. **CRITICAL: Do NOT provide any bottleneck findings**
+
+---
+
+## Language Guidelines
+
+Use vendor-agnostic terminology:
+- "GPU kernels" not "CUDA kernels"
+- "optimized attention kernel" not vendor-specific names
+- "DNN primitives" not vendor-specific names
+- Focus on operation semantics, not vendor implementation details
+
+---
+
+## Analysis Workflow
+
+### Step 1: Run Analysis Script
+
+```bash
+<prefix> python3 \
+  TraceLens/Agent/Analysis/category_analyses/sdpa_analysis.py \
+  --output-dir <output_dir> \
+  --category <sdpa> \
+  --comparison_scope <comparison_scope>
+```
+
+### Step 2: Read Metrics
+
+```bash
+cat <output_dir>/category_data/<sdpa>_metrics.json
+```
+
+Check `category_specific` for the attention implementation:
+
+| Field | Meaning |
+|-------|---------|
+| `flash_attention_detected` | Standard Flash Attention (PyTorch SDPA). |
+| `paged_attention_detected` | vLLM Paged Attention. Operation names contain `unified_attention` or `paged`; per-op `classification.kernel_breakdown` (typical components: `reshape_and_cache`, `_fwd_kernel`, `kernel_paged_attention_2d`) and `classification.workload_profile` (`n_q`, `n_kv`, `sum_ctx_tokens`, `sum_gen_tokens`, `ctx_ratio`, `attention_pattern`, `gqa_ratio`) qualify the workload. |
+| Neither | Unfused attention (typically a major opportunity to migrate to Flash Attention). |
+
+Reference the detected implementation in the **Identification** prose of every finding. For Paged Attention, also reference the kernel-breakdown component that dominates and the workload profile (prefill-heavy when `ctx_ratio > 0.8`, decode-heavy when `ctx_ratio < 0.2`).
+
+### Step 3: Render P-items from `category_findings`
+
+Read `category_data/<sdpa>_metrics.json::category_findings`. Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `library`) and the per-op `classification.kernel_breakdown` / `classification.workload_profile` (Paged) using the Action Prose Guidance, Expected Efficiency, and Common Patterns below. For Paged Attention, extend the **Data:** operations table with kernel-breakdown component, workload type, and attention pattern columns when populated. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+
+**efficiency_percent semantics:**
+- **Standalone:** Treat `efficiency_percent` as **% of roofline**.
+- **Comparative:** Treat `efficiency_percent` as **100 × (trace2 kernel time) / (trace1 kernel time)**.
+
+**Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
+
+**Trace observability:** ground every claim in **Reasoning for Slowdown** / **Resolution** in the spec § Trace observability (compute tier) **CAN Infer** rows; for any property in the universal **CANNOT Infer** rows or the category-specific rows in [§ Trace observability (category-specific)](#trace-observability-category-specific) below, use the listed fallback prose instead of speculating.
+
+---
+
+## Action Prose Guidance
+
+Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].bound_type` and the attention implementation. **Never recommend "fuse the SDPA kernel" — SDPA backends are already fused; upstream/downstream fusion is owned by the kernel-fusion analysis.**
+
+| `bound_type` | Attention type | Action template |
+|---|---|---|
+| `compute` | Flash / Standard | Profile the dominant member kernels for tile-size and wave-occupancy tuning. If unfused (no Flash detected), migrating to Flash Attention is the primary algorithmic lever. |
+| `memory` | Flash / Standard | Optimize memory access patterns of the dominant member kernels. Short sequences (N < 1024) naturally show lower efficiency due to memory-overhead dominance — note that in **Identification** before recommending tuning. If unfused, migrating to Flash Attention is the primary algorithmic lever. |
+| `compute` | Paged | Profile the dominant kernel-breakdown component (typically `_fwd_kernel` for prefill-heavy, `kernel_paged_attention_2d` for decode-heavy) for tile-size tuning. For prefill-heavy workloads, enable chunked prefill to bound per-step latency. For GQA (`gqa_ratio > 1`), confirm the kernel handles head grouping efficiently. |
+| `memory` | Paged | Optimize memory access patterns of the dominant kernel-breakdown component. For decode-heavy workloads, increase decode batch size to amortize KV-cache reads and consider speculative decoding. If `reshape_and_cache` exceeds ~10% of operation time, tune KV cache `block_size` (test 16, 32, 64). |
+
+---
+
+## Expected efficiency by sequence length (Standard / Flash Attention)
+
+Short sequences naturally show lower efficiency — do NOT call low efficiency a bottleneck if it falls within the expected band for `N`.
+
+| Sequence length `N` | Expected efficiency |
+|---------------------|---------------------|
+| `N < 512` | 5–15% (memory overhead dominates) |
+| `N = 1024` | 20–40% |
+| `N = 2048` | 40–60% |
+| `N > 4096` | 50–70% |
+
+---
+
+## Common Patterns
+
+### Standard / Flash Attention
+
+#### Unfused attention
+- **Symptoms:** Multiple ops (`softmax`, `bmm`, `mul`, `copy_`) appear together, no Flash kernel detected.
+- **Algorithmic:** Migrate to Flash Attention.
+- **Note:** Fusion of unfused attention is handled by the kernel fusion module.
+
+#### Flash Attention already used
+- **Reasoning:** Confirm efficiency falls in the Expected Efficiency band for the sequence length; if well below, profile the kernel.
+
+#### Contiguous-copy overhead in SDPA wrapper
+- **Symptoms:** Multiple `aten::copy_` ops with the same shape as Q/K/V immediately before/after the Flash Attention call (3 copies for Q/K/V before, 1 for output after).
+- **Cause:** Framework SDPA wrapper unconditionally calls `.contiguous()` on Q/K/V/output even when the Flash backend supports strided tensors.
+- **Algorithmic:** If the Flash backend supports strided inputs, remove the `.contiguous()` calls from the SDPA wrapper.
+
+### Backward pass (`sdpa_bwd`)
+
+#### Flash Attention backward
+- **Op name:** `flash_attn::_flash_attn_backward`.
+- **Reasoning:** Generally lower efficiency than forward (recomputation of attention weights, more memory bandwidth).
+- **Kernel:** Profile backward kernel for tile/block tuning.
+
+### Paged Attention (vLLM)
+
+#### Decode-heavy workload
+- **Symptoms:** High `kernel_paged_attention_2d` %, low `_fwd_kernel` %, `ctx_ratio < 0.2`.
+- **Algorithmic:** Increase batch size; speculative decoding.
+- **Kernel:** Optimize paged attention kernel if well below the resolved memory roofline.
+
+#### Prefill bottleneck
+- **Symptoms:** High `_fwd_kernel` %, large `sum_ctx_tokens`, `ctx_ratio > 0.8`.
+- **Algorithmic:** Enable chunked prefill; reduce `max_model_len` if memory-constrained.
+- **Kernel:** Profile `_fwd_kernel` for tile-size optimization.
+
+#### KV-cache overhead
+- **Symptoms:** `reshape_and_cache` > 10% of operation time.
+- **Algorithmic:** Tune KV cache `block_size` (test 16, 32, 64).
+- **Kernel:** Check memory access patterns in the reshape kernel.
+
+#### GQA (Grouped Query Attention)
+- **Detection:** `gqa_ratio > 1` (e.g. 8:1 means 8 query heads per KV head).
+- **Reasoning:** GQA reduces KV-cache memory but may slightly lower kernel efficiency vs. MHA — note this in **Identification** before recommending tuning.
+
+---
+
+## Trace observability (category-specific)
+
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, SDPA analysis cannot observe:
+
+**Flash / Standard Attention:**
+
+| NOT observable | Why | Fallback prose |
+|----------------|-----|----------------|
+| Internal block / tile size of the Flash kernel | Tile selection is internal to the Flash backend | "Flash tile size not visible — profile the kernel for tile-size tuning." |
+
+**Paged Attention (vLLM):**
+
+| NOT observable | Why | Fallback prose |
+|----------------|-----|----------------|
+| Per-request KV-cache hit rate | Cache hits/misses are not surfaced as kernel events | "Per-request KV-cache hit rate not visible from trace data." |
+
+---
+
+## Validate findings
+
+Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+
+```bash
+<prefix> python3 -c "
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_findings_file
+passed, errors = validate_findings_file(sys.argv[1], sys.argv[2], sys.argv[3])
+if not passed:
+    print('FAIL:')
+    for e in errors:
+        print('  - ' + e)
+    sys.exit(1)
+print('PASS: Findings file is valid')
+" '<output_dir>/category_findings/<sdpa>_findings.md' 'compute' '<comparison_scope>'
+```
+
+If validation fails, fix the findings file and re-run. Max 2 retries.
diff --git a/skills/analysis-orchestrator/agents/triton-analyzer.md b/skills/analysis-orchestrator/agents/triton-analyzer.md
new file mode 100644
index 0000000..9486650
--- /dev/null
+++ b/skills/analysis-orchestrator/agents/triton-analyzer.md
@@ -0,0 +1,157 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+---
+name: triton-analyzer
+description: Analyze Triton (torch.compile fused) kernels for roofline efficiency. Use when orchestrator needs Triton category analysis.
+model: claude-opus-4-7-high
+---
+
+# Triton Analysis Subagent
+
+Analyze Triton (torch.compile / inductor) fused GPU kernels for roofline efficiency. Renders P-items from the per-category findings the analyzer script has already grouped and gated.
+
+---
+
+## Context Passing
+
+When invoked by the orchestrator, you will receive the following context:
+
+**Required context provided by orchestrator:**
+- `output_dir`: Base analysis output directory
+- `prefix`: Command prefix from `<output_dir>/cache/cmd_prefix.txt` — contains a template with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+- `comparison_scope`: `standalone` (default) or `comparative`
+
+**Input files (pre-computed by orchestrator):**
+1. `<output_dir>/category_data/triton_ops.csv` - Filtered Triton operations (includes `call_stack` column for architecture context)
+2. `<output_dir>/metadata/triton_metadata.json` - Hardware specs
+
+**Output file you must write:**
+- `<output_dir>/category_findings/triton_findings.md`
+
+---
+
+## Error Handling
+
+**If category data files are missing:**
+1. Write a findings file noting: "No Triton operations found in trace"
+2. Return gracefully
+
+**If analysis script fails:**
+1. Write a findings file with Status: ERROR
+2. **CRITICAL: Do NOT manually analyze the raw CSV data**
+3. **CRITICAL: Do NOT provide any bottleneck findings**
+
+---
+
+## Language Guidelines
+
+Use vendor-agnostic terminology:
+- "GPU kernels" not "CUDA kernels"
+- "Triton fused kernels" or "torch.compile fused kernels" for the category
+- Focus on operation semantics, not vendor implementation details
+
+---
+
+## Analysis Workflow
+
+### Step 1: Run Analysis Script
+
+```bash
+<prefix> python3 \
+  TraceLens/Agent/Analysis/category_analyses/triton_analysis.py \
+  --output-dir <output_dir>
+  --comparison_scope <comparison_scope>
+```
+
+### Step 2: Read metrics
+
+```bash
+cat <output_dir>/category_data/triton_metrics.json
+```
+
+`category_specific.pointwise_count`, `reduction_count`, and `persistent_count` indicate the inductor kernel-type mix; reference them in **Identification** when one type dominates a finding.
+
+### Step 3: Classify members by name
+
+Each `category_findings[i].members[j].operation` carries a torch.compile kernel name (e.g. `triton_poi_fused_add_gelu_1`, `triton_red_fused_sum_36`). Classify each member by its inductor prefix when describing the finding:
+
+- **Pointwise**: `triton_poi_` (elementwise fusions — add, mul, gelu, sigmoid, etc.).
+- **Reduction**: `triton_red_` (reduction fusions — sum, mean, norm backward, etc.).
+- **Persistent**: `triton_per_` (persistent-reduction fusions — layer_norm, etc.).
+- **Other**: anything not matching the above.
+
+The fused ATen ops are encoded in the kernel name after the prefix (e.g. `triton_red_fused_add_native_layer_norm_backward_20` fuses `add` + `native_layer_norm_backward`). Use them to describe the dominant computation in prose.
+
+### Step 4: Render P-items from `category_findings`
+
+**efficiency_percent semantics:**
+- **Standalone:** Treat `efficiency_percent` as **% of roofline**.
+- **Comparative:** Treat `efficiency_percent` as **100 × (trace2 kernel time) / (trace1 kernel time)**.
+
+Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `time_ms`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+
+**Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
+
+**Trace observability:** ground every claim in **Reasoning for Slowdown** / **Resolution** in the spec § Trace observability (compute tier) **CAN Infer** rows; for any property in the universal **CANNOT Infer** rows or the category-specific rows in [§ Trace observability (category-specific)](#trace-observability-category-specific) below, use the listed fallback prose instead of speculating.
+
+---
+
+## Action Prose Guidance
+
+Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].bound_type`:
+
+| `bound_type` | Action template |
+|---|---|
+| `memory` | Optimize memory access patterns of the dominant member kernels. For chains of memory-bound fused ops in the same parent module, defer to the kernel fusion analysis. |
+| `compute` | Rare for fused Triton kernels; if it occurs, profile the kernel for tile-size and wave-occupancy tuning. |
+
+---
+
+## Common Patterns
+
+### Low-efficiency fused kernels (<30% roofline)
+- **Symptoms:** Fused kernels with norm or reduction ops at <30% of peak HBM BW.
+- **Reasoning:** Fused norm+backward or small-reduction kernels can have suboptimal memory access patterns.
+- **Kernel:** Profile the fused kernel; consider dedicated kernel libraries for the dominant op.
+
+### Many small fused kernels
+- **Symptoms:** High aggregate count of small Triton kernels with low individual time.
+- **Reasoning:** torch.compile may generate many narrow fusions instead of one broad fusion.
+- **Kernel:** Review compilation strategy for broader fusion scope.
+
+---
+
+## Trace observability (category-specific)
+
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, Triton fused-kernel analysis cannot observe:
+
+| NOT observable | Why | Fallback prose |
+|----------------|-----|----------------|
+| Per-sub-op breakdown within a fused kernel | Trace only captures the fused kernel as a single event | "Individual sub-op timings within the fused kernel are not separable from the trace." |
+| Torch.compile fusion strategy | The inductor fusion decisions are not recorded in the trace | "Fusion strategy not visible — review torch.compile settings if kernels appear under-fused." |
+
+---
+
+## Validate findings
+
+Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+
+```bash
+<prefix> python3 -c "
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_findings_file
+passed, errors = validate_findings_file(sys.argv[1], sys.argv[2], sys.argv[3])
+if not passed:
+    print('FAIL:')
+    for e in errors:
+        print('  - ' + e)
+    sys.exit(1)
+print('PASS: Findings file is valid')
+" '<output_dir>/category_findings/triton_findings.md' 'compute' '<comparison_scope>'
+```
+
+If validation fails, fix the findings file and re-run. Max 2 retries.
diff --git a/skills/analysis-orchestrator/reference.md b/skills/analysis-orchestrator/reference.md
new file mode 100644
index 0000000..dbb0e91
--- /dev/null
+++ b/skills/analysis-orchestrator/reference.md
@@ -0,0 +1,626 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+# Analysis orchestrator — reference
+
+This document is the detailed specification for the TraceLens **analysis-orchestrator** skill ([SKILL.md](SKILL.md)). Read it when executing the workflow: step-by-step user prompts, CLI commands, subagent contracts, validation, report assembly, and trace diagnostics.
+
+## Workflow overview
+
+The orchestrator runs a staged pipeline (Steps 0–11): collect inputs and environment prefix, generate perf reports, prepare category data via `orchestrator_prepare.py`, run system-level and compute-kernel subagents in parallel, aggregate and validate findings, identify the model, render plots when no extension is present, and write `analysis.md` via remote `tee` heredocs. Only Steps 6, 7, and 9 delegate to Task subagents; all other steps run in the main agent.
+
+---
+
+## Language Guidelines
+
+Use vendor-agnostic terminology throughout such as GPU kernels, collective communication, vendor GEMM library, DNN primitives, GPU graph, etc. Focus on operation semantics, not vendor implementation details
+
+**Exception:** When quoting kernel names from traces, it's acceptable to include the actual name for identification.
+ 
+---
+
+## Workflow Steps
+
+```
+0. Query User Inputs (Platform, Trace Path(s), Analysis Mode, Environment Setup)
+1. Generate Performance Report (branches on analysis mode: training vs inference then, comparison scope)
+2-5. Prepare Category Data (GPU Util, Top Ops, Tree Data, Multi-Kernel Data, Category Filtering)
+6. System-Level Analysis (PARALLEL, CPU/Idle + Kernel Fusion + Multi-Kernel) → system_findings/
+7. Invoke Compute Kernel Subagents (PARALLEL, read category_findings[] from _metrics.json) → category_findings/
+   7.5. Aggregate per-category category_findings[] → priority_data.json::findings[] (globally sorted)
+8. Validate Subagent Outputs (system_findings/ + category_findings/)
+9. Prepare Report Data (load_findings) + Model Identification (subagent) → metadata/model_info.json
+10. Render performance PNG IF agent_extension.py is absent
+11. Generate Final Report (composable System + Compute sections), validate it,
+    optionally invoke agent_extension.py (when present), then embed the PNG into the report.
+```
+
+**Subagent usage:** Only invoke Task subagents in steps that explicitly say "subagent" (Steps 6, 7, 9). All other steps (including Step 7.5) must be performed directly by the orchestrator using the command prefix.
+
+---
+
+## Step 0: Query User Inputs
+
+**When this skill is invoked, immediately ask the user for:**
+
+### Required Information:
+
+1. **Comparison scope** → `<comparison_scope>`
+   - Set from the user’s intent **before** deep-diving on paths:
+     - **`comparative`** if the skill was triggered by **“comparative analysis”**, **“compare two traces”**, or the user supplies **two** trace paths / explicitly asks to compare trace A vs B.
+     - **`standalone`** otherwise (including triggers **“standalone analysis”**, **“analyze trace standalone”**, single trace only).
+
+2. **Trace File Path(s)**
+   - **`standalone`:** **Trace File Path** → `<trace_path>`
+     - Ask: "Please provide the full path to your PyTorch trace file (.json or .json.gz)"
+   - **`comparative`:** ask for both:
+     - **Primary trace (trace1)** → `<trace_path>`
+     - **Comparison trace (trace2)** → `<trace2_path>`
+     - Ask: "Please provide the full path to your primary trace file and your comparison trace file (.json or .json.gz)"
+
+3. **Platform** → `<platform>`
+   **`standalone`**: Ask: "Which platform are you analyzing?"
+   **`comparative`**: Ask: "Which platform is baseline trace (trace1)?"
+   - Options:
+     1. **MI300X**
+     2. **MI325X**
+     3. **MI350X**
+     4. **MI355X**
+     5. **MI455X**
+   **`comparative`:** Ask: "Which platform is target trace (trace2)?" Assign `<platform2>` (`<platform2>` does not need to be one of the platform options)
+
+4. **Analysis Mode** → `<analysis_mode>`
+   - If the user's prompt explicitly specifies an analysis mode or mentions inference/vLLM/SGLang, use that. Otherwise, default to `default` without asking.
+   - Options:
+     1. **Default (training and non-vLLM/SGLang eager inference)** (`<analysis_mode>` = `default`) — uses `TraceLens_generate_perf_report_pytorch`
+     2. **Inference analysis (vLLM/SGLang)** (`<analysis_mode>` = `inference`) — uses `TraceLens_generate_perf_report_pytorch_inference`
+   - If **Inference (vLLM/SGLang)** is selected, ask **Execution Mode** → `<inference_exec_mode>`:
+     1. **Eager mode** (`<inference_exec_mode>` = `eager`) — only the trace file is needed
+     2. **Graph replay + capture** (`<inference_exec_mode>` = `graph_capture`) — also requires a capture folder path
+   - If **Graph replay + capture**, ask for **Capture Folder Path** → `<capture_folder_path>`:
+     - Ask: "Please provide the full path to the graph capture traces folder"
+   - **Unsupported combination:** If `<inference_exec_mode>` = `graph_capture` **and** `<comparison_scope>` = `comparative`, stop immediately. Inform the user: "Graph replay + capture mode is not yet supported for comparative analysis. Please provide eager mode traces instead." Do not misinterpret as two standalone analyses. Do **not** proceed to Step 1 or beyond.
+
+5. **Environment Setup**
+   - Ask: "Are you running locally or on a cluster?"
+     - If **local**: No further environment questions — prefix is blank (commands run directly).
+     - If **cluster**:
+       - Ask "Which node should we use?" → `<node>`
+       - Ask "Are you working in a containerized environment (e.g. Docker)?" → if yes, ask for container name → `<container>`
+       - Ask "Are you using a virtual environment?" → if yes, ask for venv path → `<venv_path>`
+
+6. **Output Directory** (Optional)
+   - Ask: "Where should we save analysis results? (Press Enter for default: <trace_directory>/analysis_output)"
+   - Default: Same directory as trace file, in `analysis_output/` subdirectory
+
+7. **Extension File** (Optional) → `<extension_file>`
+   - Ask: "Do you have a TraceLens extension file to apply? Press Enter to skip."
+   - If provided, resolve to an absolute path and assign to `<extension_file>`.
+   - If skipped, set `<extension_file>` to empty (no `--extension_file` flag is added to any command).
+
+### Build and Cache Command Prefix
+
+After collecting inputs, build a command template and save it to `<output_dir>/cache/cmd_prefix.txt`. Create the directory with `mkdir -p <output_dir>/cache`.
+
+The template uses `{CMD}` as a placeholder for the actual command.
+
+**Cluster:** Before building the prefix, locate the TraceLens project root on the remote environment.
+
+Run the following command (adjust for container if applicable):
+
+```bash
+# Without container:
+ssh <node> "find / -maxdepth 5 -type d -name 'TraceLens' 2>/dev/null | head -5"
+
+# With container:
+ssh <node> "docker exec <container> bash -c 'find / -maxdepth 5 -type d -name TraceLens 2>/dev/null | head -5'"
+```
+
+Pick the result containing `Agent/` and strip the trailing `/TraceLens` to get `<tracelens_dir>`.
+
+Build the cluster prefix using this lookup:
+
+| Container | Venv | Template |
+|-----------|------|----------|
+| No | No | `ssh <node> "cd <tracelens_dir> && {CMD}"` |
+| Yes | No | `ssh <node> "docker exec <container> bash -c 'cd <tracelens_dir> && {CMD}'"` |
+| No | Yes | `ssh <node> "bash -c 'source <venv_path>/bin/activate && cd <tracelens_dir> && {CMD}'"` |
+
+Write the resolved template to `<output_dir>/cache/cmd_prefix.txt`. Then validate it works:
+
+```bash
+<prefix> python3 -c "import TraceLens; print('PREFIX_OK')"
+```
+
+If this fails, inform the user with `[DIAG:pipeline:PREFIX_FAIL]` and check that `<tracelens_dir>` is the **parent** of TraceLens (not the repo root itself), verify the container/venv is accessible, rebuild, and retry. Do NOT proceed to Step 1 until validation passes.
+
+### Command Execution Pattern
+
+**Before executing any command**, read `<output_dir>/cache/cmd_prefix.txt`. It contains a template with a `{CMD}` placeholder. Substitute `{CMD}` with the actual command. All commands below use `<prefix>` to represent this resolved template.
+
+---
+
+## Step 1: Generate Performance Report
+
+Use **`<analysis_mode>`** to determine which CLI tool to run and then **`<comparison_scope>`** to determine arguments.
+
+For all of these scripts below, look at the environment variable TL_EXTENSION to recursively search for a file called <platform>.json. Do not look for <platform2>.json; it is not needed.
+If it is not found also look in TraceLens/Agent/Analysis/utils/arch/<platform>.json.
+Use <platform_file> to represent the location of this file
+
+**CLI call count:**
+- **`standalone`**: one TraceLens CLI call (for `<trace_path>`)
+- **`comparative`**: one TraceLens CLI call per trace (for `<trace_path>` and `<trace2_path>`)
+
+All commands below append `<suffix_1>` and `<suffix_2>`, resolved by `<comparison_scope>`:
+
+**`<suffix_1>`** — output paths:
+
+| scope | value |
+|-------|-------|
+| `standalone` | `--output_xlsx_path <output_dir>/perf_report.xlsx --output_csvs_dir <output_dir>/perf_report_csvs` |
+| `comparative` trace1 | `--output_xlsx_path <output_dir>/perf_report_trace1.xlsx --output_csvs_dir <output_dir>/perf_report_trace1_csvs` |
+| `comparative` trace2 | `--profile_json_path <trace2_path> --output_xlsx_path <output_dir>/perf_report_trace2.xlsx --output_csvs_dir <output_dir>/perf_report_trace2_csvs` |
+
+**`<suffix_2>`** — extension flags:
+
+| scope | value |
+|-------|-------|
+| `standalone` | none |
+| `comparative` trace1 | `--comparison_json_path <trace2_path>` |
+| `comparative` trace2 | none |
+
+**`<suffix_ext>`** — user extension file:
+
+| condition | value |
+|-----------|-------|
+| `<extension_file>` provided | `--extension_file <extension_file>` |
+| not provided | none |
+
+---
+
+**Default (training and non-vLLM/SGLang eager inference)** (`<analysis_mode>` = `default`):
+
+```bash
+<prefix> TraceLens_generate_perf_report_pytorch \
+  --profile_json_path <trace_path> \
+  --gpu_arch_json_path <platform_file> \
+  --enable_pseudo_ops \
+  --group_by_num_kernels \
+  --include_call_stack \
+  <suffix_1> \
+  <suffix_2> \
+  <suffix_ext>
+```
+
+**Inference eager mode** (`<analysis_mode>` = `inference`, `<inference_exec_mode>` = `eager`):
+
+```bash
+<prefix> TraceLens_generate_perf_report_pytorch_inference \
+  --profile_json_path <trace_path> \
+  --gpu_arch_json_path <platform_file> \
+  --group_by_parent_module \
+  --enable_pseudo_ops \
+  --group_by_num_kernels \
+  --include_call_stack \
+  <suffix_1> \
+  <suffix_2> \
+  <suffix_ext>
+```
+
+**Inference graph replay + capture mode** (`<analysis_mode>` = `inference`, `<inference_exec_mode>` = `graph_capture`):
+
+```bash
+<prefix> TraceLens_generate_perf_report_pytorch_inference \
+  --profile_json_path <trace_path> \
+  --capture_folder <capture_folder_path> \
+  --gpu_arch_json_path <platform_file> \
+  --group_by_parent_module \
+  --enable_pseudo_ops \
+  --group_by_num_kernels \
+  --include_call_stack \
+  <suffix_1> \
+  <suffix_2> \
+  <suffix_ext>
+```
+
+---
+
+## Steps 2-5: Prepare Category Data
+
+Execute the TraceLens Agentic Mode orchestrator preparation script:
+
+```bash
+<prefix> python3 \
+  TraceLens/Agent/Analysis/utils/orchestrator_prepare.py \
+  --trace-path <trace_path> \
+  --platform <platform> \
+  --output-dir <output_dir> \
+  --comparison-scope <comparison_scope>
+```
+
+This script performs:
+- **Step 2:** Assess GPU utilization (computation, idle, communication times)
+- **Step 3:** Identify top 10 operations by GPU time
+- **Step 4:** Pre-compute tree data for bottleneck operations (load trace ONCE)
+- **Step 4.5:** Pre-compute multi-kernel issue data (memcpy by direction, NCCL events, overlap metrics)
+- **Step 5:** Filter and export category-specific data
+
+**Outputs:**
+- `category_data/<category>_ops.csv` - Filtered operations per category
+- `metadata/<category>_metadata.json` - Platform specs, GPU utilization, config
+- `category_data/multi_kernel_data.json` - Memcpy/NCCL/overlap pre-computed data
+- `category_data/category_manifest.json` - Workflow metadata with categories (includes `tier` field: `system` or `compute_kernel`)
+- `system_findings/` - Directory for system-level analysis outputs
+- `category_findings/` - Directory for compute kernel analysis outputs
+
+---
+
+## Step 6: System-Level Analysis (PARALLEL)
+
+System-level analysis examines issues that affect the GPU pipeline as a whole -- idle time, memory transfer patterns, and communication/compute overlap. These are **not** about individual kernel efficiency.
+
+**Output directory:** `system_findings/`
+
+### 6.1 Read Manifest and Identify System-Level Subagents
+
+```bash
+<prefix> python3 -c \"
+import sys
+from TraceLens.Agent.Analysis.utils.report_utils import load_manifest_categories
+load_manifest_categories(sys.argv[1])
+\" '<output_dir>'"
+```
+
+This prints `system_categories` and `compute_categories` lists. Use `system_categories` for Step 6 and `compute_categories` for Step 7.
+
+### 6.2 Launch System-Level Subagents in PARALLEL
+
+Launch system-level sub-agents simultaneously using the Task tool. Do NOT wait between invocations.
+
+**System-Level Agent File Map:**
+
+**Base path:** `TraceLens/Agent/Analysis/skills/analysis-orchestrator/agents/`
+
+| Category | Agent file |
+|----------|-----------|
+| `cpu_idle` | `cpu-idle-analyzer.md` |
+| `multi_kernel` | `multi-kernel-analyzer.md` |
+| `kernel_fusion` | `kernel-fusion-analyzer.md` |
+
+**Invocation conditions:**
+- **CPU/Idle**: Read `category_data/category_manifest.json` and check `gpu_utilization.idle_time_percent`. Only invoke the subagent if `idle_time_percent > 15`. Skip otherwise -- the deterministic script already captured the factual data.
+- **Multi-Kernel**: `multi_kernel` category exists in manifest OR `gpu_util['exposed_comm_time_percent'] > 0` OR `gpu_util['exposed_memcpy_time_percent'] > 0`
+- **Kernel Fusion**: `kernel_fusion` category exists in manifest
+
+**Task prompt structure for each system-level subagent:**
+
+The subagent reads its own agent file — the orchestrator does NOT read or paste agent file contents.
+
+```
+Read and follow the FULL instructions in:
+  TraceLens/Agent/Analysis/skills/analysis-orchestrator/agents/<agent-file>.md
+
+**Execution Context:**
+- Comparison scope: `<comparison_scope>`
+- Output directory: <output_dir>
+- Command prefix: read `<output_dir>/cache/cmd_prefix.txt` — contains a template
+  with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+- Input files: <list from agent file's "Input files" section>
+- Output file: <from agent file's "Output file" section>
+
+Execute every step in the agent file. Return "DONE" when complete.
+```
+
+**CRITICAL:** The orchestrator does NOT read agent files or run analysis scripts. Each sub-agent is responsible for:
+1. Reading its own agent `.md` file
+2. Running its Python script using the command prefix
+3. Reading the metrics JSON output
+4. Identifying issues and generating findings
+
+### 6.3 Wait for System-Level Subagents to Complete
+
+The three subagents must complete before proceeding to Step 6.4.
+Each writes findings to `system_findings/<name>_findings.md`.
+
+### 6.4 Verify System Outputs and Retry Failures (up to 1 retry per subagent)
+
+After all system-level subagents complete:
+
+1. For each expected system category from the manifest, check:
+   - Does `system_findings/<category>_findings.md` exist?
+   - If it exists, does it contain "Status: ERROR"?
+2. Collect a list of **failed** categories (missing file OR Status: ERROR).
+3. **Retry each failed category exactly once** by re-launching its subagent with the same prompt from Step 6.2. Wait for all retries to complete before proceeding.
+4. After retries, re-check outputs. Any category that still fails is excluded from aggregation.
+5. **CRITICAL: Do NOT attempt manual analysis of failed system checks — only automated subagent retry is allowed.**
+
+---
+
+## Step 7: Invoke Compute Kernel Subagents (PARALLEL)
+
+Compute kernel analysis examines individual operation category efficiency.
+
+**Output directory:** `category_findings/`
+
+### 7.1 Read Manifest and Identify Compute Kernel Categories
+
+Use `compute_categories` from the `load_manifest_categories()` call in Step 6.1.
+
+### 7.2 Launch Compute Kernel Subagents in PARALLEL
+
+For each entry in `compute_categories` (loaded in Step 6.1), resolve `{agent_file}` as `{entry.skill}.md` and launch a subagent with agent file `TraceLens/Agent/Analysis/skills/analysis-orchestrator/agents/{agent_file}`. Fall back to `generic-op-analyzer.md` if the file is absent.
+
+Launch all subagents simultaneously in a single parallel batch.
+
+---
+
+#### Shared Compute Kernel Preamble
+
+Include this block in every compute kernel subagent prompt:
+
+<Shared Compute Kernel Preamble>:
+```
+comparison_scope: {comparison_scope}
+
+**CRITICAL - READ FIRST:**
+- Use GPU kernel time (not CPU duration) for all bottleneck analysis
+- `efficiency_percent` semantics differ by mode:
+  - **Standalone:** % of roofline. Flag > 100% as "[ANOMALY] - verify measurement".
+  - **Comparative:** `100 × (Trace 2 kernel time) / (Trace 1 kernel time)`.
+    - **< 100%** → Trace 1 is slower than Trace 2. **This is an optimization opportunity — flag it.**
+    - **> 100%** → Trace 2 is slower than Trace 1. **NOT an anomaly; no Trace-1 optimization needed.**
+
+**CRITICAL CONSTRAINTS:**
+1. **Standalone:** Any efficiency > 100% → `[ANOMALY] - verify measurement`. **Comparative:** efficiency > 100% means Trace 2 is slower — NOT an anomaly; efficiency < 100% means Trace 1 is slower — flag as optimization opportunity.
+2. Status must be SUCCESS or ERROR; times in ms; efficiencies as percentages
+3. Operations with `fusion_flagged: true` in the metrics JSON are already covered by
+   a high-confidence kernel fusion candidate — do NOT flag them as bottlenecks or write
+   kernel_tuning recommendations. The analysis scripts already exclude them from `impact_estimates`.
+
+**Execution Context:**
+- Output directory: <output_dir>
+- Command prefix: read `<output_dir>/cache/cmd_prefix.txt` — contains a template
+  with `{CMD}` placeholder; substitute `{CMD}` with the actual command
+```
+
+---
+
+#### Compute Kernel Subagent Prompt
+
+For each category, launch a Task (subagent_type: generalPurpose):
+
+```
+You are analyzing {category} operations for a PyTorch trace on {platform}.
+
+<Shared Compute Kernel Preamble>
+
+Read and follow the FULL instructions in:
+  TraceLens/Agent/Analysis/skills/analysis-orchestrator/agents/{agent_file}
+
+- Category: {category}
+- Input files: category_data/{category}_ops.csv, metadata/{category}_metadata.json,
+  category_data/{category}_metrics.json (P-items come from `category_findings[]`; `operations[i].module_chain` provides model layer context)
+- Output file: category_findings/{category}_findings.md
+
+Execute every step in the agent file. Return "DONE" when complete.
+```
+
+### 7.3 Wait for All Compute Kernel Subagents to Complete
+
+All subagents must complete before proceeding to Step 7.4.
+Each subagent writes its findings to `category_findings/<category>_findings.md`.
+
+### 7.4 Verify Outputs and Retry Failures (up to 1 retry per subagent)
+
+After all compute kernel subagents complete:
+
+1. For each category in the manifest with `tier: compute_kernel`, check:
+   - Does `category_findings/<category>_findings.md` exist?
+   - If it exists, does it contain "Status: ERROR"?
+2. Collect a list of **failed** categories (missing file OR Status: ERROR).
+3. **Retry each failed category exactly once** by re-launching its subagent with the same prompt structure from Step 7.2. Launch all retries in parallel and wait for completion.
+4. After retries, re-check outputs. Any category that still fails is excluded from aggregation and recommendations.
+5. **CRITICAL: Do NOT attempt to manually analyze failed categories — only automated subagent retry is allowed.**
+
+### 7.5 Aggregate findings → priority_data.json
+
+After all compute sub-agent `_metrics.json` files exist (each carrying its own `category_findings[]`), concatenate them into a globally-sorted `priority_data.json::findings[]` for the report template.
+
+```bash
+<prefix> python3 -c \"
+import sys
+from TraceLens.Agent.Analysis.utils.report_utils import generate_priority_data
+generate_priority_data(sys.argv[1])
+\" '<output_dir>'
+```
+
+---
+
+## Step 8: Validate Subagent Outputs
+
+Before aggregating results, validate outputs from **both** tiers (system_findings/ and category_findings/).
+
+```bash
+<prefix> python3 -c \"
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_subagent_outputs
+validate_subagent_outputs(sys.argv[1])
+\" '<output_dir>'"
+```
+
+This runs four checks:
+1. **Time Sanity** -- category GPU kernel time sum vs computation time (WARN if >15% discrepancy)
+2. **Efficiency Anomalies** -- findings with efficiency >100% (measurement issues) when `<comparison_scope>` = `standalone`
+3. **Coverage** -- all expected system and compute findings present
+4. **Priority Consistency** -- `priority_data.json` invariants: `findings[]` sorted desc by `impact_score`, contiguous `global_rank` / `priorities[].rank`, and per-category `priorities[].impact_score` ≈ `sum(findings[].impact_score)`
+
+---
+
+## Step 9: Prepare Report Data + Model Identification
+
+```bash
+<prefix> python3 -c \"
+import sys
+from TraceLens.Agent.Analysis.utils.report_utils import load_findings
+load_findings(sys.argv[1])
+\" '<output_dir>'"
+```
+
+### 9.1 Model Identification (Subagent, retry once on failure)
+
+Launch a Task subagent (generalPurpose) that reads and follows `TraceLens/Agent/Analysis/skills/analysis-orchestrator/agents/model-identification-agent.md` with context: <output_dir>. Wait for completion.
+
+**On failure (subagent error, timeout, or `model_info.json` not written):**
+1. **Retry exactly once** by re-launching the same subagent with the same prompt.
+2. If the retry also fails, write fallback `metadata/model_info.json` with all four fields set to `"Cannot be inferred from trace"`.
+
+Assign <Model> to model value in `<output_dir>/metadata/model_info.json` or "Workload" if model is "Cannot be inferred from trace".
+
+---
+
+## Step 10: Render Plot (conditional)
+
+**Important:** Plot data is sourced from `priority_data.json` (written in Step 7.5). This step only renders the PNG when `agent_extension.py` is absent.
+Look at the environment variable TL_EXTENSION to find python packages and directories to recursively search for `agent_extension.py`.
+If this environment variable is not present or the it is not found look in TraceLens/Agent/Analysis/utils/.
+If the file is present, **skip this step** — Step 11.2 will produce `perf_improvement.png` and Step 11.3 will embed it.
+Use <agent_extension_file> to represent the location of this file.
+
+```bash
+EXT='<agent_extension_file>'
+if [ ! -f "$EXT" ]; then
+  <prefix> python3 -c \"
+import sys
+from TraceLens.Agent.Analysis.utils.plot_utils import generate_perf_plot
+generate_perf_plot(sys.argv[1], sys.argv[2])
+\" '<output_dir>' '<Model> on <Platform> — Performance Breakdown'
+fi
+```
+
+If the plot fails (extension-absent branch), retry once. If still failing, proceed to Step 11 without the plot.
+
+---
+
+## Step 11: Generate Final Report (<output_dir>/analysis.md)
+
+**CRITICAL: Do NOT delegate Step 11 to a Task subagent.** The orchestrator must write the report directly.
+
+1. **Read** the report template: `TraceLens/Agent/Analysis/utils/templates/analysis_template.md`
+2. **Write the report in sections** to `<output_dir>/analysis.md` using **only** `<prefix> tee` / `<prefix> tee -a` with single-quoted heredoc delimiters (see write order below). You MUST NOT use the IDE Write tool, Edit tool, StrReplace tool, `cat >`, `echo >`, `>>` redirect, or any other write method for `analysis.md` unless tee fails.
+3. **Fill in** each section by substituting placeholders with actual data. Never retain template placeholders (`<Brief Title>`, `X ms`, `Y%`, `<platform>`, `<model>`) — every field must contain actual data.
+
+**Write order (one heredoc per step):**
+
+   a. **Initialize** — truncate and write the title line + `## Executive Summary` (metrics table, `{{PERF_PLOT}}` placeholder). Use `<prefix> tee <output_dir>/analysis.md << 'SECTION_EOF'` (truncating `tee`, not append) for this first write only.
+      - Data sources: `category_data/category_manifest.json` (`gpu_utilization` keys), `priority_data.json` (top bottleneck).
+
+   b. **Compute Kernel Optimizations** — append `## Compute Kernel Optimizations` with `### Top Operations` table and P-item cards. Use `<prefix> tee -a <output_dir>/analysis.md << 'SECTION_EOF'`.
+      - Data sources: `priority_data.json` — P1 = `findings[0]`, P2 = `findings[1]`, ... ; each card joins its sub-agent's Detailed Analysis block by `(findings[i].category, findings[i].category_rank)`. The Top Operations table materializes `priorities[]` verbatim (one row per entry, array order, no re-sorting).
+      - `category_findings/*.md` — for each findings file, copy its `## Recommendations` P-items into the report card slots. **Copy table cells verbatim** from the source `category_findings/<cat>_findings.md`.
+      - Heuristic findings (`findings[i].estimate_method == "heuristic"`) carry a numeric estimated impact and sort by `impact_score` like any other compute finding — render them (do NOT skip them) per `sub_agent_spec.md § Heuristic findings`.
+
+   c. **Kernel Fusion** — append `## Kernel Fusion Opportunities (Experimental)`. Use `<prefix> tee -a <output_dir>/analysis.md << 'SECTION_EOF'`.
+      - Data source: `system_findings/kernel_fusion_findings.md`.
+
+   d. **System-Level** — append `## System-Level Optimizations`. Use `<prefix> tee -a <output_dir>/analysis.md << 'SECTION_EOF'`.
+      - Data sources: remaining `system_findings/*.md` (cpu_idle, multi_kernel).
+
+   e. **Detailed Analysis** — append `## Detailed Analysis` with `### Compute Kernel Insights`, `### Kernel Fusion Insights`, `### System-Level Insights` subsections. Use `<prefix> tee -a <output_dir>/analysis.md << 'SECTION_EOF'`.
+      - Data sources: copy the `## Detailed Analysis` blocks verbatim from each `*_findings.md` file. Follow the template for formatting.
+      - `category_data/*_metrics.json` (per-op tables, impact estimates).
+
+   f. **Appendix** — append `## Appendix` with `### Model Architecture` and `### Hardware Reference`. Use `<prefix> tee -a <output_dir>/analysis.md << 'SECTION_EOF'`.
+      - `metadata/model_info.json` — substitute `<model>`, `<architecture>`, `<scale>`, `<precision>` with the four field values.
+      - Platform arch file — read `platform` from `category_manifest.json`, then read `TraceLens/Agent/Analysis/utils/arch/<platform>.json`. For `### Hardware Reference`: substitute `<platform>`, Peak HBM BW = `mem_bw_gbps / 1000` TB/s, Peak MAF (BF16) = `max_achievable_tflops.matrix_bf16` TFLOPS, Peak MAF (FP8) = `max_achievable_tflops.matrix_fp8` TFLOPS if present.
+
+**Failure exclusion:** Skip any category listed in `load_findings()` output as `failed_system` or `failed_compute`. Include a `## Warnings` section (between Executive Summary and Compute Kernel Optimizations) only if failures exist.
+
+The report at `<output_dir>/analysis.md` must use these exact `##` headers — do NOT rename them:
+1. `## Executive Summary`
+2. `## Compute Kernel Optimizations`
+3. `## Kernel Fusion Opportunities (Experimental)`
+4. `## System-Level Optimizations`
+5. `## Detailed Analysis`
+6. `## Appendix`
+
+
+### 11.1 Validate Report Structure (Retry up to 2x)
+
+After writing `analysis.md`, validate that the report contains all required `##` section headers. If validation fails, modify the report with the missing sections.
+
+**Validation procedure:**
+
+```bash
+<prefix> python3 -c \"
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_report
+passed, missing = validate_report(sys.argv[1], comparison_scope=sys.argv[2])
+if not passed:
+    print('FAIL:')
+    for m in missing:
+        print('  - ' + m)
+    sys.exit(1)
+print('PASS: All required sections present')
+\" '<output_dir>' '<comparison_scope>'
+```
+
+**If validation fails (exit code 1):**
+
+1. Read the FAIL output to identify the issue. Fix in-place, do NOT rewrite the report from scratch. Edit sections in place and not regenerate the entire output.
+a. Check if the report contains similar but incorrectly named headers and rename them to match the exact required names. 
+b. If sections are entirely absent, add them with the correct `##` headers, keeping existing content.
+c. For "Missing metrics row" errors: add the row to the Executive Summary table using values from `category_data/category_manifest.json` (`gpu_utilization` keys) and `priority_data.json` (top bottleneck).
+d. For placeholder values (`X ms`, `Y%`, `Z%`, `W%`) in the Executive Summary metrics table: replace each with the actual value from `category_manifest.json` -> `gpu_utilization`.
+e. For unfilled `<Brief Title>` / `<Library>` / `<platform>` placeholders: substitute the real title/backend/platform from the corresponding findings file or `metadata/*_metadata.json`.
+f. For Args cell mismatches: copy the matching `operations[].args` value verbatim (preserving `<br>`) from the corresponding `category_data/<cat>_metrics.json` and string-replace the bad cell.
+g. For marker errors: restore or add the missing/broken marker in place — never delete a card or block to silence an error. Source numeric values from `priority_data.json` (P-items) or `<cat>_metrics.json::impact_estimates[]` (detail estimates); use `null` or the sentinel `not quantifiable from trace data` for non-quantifiable items.
+h. For priority-consistency errors (R1 P-item count mismatch, R2 P-item category-order mismatch, R3 marker numeric mismatch, R4 Top Ops row-count mismatch): re-render the affected card(s) by re-reading `priority_data.json::findings[N-1]` for `category`, `low` (impact_score_low), `mid` (impact_score), `high` (impact_score_high), and `priorities[]` for the Top Operations table rows (one row per entry, in array order).
+2. Run validation again.
+3. Maximum 2 retry attempts. If still failing after retry, proceed with a warning.
+
+---
+
+### 11.2 Optional extension (auto-detected)
+
+If `<agent_extension_file>` exists, run it as shown below. Its behavior is documented in the extension itself; the orchestrator does not need to inspect or reason about it.
+
+If the file is absent, skip this step silently. The analysis is complete; the simple plot from Step 10 stays in place.
+
+```bash
+EXT='<agent_extension_file>'
+if [ -f "$EXT" ]; then
+  <prefix> python3 "$EXT" --output-dir '<output_dir>' --title '<Model> on <Platform> — Kernel Tuning Potential' --comparison-scope <comparison_scope>
+fi
+```
+
+This step is a hook for an optional extension; if `agent_extension.py` is not present, skip it.
+
+**Do NOT re-run `validate_report` after this step.**
+
+---
+
+### 11.3 Embed Performance Improvement Plot
+
+The PNG (`perf_improvement.png`) is already on disk from either Step 10.3 or Step 11.2 (whichever ran). This step only embeds its base64 sidecar into the report at the `{{PERF_PLOT}}` placeholder.
+
+```bash
+<prefix> python3 -c \"
+import sys
+from TraceLens.Agent.Analysis.utils.plot_utils import embed_plot_in_report
+embed_plot_in_report(sys.argv[1])
+\" '<output_dir>'
+```
+
+If the plot is skipped, the `{{PERF_PLOT}}` placeholder is removed so the report remains clean.
+---
+
+## Trace Feature Detection
+
+If Steps 1 or many of Steps 2-5 fail or produce unexpected results, check whether the trace uses the following features before retrying:
+- **GPU Graph Replay**: raw trace JSON contains `hipGraphLaunch` or `cudaGraphLaunch`.
+  - **Default mode** (analysis_mode = `default`): Inform the user with `[DIAG:trace_quality:GPU_GRAPH_REPLAY]` that GPU graph replay was detected and that the default analysis mode supports typical PyTorch traces. **Abort** -- do not retry or continue.
+  - **Inference mode** (analysis_mode = `inference`): Graph launches are expected and supported if graph capture folder is provided, do not abort. If inference_exec_mode is `eager` (no capture folder was provided), continue.
diff --git a/skills/analysis-orchestrator/skill-card.md b/skills/analysis-orchestrator/skill-card.md
new file mode 100644
index 0000000..450130b
--- /dev/null
+++ b/skills/analysis-orchestrator/skill-card.md
@@ -0,0 +1,13 @@
+# Skill Card
+
+## Description
+
+Orchestrates modular PyTorch profiler trace analysis with TraceLens: generates perf reports, prepares category data, runs system-level and compute-kernel subagents in parallel, validates outputs, and writes a prioritized stakeholder report (analysis.md).
+
+## Owner
+
+AMD-AGI (federated from [AMD-AGI/TraceLens](https://github.com/AMD-AGI/TraceLens))
+
+## License
+
+MIT

From 50fce30aaf7d35e7ed08e48f3118ecc84e8d491c Mon Sep 17 00:00:00 2001
From: Gabe Weisz <gabe.weisz@amd.com>
Date: Fri, 19 Jun 2026 14:42:56 -0700
Subject: [PATCH 4/8] update import

---
 .github/scripts/import_external_skills.py     |  51 ++-
 .../utils/templates/sub_agent_spec.md         | 345 ++++++++++++++++++
 2 files changed, 395 insertions(+), 1 deletion(-)
 create mode 100644 skills/analysis-orchestrator/utils/templates/sub_agent_spec.md

diff --git a/.github/scripts/import_external_skills.py b/.github/scripts/import_external_skills.py
index 286c932..601843a 100644
--- a/.github/scripts/import_external_skills.py
+++ b/.github/scripts/import_external_skills.py
@@ -9,7 +9,9 @@
 
 1. Shallow-clones the repo at the pinned `ref` into a temp directory,
    using sparse-checkout so only the configured `path` is fetched.
-2. Copies each named skill folder into `skills/<skill>/`.
+2. Copies each named skill folder into `skills/<skill>/`, plus any declared
+   companion files (e.g. templates linked from agent markdown but stored
+   outside the skill subtree in the upstream repo).
 3. Writes `.federated.json` inside each copy with source metadata so we
    can tell vendored skills apart from skills authored in this repo.
 4. Rewrites relative markdown links that point outside the copied skill
@@ -353,6 +355,45 @@ def copy_skill(src: Path, dest: Path) -> None:
     shutil.copytree(src, dest)
 
 
+# Repo-relative paths (inside the upstream clone) that a federated skill links
+# to but that live outside its copied subtree. Fetched with `git show` so the
+# sparse checkout does not need to widen.
+_COMPANION_FILES: dict[tuple[str, str], list[tuple[str, str]]] = {
+    ("amd-agi-tracelens", "analysis-orchestrator"): [
+        (
+            "TraceLens/Agent/Analysis/utils/templates/sub_agent_spec.md",
+            "utils/templates/sub_agent_spec.md",
+        ),
+    ],
+}
+
+
+def vendor_companion_files(
+    clone_dir: Path,
+    commit: str,
+    dest_skill: Path,
+    source_name: str,
+    skill_folder: str,
+    log: list[str],
+) -> None:
+    """Materialize companion blobs linked from agent markdown but outside the skill folder."""
+    for repo_rel, dest_rel in _COMPANION_FILES.get((source_name, skill_folder), []):
+        result = subprocess.run(
+            ["git", "show", f"{commit}:{repo_rel}"],
+            cwd=clone_dir,
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        out = dest_skill / dest_rel
+        out.parent.mkdir(parents=True, exist_ok=True)
+        out.write_text(result.stdout, encoding="utf-8")
+        log.append(
+            f"[{source_name}] vendored companion {repo_rel} -> "
+            f"skills/{skill_folder}/{dest_rel}"
+        )
+
+
 def write_marker(
     skill_dir: Path,
     source: Source,
@@ -502,6 +543,14 @@ def import_source(
             log.append(f"[{source.name}] {action} {spec.folder} -> skills/{spec.folder}")
             if not dry_run:
                 copy_skill(src_skill, dest_skill)
+                vendor_companion_files(
+                    tmp_path,
+                    commit,
+                    dest_skill,
+                    source.name,
+                    spec.folder,
+                    log,
+                )
                 write_marker(dest_skill, source, commit, relative_path)
                 write_card(dest_skill, source, marketplace_description)
                 rewrite_external_references(
diff --git a/skills/analysis-orchestrator/utils/templates/sub_agent_spec.md b/skills/analysis-orchestrator/utils/templates/sub_agent_spec.md
new file mode 100644
index 0000000..779ac03
--- /dev/null
+++ b/skills/analysis-orchestrator/utils/templates/sub_agent_spec.md
@@ -0,0 +1,345 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+# Sub-Agent Findings Specification
+
+Canonical reference for the output that sub-agents write into their findings
+files. The orchestrator extracts these sections when composing the final
+`analysis.md` report.
+
+> **Usage:** Link here from every `*-analyzer.md` instead of duplicating the
+> schema. Replace `<category>` with the actual category name.
+> This spec receives `comparison_scope`: `standalone` (default) or `comparative`.
+---
+
+## Orchestrator-consumed sections
+
+Every findings file must end with these two sections, in this order:
+
+1. `## Recommendations`
+2. `## Detailed Analysis`
+
+Applies to both tiers (compute → `category_findings/`, system → `system_findings/`). Agents may include any other sections (Overview, Operations Breakdown, Key Bottlenecks, …) before them — those are agent-internal and not parsed by the orchestrator.
+
+---
+
+## No actionable findings
+
+**Compute tier:** There is no actionable bottleneck when the analyzer left
+`category_data/<category>_metrics.json::category_findings` as an **empty array**
+`[]`. In that case emit **empty** `## Recommendations` and **empty**
+`## Detailed Analysis` exactly as in § Empty category_findings.
+
+**System tier:** Follow the structured output your analyzer JSON supports.
+
+---
+
+## Recommendations
+
+Each P-item maps 1:1 to a `## Detailed Analysis` reasoning candidate at the same rank.
+
+```markdown
+### P1: <Brief Title> (<Library>)            <!-- (<Library>) only on compute tier -->
+**Insight**: [1 sentence — what's wrong]
+**Action**: [1-2 sentences — what to do]
+<!-- impact-begin kind=p_item low=<impact_score_low> mid=<impact_score> high=<impact_score_high> -->
+**Impact**: [impact_score: X.X, OR "Not quantifiable from trace data"]
+<!-- impact-end -->
+```
+
+- **Compute tier**: include all three fields. Pull `**Impact**` from `category_data/<category>_metrics.json::category_findings[i]`, ordered by `rank` (one card per entry).
+- **System tier**: omit the `(<Library>)` title suffix. Always emit `**Impact**: Not quantifiable from trace data` wrapped in `kind=p_item` markers with `low=null mid=null high=null`.
+- **Field labels are exact** — `**Insight**`, `**Action**`, `**Impact**`.
+- **`(<Library>)` suffix**: the single `category_findings[i].library` for this card (one library per finding by construction). Omit the parenthetical when the value is `Unknown`.
+- **Marker required** — see § Impact markers (REQUIRED). The `low`/`mid`/`high` attributes carry the raw `impact_score_low/impact_score/impact_score_high` values from `category_findings[i]`. For non-quantifiable cards (system tier) use `low=null mid=null high=null`.
+
+---
+
+## Detailed Analysis block schema
+
+Each candidate block lives inside a `## Detailed Analysis` section. It starts
+with an HTML comment and an `####` heading:
+
+```markdown
+## Detailed Analysis
+
+<!-- reasoning-candidate tier=<compute|system> rank=<N> -->
+#### <insight_title>
+**Identification:** …
+
+**Data:** …
+
+**Reasoning for Slowdown:** …
+
+**Resolution:** …
+
+**Impact estimate:** …
+```
+
+### HTML comment fields
+
+| Field | Values | Meaning |
+|-------|--------|---------|
+| `tier` | `compute` \| `system` | Must match the findings directory (`category_findings/` → compute, `system_findings/` → system). |
+| `rank` | Integer ≥ 1 | Compute tier: `category_findings[i].rank`. System tier: agent-local priority within this file (1 = highest). |
+
+### Required labels
+
+The five labels below must appear **in this order**, each on its own line with a
+blank line between them. The validator checks for these as substring matches.
+
+| Label | Purpose |
+|-------|---------|
+| `**Identification:**` | Why these operations were flagged. Body text must be plain language — JSON keys, dotted paths, and internal variable names belong **only** in the closing `(source: \`artifact\` → \`keys\`)` parenthetical (artifact + keys backticked, e.g. `(source: \`<cat>_metrics.json\` → \`operations[].efficiency.efficiency_percent\` < 70)`). When any flagged op has a non-null `library` (e.g. `Tensile`, `CK`, `AITER`, `Triton`, `rocBLAS`), state the backend in prose and include `operations[].library` in the `(source:)` parenthetical. When `operations[i].module_chain` is non-empty, name the model layer the ops belong to. When `operations[i].call_chain` is present, use it for deeper context. |
+| `**Data:**` | **Compute** (`tier=compute`): exactly one trace-grounded kernel breakdown table (see § Operations Table Schema). **All columns in the schema are mandatory — never drop a column.** Use `—` for any individual cell whose value is missing or null. **System** (`tier=system`): **must not** include kernel breakdown tables. include metric table (see § Metric Table Schema). |
+| `**Reasoning for Slowdown:**` | Why the workload is slow *as the trace shows*: **Standalone:** low % of roofline, low arithmetic intensity, unfused patterns, etc. **Comparative:** how Trace 1 is slower than Trace 2 for these operations — express speed differences as "X% faster" or "X% slower", plus absolute time gaps. Never use raw efficiency ratios or `efficiency_percent` values in prose. **Forbidden:** micro-architecture speculation (bank conflicts, L1 miss rates, etc.). |
+| `**Resolution:**` | **Why** the suggested optimization helps — not merely restating *what* to do. Must align with the P-item **Action** on the card. **Forbidden tautologies:** Do not restate the roofline definition (e.g. "raising bandwidth toward the roofline reduces kernel time"). Instead, explain the **mechanism** (e.g. "fusion eliminates the intermediate write-back, cutting bytes moved per invocation in half"). If the mechanism is not inferable from the trace, state only the action. |
+| `**Impact estimate:**` | Compute tier: rendered from `category_findings[i]` (matched by `rank`), two-bullet low/high `impact_score` format (see § Impact estimate rendering). System tier: `Impact estimate is not quantifiable from trace data.` |
+
+### Sentence quality
+
+- Each sentence should convey **one main idea**. Do not chain independent
+  observations with em-dashes, semicolons, or "while" bridges. Avoid run-on
+  sentences.
+
+### Trace observability (compute tier)
+
+This is the single source of truth for what compute-tier sub-agents can and
+cannot infer from a kernel-level PyTorch trace. Ground every claim in
+**Reasoning for Slowdown** / **Resolution** in a **CAN Infer** row; for any
+property in the **CANNOT Infer** rows, use the listed fallback prose instead
+of speculating.
+
+#### CAN Infer (universal — all compute categories)
+
+| Observable | Source |
+|------------|--------|
+| Kernel names | `trunc_kernel_details` column |
+| Kernel durations | Trace events |
+| Achieved TFLOPS/s or TB/s | Calculated from duration + FLOPs/bytes |
+| Efficiency % vs roofline | Achieved / resolved peak (MAF or HBM BW) |
+| Invocation counts | Number of trace events per signature |
+| Library / backend | `library` column / kernel-name heuristics |
+| Bound type | `efficiency.bound_type` (compute / memory) |
+| Input shape dimensions | `Input Dims` column (semantics per category — e.g. M/N/K for GEMM, B/H/S/D for SDPA, expert/token counts for MoE, NCHW for convolution) |
+
+#### CANNOT Infer (universal — all compute categories)
+
+These require hardware counters or profiler tools, not a trace.
+
+| NOT Observable | Why | Fallback prose |
+|----------------|-----|----------------|
+| Bank conflicts | Requires hardware counters | "Low efficiency — profile with hardware counters to diagnose." |
+| Cache hit rates | Requires hardware counters | "Large working set may exceed cache." |
+| Wave / SM occupancy | Requires hardware counters | "Kernel running slower than expected — profile occupancy with hardware counters." |
+| Shared-memory / LDS usage | Requires hardware counters | "Shared-memory usage not visible — profile with hardware counters." |
+| Intra-warp shuffle efficiency | Requires hardware counters | "Warp-shuffle efficiency not visible — profile with hardware counters." |
+| Root causes generally | Traces show WHAT, not WHY | "Bottleneck identified — generate reproducer for kernel team." |
+
+#### CANNOT Infer (category-specific)
+
+Each analyzer owns its own category-specific blind spots under a
+`## Trace observability (category-specific)` section in its `*-analyzer.md` file.
+The universal rows above always apply on top of those.
+
+---
+
+## Operations Table Schema (compute tier)
+
+Standard column schema for operations breakdown tables and the `**Data:**` table
+inside `## Detailed Analysis` blocks.
+
+### Standalone (`comparison_scope` = `standalone`)
+
+```markdown
+| Operation |  Args  |            Kernel Path                  | Kernel Name | Time (ms) | %E2E | Count |FLOPS/Byte| Efficiency | Bound |
+|-----------|--------|-----------------------------------------|-------------|-----------|------|-------|----------|------------|-------|
+```
+
+**All ten columns above are mandatory.** Never drop a column because some or all of its values are missing — render `—` in any cell whose value is null/absent and keep the column. The header row of every `**Data:**` table must contain exactly these ten column names in this order. (Agents may append extra columns at the end when needed, e.g. `Sub-Category` in the generic-op analyzer, but must not remove or reorder the ten standard columns.)
+
+**Column mappings** (source: `metrics['operations']`):
+- **Operation**: `operations[i].name`. Bare op name only — shape/dtype go in Args. Allowed suffix: `(decode)`/`(prefill)` to disambiguate the same op at multiple shapes.
+- **Args**: `operations[i].args`. Pre-rendered shape/dtype string, already joined with `<br>` — paste verbatim, do not reformat or re-join. `—` when absent.
+- **Kernel Path**: `operations[i].launcher_path`. Relative Python path that launched the kernel (e.g. `sglang/srt/layers/quantization/fp8_utils.py(549): aiter_w8a8_block_fp8_linear`). **Copy the value exactly as-is — do NOT truncate, shorten, or extract just the function name.** `—` when absent.
+- **Kernel Name**: `operations[i].kernel_name_trunc`. Truncated GPU kernel name(s) launched by this operation. For multi-kernel ops, formatted as `Kernel 1: <name><br>Kernel 2: <name>`. **Copy the value exactly as-is.** `—` when absent. (The full untruncated name is available in `operations[i].kernel_name` if needed for identification.)
+- **Time (ms)**: `operations[i].time_ms` — kernel time in milliseconds.
+- **%E2E**: `operations[i].percent_of_total` — kernel time as % of E2E GPU time. `—` when null. (`percent_of_category` is still in the JSON for screening thresholds but no longer rendered.)
+- **Count**: `operations[i].count` — total invocations, not unique signatures. `—` when absent.
+- **FLOPS/Byte**: `operations[i].efficiency.flops_per_byte` — note the nested path under `efficiency`, NOT a top-level field. `—` when null.
+- **Efficiency**: `operations[i].efficiency.efficiency_percent`, formatted by `bound_type`:
+  - `compute-bound`: `X.XX% of Y TFLOPS` (Y = `resolved_peak_maf`)
+  - `memory-bound`: `X.XX% of Y TB/s` (Y = `resolved_peak_hbm_bw`)
+- **Bound**: `operations[i].efficiency.bound_type` + `-bound` suffix (e.g., `memory-bound`). Must reflect compute/memory bound type — never use `classification.gemm_type` or similar.
+
+### Comparative (`comparison_scope` = `comparative`)
+
+```markdown
+| Operation | Args (T1) | Trace 1 Time (ms) | Trace 2 Time (ms) | Count (T1/T2) | Difference (ms) | FLOPS/Byte (T1) | Bound (T1) |
+|-----------|-----------|-------------------|-------------------|---------------|-----------------|-----------------|------------|
+```
+
+**Column mappings** (all sourced from `metrics['operations']`; do **not** re-join the CSV):
+- **Operation**: `operations[i].name`. Bare op name only.
+- **Args (T1)**: `operations[i].args`. Pre-rendered shape/dtype string, already joined with `<br>` — paste verbatim. `—` when absent.
+- **Trace 1 Time (ms)**: `operations[i].time_ms`
+- **Trace 2 Time (ms)**: `operations[i].t2_time_ms`. `—` when absent.
+- **Count (T1/T2)**: T1 = `operations[i].count`; T2 = `operations[i].count_trace2`. Format `T1 / T2` (use `—` for missing T2).
+- **Difference (ms)**: `operations[i].difference_ms`. `—` when absent.
+- **FLOPS/Byte (T1)**: `operations[i].efficiency.flops_per_byte`
+- **Bound (T1)**: `operations[i].efficiency.bound_type` with a `-bound` suffix
+
+Agents may add extra columns when needed (e.g. `Sub-Category` in the generic-op analyzer).
+
+---
+
+## Metric Table Schema (system tier)
+
+Standard schema for the `**Data:**` table inside system-tier `## Detailed Analysis` blocks. In comparative mode, report Trace 1 metrics only — do not add Trace 2 columns or comparisons.
+
+```markdown
+| Metric | Value | Flagged |
+|--------|-------|---------|
+```
+
+**Column rules:**
+- **Metric**: Copy metric label directly from earlier findings sections — do not rename or reformat.
+- **Value**: `X.X ms` or `X.X%` or `X.X ms (X.X%)`
+- **Flagged**: `true` when the metric's threshold is exceeded (issue present); `false` otherwise.
+
+---
+
+## Peak Reference (compute tier)
+
+When citing peak performance for a bottleneck, select the correct peak based on
+`operations[i].efficiency.bound_type`:
+- **compute-bound**: Use `operations[i].efficiency.resolved_peak_maf` (TFLOPS).
+  Report achieved TFLOPS/s vs peak TFLOPS.
+- **memory-bound**: Use `operations[i].efficiency.resolved_peak_hbm_bw` (TB/s).
+  Report achieved TB/s vs peak TB/s.
+
+Do not look up peaks independently from the metadata dict.
+
+---
+
+## Impact estimate rendering
+
+Compute-tier sub-agents READ their P-items from `category_data/<category>_metrics.json::category_findings[]`, one card per entry ordered by `rank`. The analyzer script has already grouped per-op estimates by `(bound_type, library, eff_bucket)` in standalone mode (or `(bound_type, library)` in comparative mode), summed impact, and dropped sub-threshold groups; the sub-agent renders one card per surviving entry.
+
+The set of P-items is decided by `category_findings[]` alone — `MIN_PITEM_IMPACT_SCORE` already gated upstream. **Per-category efficiency tables, expected-band thresholds, and Common Patterns in analyzer files are interpretation context for the prose** (cite in **Reasoning for Slowdown** when a member matches the band/symptom); they MUST NOT be used to add or drop P-items.
+
+### Reading category_findings[]
+
+| Field | Use |
+|-------|-----|
+| `rank` | Card order within your category (1 = highest impact). Also the `rank=` value in `<!-- reasoning-candidate -->`. |
+| `bound_type` | `compute` \| `memory`. Selects the matching Action Prose Guidance row. |
+| `library` | One per finding. Drives the `(<Library>)` title suffix. |
+| `eff_bucket` | Roofline-efficiency band: `"0-30"`, `"30-60"`, `"60-100"`, or `"unknown"` (standalone); `"all"` (comparative). Members within a finding share the same band. |
+| `impact_score` / `_low` / `_high` | Group-summed % of E2E. Render verbatim into `kind=p_item` and `kind=detail_estimate` markers. |
+| `estimate_method` | `"quantified"` (impact from a perf model — standalone roofline gap or comparative t2/t1 ratio) or `"heuristic"` (op has no perf model; impact estimated from E2E share — see below). |
+| `percent_of_total` | Heuristic findings only: the op's combined E2E GPU-time share (summed across shapes). Drives the warning line in § Heuristic findings. Absent on quantified findings. |
+| `member_count`, `members[]` | Underlying per-op estimate rows (operation, time_ms, efficiency_pct, `type`, …) — rows of the `**Data:**` table. `members[].type == "unmodeled_significant"` marks a heuristic finding; `"kernel_tuning"` is a quantified (perf-modeled) finding. |
+
+### Empty category_findings
+
+If `category_findings[]` is empty, emit `## Recommendations` with no P-items
+and `## Detailed Analysis` with no candidates. Do not manufacture sub-threshold
+cards to fill the section — that is the honest "no actionable issues" answer.
+
+### Heuristic findings (`estimate_method == "heuristic"`)
+
+An op with no perf model: `impact_score` / `_low` / `_high` are estimated from a
+recoverable fraction of its E2E share (`percent_of_total`, summed across shapes)
+and ranked alongside quantified findings.
+
+Render like a normal P-item card (numeric `low`/`mid`/`high` on `kind=p_item`,
+normal `kind=detail_estimate` bullets) with two additions:
+- Immediately **after** the `kind=p_item` impact marker block (i.e. *outside* the `impact-begin`/`impact-end` markers), add a warning line, substituting the finding's `percent_of_total`:
+
+```markdown
+> **Warning — estimated:** No performance model for this op; impact is derived from its E2E GPU-time share (<percent_of_total>%), not a gap projection.
+```
+- In the `## Detailed Analysis` `**Data:**` row, render `—` for `Efficiency` and `Bound`.
+
+### Rendering in `## Detailed Analysis` (compute tier)
+
+Two bullets — low and high. Wrap in `kind=detail_estimate` markers (see
+§ Impact markers).
+
+```markdown
+<!-- impact-begin kind=detail_estimate low=<impact_score_low> high=<impact_score_high> -->
+- Low end impact_score: <impact_score_low>
+- High end impact_score: <impact_score_high>
+<!-- impact-end -->
+```
+
+## Impact markers (REQUIRED)
+
+Every block whose contents depend on `impact_score*` values must be wrapped in
+a paired HTML-comment marker. The markers carry the underlying numeric data as
+key=value attributes so that optional downstream tooling can re-process the
+block deterministically without re-parsing prose.
+
+### Marker shape
+
+```
+<!-- impact-begin kind=KIND attr1=value1 attr2=value2 -->
+...rendered markdown content for this block...
+<!-- impact-end -->
+```
+
+The block between them is exactly the `impact_score`-based markdown you would otherwise emit.
+
+### `kind` values you must emit
+
+| `kind` | Where | Required attributes | Optional attributes |
+|--------|-------|--------------------|---------------------|
+| `p_item` | Around every P-item `**Impact**` line in `## Recommendations`. | `low`, `mid`, `high` (all three; use `null` only for system-tier non-quantifiable). | `category` is reserved for the orchestrator template; sub-agents do **not** emit it. |
+| `detail_estimate` | Around the two-bullet `Low end ... / High end ...` block under `**Impact estimate:**` in each `## Detailed Analysis` candidate. Skip only for system-tier non-quantifiable estimates. | `low`, `high` (impact_score values, % of E2E). | none |
+
+### Value-source rule
+
+The numbers in marker attributes (`low`, `mid`, `high`) **must** be transcribed
+verbatim from `category_data/<category>_metrics.json::category_findings[i]`
+(`impact_score_low`, `impact_score`, `impact_score_high` respectively), matched
+by `rank`. Do not re-derive, round, or scale them. Do not pull them from any
+other source.
+
+---
+
+## Validate findings (required before returning)
+
+After writing the findings file and impact estimates, run the programmatic
+validator. This replaces the previous manual self-check. The validator also
+enforces marker structure (pairing, `kind=`, per-kind required attrs,
+mandatory `kind=p_item` for category/system findings unless exempt) per
+§ Impact markers above.
+
+```bash
+<prefix> python3 -c "
+import sys
+from TraceLens.Agent.Analysis.utils.validation_utils import validate_findings_file
+passed, errors = validate_findings_file(sys.argv[1], sys.argv[2], sys.argv[3])
+if not passed:
+    print('FAIL:')
+    for e in errors:
+        print('  - ' + e)
+    sys.exit(1)
+print('PASS: Findings file is valid')
+" '<output_dir>/<subdir>/<category>_findings.md' '<tier>' '<comparison_scope>'
+```
+
+Where `<tier>` is `compute` or `system`, `<subdir>` is `category_findings`
+or `system_findings` respectively, and `<comparison_scope>` is `standalone` or
+`comparative`.
+
+**If validation fails (exit code 1):**
+
+1. Read the FAIL output — error messages are self-explanatory and include the fix hint.
+2. Fix the findings file accordingly. Edit sections in place and not regenerate the entire output.
+3. When fixing table cells (Args, Kernel Path, or any multi-row issue), **re-emit the entire table in one edit** — do NOT patch rows individually. Batch together edits together.
+4. Re-run validation. Maximum 2 retry attempts; if still failing, return with a warning.

From d14d40a110952354db940ca9c871891daf860b10 Mon Sep 17 00:00:00 2001
From: Gabe Weisz <gabe.weisz@amd.com>
Date: Fri, 19 Jun 2026 14:54:08 -0700
Subject: [PATCH 5/8] fix cursor manifest

---
 .cursor-plugin/marketplace.json | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.cursor-plugin/marketplace.json b/.cursor-plugin/marketplace.json
index f4aa2d7..08198da 100644
--- a/.cursor-plugin/marketplace.json
+++ b/.cursor-plugin/marketplace.json
@@ -9,6 +9,11 @@
     "version": "0.1.0"
   },
   "plugins": [
+    {
+      "name": "analysis-orchestrator",
+      "source": "./skills/analysis-orchestrator",
+      "description": "Orchestrates modular PyTorch profiler trace analysis with TraceLens: generates perf reports, prepares category data, runs system-level and compute-kernel subagents in parallel, validates outputs, and writes a prioritized stakeholder report (analysis.md)."
+    },
     {
       "name": "apu-memory-tuner",
       "source": "./skills/apu-memory-tuner",

From 0c0cc5c75e125fda527e300eefd8d0ff1d7d769b Mon Sep 17 00:00:00 2001
From: Gabe Weisz <gabe.weisz@amd.com>
Date: Fri, 19 Jun 2026 14:59:09 -0700
Subject: [PATCH 6/8] revert change

---
 .github/scripts/import_external_skills.py | 51 +----------------------
 1 file changed, 1 insertion(+), 50 deletions(-)

diff --git a/.github/scripts/import_external_skills.py b/.github/scripts/import_external_skills.py
index 601843a..286c932 100644
--- a/.github/scripts/import_external_skills.py
+++ b/.github/scripts/import_external_skills.py
@@ -9,9 +9,7 @@
 
 1. Shallow-clones the repo at the pinned `ref` into a temp directory,
    using sparse-checkout so only the configured `path` is fetched.
-2. Copies each named skill folder into `skills/<skill>/`, plus any declared
-   companion files (e.g. templates linked from agent markdown but stored
-   outside the skill subtree in the upstream repo).
+2. Copies each named skill folder into `skills/<skill>/`.
 3. Writes `.federated.json` inside each copy with source metadata so we
    can tell vendored skills apart from skills authored in this repo.
 4. Rewrites relative markdown links that point outside the copied skill
@@ -355,45 +353,6 @@ def copy_skill(src: Path, dest: Path) -> None:
     shutil.copytree(src, dest)
 
 
-# Repo-relative paths (inside the upstream clone) that a federated skill links
-# to but that live outside its copied subtree. Fetched with `git show` so the
-# sparse checkout does not need to widen.
-_COMPANION_FILES: dict[tuple[str, str], list[tuple[str, str]]] = {
-    ("amd-agi-tracelens", "analysis-orchestrator"): [
-        (
-            "TraceLens/Agent/Analysis/utils/templates/sub_agent_spec.md",
-            "utils/templates/sub_agent_spec.md",
-        ),
-    ],
-}
-
-
-def vendor_companion_files(
-    clone_dir: Path,
-    commit: str,
-    dest_skill: Path,
-    source_name: str,
-    skill_folder: str,
-    log: list[str],
-) -> None:
-    """Materialize companion blobs linked from agent markdown but outside the skill folder."""
-    for repo_rel, dest_rel in _COMPANION_FILES.get((source_name, skill_folder), []):
-        result = subprocess.run(
-            ["git", "show", f"{commit}:{repo_rel}"],
-            cwd=clone_dir,
-            check=True,
-            capture_output=True,
-            text=True,
-        )
-        out = dest_skill / dest_rel
-        out.parent.mkdir(parents=True, exist_ok=True)
-        out.write_text(result.stdout, encoding="utf-8")
-        log.append(
-            f"[{source_name}] vendored companion {repo_rel} -> "
-            f"skills/{skill_folder}/{dest_rel}"
-        )
-
-
 def write_marker(
     skill_dir: Path,
     source: Source,
@@ -543,14 +502,6 @@ def import_source(
             log.append(f"[{source.name}] {action} {spec.folder} -> skills/{spec.folder}")
             if not dry_run:
                 copy_skill(src_skill, dest_skill)
-                vendor_companion_files(
-                    tmp_path,
-                    commit,
-                    dest_skill,
-                    source.name,
-                    spec.folder,
-                    log,
-                )
                 write_marker(dest_skill, source, commit, relative_path)
                 write_card(dest_skill, source, marketplace_description)
                 rewrite_external_references(

From b36600bc697a9cda3416cdb064dc86b67d0c830b Mon Sep 17 00:00:00 2001
From: Gabe Weisz <gabe.weisz@amd.com>
Date: Mon, 22 Jun 2026 06:02:23 -0700
Subject: [PATCH 7/8] include template in skills

---
 skills/analysis-orchestrator/.federated.json  |   4 +-
 .../agents/convolution-analyzer.md            |   6 +-
 .../agents/cpu-idle-analyzer.md               |   4 +-
 .../agents/elementwise-analyzer.md            |   6 +-
 .../agents/gemm-analyzer.md                   |   6 +-
 .../agents/generic-op-analyzer.md             |  12 +-
 .../agents/kernel-fusion-analyzer.md          |   4 +-
 .../agents/moe-analyzer.md                    |   6 +-
 .../agents/multi-kernel-analyzer.md           |   4 +-
 .../agents/norm-analyzer.md                   |   6 +-
 .../agents/reduce-analyzer.md                 |   6 +-
 .../agents/sdpa-analyzer.md                   |   6 +-
 .../agents/triton-analyzer.md                 |   6 +-
 skills/analysis-orchestrator/reference.md     |   2 +-
 .../templates/analysis_template.md            | 449 ++++++++++++++++++
 .../{utils => }/templates/sub_agent_spec.md   |   0
 skills/magpie/.federated.json                 |   4 +-
 skills/magpie/SKILL.md                        |   8 +-
 18 files changed, 494 insertions(+), 45 deletions(-)
 create mode 100644 skills/analysis-orchestrator/templates/analysis_template.md
 rename skills/analysis-orchestrator/{utils => }/templates/sub_agent_spec.md (100%)

diff --git a/skills/analysis-orchestrator/.federated.json b/skills/analysis-orchestrator/.federated.json
index 72e64e6..d9af670 100644
--- a/skills/analysis-orchestrator/.federated.json
+++ b/skills/analysis-orchestrator/.federated.json
@@ -2,8 +2,8 @@
   "source": "amd-agi-tracelens",
   "repo": "AMD-AGI/TraceLens",
   "ref": "feat/gw_rename_directories",
-  "commit": "9b461bb25192ce73cb70de912ce27df515b56b44",
+  "commit": "f630a6f33ef31d749456afaa9d446201b2848b6f",
   "path": "TraceLens/Agent/Analysis/skills/analysis-orchestrator",
   "license": "MIT",
-  "imported_at": "2026-06-19T00:16:47Z"
+  "imported_at": "2026-06-22T13:01:34Z"
 }
diff --git a/skills/analysis-orchestrator/agents/convolution-analyzer.md b/skills/analysis-orchestrator/agents/convolution-analyzer.md
index 19446a3..3add20c 100644
--- a/skills/analysis-orchestrator/agents/convolution-analyzer.md
+++ b/skills/analysis-orchestrator/agents/convolution-analyzer.md
@@ -96,7 +96,7 @@ These are guidelines; if a member doesn't fit neatly, classify it semantically.
 - **Standalone:** Treat `efficiency_percent` as **% of roofline**.
 - **Comparative:** Treat `efficiency_percent` as **100 × (trace2 kernel time) / (trace1 kernel time)**.
 
-Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `time_ms`, `library`) using the Action Prose Guidance, Expected Efficiency, and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+Per [`templates/sub_agent_spec.md`](../templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `time_ms`, `library`) using the Action Prose Guidance, Expected Efficiency, and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
 
 **Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
 
@@ -157,7 +157,7 @@ Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].b
 
 ## Trace observability (category-specific)
 
-The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, Convolution analysis cannot observe:
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../templates/sub_agent_spec.md) always apply. In addition, Convolution analysis cannot observe:
 
 | NOT observable | Why | Fallback prose |
 |----------------|-----|----------------|
@@ -167,7 +167,7 @@ The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_
 
 ## Validate findings
 
-Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+Per [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Validate findings, run:
 
 ```bash
 <prefix> python3 -c "
diff --git a/skills/analysis-orchestrator/agents/cpu-idle-analyzer.md b/skills/analysis-orchestrator/agents/cpu-idle-analyzer.md
index 0ff5055..502b3e0 100644
--- a/skills/analysis-orchestrator/agents/cpu-idle-analyzer.md
+++ b/skills/analysis-orchestrator/agents/cpu-idle-analyzer.md
@@ -129,13 +129,13 @@ emit one merged recommendation card with combined evidence.
 **Action**: [Specific steps to take]
 ```
 
-**Detailed Analysis block:** Follow [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) for the full block schema.
+**Detailed Analysis block:** Follow [`templates/sub_agent_spec.md`](../templates/sub_agent_spec.md) for the full block schema.
 
 **Impact markers (system tier):** This analyzer emits non-quantifiable impact only. Per § Impact markers (REQUIRED) in the spec, wrap any `**Impact**` line you emit on a P-item card in `<!-- impact-begin kind=p_item low=null mid=null high=null -->` ... `<!-- impact-end -->`. Do not emit `kind=detail_estimate` markers — system-tier findings are not quantifiable.
 
 ### Step 3.1: Validate Findings
 
-Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+Per [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Validate findings, run:
 
 ```bash
 <prefix> python3 -c "
diff --git a/skills/analysis-orchestrator/agents/elementwise-analyzer.md b/skills/analysis-orchestrator/agents/elementwise-analyzer.md
index 7babc39..1a8e2a4 100644
--- a/skills/analysis-orchestrator/agents/elementwise-analyzer.md
+++ b/skills/analysis-orchestrator/agents/elementwise-analyzer.md
@@ -95,7 +95,7 @@ Baseline ops anchor the bandwidth comparison — if a baseline op underperforms
 - **Standalone:** Treat `efficiency_percent` as **% of roofline**.
 - **Comparative:** Treat `efficiency_percent` as **100 × (trace2 kernel time) / (trace1 kernel time)**.
 
-Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `time_ms`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+Per [`templates/sub_agent_spec.md`](../templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `time_ms`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
 
 **Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
 
@@ -130,7 +130,7 @@ Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].b
 
 ## Trace observability (category-specific)
 
-The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, elementwise analysis cannot observe:
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../templates/sub_agent_spec.md) always apply. In addition, elementwise analysis cannot observe:
 
 | NOT observable | Why | Fallback prose |
 |----------------|-----|----------------|
@@ -140,7 +140,7 @@ The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_
 
 ## Validate findings
 
-Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+Per [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Validate findings, run:
 
 ```bash
 <prefix> python3 -c "
diff --git a/skills/analysis-orchestrator/agents/gemm-analyzer.md b/skills/analysis-orchestrator/agents/gemm-analyzer.md
index e5afdf7..8f0008b 100644
--- a/skills/analysis-orchestrator/agents/gemm-analyzer.md
+++ b/skills/analysis-orchestrator/agents/gemm-analyzer.md
@@ -76,7 +76,7 @@ cat <output_dir>/category_data/gemm_metrics.json
 
 ### Step 3: Render P-items from `category_findings`
 
-Read `category_data/gemm_metrics.json::category_findings`. Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `time_ms`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+Read `category_data/gemm_metrics.json::category_findings`. Per [`templates/sub_agent_spec.md`](../templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `time_ms`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
 
 **Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
 
@@ -120,7 +120,7 @@ Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].b
 
 ## Trace observability (category-specific)
 
-The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, GEMM analysis cannot observe:
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../templates/sub_agent_spec.md) always apply. In addition, GEMM analysis cannot observe:
 
 | NOT observable | Why | Fallback prose |
 |----------------|-----|----------------|
@@ -131,7 +131,7 @@ The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_
 
 ## Validate findings
 
-Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+Per [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Validate findings, run:
 
 ```bash
 <prefix> python3 -c "
diff --git a/skills/analysis-orchestrator/agents/generic-op-analyzer.md b/skills/analysis-orchestrator/agents/generic-op-analyzer.md
index 77d9508..f91e129 100644
--- a/skills/analysis-orchestrator/agents/generic-op-analyzer.md
+++ b/skills/analysis-orchestrator/agents/generic-op-analyzer.md
@@ -83,9 +83,9 @@ cat <output_dir>/category_data/<cat>_metrics.json
 
 ### Step 3: Render P-items from `category_findings`
 
-Read `category_data/<cat>_metrics.json::category_findings`. Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+Read `category_data/<cat>_metrics.json::category_findings`. Per [`templates/sub_agent_spec.md`](../templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
 
-Entries whose `estimate_method == "heuristic"` (op with no perf model) carry a numeric **estimated** impact derived from E2E share and rank by `impact_score` like any other compute finding — follow [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Heuristic findings.
+Entries whose `estimate_method == "heuristic"` (op with no perf model) carry a numeric **estimated** impact derived from E2E share and rank by `impact_score` like any other compute finding — follow [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Heuristic findings.
 
 **efficiency_percent semantics:**
 - **Standalone:** Treat `efficiency_percent` as **% of roofline**.
@@ -94,8 +94,8 @@ Entries whose `estimate_method == "heuristic"` (op with no perf model) carry a n
 For each surviving entry:
 
 1. **Resolve what each member actually does.** Walk `members[]` and for every entry combine the `operation` name, kernel details, and `module_chain` context from `operations[]` to identify the real workload (e.g. embedding lookup, scatter/gather, custom layer). Call out miscategorization explicitly when the trace label is misleading.
-2. **Render the P-item.** Ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (`operation`, `efficiency_pct`, `library`) plus the resolved purpose from step 1, using the Action Prose Guidance and Common Patterns below. The P-item heading must include the `(<Library>)` suffix per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Recommendations: use `category_findings[i].library` as the value (e.g. `(vLLM)` for an aggregated InferenceAttention finding whose members are all vLLM ops). Omit the parenthetical only when the value is `Unknown`.
-3. **Annotate the Data table.** Extend the **Data:** operations table with a `Sub-Category` column from `operations[i].classification` when populated. Even when the finding has a single `members[]` row (e.g. aggregated InferenceAttention with one operation), render the canonical 10-column horizontal Operations Table from [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Operations Table Schema. Do not substitute a vertical `Metric | Value` table — that schema is system-tier only.
+2. **Render the P-item.** Ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (`operation`, `efficiency_pct`, `library`) plus the resolved purpose from step 1, using the Action Prose Guidance and Common Patterns below. The P-item heading must include the `(<Library>)` suffix per [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Recommendations: use `category_findings[i].library` as the value (e.g. `(vLLM)` for an aggregated InferenceAttention finding whose members are all vLLM ops). Omit the parenthetical only when the value is `Unknown`.
+3. **Annotate the Data table.** Extend the **Data:** operations table with a `Sub-Category` column from `operations[i].classification` when populated. Even when the finding has a single `members[]` row (e.g. aggregated InferenceAttention with one operation), render the canonical 10-column horizontal Operations Table from [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Operations Table Schema. Do not substitute a vertical `Metric | Value` table — that schema is system-tier only.
 
 **Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
 
@@ -136,7 +136,7 @@ Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].b
 
 ## Trace observability (category-specific)
 
-The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, uncategorized-op analysis cannot observe:
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../templates/sub_agent_spec.md) always apply. In addition, uncategorized-op analysis cannot observe:
 
 | NOT observable | Why | Fallback prose |
 |----------------|-----|----------------|
@@ -147,7 +147,7 @@ The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_
 
 ## Validate findings
 
-Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+Per [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Validate findings, run:
 
 ```bash
 <prefix> python3 -c "
diff --git a/skills/analysis-orchestrator/agents/kernel-fusion-analyzer.md b/skills/analysis-orchestrator/agents/kernel-fusion-analyzer.md
index c2be75e..9dbc052 100644
--- a/skills/analysis-orchestrator/agents/kernel-fusion-analyzer.md
+++ b/skills/analysis-orchestrator/agents/kernel-fusion-analyzer.md
@@ -149,7 +149,7 @@ Use the `confidence` from `kernel_fusion_metrics.json` when available. Otherwise
 
 Write `<output_dir>/system_findings/kernel_fusion_findings.md` using the command prefix.
 
-**Pay particular attention to § Impact markers (REQUIRED) in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md).** Every P-item `**Impact**` line and every Detailed Analysis `**Impact estimate:**` two-bullet block must be wrapped in `<!-- impact-begin kind=... -->` ... `<!-- impact-end -->` markers using the `low`/`mid`/`high` impact_score values from `category_data/kernel_fusion_metrics.json::impact_estimates[]`.
+**Pay particular attention to § Impact markers (REQUIRED) in [`sub_agent_spec.md`](../templates/sub_agent_spec.md).** Every P-item `**Impact**` line and every Detailed Analysis `**Impact estimate:**` two-bullet block must be wrapped in `<!-- impact-begin kind=... -->` ... `<!-- impact-end -->` markers using the `low`/`mid`/`high` impact_score values from `category_data/kernel_fusion_metrics.json::impact_estimates[]`.
 
 Number findings P1, P2, P3... sequentially by impact_score (highest first). The icon is set ONLY by the `confidence` field in `kernel_fusion_metrics.json`:
 
@@ -263,7 +263,7 @@ Then proceed directly to Step 4.1 validation.
 
 ### Step 4.1: Validate Findings
 
-Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+Per [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Validate findings, run:
 
 ```bash
 <prefix> python3 -c "
diff --git a/skills/analysis-orchestrator/agents/moe-analyzer.md b/skills/analysis-orchestrator/agents/moe-analyzer.md
index 127688d..b8bd22d 100644
--- a/skills/analysis-orchestrator/agents/moe-analyzer.md
+++ b/skills/analysis-orchestrator/agents/moe-analyzer.md
@@ -85,7 +85,7 @@ The byte estimation for MoE is an **average-case approximation** under uniform r
 - **Standalone:** Treat `efficiency_percent` as **% of roofline**.
 - **Comparative:** Treat `efficiency_percent` as **100 × (trace2 kernel time) / (trace1 kernel time)**.
 
-Read `category_data/<cat>_metrics.json::category_findings`. Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `library`, precision from `Compute Spec`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+Read `category_data/<cat>_metrics.json::category_findings`. Per [`templates/sub_agent_spec.md`](../templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `library`, precision from `Compute Spec`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
 
 **Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
 
@@ -136,7 +136,7 @@ Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].b
 
 ## Trace observability (category-specific)
 
-The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, MoE workloads have these blind spots:
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../templates/sub_agent_spec.md) always apply. In addition, MoE workloads have these blind spots:
 
 | NOT observable | Why | Fallback prose |
 |----------------|-----|----------------|
@@ -149,7 +149,7 @@ The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_
 
 ## Validate findings
 
-Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+Per [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Validate findings, run:
 
 ```bash
 <prefix> python3 -c "
diff --git a/skills/analysis-orchestrator/agents/multi-kernel-analyzer.md b/skills/analysis-orchestrator/agents/multi-kernel-analyzer.md
index 2c91157..20fed16 100644
--- a/skills/analysis-orchestrator/agents/multi-kernel-analyzer.md
+++ b/skills/analysis-orchestrator/agents/multi-kernel-analyzer.md
@@ -237,13 +237,13 @@ Recommendation quality requirements (apply before writing):
 
 ```
 
-**Detailed Analysis block:** Follow [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) for the full block schema.
+**Detailed Analysis block:** Follow [`templates/sub_agent_spec.md`](../templates/sub_agent_spec.md) for the full block schema.
 
 **Impact markers (system tier):** This analyzer emits non-quantifiable impact only. Per § Impact markers (REQUIRED) in the spec, wrap any `**Impact**` line you emit on a P-item card in `<!-- impact-begin kind=p_item low=null mid=null high=null -->` ... `<!-- impact-end -->`. Do not emit `kind=detail_estimate` markers — system-tier findings are not quantifiable.
 
 ### Step 7.1: Validate Findings
 
-Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+Per [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Validate findings, run:
 
 ```bash
 <prefix> python3 -c "
diff --git a/skills/analysis-orchestrator/agents/norm-analyzer.md b/skills/analysis-orchestrator/agents/norm-analyzer.md
index 2d419cf..a9ab574 100644
--- a/skills/analysis-orchestrator/agents/norm-analyzer.md
+++ b/skills/analysis-orchestrator/agents/norm-analyzer.md
@@ -93,7 +93,7 @@ Different norm variants have different efficiency characteristics due to their k
 - **Standalone:** Treat `efficiency_percent` as **% of roofline**.
 - **Comparative:** Treat `efficiency_percent` as **100 × (trace2 kernel time) / (trace1 kernel time)**.
 
-Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+Per [`templates/sub_agent_spec.md`](../templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
 
 **Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
 
@@ -135,7 +135,7 @@ Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].b
 
 ## Trace observability (category-specific)
 
-The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, normalization analysis cannot observe:
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../templates/sub_agent_spec.md) always apply. In addition, normalization analysis cannot observe:
 
 | NOT observable | Why | Fallback prose |
 |----------------|-----|----------------|
@@ -145,7 +145,7 @@ The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_
 
 ## Validate findings
 
-Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+Per [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Validate findings, run:
 
 ```bash
 <prefix> python3 -c "
diff --git a/skills/analysis-orchestrator/agents/reduce-analyzer.md b/skills/analysis-orchestrator/agents/reduce-analyzer.md
index 065ffac..c90c90a 100644
--- a/skills/analysis-orchestrator/agents/reduce-analyzer.md
+++ b/skills/analysis-orchestrator/agents/reduce-analyzer.md
@@ -88,7 +88,7 @@ Each `category_findings[i].members[j].operation` carries a torch op name (e.g. `
 
 ### Step 4: Render P-items from `category_findings`
 
-Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+Per [`templates/sub_agent_spec.md`](../templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
 
 **efficiency_percent semantics:**
 - **Standalone:** Treat `efficiency_percent` as **% of roofline**.
@@ -122,7 +122,7 @@ Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].b
 
 ## Trace observability (category-specific)
 
-The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, reduce analysis cannot observe:
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../templates/sub_agent_spec.md) always apply. In addition, reduce analysis cannot observe:
 
 | NOT observable | Why | Fallback prose |
 |----------------|-----|----------------|
@@ -132,7 +132,7 @@ The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_
 
 ## Validate findings
 
-Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+Per [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Validate findings, run:
 
 ```bash
 <prefix> python3 -c "
diff --git a/skills/analysis-orchestrator/agents/sdpa-analyzer.md b/skills/analysis-orchestrator/agents/sdpa-analyzer.md
index 3c2f009..053d947 100644
--- a/skills/analysis-orchestrator/agents/sdpa-analyzer.md
+++ b/skills/analysis-orchestrator/agents/sdpa-analyzer.md
@@ -88,7 +88,7 @@ Reference the detected implementation in the **Identification** prose of every f
 
 ### Step 3: Render P-items from `category_findings`
 
-Read `category_data/<sdpa>_metrics.json::category_findings`. Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `library`) and the per-op `classification.kernel_breakdown` / `classification.workload_profile` (Paged) using the Action Prose Guidance, Expected Efficiency, and Common Patterns below. For Paged Attention, extend the **Data:** operations table with kernel-breakdown component, workload type, and attention pattern columns when populated. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+Read `category_data/<sdpa>_metrics.json::category_findings`. Per [`templates/sub_agent_spec.md`](../templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `library`) and the per-op `classification.kernel_breakdown` / `classification.workload_profile` (Paged) using the Action Prose Guidance, Expected Efficiency, and Common Patterns below. For Paged Attention, extend the **Data:** operations table with kernel-breakdown component, workload type, and attention pattern columns when populated. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
 
 **efficiency_percent semantics:**
 - **Standalone:** Treat `efficiency_percent` as **% of roofline**.
@@ -175,7 +175,7 @@ Short sequences naturally show lower efficiency — do NOT call low efficiency a
 
 ## Trace observability (category-specific)
 
-The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, SDPA analysis cannot observe:
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../templates/sub_agent_spec.md) always apply. In addition, SDPA analysis cannot observe:
 
 **Flash / Standard Attention:**
 
@@ -193,7 +193,7 @@ The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_
 
 ## Validate findings
 
-Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+Per [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Validate findings, run:
 
 ```bash
 <prefix> python3 -c "
diff --git a/skills/analysis-orchestrator/agents/triton-analyzer.md b/skills/analysis-orchestrator/agents/triton-analyzer.md
index 9486650..1632590 100644
--- a/skills/analysis-orchestrator/agents/triton-analyzer.md
+++ b/skills/analysis-orchestrator/agents/triton-analyzer.md
@@ -92,7 +92,7 @@ The fused ATen ops are encoded in the kernel name after the prefix (e.g. `triton
 - **Standalone:** Treat `efficiency_percent` as **% of roofline**.
 - **Comparative:** Treat `efficiency_percent` as **100 × (trace2 kernel time) / (trace1 kernel time)**.
 
-Per [`utils/templates/sub_agent_spec.md`](../utils/templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `time_ms`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
+Per [`templates/sub_agent_spec.md`](../templates/sub_agent_spec.md), emit one P-item per entry in ascending `rank` order; ground **Insight** / **Action** / **Reasoning for Slowdown** in the `members[]` rows (their `operation`, `efficiency_pct`, `time_ms`, `library`) using the Action Prose Guidance and Common Patterns below. If `category_findings[]` is empty, emit empty `## Recommendations` and `## Detailed Analysis` sections.
 
 **Markers required:** wrap every `**Impact**` line in `<!-- impact-begin kind=p_item ... --> ... <!-- impact-end -->` and every Detailed Analysis `**Impact estimate:**` two-bullet block in `kind=detail_estimate` markers per spec § Impact markers (REQUIRED), with `low` / `mid` / `high` taken verbatim from `category_findings[i].impact_score{,_low,_high}`.
 
@@ -127,7 +127,7 @@ Vendor/library/framework-agnostic. Pick the row matching `category_findings[i].b
 
 ## Trace observability (category-specific)
 
-The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) always apply. In addition, Triton fused-kernel analysis cannot observe:
+The universal CANNOT Infer rows in [`sub_agent_spec.md`](../templates/sub_agent_spec.md) always apply. In addition, Triton fused-kernel analysis cannot observe:
 
 | NOT observable | Why | Fallback prose |
 |----------------|-----|----------------|
@@ -138,7 +138,7 @@ The universal CANNOT Infer rows in [`sub_agent_spec.md`](../utils/templates/sub_
 
 ## Validate findings
 
-Per [`sub_agent_spec.md`](../utils/templates/sub_agent_spec.md) § Validate findings, run:
+Per [`sub_agent_spec.md`](../templates/sub_agent_spec.md) § Validate findings, run:
 
 ```bash
 <prefix> python3 -c "
diff --git a/skills/analysis-orchestrator/reference.md b/skills/analysis-orchestrator/reference.md
index dbb0e91..3230af3 100644
--- a/skills/analysis-orchestrator/reference.md
+++ b/skills/analysis-orchestrator/reference.md
@@ -509,7 +509,7 @@ If the plot fails (extension-absent branch), retry once. If still failing, proce
 
 **CRITICAL: Do NOT delegate Step 11 to a Task subagent.** The orchestrator must write the report directly.
 
-1. **Read** the report template: `TraceLens/Agent/Analysis/utils/templates/analysis_template.md`
+1. **Read** the report template: `TraceLens/Agent/Analysis/skills/analysis-orchestrator/templates/analysis_template.md`
 2. **Write the report in sections** to `<output_dir>/analysis.md` using **only** `<prefix> tee` / `<prefix> tee -a` with single-quoted heredoc delimiters (see write order below). You MUST NOT use the IDE Write tool, Edit tool, StrReplace tool, `cat >`, `echo >`, `>>` redirect, or any other write method for `analysis.md` unless tee fails.
 3. **Fill in** each section by substituting placeholders with actual data. Never retain template placeholders (`<Brief Title>`, `X ms`, `Y%`, `<platform>`, `<model>`) — every field must contain actual data.
 
diff --git a/skills/analysis-orchestrator/templates/analysis_template.md b/skills/analysis-orchestrator/templates/analysis_template.md
new file mode 100644
index 0000000..d3fd5f0
--- /dev/null
+++ b/skills/analysis-orchestrator/templates/analysis_template.md
@@ -0,0 +1,449 @@
+<!--
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+See LICENSE for license information.
+-->
+
+<!--
+=== FORMATTING RULES (for the agent filling in this template) ===
+
+=== MODE SELECTION ===
+This template supports two modes determined by `comparison_scope`:
+  - **standalone**: Single-trace roofline analysis (default). Use sections marked STANDALONE.
+  - **comparative**: Two-trace analysis (Trace 1 =  primary, Trace 2 = target). Use sections marked COMPARATIVE.
+When filling in this template, select the block matching the active `comparison_scope` for each
+section that has STANDALONE / COMPARATIVE variants. Delete the unused variant.
+
+=== COMPARATIVE TERMINOLOGY ===
+  - **Trace 1** =  trace (primary). **Trace 2** = trace (target/comparison).
+  - Impact semantics: standalone uses roofline gap; comparative uses
+    trace 2 kernel time as the optimization target (gap = trace1 time − trace2 time).
+  - Comparative speed semantics: express as "X% faster" or "X% slower" relative to Trace 1. If t2 < t1 → "X% faster"; if t2 > t1 → "X% slower".
+    Standalone efficiency semantics: % of roofline (unchanged).
+
+=== GENERAL RULES ===
+1. Warnings section: Only include if there were errors or high-variance operations; omit entirely if all succeeded and no variance flags.
+2. Executive Summary: Max ~20 lines.
+3. Performance plot: The {{PERF_PLOT}} placeholder is replaced by Step 11.3 with a base64-embedded
+   PNG data URI (![Performance Breakdown](data:image/png;base64,...)) of a single horizontal stacked
+   bar showing the run's compute-time breakdown by kernel category. The plot is purely descriptive
+   (no error bars, no throughput cone, no savings estimates). If the plot was not generated
+   (Step 10.3 / Step 11.2 failed), the placeholder is removed.
+4. Compute Kernel Optimizations: One P-item per entry in `priority_data.json::findings[]`,
+   numbered P1, P2, ... in `findings[]` order (already globally sorted by `impact_score`). The
+   P-item Impact line uses the canonical mid `impact_score` value; low/high values appear only
+   in Detailed Analysis. Cards join the corresponding sub-agent's Detailed Analysis block by
+   `(findings[i].category, findings[i].category_rank)`.
+5. System-Level Optimizations: If all system-level analyses report no actionable issues
+   (NONE/N/A severity), use a single "✅ No system-level bottlenecks detected" summary instead of
+   P1/P2/P3 recommendations. Only generate numbered priorities when at least one actionable issue
+   exists (number sequentially from P1, including CPU/Idle first if invoked).
+6. Each section is independently composable -- can be shared standalone.
+7. All three tiers (Compute, Kernel Fusion, System) use separate sequential P1/P2/P3 numbering (no gaps).
+8. Priority icons are assigned by PRIORITY NUMBER, not severity:
+   - Compute Kernel: 🔴 P1 → 🟡 P2 → 🟢 P3 → 🟢 P4 ...
+   - Kernel Fusion: icon by confidence (🔴 high → 🟡 medium → 🟢 low), not priority number
+   - System-Level: 🔴 P1 → 🟡 P2 → 🟢 P3 → 🟢 P4 ... (only when actionable issues exist)
+9. Field labels — each section uses EXACTLY these labels:
+
+   OPTIMIZATION CARDS (§Compute Kernel Optimizations, §Kernel Fusion, §System-Level):
+   - Compute Kernel P-items: **Insight** / **Action** / **Impact**
+   - Kernel Fusion P-items:  **Insight** / **Action** / **Impact** / **Confidence**
+   - System-Level P-items:   **Insight** / **Action** / **Impact**
+
+   DETAILED ANALYSIS (§Detailed Analysis only):
+   - Compute / System blocks: **Identification:** / **Data:** / **Reasoning for Slowdown:** / **Resolution:** / **Impact estimate:**
+   - Kernel Fusion blocks:    **Identification:** / **Data:** / **Impact estimate:**
+
+10. Detailed Analysis: three subsections (`### Compute Kernel Insights`, `### Kernel Fusion Insights`, `### System-Level Insights`) with `#### 🔴/🟡/🟢 Pn: <Brief Title>` blocks matching card titles and order.
+11. Model and appendix: Use `model_info["model"]` from `metadata/model_info.json` for the
+    report title (fall back to "Workload" if "Cannot be inferred from trace"). Fill Appendix
+    **Model Architecture** with the raw `model`, `architecture`, `scale`, `precision` values.
+12. Library parenthetical: Compute Kernel card titles and Detailed Analysis headings must include
+    the library name(s) in parentheses when present in the sub-agent findings. Omit when no
+    library is identified. System-Level and Kernel Fusion titles do NOT include a library
+    parenthetical.
+-->
+
+<!-- === STANDALONE title === -->
+# <Model> - <Platform> Standalone Analysis
+
+<!-- === COMPARATIVE title === -->
+# <Model> - Comparative Analysis: <Platform1> vs <Platform2>
+
+## Executive Summary
+
+<!-- === STANDALONE Executive Summary === -->
+[1 paragraph overview + key metrics table]
+
+<!-- MANDATORY: This table must contain exactly these 5 rows:
+     Total Time | Compute % | Idle % | Exposed Communication % | Top Bottleneck Category
+     Top Bottleneck Category V% = gpu_kernel_time_ms of top category / (gpu_utilization.total_time_ms * computation_time_percent / 100) -->
+| Metric | Value |
+|--------|-------|
+| Total Time | X ms |
+| Compute % | Y% |
+| Idle % | Z% |
+| Exposed Communication % | W% |
+| Top Bottleneck Category | Category (V%) |
+
+<!-- === COMPARATIVE Executive Summary === -->
+[1 paragraph comparative overview: summarize which trace is faster overall, by how much, and the dominant gap categories]
+
+<!-- Top Bottleneck Category X% = top category's gpu_kernel_time_ms / (manifest.gpu_utilization.total_time_ms * manifest.gpu_utilization.computation_time_percent / 100)
+     Top Bottleneck Category Y% = top category's gpu_kernel_time_ms / (manifest.trace2_gpu_utilization.total_time_ms * manifest.trace2_gpu_utilization.computation_time_percent / 100)
+     Difference = Trace 2 value − Trace 1 value -->
+| Metric | Trace 1 - (<Platform1>) | Trace 2 - (<Platform2>) | Difference |
+|--------|----------------------------|-------------------------------|------------|
+| Total Time | X ms | Y ms | +/-Z ms (+/-W%) |
+| Compute % | X% | Y% | +/-Z% |
+| Idle % | X% | Y% | +/-Z% |
+| Exposed Communication % | X% | Y% | +/-Z% |
+| Top Bottleneck Category | Category (X%) | Category (Y%) | — |
+
+{{PERF_PLOT}}
+
+## Warnings
+
+**Include this section ONLY if any subagent failed OR any operation has high_variance: true in *_metrics.json:**
+
+<!-- Subagent failures (if any): -->
+The following analyses could not be completed due to script failures:
+
+| Analysis | Tier | Error Summary |
+|----------|------|---------------|
+| <name> | System / Compute Kernel | <brief error description> |
+
+These are excluded from the recommendations below.
+
+<!-- Data quality warnings (if any operation has high_variance: true in *_metrics.json): -->
+**Data Quality:** The following operations have unreliable kernel time measurements (CoV > 1.0, indicating extreme variance across instances — likely a profiler timing artifact):
+
+| Operation | Category | CoV | Reported Time (ms) |
+|-----------|----------|-----|-------------------|
+| <name> | <category> | X.X | Y.Y |
+
+---
+
+## Compute Kernel Optimizations
+
+Findings from per-category kernel analysis (GEMM, SDPA, elementwise, etc.).
+Summaries of recommendations from Step 7 sub-agents, focused on individual kernel efficiency.
+
+### Top Operations
+
+One row per entry in `priority_data.json::priorities[]`, in array order (no manifest-sort, no extra rows). For row N (= `priorities[N-1]`): `Rank`/`Category` = `rank`/`display_name`; `Time (ms)` = matching `manifest.categories[].gpu_kernel_time_ms` (verbatim); `Ops` = matching `manifest.categories[].ops_count`; `% of Compute Time` = `Time (ms) / (gpu_utilization.total_time_ms * computation_time_percent / 100)`; trailer `low`/`high` = `priorities[N-1].impact_score_low`/`impact_score_high` (use `null` for `source: "manifest_fallback"`). Wrap the whole block (header + separator + rows) in the `kind=top_ops` marker.
+
+<!-- === STANDALONE Top Operations === -->
+<!-- impact-begin kind=top_ops -->
+| Rank | Category | Time (ms) | % of Compute Time | Ops |
+|------|----------|-----------|-------------------|-----|
+| 1 | ... | ... | ... | ... | <!-- top-ops-row low=<impact_score_low> high=<impact_score_high> -->
+<!-- impact-end -->
+
+<!-- === COMPARATIVE Top Operations === -->
+`Trace 2 Time (ms)` = matching `manifest.trace2_ops_summary_by_category[]["total_direct_kernel_time_ms"]` where `"op category"` matches the row Category **case-insensitively**; use — if no match.
+`Difference (ms)` = Trace 2 Time − Trace 1 Time.
+<!-- impact-begin kind=top_ops -->
+| Rank | Category | Trace 1 Time (ms) | Trace 2 Time (ms) | % of Compute Time | Ops | Difference (ms) |
+|------|----------|-------------------|-------------------|-------------------|-----|-----------------|
+| 1 | ... | ... | ... | ... | ... | +/-X.X or — | <!-- top-ops-row low=<impact_score_low> high=<impact_score_high> -->
+<!-- impact-end -->
+
+<!-- === NO ACTIONABLE FINDINGS (all quantified compute categories have empty category_findings[] in *_metrics.json) === -->
+<!-- Use when priority_data / per-category metrics show no compute P-items to render (category_findings[] empty everywhere that applies). -->
+✅ No compute kernel optimization opportunities identified. All categories are within target performance bounds.
+
+<!-- === ACTIONABLE FINDINGS (at least one compute category has P-items) === -->
+<!-- Icon mapping by PRIORITY NUMBER (not severity): P1=🔴, P2=🟡, P3+=🟢 -->
+<!-- One card per entry in priority_data.findings[] in array order. Title uses the entry's category and library; Action text is category-appropriate. Do NOT recommend "fuse the SDPA kernel" (already fused — defer upstream/downstream fusion to Kernel Fusion section). -->
+<!-- Skip categories that have empty category_findings[] in category_data/<cat>_metrics.json (no P-items for that category). -->
+<!-- Heuristic findings (priority_data.findings[i].estimate_method == "heuristic") carry a numeric estimated impact and sort by impact_score like any other compute finding; render per sub_agent_spec.md § Heuristic findings. -->
+
+### 🔴 P1: <Brief Title> (<Library>)
+
+**Insight**: [1 sentence - what's wrong]
+
+**Action**: [1-2 sentences - category-appropriate: GEMM fusion/tile/library; SDPA tile/backend; elementwise fusion; etc.]
+
+<!-- impact-begin kind=p_item category=<priority_data.findings[0].category> low=<priority_data.findings[0].impact_score_low> mid=<priority_data.findings[0].impact_score> high=<priority_data.findings[0].impact_score_high> -->
+**Impact**: [impact_score: X.X, OR "Not quantifiable from trace data"]
+<!-- impact-end -->
+
+→ *See [Detailed Analysis: Compute kernel insights > P1](#detailed-analysis-compute-p1) for details*
+
+---
+
+### 🟡 P2: <Brief Title> (<Library>)
+
+**Insight**: [1 sentence]
+
+**Action**: [1-2 sentences]
+
+<!-- impact-begin kind=p_item category=<priority_data.findings[1].category> low=<priority_data.findings[1].impact_score_low> mid=<priority_data.findings[1].impact_score> high=<priority_data.findings[1].impact_score_high> -->
+**Impact**: [impact_score: X.X, OR "Not quantifiable from trace data"]
+<!-- impact-end -->
+
+→ *See [Detailed Analysis: Compute kernel insights > P2](#detailed-analysis-compute-p2) for details*
+
+---
+
+### 🟢 P3: <Brief Title> (<Library>)
+
+**Insight**: [1 sentence]
+
+**Action**: [1-2 sentences]
+
+<!-- impact-begin kind=p_item category=<priority_data.findings[2].category> low=<priority_data.findings[2].impact_score_low> mid=<priority_data.findings[2].impact_score> high=<priority_data.findings[2].impact_score_high> -->
+**Impact**: [impact_score: X.X, OR "Not quantifiable from trace data"]
+<!-- impact-end -->
+
+→ *See [Detailed Analysis: Compute kernel insights > P3](#detailed-analysis-compute-p3) for details*
+
+<!-- All additional P-items (P4, P5, ...) follow the same pattern, sourcing markers from priority_data.findings[N-1]. Detailed Analysis links: → *See [Detailed Analysis: Compute kernel insights > PN](#detailed-analysis-compute-pN) for details* -->
+
+---
+
+## Kernel Fusion Opportunities (Experimental)
+<!-- === STANDALONE Kernel Fusion === -->
+> **Note:** Kernel fusion analysis is experimental. impact_score projections estimate the recoverable fraction of E2E with 85% memory/compute pipeline overlap. Kernels without perf models use their measured trace time as-is. Candidates where fewer than 75% of kernels have perf models are not reported. Actual recoverable time depends on implementation feasibility and interaction effects.
+<!-- === COMPARATIVE Kernel Fusion === -->
+> **Note:** Kernel fusion analysis is experimental.
+
+<!-- Populate from system_findings/kernel_fusion_findings.md if kernel_fusion category exists in manifest. -->
+<!-- Each finding uses Insight / Action / Impact / Confidence format, with Impact from kernel_fusion_metrics.json. -->
+<!-- P1/P2/P3+ ordered by confidence then kernel time. -->
+<!-- Icon mapping by CONFIDENCE (not priority number): 🔴 high → 🟡 medium → 🟢 low. -->
+<!-- If no findings or kernel_fusion category not in manifest, replace the cards below with: "No kernel fusion opportunities detected." -->
+
+### 🔴 P1: <Candidate Name>
+
+**Insight**: [1 sentence - what fusion pattern was detected]
+
+**Action**: [1-2 sentences - which kernels to fuse and how]
+
+<!-- impact-begin kind=p_item low=<kernel_fusion_metrics.impact_estimates[0].impact_score_low> mid=<kernel_fusion_metrics.impact_estimates[0].impact_score> high=<kernel_fusion_metrics.impact_estimates[0].impact_score_high> -->
+**Impact**: [impact_score: X.X (perf-model coverage Y/Z kernels)]
+<!-- impact-end -->
+
+**Confidence**: [high / medium / low - fusion pattern quality]
+
+→ *See [Detailed Analysis: Kernel fusion insights > P1](#detailed-analysis-fusion-P1) for details*
+
+---
+
+### 🟡 P2: <Candidate Name>
+
+**Insight**: [1 sentence]
+
+**Action**: [1-2 sentences]
+
+<!-- impact-begin kind=p_item low=<kernel_fusion_metrics.impact_estimates[1].impact_score_low> mid=<kernel_fusion_metrics.impact_estimates[1].impact_score> high=<kernel_fusion_metrics.impact_estimates[1].impact_score_high> -->
+**Impact**: [impact_score: X.X (perf-model coverage Y/Z kernels)]
+<!-- impact-end -->
+
+**Confidence**: [high / medium / low]
+
+→ *See [Detailed Analysis: Kernel fusion insights > P2](#detailed-analysis-fusion-P2) for details*
+
+---
+
+### 🟢 P3: <Candidate Name>
+
+**Insight**: [1 sentence]
+
+**Action**: [1-2 sentences]
+
+<!-- impact-begin kind=p_item low=<kernel_fusion_metrics.impact_estimates[2].impact_score_low> mid=<kernel_fusion_metrics.impact_estimates[2].impact_score> high=<kernel_fusion_metrics.impact_estimates[2].impact_score_high> -->
+**Impact**: [impact_score: X.X (perf-model coverage Y/Z kernels)]
+<!-- impact-end -->
+
+**Confidence**: [high / medium / low]
+
+→ *See [Detailed Analysis: Kernel fusion insights > P3](#detailed-analysis-fusion-P3) for details*
+
+<!-- All additional fusion P-items (P4, P5, ...) follow the same pattern with Detailed Analysis links: → *See [Detailed Analysis: Kernel fusion insights > PN](#detailed-analysis-fusion-PN) for details* -->
+
+---
+
+## System-Level Optimizations
+
+> **Note:** System-level analysis is exploratory. The patterns and recommendations below are under active development and may be refined as system-level analysis matures.
+
+<!-- === COMPARATIVE system-level note === -->
+<!-- In comparative mode, add this note immediately after the blockquote above: -->
+<!-- > **Comparative Note:** System-level analysis is performed on the primary trace (Trace 1) only. Cross-trace system-level comparison is not yet supported. -->
+
+Findings from system-level analysis (GPU utilization, memory transfer patterns,
+communication/compute overlap). These affect the GPU pipeline as a whole.
+
+<!-- CONDITIONAL: If NO actionable system-level issues found (idle <= 15% and all multi-kernel assessments flagged: false), use Template A. -->
+<!-- Otherwise, number priorities sequentially: CPU/Idle first (if idle > 15%), then multi-kernel issues by severity. -->
+<!-- Icon mapping by PRIORITY NUMBER (not severity): P1=🔴, P2=🟡, P3+=🟢 -->
+<!-- Title format: Descriptive name only. -->
+<!-- System-level recommendations always include **Impact**: "Not quantifiable from trace data" with null markers. -->
+<!-- De-dup rule: If CPU/Idle and Multi-Kernel propose the same mechanism/action, keep one merged system card with combined evidence (do not render two near-duplicate cards). -->
+
+<!-- === TEMPLATE A: No actionable system-level issues === -->
+<!-- Use this when idle <= 15% and all multi-kernel assessments have flagged: false -->
+
+✅ No system-level bottlenecks detected. GPU activity breakdown shows X% computation, with negligible memcpy and communication overhead.
+
+<!-- === TEMPLATE B: Actionable issues found === -->
+<!-- Use this when idle > 15% or at least one multi-kernel assessment has flagged: true -->
+
+### 🔴 P1: <CPU/Idle Title OR Multi-Kernel Issue Title>
+
+**Insight**: [1-2 sentences - what's wrong]
+
+**Action**: [1-2 sentences - what to do]
+
+<!-- impact-begin kind=p_item low=null mid=null high=null -->
+**Impact**: Not quantifiable from trace data
+<!-- impact-end -->
+
+→ *See [Detailed Analysis: System-level insights > P1](#detailed-analysis-system-p1) for details*
+
+---
+
+### 🟡 P2: <Multi-Kernel Issue Title>
+
+**Insight**: [1 sentence - what's wrong]
+
+**Action**: [1-2 sentences - what to do]
+
+<!-- impact-begin kind=p_item low=null mid=null high=null -->
+**Impact**: Not quantifiable from trace data
+<!-- impact-end -->
+
+→ *See [Detailed Analysis: System-level insights > P2](#detailed-analysis-system-p2) for details*
+
+---
+
+### 🟢 P3: <Next Multi-Kernel Issue>
+
+**Insight**: [1 sentence]
+
+**Action**: [1-2 sentences]
+
+<!-- impact-begin kind=p_item low=null mid=null high=null -->
+**Impact**: Not quantifiable from trace data
+<!-- impact-end -->
+
+→ *See [Detailed Analysis: System-level insights > P3](#detailed-analysis-system-p3) for details*
+
+<!-- All additional system P-items follow the same pattern with Detailed Analysis links -->
+
+---
+
+## Detailed Analysis
+
+<!-- Paste reasoning blocks from sub-agent findings, augment headings with P-numbers, icons, and HTML anchors. Everything else should be copied verbatim-->
+<!-- Detailed Analysis labels per rule 9 — do not use these labels in optimization cards above -->
+<!-- Impact estimate bullets are rendered by each sub-agent from metadata/*.json → impact_estimates (same source as card Impact). -->
+<!-- MARKER CONTRACT: Every #### P<N>: heading in Detailed Analysis MUST be
+     preceded by  <!-- reasoning-candidate tier=<TIER> rank=<R> --> where TIER = compute | fusion | system (matching the ### subsection), R= 1, 2, 3, … incrementing per tier (rank=1 for first item, rank=2 for second, etc.). -->
+
+### Compute Kernel Insights
+
+<!-- One #### 🔴/🟡/🟢 Pn: <title> block per entry in priority_data.findings[], in array order. -->
+<!-- Source the body block from the sub-agent's findings.md by joining on (findings[i].category, findings[i].category_rank): the sub-agent emits its P-items ordered by intra-category rank, so its rank-N block becomes this report's PN where N matches the position in priority_data.findings[]. -->
+<!-- Each block has an HTML anchor: <a id="detailed-analysis-compute-pN"></a> -->
+
+<!-- === STANDALONE Compute Kernel Data table === Use this schema for standalone mode ONLY. Use these 10 exact columns (must match sub_agent_spec.md § Operations Table Schema) -->
+
+<a id="detailed-analysis-compute-p1"></a>
+<!-- reasoning-candidate tier=compute rank=1 -->
+#### 🔴 P1: <Brief Title> (<Library>)
+**Identification:**
+**Data:**
+
+| Operation | Args | Kernel Path | Kernel Name | Time (ms) | %E2E | Count | FLOPS/Byte | Efficiency | Bound |
+|-----------|------|-------------|-------------|-----------|------|-------|------------|------------|-------|
+| ...       | ...  | ...         | ...         | ...       | ...  | ...   | ...        | ...        | ...   |
+
+**Reasoning for Slowdown:**
+**Resolution:**
+**Impact estimate:**
+
+<!-- === COMPARATIVE Compute Kernel Data table === Use this schema for comparative mode ONLY. Use these 8 exact columns (Kernel Name/Path are omitted in comparative mode) -->
+<!-- Trace 1 ms = operations[i].time_ms. Trace 2 ms = operations[i].t2_time_ms.
+     Count T1/T2 = operations[i].count / operations[i].count_trace2 when present.
+     Difference (ms) = operations[i].difference_ms (negative ⇒ Trace 1 slower), or —. -->
+
+<a id="detailed-analysis-compute-p1"></a>
+<!-- reasoning-candidate tier=compute rank=1 -->
+#### 🔴 P1: <Brief Title>
+**Identification:** [1-2 sentences - How this opportunity was surfaced relative to the target trace. Must end with (source: <artifact> → <keys>).]
+**Data:** [1 sentence summary of table]
+
+| Operation | Args (T1) | Trace 1 Time (ms) | Trace 2 Time (ms) | Count (T1/T2) | Difference (ms) | FLOPS/Byte (T1) | Bound (T1) |
+|-----------|-----------|-------------------|-------------------|---------------|-----------------|-----------------|------------|
+| ...       | ...       | ...               | ...               | .../...       | ...             | ...             | ...        |
+
+**Reasoning for Slowdown:** [2-3 sentences - Why Trace 1 is slower than Trace 2 for these operations as the traces show. No micro-architecture speculation.]
+**Resolution:** [1-2 sentences - Why the suggested optimization helps close the gap — not merely restating what to do.]
+**Impact estimate:** [Rendered from metadata → impact_estimates]
+
+### Kernel Fusion Insights
+<!-- === STANDALONE Kernel Fusion === -->
+> **Note:** Kernel fusion analysis is experimental. impact_score projections estimate the recoverable fraction of E2E with 85% memory/compute pipeline overlap. Kernels without perf models use their measured trace time as-is. Actual recoverable time depends on implementation feasibility and interaction effects.
+<!-- === COMPARATIVE Kernel Fusion === -->
+> **Note:** Kernel fusion analysis is experimental.
+
+<!-- Paste reasoning blocks from kernel_fusion_findings.md, ordered by confidence then kernel time (matching card order). -->
+<!-- Each block uses three required labels: **Identification:**, **Data:**, **Impact estimate:** -->
+<!-- If kernel_fusion category is not in the manifest or findings are empty, show "No fusion impact estimates available." -->
+
+<a id="detailed-analysis-fusion-P1"></a>
+<!-- reasoning-candidate tier=fusion rank=1 -->
+#### 🔴/🟡/🟢 P1: <Candidate Name> (<time_ms> ms, <instance_count> instances)
+
+**Identification:**
+
+**Data:**
+
+| Kernel | Type | Duration (us) | Perf model |
+|--------|------|--------------|------------|
+| <kernel name (truncated to ~60 chars)> | <type> | X.X | Yes/No |
+
+**Impact estimate:**
+
+<a id="detailed-analysis-fusion-P2"></a>
+<!-- reasoning-candidate tier=fusion rank=2 -->
+#### 🔴/🟡/🟢 P2: <Candidate Name> (<time_ms> ms, <instance_count> instances)
+
+*Repeat the same Identification + Data + Impact estimate format for each candidate, with anchors `detailed-analysis-fusion-PN`.*
+
+### System-Level Insights
+
+<!-- One #### 🔴/🟡/🟢 Pn: <title> block per promoted system P-item, in priority order. -->
+<!-- Each block has an HTML anchor: <a id="detailed-analysis-system-pN"></a> -->
+<!-- System-level detailed analysis uses the same format for both standalone and comparative modes.
+     In comparative mode, system-level analysis covers Trace 1 () only. -->
+
+<a id="detailed-analysis-system-p1"></a>
+<!-- reasoning-candidate tier=system rank=1 -->
+#### 🔴 P1: <Brief Title>
+**Identification:**
+**Data:**
+**Reasoning for Slowdown:**
+**Resolution:**
+**Impact estimate:**
+
+---
+
+## Appendix
+
+### Model Architecture
+- **Model**: <model>
+- **Architecture**: <architecture>
+- **Scale**: <scale>
+- **Precision**: <precision>
+
+### Hardware Reference
+- **Platform**: <platform>
+- **Peak HBM BW**: X TB/s
+- **Peak MAF (BF16)**: Y TFLOPS
+- **Peak MAF (FP8)**: Z TFLOPS (if supported)
+- **Peak MAF (FP4)**: W TFLOPS (if supported)
diff --git a/skills/analysis-orchestrator/utils/templates/sub_agent_spec.md b/skills/analysis-orchestrator/templates/sub_agent_spec.md
similarity index 100%
rename from skills/analysis-orchestrator/utils/templates/sub_agent_spec.md
rename to skills/analysis-orchestrator/templates/sub_agent_spec.md
diff --git a/skills/magpie/.federated.json b/skills/magpie/.federated.json
index 93be522..a832c46 100644
--- a/skills/magpie/.federated.json
+++ b/skills/magpie/.federated.json
@@ -2,8 +2,8 @@
   "source": "amd-agi-magpie",
   "repo": "AMD-AGI/Magpie",
   "ref": "main",
-  "commit": "02d9a7dfc3aedec0e3feadb7449f6c0c318621d5",
+  "commit": "70023bada7762105157450554256b946ec869c73",
   "path": "skills/magpie",
   "license": "MIT",
-  "imported_at": "2026-06-18T20:31:36Z"
+  "imported_at": "2026-06-22T13:01:33Z"
 }
diff --git a/skills/magpie/SKILL.md b/skills/magpie/SKILL.md
index 0e9a544..6494d02 100644
--- a/skills/magpie/SKILL.md
+++ b/skills/magpie/SKILL.md
@@ -37,7 +37,7 @@ magpie analyze path/to/kernel.hip --testcase "./run_test.sh"
 - `--no-perf`: Skip performance profiling.
 - `-o`, `--output-dir`: Output directory (default: `./results`).
 
-**Config template (single kernel):** Use `kernel:` with `id`, `type`, `source_files`, `working_dir`, `testcase_command`, optional `compile_command`, `env`. See [Magpie/kernel_config.yaml.example](https://github.com/AMD-AGI/Magpie/blob/02d9a7dfc3aedec0e3feadb7449f6c0c318621d5/Magpie/kernel_config.yaml.example) and [examples/ck_gemm_add.yaml](https://github.com/AMD-AGI/Magpie/blob/02d9a7dfc3aedec0e3feadb7449f6c0c318621d5/examples/ck_gemm_add.yaml).
+**Config template (single kernel):** Use `kernel:` with `id`, `type`, `source_files`, `working_dir`, `testcase_command`, optional `compile_command`, `env`. See [Magpie/kernel_config.yaml.example](https://github.com/AMD-AGI/Magpie/blob/70023bada7762105157450554256b946ec869c73/Magpie/kernel_config.yaml.example) and [examples/ck_gemm_add.yaml](https://github.com/AMD-AGI/Magpie/blob/70023bada7762105157450554256b946ec869c73/examples/ck_gemm_add.yaml).
 
 ## Compare (multiple kernels)
 
@@ -59,7 +59,7 @@ magpie compare kernel1.hip kernel2.hip --testcase "./run_test.sh"
 - `--baseline`: Index of baseline kernel (default: 0).
 - `--no-perf`, `-o`: Same as analyze.
 
-Example: [examples/ck_grouped_gemm_compare.yaml](https://github.com/AMD-AGI/Magpie/blob/02d9a7dfc3aedec0e3feadb7449f6c0c318621d5/examples/ck_grouped_gemm_compare.yaml).
+Example: [examples/ck_grouped_gemm_compare.yaml](https://github.com/AMD-AGI/Magpie/blob/70023bada7762105157450554256b946ec869c73/examples/ck_grouped_gemm_compare.yaml).
 
 ## Benchmark (vLLM / SGLang)
 
@@ -81,7 +81,7 @@ magpie benchmark --benchmark-config examples/benchmarks/benchmark_vllm_dsr1.yaml
 - `--run-mode`: `docker` (default) or `local`.
 - `--docker-image`, `--timeout`, `-o`: Override image, timeout (seconds), output dir.
 
-Example configs: [examples/benchmarks/benchmark_vllm_dsr1.yaml](https://github.com/AMD-AGI/Magpie/blob/02d9a7dfc3aedec0e3feadb7449f6c0c318621d5/examples/benchmarks/benchmark_vllm_dsr1.yaml), [docs/how-to/benchmark.md](https://github.com/AMD-AGI/Magpie/blob/02d9a7dfc3aedec0e3feadb7449f6c0c318621d5/docs/how-to/benchmark.md).
+Example configs: [examples/benchmarks/benchmark_vllm_dsr1.yaml](https://github.com/AMD-AGI/Magpie/blob/70023bada7762105157450554256b946ec869c73/examples/benchmarks/benchmark_vllm_dsr1.yaml), [docs/how-to/benchmark.md](https://github.com/AMD-AGI/Magpie/blob/70023bada7762105157450554256b946ec869c73/docs/how-to/benchmark.md).
 
 ## Gap analysis (standalone)
 
@@ -109,7 +109,7 @@ Shows vendor, architecture, compiler, profiler. No mode required.
 
 When the user needs a kernel config file:
 
-1. Emit YAML matching the structure in [Magpie/kernel_config.yaml.example](https://github.com/AMD-AGI/Magpie/blob/02d9a7dfc3aedec0e3feadb7449f6c0c318621d5/Magpie/kernel_config.yaml.example): `kernel:` with `id`, `type` (hip|cuda|pytorch), `source_files`, `working_dir`, `testcase_command`, and optionally `compile_command`, `env`.
+1. Emit YAML matching the structure in [Magpie/kernel_config.yaml.example](https://github.com/AMD-AGI/Magpie/blob/70023bada7762105157450554256b946ec869c73/Magpie/kernel_config.yaml.example): `kernel:` with `id`, `type` (hip|cuda|pytorch), `source_files`, `working_dir`, `testcase_command`, and optionally `compile_command`, `env`.
 2. Write the file to the user's requested path (e.g. `kernel_config.yaml`).
 3. Run: `magpie analyze --kernel-config <that_file>`.
 

From 9f767be52cdf40dee1d575e0d3b0988d6eb0d40f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:26:29 +0000
Subject: [PATCH 8/8] fix: add missing anchor definitions in
 analysis_template.md

---
 .../templates/analysis_template.md            | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/skills/analysis-orchestrator/templates/analysis_template.md b/skills/analysis-orchestrator/templates/analysis_template.md
index d3fd5f0..3d22fc8 100644
--- a/skills/analysis-orchestrator/templates/analysis_template.md
+++ b/skills/analysis-orchestrator/templates/analysis_template.md
@@ -385,6 +385,24 @@ communication/compute overlap). These affect the GPU pipeline as a whole.
 **Resolution:** [1-2 sentences - Why the suggested optimization helps close the gap — not merely restating what to do.]
 **Impact estimate:** [Rendered from metadata → impact_estimates]
 
+<a id="detailed-analysis-compute-p2"></a>
+<!-- reasoning-candidate tier=compute rank=2 -->
+#### 🟡 P2: <Brief Title>
+**Identification:**
+**Data:**
+**Reasoning for Slowdown:**
+**Resolution:**
+**Impact estimate:**
+
+<a id="detailed-analysis-compute-p3"></a>
+<!-- reasoning-candidate tier=compute rank=3 -->
+#### 🟢 P3: <Brief Title>
+**Identification:**
+**Data:**
+**Reasoning for Slowdown:**
+**Resolution:**
+**Impact estimate:**
+
 ### Kernel Fusion Insights
 <!-- === STANDALONE Kernel Fusion === -->
 > **Note:** Kernel fusion analysis is experimental. impact_score projections estimate the recoverable fraction of E2E with 85% memory/compute pipeline overlap. Kernels without perf models use their measured trace time as-is. Actual recoverable time depends on implementation feasibility and interaction effects.
@@ -413,6 +431,20 @@ communication/compute overlap). These affect the GPU pipeline as a whole.
 <!-- reasoning-candidate tier=fusion rank=2 -->
 #### 🔴/🟡/🟢 P2: <Candidate Name> (<time_ms> ms, <instance_count> instances)
 
+**Identification:**
+
+**Data:**
+
+| Kernel | Type | Duration (us) | Perf model |
+|--------|------|--------------|------------|
+| <kernel name (truncated to ~60 chars)> | <type> | X.X | Yes/No |
+
+**Impact estimate:**
+
+<a id="detailed-analysis-fusion-P3"></a>
+<!-- reasoning-candidate tier=fusion rank=3 -->
+#### 🔴/🟡/🟢 P3: <Candidate Name> (<time_ms> ms, <instance_count> instances)
+
 *Repeat the same Identification + Data + Impact estimate format for each candidate, with anchors `detailed-analysis-fusion-PN`.*
 
 ### System-Level Insights
@@ -431,6 +463,24 @@ communication/compute overlap). These affect the GPU pipeline as a whole.
 **Resolution:**
 **Impact estimate:**
 
+<a id="detailed-analysis-system-p2"></a>
+<!-- reasoning-candidate tier=system rank=2 -->
+#### 🟡 P2: <Brief Title>
+**Identification:**
+**Data:**
+**Reasoning for Slowdown:**
+**Resolution:**
+**Impact estimate:**
+
+<a id="detailed-analysis-system-p3"></a>
+<!-- reasoning-candidate tier=system rank=3 -->
+#### 🟢 P3: <Brief Title>
+**Identification:**
+**Data:**
+**Reasoning for Slowdown:**
+**Resolution:**
+**Impact estimate:**
+
 ---
 
 ## Appendix