amd · danielholanda · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026
diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
@@ -8,36 +8,12 @@
     "version": "0.1.0"
   },
   "plugins": [
-    {
-      "name": "aiter-reflection",
-      "source": "./skills/aiter-reflection",
-      "skills": "./",
-      "description": "This skill should be used when optimizing AMD GPU kernels on MI300 using the aiter project, including running op tests, benchmarking, iterating on kernel changes, and recording results in the kernel experiment database."
-    },
     {
       "name": "apu-memory-tuner",
       "source": "./skills/apu-memory-tuner",
       "skills": "./",
       "description": "Inspect and tune the shared-vs-dedicated memory split (GTT / UMA Frame Buffer) on AMD Ryzen APUs so larger LLMs and image models fit on the iGPU."
     },
-    {
-      "name": "gpu-architecture-fundamentals",
-      "source": "./skills/gpu-architecture-fundamentals",
-      "skills": "./",
-      "description": "This skill should be used when reasoning about GPU architecture fundamentals to guide kernel optimization choices such as memory hierarchy usage, execution model mapping, block sizing, and latency-aware tuning across HIP, Triton, and PyTorch."
-    },
-    {
-      "name": "hip-kernel-optimization",
-      "source": "./skills/hip-kernel-optimization",
-      "skills": "./",
-      "description": "This skill should be used when writing or tuning HIP kernels on AMD/NVIDIA GPUs, covering memory coalescing, shared-memory tiling, bank conflict avoidance, warp primitives, occupancy, vectorization, async ops, loop unrolling, and profiling."
-    },
-    {
-      "name": "kernel-exp-history",
-      "source": "./skills/kernel-exp-history",
-      "skills": "./",
-      "description": "This skill should be used when optimizing kernels in this repo and needing to consult past optimization experiments, or when recording the current optimization iteration back into the kernel experiment database."
-    },
     {
       "name": "local-ai-app-integration",
       "source": "./skills/local-ai-app-integration",
@@ -56,47 +32,11 @@
       "skills": "./",
       "description": "Performs GPU kernel correctness and performance evaluation and LLM inference benchmarking with Magpie. Analyzes single or multiple kernels (HIP/CUDA/PyTorch), compares kernel implementations, runs vLLM/SGLang benchmarks with profiling and TraceLens, and runs gap analysis on torch traces."
     },
-    {
-      "name": "mi300-hip-programming-insights",
-      "source": "./skills/mi300-hip-programming-insights",
-      "skills": "./",
-      "description": "CDNA3/MI300 HIP programming insights—chiplet/cache model, Infinity Cache, memory coherency, matrix cores, sparsity, and best practices."
-    },
-    {
-      "name": "pytorch-kernel-optimization",
-      "source": "./skills/pytorch-kernel-optimization",
-      "skills": "./",
-      "description": "This skill should be used when optimizing PyTorch models and kernels, including efficient tensor operations, torch.compile, custom autograd/CUDA/Triton extensions, mixed precision, memory and data pipeline tuning, model optimization techniques, CUDA graphs, and profiling."
-    },
     {
       "name": "rocm-doctor",
       "source": "./skills/rocm-doctor",
       "skills": "./",
       "description": "Diagnose why ROCm, PyTorch, or llama.cpp isn't working on an AMD GPU. Matches the symptom against a fixed list of twelve known misconfigurations and proposes the next step."
-    },
-    {
-      "name": "rocprof-compute",
-      "source": "./skills/rocprof-compute",
-      "skills": "./",
-      "description": "This skill should be used when profiling AMD GPU kernels with rocprof-compute to collect metrics, roofline data, and analyze bottlenecks for HIP kernels."
-    },
-    {
-      "name": "triton-hip-reference-kernel-search",
-      "source": "./skills/triton-hip-reference-kernel-search",
-      "skills": "./",
-      "description": "Search and adapt Triton/HIP kernel patterns from a corpus to optimize AMD GPUs; use to find similar ops and reuse tiling/occupancy strategies."
-    },
-    {
-      "name": "triton-kernel-optimization",
-      "source": "./skills/triton-kernel-optimization",
-      "skills": "./",
-      "description": "This skill should be used when writing or tuning Triton GPU kernels, including autotuning block sizes, coalesced accesses, tiled matmul, fused ops, reductions, flash-attention style kernels, quantization, custom gradients, and profiling."
-    },
-    {
-      "name": "triton-kernel-reflection-prompts",
-      "source": "./skills/triton-kernel-reflection-prompts",
-      "skills": "./",
-      "description": "Reflection/self-critique prompts for reviewing and fixing AMD-targeted Triton kernels after generation or test failures."
     }
   ]
 }
diff --git a/README.md b/README.md
@@ -57,7 +57,7 @@ Skills earn their keep on repeated, opinionated workflows, exactly where the AMD
 >
 > **Target: ready for testing by June 12.** Until then, treat anything below as a preview.
 
-The initial catalog is organized into five focus areas.
+The initial catalog is organized into four focus areas.
 
 
 ### Application integration
@@ -80,22 +80,6 @@ Diagnose, configure, and ready AMD systems for AI workloads: drivers, BIOS, memo
 | `gfx-target-chooser` | Pick the right `gfx942` / `gfx90a` / `gfx1100` target and matching compiler flags. | _planned_ |
 | `pytorch-rocm-setup` | Get a known-good PyTorch + ROCm stack running on a target node, end to end. | _planned_ |
 
-### Kernel engineering
-
-Author, tune, and reason about GPU kernels for AMD targets.
-
-| Skill | What it does | Source |
-| --- | --- | --- |
-| [`aiter-reflection`](skills/aiter-reflection/SKILL.md) | Optimize AMD GPU kernels on MI300 using the aiter project: op tests, benchmarks, iteration, experiment database. | [Apex](https://github.com/AMD-AGI/Apex) |
-| [`gpu-architecture-fundamentals`](skills/gpu-architecture-fundamentals/SKILL.md) | Reason about memory hierarchy, execution model, block sizing, and latency across HIP, Triton, and PyTorch. | [Apex](https://github.com/AMD-AGI/Apex) |
-| [`hip-kernel-optimization`](skills/hip-kernel-optimization/SKILL.md) | Write and tune HIP kernels: coalescing, shared-memory tiling, bank conflicts, warp primitives, occupancy, vectorization. | [Apex](https://github.com/AMD-AGI/Apex) |
-| [`kernel-exp-history`](skills/kernel-exp-history/SKILL.md) | Consult past kernel optimization experiments and record the current iteration back into the experiment database. | [Apex](https://github.com/AMD-AGI/Apex) |
-| [`mi300-hip-programming-insights`](skills/mi300-hip-programming-insights/SKILL.md) | CDNA3 / MI300 HIP programming insights: chiplet and cache model, Infinity Cache, coherency, matrix cores, sparsity. | [Apex](https://github.com/AMD-AGI/Apex) |
-| [`pytorch-kernel-optimization`](skills/pytorch-kernel-optimization/SKILL.md) | Optimize PyTorch models and kernels: `torch.compile`, custom extensions, mixed precision, CUDA graphs, profiling. | [Apex](https://github.com/AMD-AGI/Apex) |
-| [`triton-hip-reference-kernel-search`](skills/triton-hip-reference-kernel-search/SKILL.md) | Search and adapt Triton / HIP kernel patterns from a corpus to reuse tiling and occupancy strategies. | [Apex](https://github.com/AMD-AGI/Apex) |
-| [`triton-kernel-optimization`](skills/triton-kernel-optimization/SKILL.md) | Write and tune Triton kernels: autotune block sizes, tiled matmul, fused ops, reductions, flash-attention, quantization. | [Apex](https://github.com/AMD-AGI/Apex) |
-| [`triton-kernel-reflection-prompts`](skills/triton-kernel-reflection-prompts/SKILL.md) | Reflection / self-critique prompts for reviewing and fixing AMD-targeted Triton kernels. | [Apex](https://github.com/AMD-AGI/Apex) |
-
 ### Cross-stack porting
 
 Bring existing workloads onto AMD.
@@ -113,7 +97,7 @@ Close the loop from trace to fix to ship.
 | Skill | What it does | Source |
 | --- | --- | --- |
 | [`magpie`](skills/magpie/SKILL.md) | Evaluate GPU kernel correctness and performance, compare kernel implementations, and benchmark vLLM / SGLang inference with profiling, TraceLens, and torch-trace gap analysis. | [Magpie](https://github.com/AMD-AGI/Magpie) |
-| [`rocprof-compute`](skills/rocprof-compute/SKILL.md) | Profile AMD GPU kernels with `rocprof-compute` to collect metrics, roofline data, and bottleneck analysis. | [Apex](https://github.com/AMD-AGI/Apex) |
+| `hyperloom` | Autonomously optimizes LLM inference on AMD GPUs. | _planned_ |
 | `omniperf-tune` | Run `omniperf`, locate the bottleneck, and suggest the fix. | _planned_ |
 | `quark-quantize` | Quantize PyTorch / ONNX models with [AMD Quark](https://github.com/amd/Quark) and export for AMD deployment. | _planned_ |
 

diff --git a/scripts/sources.yml b/scripts/sources.yml
@@ -23,25 +23,6 @@
 # the resulting changes for human review.
 
 sources:
-  - name: amd-agi-apex
-    repo: AMD-AGI/Apex
-    ref: main
-    path: tools/skills
-    license: MIT
-    # `skill-creator` is intentionally excluded; this catalog already has
-    # its own `create-skill` story via CONTRIBUTING.md.
-    skills:
-      - aiter-reflection
-      - gpu-architecture-fundamentals
-      - hip-kernel-optimization
-      - kernel-exp-history
-      - mi300-hip-programming-insights
-      - pytorch-kernel-optimization
-      - rocprof-compute
-      - triton-hip-reference-kernel-search
-      - triton-kernel-optimization
-      - triton-kernel-reflection-prompts
-
   - name: amd-agi-magpie
     repo: AMD-AGI/Magpie
     ref: main

diff --git a/skills/aiter-reflection/.federated.json b/skills/aiter-reflection/.federated.json
diff --git a/skills/aiter-reflection/SKILL.md b/skills/aiter-reflection/SKILL.md
diff --git a/skills/gpu-architecture-fundamentals/.federated.json b/skills/gpu-architecture-fundamentals/.federated.json
diff --git a/skills/gpu-architecture-fundamentals/SKILL.md b/skills/gpu-architecture-fundamentals/SKILL.md
diff --git a/skills/hip-kernel-optimization/.federated.json b/skills/hip-kernel-optimization/.federated.json