|
8 | 8 | "version": "0.1.0" |
9 | 9 | }, |
10 | 10 | "plugins": [ |
| 11 | + { |
| 12 | + "name": "aiter-reflection", |
| 13 | + "source": "./skills/aiter-reflection", |
| 14 | + "skills": "./", |
| 15 | + "description": "This skill should be used when optimizing AMD GPU kernels on MI300 using the aiter project, including running op tests, benchmarking, iterating on kernel changes, and recording results in the kernel experiment database." |
| 16 | + }, |
11 | 17 | { |
12 | 18 | "name": "apu-memory-tuner", |
13 | 19 | "source": "./skills/apu-memory-tuner", |
14 | 20 | "skills": "./", |
15 | 21 | "description": "Inspect and tune the shared-vs-dedicated memory split (GTT / UMA Frame Buffer) on AMD Ryzen APUs so larger LLMs and image models fit on the iGPU." |
16 | 22 | }, |
| 23 | + { |
| 24 | + "name": "gpu-architecture-fundamentals", |
| 25 | + "source": "./skills/gpu-architecture-fundamentals", |
| 26 | + "skills": "./", |
| 27 | + "description": "This skill should be used when reasoning about GPU architecture fundamentals to guide kernel optimization choices such as memory hierarchy usage, execution model mapping, block sizing, and latency-aware tuning across HIP, Triton, and PyTorch." |
| 28 | + }, |
| 29 | + { |
| 30 | + "name": "hip-kernel-optimization", |
| 31 | + "source": "./skills/hip-kernel-optimization", |
| 32 | + "skills": "./", |
| 33 | + "description": "This skill should be used when writing or tuning HIP kernels on AMD/NVIDIA GPUs, covering memory coalescing, shared-memory tiling, bank conflict avoidance, warp primitives, occupancy, vectorization, async ops, loop unrolling, and profiling." |
| 34 | + }, |
| 35 | + { |
| 36 | + "name": "kernel-exp-history", |
| 37 | + "source": "./skills/kernel-exp-history", |
| 38 | + "skills": "./", |
| 39 | + "description": "This skill should be used when optimizing kernels in this repo and needing to consult past optimization experiments, or when recording the current optimization iteration back into the kernel experiment database." |
| 40 | + }, |
17 | 41 | { |
18 | 42 | "name": "local-ai-app-integration", |
19 | 43 | "source": "./skills/local-ai-app-integration", |
|
26 | 50 | "skills": "./", |
27 | 51 | "description": "Route image generation, text-to-speech, and speech-to-text through a local AI Server to reduce token/cost usage." |
28 | 52 | }, |
| 53 | + { |
| 54 | + "name": "mi300-hip-programming-insights", |
| 55 | + "source": "./skills/mi300-hip-programming-insights", |
| 56 | + "skills": "./", |
| 57 | + "description": "CDNA3/MI300 HIP programming insights—chiplet/cache model, Infinity Cache, memory coherency, matrix cores, sparsity, and best practices." |
| 58 | + }, |
| 59 | + { |
| 60 | + "name": "pytorch-kernel-optimization", |
| 61 | + "source": "./skills/pytorch-kernel-optimization", |
| 62 | + "skills": "./", |
| 63 | + "description": "This skill should be used when optimizing PyTorch models and kernels, including efficient tensor operations, torch.compile, custom autograd/CUDA/Triton extensions, mixed precision, memory and data pipeline tuning, model optimization techniques, CUDA graphs, and profiling." |
| 64 | + }, |
29 | 65 | { |
30 | 66 | "name": "rocm-doctor", |
31 | 67 | "source": "./skills/rocm-doctor", |
32 | 68 | "skills": "./", |
33 | 69 | "description": "Diagnose why ROCm, PyTorch, or llama.cpp isn't working on an AMD GPU. Matches the symptom against a fixed list of twelve known misconfigurations and proposes the next step." |
| 70 | + }, |
| 71 | + { |
| 72 | + "name": "rocprof-compute", |
| 73 | + "source": "./skills/rocprof-compute", |
| 74 | + "skills": "./", |
| 75 | + "description": "This skill should be used when profiling AMD GPU kernels with rocprof-compute to collect metrics, roofline data, and analyze bottlenecks for HIP kernels." |
| 76 | + }, |
| 77 | + { |
| 78 | + "name": "triton-hip-reference-kernel-search", |
| 79 | + "source": "./skills/triton-hip-reference-kernel-search", |
| 80 | + "skills": "./", |
| 81 | + "description": "Search and adapt Triton/HIP kernel patterns from a corpus to optimize AMD GPUs; use to find similar ops and reuse tiling/occupancy strategies." |
| 82 | + }, |
| 83 | + { |
| 84 | + "name": "triton-kernel-optimization", |
| 85 | + "source": "./skills/triton-kernel-optimization", |
| 86 | + "skills": "./", |
| 87 | + "description": "This skill should be used when writing or tuning Triton GPU kernels, including autotuning block sizes, coalesced accesses, tiled matmul, fused ops, reductions, flash-attention style kernels, quantization, custom gradients, and profiling." |
| 88 | + }, |
| 89 | + { |
| 90 | + "name": "triton-kernel-reflection-prompts", |
| 91 | + "source": "./skills/triton-kernel-reflection-prompts", |
| 92 | + "skills": "./", |
| 93 | + "description": "Reflection/self-critique prompts for reviewing and fixing AMD-targeted Triton kernels after generation or test failures." |
34 | 94 | } |
35 | 95 | ] |
36 | 96 | } |
0 commit comments