|
2 | 2 | title: "AI Coding Capability Leaderboard" |
3 | 3 | icon: "💻" |
4 | 4 | tags: [SWE-bench, coding, programming] |
5 | | -summary: "AI model coding capability rankings based on SWE-bench Verified" |
| 5 | +summary: "AI model coding capability rankings based on SWE-bench Bash-Only" |
6 | 6 | data_source: "https://www.swebench.com/" |
7 | | -benchmarks: [SWE-bench Verified, BigCodeBench, LiveCodeBench] |
8 | | -last_updated: "2026-05-10" |
| 7 | +benchmarks: [SWE-bench Bash-Only] |
| 8 | +last_updated: "2026-05-16" |
9 | 9 | auto_updated: true |
10 | | -date: "2026-05-10" |
| 10 | +date: "2026-05-16" |
11 | 11 | --- |
12 | 12 |
|
13 | 13 | | Rank | Model | Vendor | SWE-bench | Type | |
14 | 14 | |---|---|---|---|---| |
15 | | -| 🥇 | Claude 4 Opus | Anthropic | 78.3% | Closed | |
16 | | -| 🥈 | GPT-5.5 | OpenAI | 76.1% | Closed | |
17 | | -| 🥉 | o3 | OpenAI | 74.8% | Closed | |
18 | | -| 4 | Gemini 3.1 | Google | 72.5% | Closed | |
19 | | -| 5 | DeepSeek-V4 | DeepSeek | 70.2% | Open | |
20 | | -| 6 | Claude 4 Sonnet | Anthropic | 68.9% | Closed | |
21 | | -| 7 | ERNIE 5.1 | Baidu | 67.5% | Closed | |
22 | | -| 8 | Qwen3-Max | Alibaba | 66.0% | Closed | |
23 | | -| 9 | GPT-5 | OpenAI | 64.8% | Closed | |
24 | | -| 10 | Gemini 3.0 | Google | 63.2% | Closed | |
25 | | -| 11 | Kimi-2 | Moonshot | 62.0% | Closed | |
26 | | -| 12 | Llama 4 Maverick | Meta | 60.5% | Open | |
27 | | -| 13 | GLM-5 | Zhipu AI | 59.3% | Closed | |
28 | | -| 14 | Mistral Large 3 | Mistral | 58.0% | Closed | |
29 | | -| 15 | Claude 4 Haiku | Anthropic | 56.5% | Closed | |
30 | | -| 16 | DeepSeek-V3.2 | DeepSeek | 55.0% | Open | |
31 | | -| 17 | Llama 4 Scout | Meta | 53.8% | Open | |
32 | | -| 18 | Yi-3 | 01.AI | 52.5% | Open | |
33 | | -| 19 | Command A | Cohere | 51.0% | Closed | |
34 | | -| 20 | MiniMax-M2.5 | MiniMax | 50.0% | Closed | |
| 15 | +| 🥇 | Claude 4.5 Opus | Anthropic | 76.8% | Closed | |
| 16 | +| 🥈 | Gemini 3 Flash | Google DeepMind | 75.8% | Closed | |
| 17 | +| 🥉 | MiniMax M2.5 | MiniMax | 75.8% | Closed | |
| 18 | +| 4 | Claude Opus 4.6 | Anthropic | 75.6% | Closed | |
| 19 | +| 5 | Gemini 3 Pro Preview | Google DeepMind | 74.2% | Closed | |
| 20 | +| 6 | GPT-5.2 | OpenAI | 72.8% | Closed | |
| 21 | +| 7 | GLM-5 | Z-AI | 72.8% | Closed | |
| 22 | +| 8 | Claude 4.5 Sonnet | Anthropic | 71.4% | Closed | |
| 23 | +| 9 | Kimi K2.5 | Moonshot AI | 70.8% | Closed | |
| 24 | +| 10 | DeepSeek V3.2 | DeepSeek | 70.0% | Open | |
| 25 | +| 11 | Gemini 3 Pro | Google DeepMind | 69.6% | Closed | |
| 26 | +| 12 | Claude 4 Opus | Anthropic | 67.6% | Closed | |
| 27 | +| 13 | Claude 4.5 Haiku | Anthropic | 66.6% | Closed | |
| 28 | +| 14 | GPT-5.1 | OpenAI | 66.0% | Closed | |
| 29 | +| 15 | GPT-5 | OpenAI | 65.0% | Closed | |
| 30 | +| 16 | Claude 4 Sonnet | Anthropic | 64.9% | Closed | |
| 31 | +| 17 | Kimi K2 Thinking | Moonshot AI | 63.4% | Closed | |
| 32 | +| 18 | MiniMax M2 | MiniMax | 61.0% | Closed | |
| 33 | +| 19 | DeepSeek V3.2 Reasoner | DeepSeek | 60.0% | Open | |
| 34 | +| 20 | GPT-5 mini | OpenAI | 58.4% | Closed | |
0 commit comments