-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
This PR updates the leaderboard to reflect the change in score due to the following PR merge: 1. #888 2. #887 3. #895 4. #894 5. #897 6. #892 7. #898 8. #900 9. #902 Models were evaluated using checkpoint commit 910460e.
- Loading branch information
1 parent
3a9837f
commit bd51d41
Showing
5 changed files
with
400 additions
and
352 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,98 +1,110 @@ | ||
Rank,Model,Live Overall Acc,AST Summary,Python Simple AST,Python Multiple AST,Python Parallel AST,Python Parallel Multiple AST,Irrelevance Detection,Relevance Detection | ||
1,Gemini-2.0-Flash-Exp (Prompt),82.01%,80.68%,85.66%,79.30%,81.25%,87.50%,84.13%,77.78% | ||
1,Gemini-2.0-Flash-001 (Prompt),81.39%,73.87%,75.58%,73.12%,81.25%,83.33%,93.42%,55.56% | ||
2,o1-2024-12-17 (Prompt),80.63%,77.79%,82.95%,76.54%,81.25%,75.00%,85.15%,72.22% | ||
3,GPT-4-turbo-2024-04-09 (FC),80.50%,79.50%,83.72%,78.63%,81.25%,70.83%,82.20%,72.22% | ||
4,gpt-4o-2024-11-20 (Prompt),79.83%,80.75%,84.88%,79.77%,87.50%,75.00%,78.34%,83.33% | ||
5,gpt-4o-2024-11-20 (FC),79.65%,79.35%,81.40%,78.82%,87.50%,75.00%,80.05%,83.33% | ||
6,Gemini-2.0-Flash-Exp (FC),79.03%,71.58%,74.81%,70.66%,81.25%,70.83%,90.93%,55.56% | ||
7,Claude-3.5-Sonnet-20241022 (FC),78.94%,80.61%,84.11%,81.96%,25.00%,20.83%,76.42%,77.78% | ||
8,ToolACE-8B (FC),78.59%,76.02%,73.26%,76.73%,81.25%,70.83%,82.43%,83.33% | ||
9,Gemini-Exp-1206 (FC),78.54%,76.17%,75.19%,76.54%,68.75%,75.00%,82.20%,77.78% | ||
10,o1-mini-2024-09-12 (Prompt),78.14%,71.95%,72.87%,71.60%,75.00%,75.00%,87.98%,61.11% | ||
11,Claude-3-Opus-20240229 (FC),78.05%,75.20%,79.07%,75.78%,31.25%,37.50%,82.77%,61.11% | ||
12,Gemini-1.5-Flash-002 (FC),78.05%,70.98%,72.87%,70.18%,81.25%,79.17%,89.34%,55.56% | ||
13,o1-2024-12-17 (FC),78.01%,77.20%,81.78%,79.01%,0.00%,0.00%,79.37%,72.22% | ||
14,watt-tool-70B (FC),77.74%,83.57%,86.05%,83.48%,81.25%,62.50%,68.48%,94.44% | ||
15,Mistral-Medium-2312 (Prompt),77.61%,74.17%,75.97%,74.07%,81.25%,54.17%,83.11%,66.67% | ||
16,Gemini-1.5-Flash-002 (Prompt),76.72%,77.28%,81.78%,76.16%,93.75%,66.67%,75.74%,83.33% | ||
17,Gemini-1.5-Pro-002 (Prompt),76.72%,78.68%,82.95%,77.40%,87.50%,83.33%,73.81%,72.22% | ||
18,Gemini-1.5-Pro-001 (Prompt),76.68%,73.13%,76.36%,71.98%,93.75%,75.00%,82.54%,55.56% | ||
19,Functionary-Medium-v3.1 (FC),76.63%,82.61%,81.78%,83.29%,68.75%,70.83%,67.57%,72.22% | ||
20,watt-tool-8B (FC),76.50%,77.35%,76.74%,77.49%,87.50%,70.83%,75.06%,83.33% | ||
21,GPT-4o-mini-2024-07-18 (Prompt),76.50%,77.87%,81.40%,76.73%,93.75%,79.17%,74.26%,83.33% | ||
22,Gemini-1.5-Flash-001 (FC),76.37%,74.17%,75.97%,74.26%,62.50%,58.33%,80.27%,50.00% | ||
23,Gemini-1.5-Pro-002 (FC),76.28%,76.31%,80.23%,75.21%,87.50%,75.00%,76.30%,72.22% | ||
24,Gemini-1.5-Pro-001 (FC),76.23%,71.65%,75.58%,70.75%,81.25%,62.50%,83.79%,50.00% | ||
25,Qwen2.5-72B-Instruct (Prompt),75.30%,82.38%,85.27%,82.15%,62.50%,75.00%,63.95%,100.00% | ||
26,xLAM-7b-r (FC),75.22%,73.87%,72.09%,74.93%,50.00%,62.50%,86.72%,94.44% | ||
27,Hammer2.1-7b (FC),75.11%,77.20%,76.74%,77.40%,81.25%,70.83%,71.77%,82.35% | ||
28,GPT-4o-mini-2024-07-18 (FC),74.41%,76.68%,78.68%,76.16%,87.50%,70.83%,70.75%,83.33% | ||
29,Amazon-Nova-Pro-v1:0 (FC),74.32%,77.72%,80.23%,77.49%,81.25%,58.33%,69.05%,77.78% | ||
30,Qwen2.5-32B-Instruct (Prompt),74.23%,78.83%,82.95%,78.54%,62.50%,58.33%,66.67%,100.00% | ||
31,Haha-7B,74.19%,77.57%,78.29%,77.59%,75.00%,70.83%,68.82%,83.33% | ||
32,Qwen2.5-14B-Instruct (Prompt),74.14%,75.20%,74.42%,75.78%,62.50%,66.67%,72.45%,77.78% | ||
33,Hammer2.1-3b (FC),74.04%,73.06%,73.26%,73.31%,62.50%,66.67%,75.40%,82.35% | ||
34,GoGoAgent,74.01%,74.69%,72.87%,75.40%,68.75%,66.67%,72.90%,77.78% | ||
35,Functionary-Small-v3.1 (FC),73.75%,78.24%,79.84%,78.16%,81.25%,62.50%,66.78%,77.78% | ||
36,DeepSeek-Coder-V2 (FC),73.48%,77.20%,80.62%,77.02%,43.75%,70.83%,67.46%,88.89% | ||
37,Gemini-Exp-1206 (Prompt),72.99%,75.50%,77.52%,74.93%,87.50%,70.83%,69.16%,72.22% | ||
38,xLAM-8x22b-r (FC),72.59%,79.64%,80.23%,79.68%,81.25%,70.83%,61.45%,88.89% | ||
39,claude-3.5-haiku-20241022 (FC),72.37%,77.13%,82.95%,78.35%,18.75%,0.00%,64.85%,83.33% | ||
40,Mistral-small-2402 (FC),72.19%,68.62%,65.50%,71.51%,12.50%,12.50%,77.55%,77.78% | ||
41,Claude-3.5-Sonnet-20241022 (Prompt),71.97%,80.75%,86.82%,80.06%,81.25%,45.83%,58.39%,77.78% | ||
42,Amazon-Nova-Lite-v1:0 (FC),71.61%,70.61%,72.87%,70.09%,75.00%,66.67%,73.24%,66.67% | ||
43,xLAM-8x7b-r (FC),71.08%,77.65%,74.81%,79.30%,43.75%,58.33%,60.54%,94.44% | ||
44,claude-3.5-haiku-20241022 (Prompt),70.77%,76.68%,84.88%,75.02%,87.50%,54.17%,61.56%,77.78% | ||
45,Hammer2.1-1.5b (FC),70.64%,69.73%,71.32%,69.80%,50.00%,62.50%,71.88%,77.78% | ||
46,FireFunction-v1 (FC),70.46%,70.54%,71.71%,72.93%,0.00%,0.00%,69.84%,94.44% | ||
47,MiniCPM3-4B-FC (FC),70.01%,65.73%,74.81%,63.91%,43.75%,62.50%,76.53%,72.22% | ||
48,mistral-large-2407 (FC),69.88%,79.64%,85.27%,78.54%,62.50%,79.17%,54.88%,72.22% | ||
49,Gemini-1.0-Pro-002 (FC),69.70%,68.91%,78.29%,67.62%,43.75%,41.67%,70.98%,66.67% | ||
50,Command R7B (FC),69.17%,59.59%,63.18%,58.69%,56.25%,62.50%,84.13%,55.56% | ||
51,Gemini-1.5-Flash-001 (Prompt),68.90%,76.61%,77.13%,76.16%,93.75%,79.17%,56.80%,83.33% | ||
52,Open-Mixtral-8x22b (FC),68.64%,72.61%,77.13%,73.12%,6.25%,45.83%,62.24%,83.33% | ||
53,GPT-3.5-Turbo-0125 (Prompt),68.55%,78.61%,80.62%,78.63%,75.00%,58.33%,52.61%,94.44% | ||
54,DeepSeek-V3 (FC),68.41%,82.09%,83.72%,82.15%,81.25%,62.50%,47.05%,88.89% | ||
55,Gemma-2-9b-it (Prompt),67.97%,74.54%,77.13%,74.26%,62.50%,66.67%,57.60%,83.33% | ||
56,Qwen2.5-7B-Instruct (Prompt),67.44%,75.06%,76.74%,74.93%,62.50%,70.83%,55.33%,88.89% | ||
57,Gemma-2-27b-it (Prompt),67.17%,80.16%,85.27%,79.39%,68.75%,66.67%,46.71%,94.44% | ||
58,Amazon-Nova-Micro-v1:0 (FC),67.04%,64.17%,65.89%,64.20%,62.50%,45.83%,71.32%,72.22% | ||
59,Claude-3-Opus-20240229 (Prompt),66.99%,79.72%,85.27%,79.11%,68.75%,54.17%,47.17%,83.33% | ||
60,GLM-4-9b-Chat (FC),66.81%,64.03%,72.48%,64.39%,0.00%,0.00%,71.09%,66.67% | ||
61,Open-Mixtral-8x22b (Prompt),66.02%,74.76%,83.33%,72.65%,81.25%,70.83%,52.27%,83.33% | ||
62,Open-Mistral-Nemo-2407 (FC),65.97%,71.13%,77.13%,69.61%,75.00%,70.83%,58.05%,66.67% | ||
63,FireFunction-v2 (FC),65.66%,78.09%,79.07%,78.35%,56.25%,70.83%,46.03%,94.44% | ||
64,Hermes-2-Pro-Llama-3-8B (FC),64.95%,66.62%,72.09%,65.81%,56.25%,50.00%,62.81%,44.44% | ||
65,Meta-Llama-3-70B-Instruct (Prompt),64.95%,78.53%,81.01%,78.25%,75.00%,66.67%,43.42%,100.00% | ||
66,Ministral-8B-Instruct-2410 (FC),64.93%,72.61%,75.58%,72.27%,62.50%,62.50%,53.06%,70.59% | ||
67,GPT-3.5-Turbo-0125 (FC),64.02%,79.20%,81.40%,79.68%,43.75%,58.33%,40.14%,94.44% | ||
68,GPT-4-turbo-2024-04-09 (Prompt),63.84%,84.97%,87.98%,84.14%,100.00%,79.17%,30.73%,100.00% | ||
69,Hammer2.1-0.5b (FC),62.91%,58.11%,60.08%,58.02%,50.00%,45.83%,69.95%,77.78% | ||
70,Llama-3.3-70B-Instruct (Prompt),62.77%,78.02%,81.78%,77.11%,93.75%,66.67%,38.66%,100.00% | ||
71,Llama-3.1-70B-Instruct (Prompt),62.24%,76.54%,78.29%,76.16%,87.50%,66.67%,39.57%,100.00% | ||
72,Open-Mixtral-8x7b (Prompt),61.44%,65.36%,63.57%,66.10%,68.75%,50.00%,54.88%,88.89% | ||
73,Qwen2.5-1.5B-Instruct (Prompt),61.08%,61.07%,70.54%,59.26%,56.25%,41.67%,60.66%,83.33% | ||
74,Llama-3.1-8B-Instruct (Prompt),61.08%,72.91%,74.03%,73.31%,56.25%,54.17%,42.63%,77.78% | ||
75,DBRX-Instruct (Prompt),60.28%,73.50%,78.29%,73.03%,75.00%,41.67%,39.34%,94.44% | ||
76,Granite-20b-FunctionCalling (FC),59.66%,58.48%,68.22%,56.32%,43.75%,58.33%,60.88%,88.89% | ||
77,Command-R-Plus (FC),59.00%,60.84%,70.54%,58.78%,62.50%,45.83%,55.90%,72.22% | ||
78,Mistral-Small-2402 (Prompt),58.77%,57.96%,36.43%,65.24%,0.00%,8.33%,60.32%,44.44% | ||
79,Qwen2.5-3B-Instruct (Prompt),58.69%,66.91%,69.77%,66.48%,56.25%,62.50%,45.46%,88.89% | ||
80,Hermes-2-Pro-Mistral-7B (FC),57.71%,61.36%,69.77%,60.02%,43.75%,41.67%,51.93%,66.67% | ||
81,Llama-3.2-3B-Instruct (Prompt),55.80%,63.73%,63.95%,64.86%,12.50%,45.83%,42.97%,88.89% | ||
82,MiniCPM3-4B (Prompt),54.46%,37.23%,46.51%,34.76%,43.75%,41.67%,80.95%,50.00% | ||
83,Nexusflow-Raven-v2 (FC),54.20%,39.45%,41.47%,38.65%,56.25%,41.67%,76.64%,61.11% | ||
84,xLAM-7b-fc-r (FC),53.40%,61.07%,78.68%,58.02%,31.25%,25.00%,41.16%,77.78% | ||
85,mistral-large-2407 (Prompt),52.82%,82.90%,86.05%,81.96%,93.75%,83.33%,5.78%,100.00% | ||
86,Qwen2-7B-Instruct (Prompt),50.60%,60.77%,56.59%,62.01%,37.50%,66.67%,34.24%,88.89% | ||
87,Gemini-1.0-Pro-002 (Prompt),49.13%,47.59%,50.78%,47.01%,62.50%,29.17%,50.91%,77.78% | ||
88,Open-Mistral-Nemo-2407 (Prompt),49.04%,75.13%,77.91%,74.45%,87.50%,66.67%,8.28%,88.89% | ||
89,Meta-Llama-3-8B-Instruct (Prompt),47.98%,60.62%,61.24%,61.44%,37.50%,33.33%,28.00%,77.78% | ||
90,Llama-3.1-70B-Instruct (FC),45.00%,51.81%,52.33%,52.61%,31.25%,25.00%,33.45%,100.00% | ||
91,Gemma-2-2b-it (Prompt),43.80%,19.54%,26.74%,18.52%,0.00%,0.00%,81.07%,38.89% | ||
92,DeepSeek-Coder-V2-Lite-Instruct (FC),39.40%,3.55%,2.33%,3.80%,0.00%,8.33%,95.12%,0.00% | ||
93,Qwen2-1.5B-Instruct (Prompt),39.05%,41.30%,48.84%,40.27%,12.50%,25.00%,34.47%,94.44% | ||
94,xLAM-1b-fc-r (FC),36.92%,53.89%,63.95%,53.37%,6.25%,0.00%,9.64%,100.00% | ||
95,Llama-3.1-8B-Instruct (FC),33.50%,49.30%,51.94%,49.00%,37.50%,41.67%,8.05%,94.44% | ||
96,Qwen2.5-0.5B-Instruct (Prompt),31.59%,38.34%,53.88%,34.76%,56.25%,16.67%,19.95%,94.44% | ||
97,Llama-3.2-1B-Instruct (Prompt),31.36%,12.14%,31.40%,7.60%,12.50%,4.17%,60.66%,38.89% | ||
6,Gemini-2.0-Flash-001 (FC),79.12%,69.58%,74.42%,68.28%,81.25%,66.67%,94.33%,50.00% | ||
7,o3-mini-2025-01-31 (Prompt),79.08%,78.24%,82.95%,77.11%,87.50%,70.83%,80.50%,72.22% | ||
8,Claude-3.5-Sonnet-20241022 (FC),78.94%,80.61%,84.11%,81.96%,25.00%,20.83%,76.42%,77.78% | ||
9,ToolACE-8B (FC),78.59%,76.02%,73.26%,76.73%,81.25%,70.83%,82.43%,83.33% | ||
10,Gemini-2.0-Pro-Exp-02-05 (FC),78.50%,70.69%,76.74%,68.85%,87.50%,75.00%,91.16%,44.44% | ||
11,Gemini-2.0-Flash-Lite-Preview-02-05 (Prompt),78.28%,72.39%,72.87%,73.22%,50.00%,45.83%,87.53%,66.67% | ||
12,Gemini-2.0-Pro-Exp-02-05 (Prompt),78.23%,76.61%,81.01%,75.12%,93.75%,83.33%,80.84%,72.22% | ||
13,Claude-3-Opus-20240229 (FC),78.05%,75.20%,79.07%,75.78%,31.25%,37.50%,82.77%,61.11% | ||
14,Gemini-1.5-Flash-002 (FC),78.05%,70.98%,72.87%,70.18%,81.25%,79.17%,89.34%,55.56% | ||
15,o1-2024-12-17 (FC),78.01%,77.20%,81.78%,79.01%,0.00%,0.00%,79.37%,72.22% | ||
16,watt-tool-70B (FC),77.74%,83.57%,86.05%,83.48%,81.25%,62.50%,68.48%,94.44% | ||
17,Mistral-Medium-2312 (Prompt),77.61%,74.17%,75.97%,74.07%,81.25%,54.17%,83.11%,66.67% | ||
18,o3-mini-2025-01-31 (FC),77.30%,76.83%,81.40%,78.63%,0.00%,0.00%,78.00%,77.78% | ||
19,Gemini-2.0-Flash-Lite-Preview-02-05 (FC),76.90%,67.21%,70.54%,66.19%,81.25%,66.67%,92.29%,50.00% | ||
20,Gemini-1.5-Flash-002 (Prompt),76.72%,77.28%,81.78%,76.16%,93.75%,66.67%,75.74%,83.33% | ||
21,Gemini-1.5-Pro-002 (Prompt),76.72%,78.68%,82.95%,77.40%,87.50%,83.33%,73.81%,72.22% | ||
22,Gemini-1.5-Pro-001 (Prompt),76.68%,73.13%,76.36%,71.98%,93.75%,75.00%,82.54%,55.56% | ||
23,Functionary-Medium-v3.1 (FC),76.63%,82.61%,81.78%,83.29%,68.75%,70.83%,67.57%,72.22% | ||
24,watt-tool-8B (FC),76.50%,77.35%,76.74%,77.49%,87.50%,70.83%,75.06%,83.33% | ||
25,GPT-4o-mini-2024-07-18 (Prompt),76.50%,77.87%,81.40%,76.73%,93.75%,79.17%,74.26%,83.33% | ||
26,Gemini-1.5-Flash-001 (FC),76.37%,74.17%,75.97%,74.26%,62.50%,58.33%,80.27%,50.00% | ||
27,Gemini-1.5-Pro-002 (FC),76.28%,76.31%,80.23%,75.21%,87.50%,75.00%,76.30%,72.22% | ||
28,Gemini-1.5-Pro-001 (FC),76.23%,71.65%,75.58%,70.75%,81.25%,62.50%,83.79%,50.00% | ||
29,Qwen2.5-72B-Instruct (Prompt),75.30%,82.38%,85.27%,82.15%,62.50%,75.00%,63.95%,100.00% | ||
30,xLAM-7b-r (FC),75.22%,73.87%,72.09%,74.93%,50.00%,62.50%,86.72%,94.44% | ||
31,Hammer2.1-7b (FC),75.11%,77.20%,76.74%,77.40%,81.25%,70.83%,71.77%,82.35% | ||
32,GPT-4o-mini-2024-07-18 (FC),74.41%,76.68%,78.68%,76.16%,87.50%,70.83%,70.75%,83.33% | ||
33,Amazon-Nova-Pro-v1:0 (FC),74.32%,77.72%,80.23%,77.49%,81.25%,58.33%,69.05%,77.78% | ||
34,Qwen2.5-32B-Instruct (Prompt),74.23%,78.83%,82.95%,78.54%,62.50%,58.33%,66.67%,100.00% | ||
35,Haha-7B,74.19%,77.57%,78.29%,77.59%,75.00%,70.83%,68.82%,83.33% | ||
36,Qwen2.5-14B-Instruct (Prompt),74.14%,75.20%,74.42%,75.78%,62.50%,66.67%,72.45%,77.78% | ||
37,Hammer2.1-3b (FC),74.04%,73.06%,73.26%,73.31%,62.50%,66.67%,75.40%,82.35% | ||
38,GoGoAgent,74.01%,74.69%,72.87%,75.40%,68.75%,66.67%,72.90%,77.78% | ||
39,Functionary-Small-v3.1 (FC),73.75%,78.24%,79.84%,78.16%,81.25%,62.50%,66.78%,77.78% | ||
40,DeepSeek-Coder-V2 (FC),73.48%,77.20%,80.62%,77.02%,43.75%,70.83%,67.46%,88.89% | ||
41,CALM-8B,72.95%,67.28%,71.71%,66.67%,56.25%,54.17%,81.41%,83.33% | ||
42,xLAM-8x22b-r (FC),72.59%,79.64%,80.23%,79.68%,81.25%,70.83%,61.45%,88.89% | ||
43,claude-3.5-haiku-20241022 (FC),72.37%,77.13%,82.95%,78.35%,18.75%,0.00%,64.85%,83.33% | ||
44,Mistral-small-2402 (FC),72.19%,68.62%,65.50%,71.51%,12.50%,12.50%,77.55%,77.78% | ||
45,Claude-3.5-Sonnet-20241022 (Prompt),71.97%,80.75%,86.82%,80.06%,81.25%,45.83%,58.39%,77.78% | ||
46,CALM-8B,71.92%,66.17%,70.16%,65.34%,68.75%,58.33%,80.84%,66.67% | ||
47,Amazon-Nova-Lite-v1:0 (FC),71.61%,70.61%,72.87%,70.09%,75.00%,66.67%,73.24%,66.67% | ||
48,xLAM-8x7b-r (FC),71.08%,77.65%,74.81%,79.30%,43.75%,58.33%,60.54%,94.44% | ||
49,claude-3.5-haiku-20241022 (Prompt),70.77%,76.68%,84.88%,75.02%,87.50%,54.17%,61.56%,77.78% | ||
50,Hammer2.1-1.5b (FC),70.64%,69.73%,71.32%,69.80%,50.00%,62.50%,71.88%,77.78% | ||
51,FireFunction-v1 (FC),70.46%,70.54%,71.71%,72.93%,0.00%,0.00%,69.84%,94.44% | ||
52,MiniCPM3-4B-FC (FC),70.01%,65.73%,74.81%,63.91%,43.75%,62.50%,76.53%,72.22% | ||
53,mistral-large-2407 (FC),69.88%,79.64%,85.27%,78.54%,62.50%,79.17%,54.88%,72.22% | ||
54,Gemini-1.0-Pro-002 (FC),69.70%,68.91%,78.29%,67.62%,43.75%,41.67%,70.98%,66.67% | ||
55,Command R7B (FC),69.17%,59.59%,63.18%,58.69%,56.25%,62.50%,84.13%,55.56% | ||
56,Gemini-1.5-Flash-001 (Prompt),68.90%,76.61%,77.13%,76.16%,93.75%,79.17%,56.80%,83.33% | ||
57,Open-Mixtral-8x22b (FC),68.64%,72.61%,77.13%,73.12%,6.25%,45.83%,62.24%,83.33% | ||
58,GPT-3.5-Turbo-0125 (Prompt),68.55%,78.61%,80.62%,78.63%,75.00%,58.33%,52.61%,94.44% | ||
59,DeepSeek-V3 (FC),68.41%,82.09%,83.72%,82.15%,81.25%,62.50%,47.05%,88.89% | ||
60,Sky-T1-32B-Preview (Prompt),68.00%,76.76%,77.52%,77.11%,62.50%,62.50%,54.08%,94.12% | ||
61,Gemma-2-9b-it (Prompt),67.97%,74.54%,77.13%,74.26%,62.50%,66.67%,57.60%,83.33% | ||
62,Qwen2.5-7B-Instruct (Prompt),67.44%,75.06%,76.74%,74.93%,62.50%,70.83%,55.33%,88.89% | ||
63,Gemma-2-27b-it (Prompt),67.17%,80.16%,85.27%,79.39%,68.75%,66.67%,46.71%,94.44% | ||
64,Amazon-Nova-Micro-v1:0 (FC),67.04%,64.17%,65.89%,64.20%,62.50%,45.83%,71.32%,72.22% | ||
65,Claude-3-Opus-20240229 (Prompt),66.99%,79.72%,85.27%,79.11%,68.75%,54.17%,47.17%,83.33% | ||
66,GLM-4-9b-Chat (FC),66.81%,64.03%,72.48%,64.39%,0.00%,0.00%,71.09%,66.67% | ||
67,Open-Mixtral-8x22b (Prompt),66.02%,74.76%,83.33%,72.65%,81.25%,70.83%,52.27%,83.33% | ||
68,Open-Mistral-Nemo-2407 (FC),65.97%,71.13%,77.13%,69.61%,75.00%,70.83%,58.05%,66.67% | ||
69,FireFunction-v2 (FC),65.66%,78.09%,79.07%,78.35%,56.25%,70.83%,46.03%,94.44% | ||
70,Hermes-2-Pro-Llama-3-8B (FC),64.95%,66.62%,72.09%,65.81%,56.25%,50.00%,62.81%,44.44% | ||
71,Meta-Llama-3-70B-Instruct (Prompt),64.95%,78.53%,81.01%,78.25%,75.00%,66.67%,43.42%,100.00% | ||
72,Ministral-8B-Instruct-2410 (FC),64.93%,72.61%,75.58%,72.27%,62.50%,62.50%,53.06%,70.59% | ||
73,GPT-3.5-Turbo-0125 (FC),64.02%,79.20%,81.40%,79.68%,43.75%,58.33%,40.14%,94.44% | ||
74,GPT-4-turbo-2024-04-09 (Prompt),63.84%,84.97%,87.98%,84.14%,100.00%,79.17%,30.73%,100.00% | ||
75,Hammer2.1-0.5b (FC),62.91%,58.11%,60.08%,58.02%,50.00%,45.83%,69.95%,77.78% | ||
76,Llama-3.3-70B-Instruct (Prompt),62.77%,78.02%,81.78%,77.11%,93.75%,66.67%,38.66%,100.00% | ||
77,Llama-3.1-70B-Instruct (Prompt),62.24%,76.54%,78.29%,76.16%,87.50%,66.67%,39.57%,100.00% | ||
78,Open-Mixtral-8x7b (Prompt),61.44%,65.36%,63.57%,66.10%,68.75%,50.00%,54.88%,88.89% | ||
79,Qwen2.5-1.5B-Instruct (Prompt),61.08%,61.07%,70.54%,59.26%,56.25%,41.67%,60.66%,83.33% | ||
80,Llama-3.1-8B-Instruct (Prompt),61.08%,72.91%,74.03%,73.31%,56.25%,54.17%,42.63%,77.78% | ||
81,DBRX-Instruct (Prompt),60.28%,73.50%,78.29%,73.03%,75.00%,41.67%,39.34%,94.44% | ||
82,Granite-20b-FunctionCalling (FC),59.66%,58.48%,68.22%,56.32%,43.75%,58.33%,60.88%,88.89% | ||
83,Command-R-Plus (FC),59.00%,60.84%,70.54%,58.78%,62.50%,45.83%,55.90%,72.22% | ||
84,Bielik-11B-v2.3-Instruct (Prompt),58.91%,69.43%,72.87%,69.33%,43.75%,54.17%,42.40%,77.78% | ||
85,Mistral-Small-2402 (Prompt),58.77%,57.96%,36.43%,65.24%,0.00%,8.33%,60.32%,44.44% | ||
86,Qwen2.5-3B-Instruct (Prompt),58.69%,66.91%,69.77%,66.48%,56.25%,62.50%,45.46%,88.89% | ||
87,Hermes-2-Pro-Mistral-7B (FC),57.71%,61.36%,69.77%,60.02%,43.75%,41.67%,51.93%,66.67% | ||
88,Llama-3.2-3B-Instruct (Prompt),55.80%,63.73%,63.95%,64.86%,12.50%,45.83%,42.97%,88.89% | ||
89,Falcon3-7B-Instruct (FC),54.86%,67.95%,74.03%,66.48%,75.00%,62.50%,34.13%,88.89% | ||
90,MiniCPM3-4B (Prompt),54.46%,37.23%,46.51%,34.76%,43.75%,41.67%,80.95%,50.00% | ||
91,Nexusflow-Raven-v2 (FC),54.20%,39.45%,41.47%,38.65%,56.25%,41.67%,76.64%,61.11% | ||
92,Falcon3-10B-Instruct (FC),54.11%,75.28%,76.36%,76.16%,50.00%,41.67%,20.86%,94.44% | ||
93,xLAM-7b-fc-r (FC),53.40%,61.07%,78.68%,58.02%,31.25%,25.00%,41.16%,77.78% | ||
94,mistral-large-2407 (Prompt),52.82%,82.90%,86.05%,81.96%,93.75%,83.33%,5.78%,100.00% | ||
95,Qwen2-7B-Instruct (Prompt),50.60%,60.77%,56.59%,62.01%,37.50%,66.67%,34.24%,88.89% | ||
96,Gemini-1.0-Pro-002 (Prompt),49.13%,47.59%,50.78%,47.01%,62.50%,29.17%,50.91%,77.78% | ||
97,Open-Mistral-Nemo-2407 (Prompt),49.04%,75.13%,77.91%,74.45%,87.50%,66.67%,8.28%,88.89% | ||
98,Meta-Llama-3-8B-Instruct (Prompt),47.98%,60.62%,61.24%,61.44%,37.50%,33.33%,28.00%,77.78% | ||
99,Falcon3-3B-Instruct (FC),47.40%,55.51%,55.43%,56.32%,31.25%,37.50%,34.35%,77.78% | ||
100,Llama-3.1-70B-Instruct (FC),45.00%,51.81%,52.33%,52.61%,31.25%,25.00%,33.45%,100.00% | ||
101,Gemma-2-2b-it (Prompt),43.80%,19.54%,26.74%,18.52%,0.00%,0.00%,81.07%,38.89% | ||
102,QwQ-32B-Preview (Prompt),40.78%,3.55%,7.36%,2.75%,0.00%,0.00%,98.64%,0.00% | ||
103,DeepSeek-Coder-V2-Lite-Instruct (FC),39.40%,3.55%,2.33%,3.80%,0.00%,8.33%,95.12%,0.00% | ||
104,Qwen2-1.5B-Instruct (Prompt),39.05%,41.30%,48.84%,40.27%,12.50%,25.00%,34.47%,94.44% | ||
105,xLAM-1b-fc-r (FC),36.92%,53.89%,63.95%,53.37%,6.25%,0.00%,9.64%,100.00% | ||
106,Llama-3.1-8B-Instruct (FC),33.50%,49.30%,51.94%,49.00%,37.50%,41.67%,8.05%,94.44% | ||
107,Falcon3-1B-Instruct (FC),32.70%,2.96%,4.65%,2.37%,0.00%,12.50%,78.91%,0.00% | ||
108,Qwen2.5-0.5B-Instruct (Prompt),31.59%,38.34%,53.88%,34.76%,56.25%,16.67%,19.95%,94.44% | ||
109,Llama-3.2-1B-Instruct (Prompt),31.36%,12.14%,31.40%,7.60%,12.50%,4.17%,60.66%,38.89% |
Oops, something went wrong.