@@ -550,69 +550,90 @@ <h1 id="scicode-leaderboard">SciCode Leaderboard</h1>
550
550
< table >
551
551
< thead >
552
552
< tr >
553
- < th > Model </ th >
553
+ < th > Models </ th >
554
554
< th > Main Problem Resolve Rate</ th >
555
+ < th > < span style ="background-color:lightgrey "> Subproblem</ span > </ th >
555
556
</ tr >
556
557
</ thead >
557
558
< tbody >
558
559
< tr >
559
- < td > 🥇OpenAI o1-preview</ td >
560
- < td > 7.7%</ td >
560
+ < td > 🥇 OpenAI o1-preview</ td >
561
+ < td > < div align ="center "> 7.7</ div > </ td >
562
+ < td > < div align ="center " style ="background-color:lightgrey "> 28.5</ div > </ td >
561
563
</ tr >
562
564
< tr >
563
- < td > 🥈Claude3.5-Sonnet</ td >
564
- < td > 4.6%</ td >
565
+ < td > 🥈 Claude3.5-Sonnet</ td >
566
+ < td > < div align ="center "> 4.6</ div > </ td >
567
+ < td > < div align ="center " style ="background-color:lightgrey "> 26.0</ div > </ td >
565
568
</ tr >
566
569
< tr >
567
- < td > 🥉Deepseek-Coder-v2</ td >
568
- < td > 3.1%</ td >
570
+ < td > 🥉 Claude3.5-Sonnet (new)</ td >
571
+ < td > < div align ="center "> 4.6</ div > </ td >
572
+ < td > < div align ="center " style ="background-color:lightgrey "> 25.3</ div > </ td >
573
+ </ tr >
574
+ < tr >
575
+ < td > Deepseek-Coder-v2</ td >
576
+ < td > < div align ="center "> 3.1</ div > </ td >
577
+ < td > < div align ="center " style ="background-color:lightgrey "> 21.2</ div > </ td >
569
578
</ tr >
570
579
< tr >
571
580
< td > GPT-4o</ td >
572
- < td > 1.5%</ td >
581
+ < td > < div align ="center "> 1.5</ div > </ td >
582
+ < td > < div align ="center " style ="background-color:lightgrey "> 25.0</ div > </ td >
573
583
</ tr >
574
584
< tr >
575
585
< td > GPT-4-Turbo</ td >
576
- < td > 1.5%</ td >
586
+ < td > < div align ="center "> 1.5</ div > </ td >
587
+ < td > < div align ="center " style ="background-color:lightgrey "> 22.9</ div > </ td >
577
588
</ tr >
578
589
< tr >
579
590
< td > OpenAI o1-mini</ td >
580
- < td > 1.5%</ td >
591
+ < td > < div align ="center "> 1.5</ div > </ td >
592
+ < td > < div align ="center " style ="background-color:lightgrey "> 22.2</ div > </ td >
581
593
</ tr >
582
594
< tr >
583
595
< td > Gemini 1.5 Pro</ td >
584
- < td > 1.5%</ td >
596
+ < td > < div align ="center "> 1.5</ div > </ td >
597
+ < td > < div align ="center " style ="background-color:lightgrey "> 21.9</ div > </ td >
585
598
</ tr >
586
599
< tr >
587
600
< td > Claude3-Opus</ td >
588
- < td > 1.5%</ td >
601
+ < td > < div align ="center "> 1.5</ div > </ td >
602
+ < td > < div align ="center " style ="background-color:lightgrey "> 21.5</ div > </ td >
589
603
</ tr >
590
604
< tr >
591
- < td > Claude3-Sonnet</ td >
592
- < td > 1.5%</ td >
605
+ < td > Llama-3.1-405B-Chat</ td >
606
+ < td > < div align ="center "> 1.5</ div > </ td >
607
+ < td > < div align ="center " style ="background-color:lightgrey "> 19.8</ div > </ td >
593
608
</ tr >
594
609
< tr >
595
- < td > Qwen2-72B-Instruct</ td >
596
- < td > 1.5%</ td >
610
+ < td > Claude3-Sonnet</ td >
611
+ < td > < div align ="center "> 1.5</ div > </ td >
612
+ < td > < div align ="center " style ="background-color:lightgrey "> 17.0</ div > </ td >
597
613
</ tr >
598
614
< tr >
599
- < td > Llama-3.1-405B-Instruct</ td >
600
- < td > 0%</ td >
615
+ < td > Qwen2-72B-Instruct</ td >
616
+ < td > < div align ="center "> 1.5</ div > </ td >
617
+ < td > < div align ="center " style ="background-color:lightgrey "> 17.0</ div > </ td >
601
618
</ tr >
602
619
< tr >
603
- < td > Llama-3.1-70B-Instruct</ td >
604
- < td > 0%</ td >
620
+ < td > Llama-3.1-70B-Chat</ td >
621
+ < td > < div align ="center "> 0.0</ div > </ td >
622
+ < td > < div align ="center " style ="background-color:lightgrey "> 17.0</ div > </ td >
605
623
</ tr >
606
624
< tr >
607
625
< td > Mixtral-8x22B-Instruct</ td >
608
- < td > 0%</ td >
626
+ < td > < div align ="center "> 0.0</ div > </ td >
627
+ < td > < div align ="center " style ="background-color:lightgrey "> 16.3</ div > </ td >
609
628
</ tr >
610
629
< tr >
611
630
< td > Llama-3-70B-Chat</ td >
612
- < td > 0%</ td >
631
+ < td > < div align ="center "> 0.0</ div > </ td >
632
+ < td > < div align ="center " style ="background-color:lightgrey "> 14.6</ div > </ td >
613
633
</ tr >
614
634
</ tbody >
615
635
</ table >
636
+ < p > Note: If the models tie in the Main Problem resolve rate, we will then compare the Subproblems.</ p >
616
637
<!-- Once you've added the results to the submission repository,
617
638
bring back the table here -->
618
639
<!-- include-markdown "leaderboard_table.md" -->
0 commit comments