|
2 | 2 | import BaseLayout from "../layouts/BaseLayout.astro"; |
3 | 3 | import { CoopChart } from "../components/CoopChart"; |
4 | 4 |
|
5 | | -const title = "CooperBench: Why Coding Agents Cannot be Your Teammates Yet"; |
| 5 | +const title = "CooperBench: Benchmarking Agent Teams | Why Coding Agents Cannot be Your Teammates Yet"; |
6 | 6 | const description = |
7 | | - "CooperBench is a benchmark of over 600 collaborative coding tasks. We find that agents achieve 30% lower success rates when working together compared to performing both tasks individually."; |
| 7 | + "CooperBench is the first benchmark for evaluating AI agent teams, agents as teammates, and human-AI collaboration. Over 600 collaborative coding tasks reveal that agent teams achieve 30% lower success rates when working together compared to performing tasks individually."; |
| 8 | +const keywords = "benchmarking agent teams, benchmarking agents as teammates, AI agent cooperation, multi-agent benchmark, collaborative coding, agent collaboration, AI teammates, human-AI interaction, human-AI collaboration, human-agent collaboration"; |
8 | 9 | --- |
9 | 10 |
|
10 | | -<BaseLayout title={title} description={description} activeNav="Home"> |
| 11 | +<BaseLayout title={title} description={description} keywords={keywords} activeNav="Home"> |
11 | 12 | <style is:inline slot="head"> |
12 | 13 | :root { |
13 | 14 | --background: 0 0% 100%; |
@@ -82,7 +83,7 @@ const description = |
82 | 83 | class="text-3xl md:text-4xl font-semibold text-gray-900 leading-tight mb-6" |
83 | 84 | > |
84 | 85 | CooperBench:<br /> |
85 | | - Benchmarking AI Agents' Cooperation |
| 86 | + Benchmarking Agent Teams & Cooperation |
86 | 87 | </h1> |
87 | 88 |
|
88 | 89 | <p class="text-2xl text-gray-500 mb-8"> |
@@ -555,11 +556,12 @@ const description = |
555 | 556 |
|
556 | 557 | <div id="benchmark-content" class="mt-4"> |
557 | 558 | <p class="text-gray-600 leading-relaxed mb-6"> |
558 | | - CooperBench is the first benchmark designed to measure |
559 | | - how well AI agents can cooperate when handling |
560 | | - individual tasks with potential conflicts. We |
561 | | - constructed 652 tasks from 12 popular open-source |
562 | | - libraries across Python, TypeScript, Go, and Rust. |
| 559 | + CooperBench is the first benchmark for evaluating agent teams |
| 560 | + and measuring how well AI agents perform as teammates. Our work |
| 561 | + informs human-AI collaboration and human-agent collaboration by |
| 562 | + studying whether agents can cooperate when handling individual |
| 563 | + tasks with potential conflicts. The benchmark includes 652 tasks |
| 564 | + from 12 popular open-source libraries across Python, TypeScript, Go, and Rust. |
563 | 565 | </p> |
564 | 566 |
|
565 | 567 | <div class="flex flex-wrap gap-6 text-center mb-6"> |
|
0 commit comments