Skip to content

Commit 8b05322

Browse files
authored
Merge branch 'sys-intelligence:main' into main
2 parents 16077b4 + c3801f9 commit 8b05322

File tree

229 files changed

+15588
-17194
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

229 files changed

+15588
-17194
lines changed

.github/workflows/claude.yml

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
name: Claude Code
2+
3+
on:
4+
issue_comment:
5+
types: [created]
6+
pull_request_review_comment:
7+
types: [created]
8+
issues:
9+
types: [opened, assigned]
10+
pull_request_review:
11+
types: [submitted]
12+
13+
jobs:
14+
claude:
15+
if: |
16+
(github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) ||
17+
(github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) ||
18+
(github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) ||
19+
(github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude')))
20+
runs-on: ubuntu-latest
21+
permissions:
22+
contents: read
23+
pull-requests: read
24+
issues: read
25+
id-token: write
26+
actions: read # Required for Claude to read CI results on PRs
27+
steps:
28+
- name: Checkout repository
29+
uses: actions/checkout@v4
30+
with:
31+
fetch-depth: 1
32+
33+
- name: Run Claude Code
34+
id: claude
35+
uses: anthropics/claude-code-action@v1
36+
with:
37+
claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
38+
39+
# This is an optional setting that allows Claude to read CI results on PRs
40+
additional_permissions: |
41+
actions: read
42+
43+
# Optional: Give a custom prompt to Claude. If this is not specified, Claude will perform the instructions specified in the comment that tagged it.
44+
# prompt: 'Update the pull request description to include a summary of changes.'
45+
46+
# Optional: Add claude_args to customize behavior and configuration
47+
# See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md
48+
# or https://code.claude.com/docs/en/cli-reference for available options
49+
# claude_args: '--allowed-tools Bash(gh pr:*)'
50+

.github/workflows/test.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@ jobs:
1818
matrix:
1919
benchmark:
2020
- example_bench
21-
- course_exam_bench
2221
- toposense_bench
22+
- courselab_bench
23+
- courseexam_bench
2324
# TODO: For now, we comment out other benchmarks as they have no tests
2425
# - arteval_bench
2526
# - cache_bench
@@ -32,7 +33,7 @@ jobs:
3233
- name: Set up Python
3334
uses: actions/setup-python@v5
3435
with:
35-
python-version: '3.9'
36+
python-version: '3.10'
3637

3738
- name: Install dependencies
3839
working-directory: benchmarks/${{ matrix.benchmark }}
@@ -44,6 +45,9 @@ jobs:
4445
if [ -f requirements.txt ]; then
4546
pip install -r requirements.txt
4647
fi
48+
if [ -f pyproject.toml ]; then
49+
pip install -e ".[dev]"
50+
fi
4751
deactivate
4852
4953
- name: Run tests
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
name: Experimental courseexam benchmark run
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
pr_number:
7+
description: 'PR number to validate'
8+
required: true
9+
type: number
10+
11+
jobs:
12+
validate-exam:
13+
runs-on: ubuntu-latest
14+
permissions:
15+
contents: read
16+
pull-requests: write
17+
18+
steps:
19+
- name: Checkout PR code
20+
uses: actions/checkout@v4
21+
with:
22+
ref: refs/pull/${{ inputs.pr_number }}/merge
23+
fetch-depth: 0
24+
25+
- name: Verify PR targets main branch
26+
env:
27+
GH_TOKEN: ${{ github.token }}
28+
run: |
29+
BASE_BRANCH=$(gh pr view ${{ inputs.pr_number }} --json baseRefName --jq '.baseRefName')
30+
if [ "$BASE_BRANCH" != "main" ]; then
31+
echo "This workflow only runs for PRs targeting main (current: $BASE_BRANCH)"
32+
exit 1
33+
fi
34+
35+
- name: Detect new or modified exams
36+
id: detect_exams
37+
run: |
38+
git fetch origin main
39+
EXAM_DIRS=$(git diff --name-only origin/main...HEAD | \
40+
grep -E '^benchmarks/courseexam_bench/data/raw/[^/]+/exam\.md$' | \
41+
sed 's|/exam\.md||' | sed 's|benchmarks/courseexam_bench/data/raw/||' | sort -u)
42+
43+
if [ -z "$EXAM_DIRS" ]; then
44+
echo "No new or modified exams detected"
45+
echo "has_exams=false" >> $GITHUB_OUTPUT
46+
exit 0
47+
fi
48+
49+
EXAM_IDS=$(echo "$EXAM_DIRS" | tr '\n' ',' | sed 's/,$//')
50+
echo "exam_ids=$EXAM_IDS" >> $GITHUB_OUTPUT
51+
echo "has_exams=true" >> $GITHUB_OUTPUT
52+
53+
- name: Set up Python
54+
if: steps.detect_exams.outputs.has_exams == 'true'
55+
uses: actions/setup-python@v5
56+
with:
57+
python-version: '3.11'
58+
59+
- name: Install dependencies
60+
if: steps.detect_exams.outputs.has_exams == 'true'
61+
working-directory: benchmarks/courseexam_bench
62+
run: pip install -e . inspect-ai anthropic
63+
64+
- name: Prepare dataset and run evaluation
65+
if: steps.detect_exams.outputs.has_exams == 'true'
66+
working-directory: benchmarks/courseexam_bench
67+
env:
68+
ANTHROPIC_API_KEY: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
69+
EXAM_IDS: ${{ steps.detect_exams.outputs.exam_ids }}
70+
# I think it's simpler to have a single script block here
71+
run: |
72+
python prepare_dataset.py
73+
74+
python << 'EOF'
75+
import os
76+
from inspect_ai import eval
77+
from courseexam.courseexam import courseexam
78+
79+
exam_ids = [x.strip() for x in os.environ['EXAM_IDS'].split(',') if x.strip()]
80+
81+
task = courseexam(exam_ids=exam_ids, judge_model="anthropic/claude-haiku-4-5")
82+
eval(tasks=task, model="anthropic/claude-haiku-4-5")
83+
EOF
84+
85+
- name: Upload evaluation results
86+
if: steps.detect_exams.outputs.has_exams == 'true'
87+
uses: actions/upload-artifact@v4
88+
with:
89+
name: evaluation-results
90+
path: benchmarks/courseexam_bench/logs/*.eval
91+
retention-days: 30
92+
93+
- name: Comment on PR
94+
if: steps.detect_exams.outputs.has_exams == 'true'
95+
env:
96+
GH_TOKEN: ${{ github.token }}
97+
run: |
98+
cat > comment.md << EOF
99+
## Experimental courseexam benchmark run
100+
101+
**Exams tested:**
102+
$(echo "${{ steps.detect_exams.outputs.exam_ids }}" | tr ',' '\n' | sed 's/^/- `/' | sed 's/$/`/')
103+
104+
**Evaluation file:** Download the \`evaluation-results\` artifact from [this workflow run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) and inspect it using \`inspect view <file>.eval\` or the [Inspect VS Code extension](https://marketplace.visualstudio.com/items?itemName=UKGovernmentAISafetyInstitute.inspect-ai). For more details on how to use the web-based log viewer, see [Inspect Log Viewer documentation](https://inspect.aisi.org.uk/log-viewer.html)
105+
106+
This experimental run helps you verify that questions have enough context for the LLM to answer and that grading rubrics are appropriate.
107+
EOF
108+
109+
gh pr comment ${{ inputs.pr_number }} --body-file comment.md

README.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,15 @@
22

33
System Intelligence Benchmark is a comprehensive benchmark suite for evaluating the performance of Large Language Models (LLMs) and AI systems across critical system capabilities. It features tutorial, example benchmarks and offers both CLI tools and an SDK for further development.
44

5+
6+
> [!Note]
7+
> ## Private Contributions
8+
> This repository contains the public portion of the benchmark dataset. We also maintain a private fork that is not publicly accessible. If you would like to contribute exams or labs to the benchmark while keeping them private and confidential, please contribute via our private repository instead.
9+
>
10+
> The private repository is access-controlled. To gain access, please reach out on our [Slack channel](https://join.slack.com/t/sys-intelligence/shared_invite/zt-3hpkgr2aa-NnuPxUbyHr45S89DFi_N1A).
11+
12+
13+
514
## Benchmark Overview
615
A benchmark is a standard or point of reference against which things may be compared or assessed. In the context of AI and LLMs, benchmarks are essential for evaluating model capabilities, guiding research directions, and measuring progress.
716

@@ -17,8 +26,8 @@ The benchmark framework is **still under development**. If you have any question
1726

1827
System Intelligence Benchmark currently includes the following example benchmarks. Each benchmark assesses specific capabilities across multiple levels within a given research direction. Some benchmarks are still under development — we're actively updating them. Stay tuned!
1928

20-
- **System Exam Benchmark** ([benchmarks/course_exam_bench/](benchmarks/course_exam_bench/)) - Tests LLM understanding of system concepts through university course exams (54 questions across 4 exams)
21-
- **System Lab Benchmark** ([benchmarks/course_lab_bench/](benchmarks/course_lab_bench/)) - Assesses AI capability on practical system course labs and projects
29+
- **System Exam Benchmark** ([benchmarks/courseexam_bench/](benchmarks/courseexam_bench/)) - Tests LLM understanding of system concepts through university course exams
30+
- **System Lab Benchmark** ([benchmarks/courselab_bench/](benchmarks/courselab_bench/)) - Assesses AI capability on practical system course labs and projects
2231
- **System Artifact Benchmark** ([benchmarks/arteval_bench/](benchmarks/arteval_bench/)) - Evaluates AI performance on artifact evaluation
2332
- **System Modeling Benchmark** ([benchmarks/sysmobench/](benchmarks/sysmobench/)) - Evaluates an agent's ability to produce correct TLA+ models for real-world concurrent and distributed systems, covering system capabilities across system comprehension, abstraction, and potentially tool fluency.
2433
- **TopoSense Benchmark** ([benchmarks/toposense_bench/](benchmarks/toposense_bench/)) - Evaluates Semantic-Spatial Sensor Scheduling (S³) capabilities in large-scale IoT digital twins (5,250 queries across 2,510 cameras)
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
\relax
2+
\providecommand\hyper@newdestlabel[2]{}
3+
\providecommand\HyField@AuxAddToFields[1]{}
4+
\providecommand\HyField@AuxAddToCoFields[2]{}
5+
\citation{Ellis1989}
6+
\citation{Nichols1995}
7+
\citation{DayRichter2010}
8+
\citation{Li2006,Roh2011RGA,Sun2020OT}
9+
\@LN@col{1}
10+
\@writefile{toc}{\contentsline {section}{Abstract}{1}{section*.1}\protected@file@percent }
11+
\@LN@col{2}
12+
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Two concurrent insertions into a text document.}}{1}{figure.caption.5}\protected@file@percent }
13+
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
14+
\newlabel{two-inserts}{{1}{1}{Two concurrent insertions into a text document}{figure.caption.5}{}}
15+
\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent }
16+
\newlabel{introduction}{{1}{1}{Introduction}{section.1}{}}
17+
\citation{Upwelling,Patchwork}
18+
\citation{Oster2006WOOT}
19+
\citation{crdt-papers}
20+
\citation{DayRichter2010}
21+
\citation{overleaf-ot}
22+
\citation{ditto-aircraft}
23+
\citation{ditto-military}
24+
\citation{antarctica}
25+
\citation{Hellerstein2010}
26+
\@LN@col{1}
27+
\@LN@col{2}
28+
\@writefile{toc}{\contentsline {section}{\numberline {2}Background}{2}{section.2}\protected@file@percent }
29+
\citation{Shapiro2011}
30+
\citation{Lamport1978}
31+
\citation{Birman1991,Cachin2011}
32+
\@LN@col{1}
33+
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}System model}{3}{subsection.2.1}\protected@file@percent }
34+
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Event graphs}{3}{subsection.2.2}\protected@file@percent }
35+
\newlabel{event-graphs}{{2.2}{3}{Event graphs}{subsection.2.2}{}}
36+
\@LN@col{2}
37+
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces The event graph corresponding to \autoref {two-inserts}.}}{3}{figure.caption.6}\protected@file@percent }
38+
\newlabel{graph-example}{{2}{3}{The event graph corresponding to \autoref {two-inserts}}{figure.caption.6}{}}
39+
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Document versions}{3}{subsection.2.3}\protected@file@percent }
40+
\newlabel{versions}{{2.3}{3}{Document versions}{subsection.2.3}{}}
41+
\citation{polog}
42+
\@LN@col{1}
43+
\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Replaying editing history}{4}{subsection.2.4}\protected@file@percent }
44+
\newlabel{replay}{{2.4}{4}{Replaying editing history}{subsection.2.4}{}}
45+
\@LN@col{2}
46+
\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces An event graph (left) and one possible topologically sorted order of that graph (right).}}{4}{figure.caption.7}\protected@file@percent }
47+
\newlabel{topological-sort}{{3}{4}{An event graph (left) and one possible topologically sorted order of that graph (right)}{figure.caption.7}{}}
48+
\@writefile{toc}{\contentsline {subsection}{\numberline {2.5}Implementing OT using a CRDT}{4}{subsection.2.5}\protected@file@percent }
49+
\newlabel{crdt-replay}{{2.5}{4}{Implementing OT using a CRDT}{subsection.2.5}{}}
50+
\citation{Boehm1995}
51+
\citation{vscode-buffer}
52+
\citation{Attiya2016}
53+
\citation{fugue}
54+
\@LN@col{1}
55+
\@writefile{toc}{\contentsline {section}{\numberline {3}The Event Graph Walker algorithm}{5}{section.3}\protected@file@percent }
56+
\newlabel{algorithm}{{3}{5}{The Event Graph Walker algorithm}{section.3}{}}
57+
\@LN@col{2}
58+
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Characteristics of Eg-walker\xspace }{5}{subsection.3.1}\protected@file@percent }
59+
\newlabel{characteristics}{{3.1}{5}{Characteristics of \algname }{subsection.3.1}{}}
60+
\citation{CLRS2009}
61+
\@LN@col{1}
62+
\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Walking the event graph}{6}{subsection.3.2}\protected@file@percent }
63+
\newlabel{graph-walk}{{3.2}{6}{Walking the event graph}{subsection.3.2}{}}
64+
\@LN@col{2}
65+
\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces An event graph. Starting with document ``hi'', one user changes ``hi'' to ``hey'', while concurrently another user capitalises the ``H''. After merging to the state ``Hey'', one of them appends an exclamation mark to produce ``Hey!''.}}{6}{figure.caption.8}\protected@file@percent }
66+
\newlabel{graph-hi-hey}{{4}{6}{An event graph. Starting with document ``hi'', one user changes ``hi'' to ``hey'', while concurrently another user capitalises the ``H''. After merging to the state ``Hey'', one of them appends an exclamation mark to produce ``Hey!''}{figure.caption.8}{}}
67+
\citation{Roh2011RGA}
68+
\citation{Nicolaescu2016YATA}
69+
\citation{yjs}
70+
\@LN@col{1}
71+
\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Representing prepare and effect versions}{7}{subsection.3.3}\protected@file@percent }
72+
\newlabel{prepare-effect-versions}{{3.3}{7}{Representing prepare and effect versions}{subsection.3.3}{}}
73+
\@LN@col{2}
74+
\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces State machine for internal state variable $s_p$.}}{7}{figure.caption.9}\protected@file@percent }
75+
\newlabel{spv-state}{{5}{7}{State machine for internal state variable $s_p$}{figure.caption.9}{}}
76+
\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces Left: the internal state after applying $e_1 ... e_4$ from \autoref {graph-hi-hey}. Right: after $\mathsf {retreat}(e_4)$ and $\mathsf {retreat}(e_3)$, the prepare state is updated to mark ``H'' as \texttt {NotInsertedYet}, and the deletion of ``h'' is undone. The effect state is unchanged.}}{7}{figure.caption.10}\protected@file@percent }
77+
\newlabel{crdt-state-1}{{6}{7}{Left: the internal state after applying $e_1 ... e_4$ from \autoref {graph-hi-hey}. Right: after $\mathsf {retreat}(e_4)$ and $\mathsf {retreat}(e_3)$, the prepare state is updated to mark ``H'' as \texttt {NotInsertedYet}, and the deletion of ``h'' is undone. The effect state is unchanged}{figure.caption.10}{}}
78+
\@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces The internal Eg-walker\xspace state after replaying all of the events in \autoref {graph-hi-hey}.}}{7}{figure.caption.11}\protected@file@percent }
79+
\newlabel{crdt-state-2}{{7}{7}{The internal \algname state after replaying all of the events in \autoref {graph-hi-hey}}{figure.caption.11}{}}
80+
\@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Mapping indexes to character IDs}{7}{subsection.3.4}\protected@file@percent }
81+
\newlabel{b-trees}{{3.4}{7}{Mapping indexes to character IDs}{subsection.3.4}{}}
82+
\citation{CLRS2009}
83+
\@LN@col{1}
84+
\@LN@col{2}
85+
\@writefile{toc}{\contentsline {subsection}{\numberline {3.5}Clearing the internal state}{8}{subsection.3.5}\protected@file@percent }
86+
\newlabel{clearing}{{3.5}{8}{Clearing the internal state}{subsection.3.5}{}}
87+
\citation{automerge-storage,automerge-columnar}
88+
\citation{Abadi2013,Stonebraker2005}
89+
\citation{yjs}
90+
\@LN@col{1}
91+
\@writefile{toc}{\contentsline {subsection}{\numberline {3.6}Partial event graph replay}{9}{subsection.3.6}\protected@file@percent }
92+
\newlabel{partial-replay}{{3.6}{9}{Partial event graph replay}{subsection.3.6}{}}
93+
\@LN@col{2}
94+
\@writefile{toc}{\contentsline {subsection}{\numberline {3.7}Algorithm complexity}{9}{subsection.3.7}\protected@file@percent }
95+
\newlabel{complexity}{{3.7}{9}{Algorithm complexity}{subsection.3.7}{}}
96+
\@writefile{toc}{\contentsline {subsection}{\numberline {3.8}Storing the event graph}{9}{subsection.3.8}\protected@file@percent }
97+
\newlabel{storage}{{3.8}{9}{Storing the event graph}{subsection.3.8}{}}

0 commit comments

Comments
 (0)