sys-intelligence
diff --git a/‎.github/workflows/claude.yml‎
Lines changed: 50 additions & 0 deletions b/‎.github/workflows/claude.yml‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 6 additions & 2 deletions b/‎.github/workflows/test.yml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎.github/workflows/validate-exam-pr.yml‎
Lines changed: 109 additions & 0 deletions b/‎.github/workflows/validate-exam-pr.yml‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 11 additions & 2 deletions b/‎README.md‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/egwalker/eg-walker.aux‎
Lines changed: 97 additions & 0 deletions b/‎benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/egwalker/eg-walker.aux‎
Lines changed: 97 additions & 0 deletions
@@ -0,0 +1,50 @@
+name: Claude Code
+
+on:
+  issue_comment:
+    types: [created]
+  pull_request_review_comment:
+    types: [created]
+  issues:
+    types: [opened, assigned]
+  pull_request_review:
+    types: [submitted]
+
+jobs:
+  claude:
+    if: |
+      (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) ||
+      (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) ||
+      (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) ||
+      (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude')))
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+      issues: read
+      id-token: write
+      actions: read # Required for Claude to read CI results on PRs
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Run Claude Code
+        id: claude
+        uses: anthropics/claude-code-action@v1
+        with:
+          claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
+
+          # This is an optional setting that allows Claude to read CI results on PRs
+          additional_permissions: |
+            actions: read
+
+          # Optional: Give a custom prompt to Claude. If this is not specified, Claude will perform the instructions specified in the comment that tagged it.
+          # prompt: 'Update the pull request description to include a summary of changes.'
+
+          # Optional: Add claude_args to customize behavior and configuration
+          # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md
+          # or https://code.claude.com/docs/en/cli-reference for available options
+          # claude_args: '--allowed-tools Bash(gh pr:*)'
+
@@ -18,8 +18,9 @@ jobs:
       matrix:
         benchmark:
           - example_bench
-          - course_exam_bench
           - toposense_bench
+          - courselab_bench
+          - courseexam_bench
           # TODO: For now, we comment out other benchmarks as they have no tests
           # - arteval_bench
           # - cache_bench
@@ -32,7 +33,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v5
       with:
-        python-version: '3.9'
+        python-version: '3.10'
 
     - name: Install dependencies
       working-directory: benchmarks/${{ matrix.benchmark }}
@@ -44,6 +45,9 @@ jobs:
         if [ -f requirements.txt ]; then
           pip install -r requirements.txt
         fi
+        if [ -f pyproject.toml ]; then
+          pip install -e ".[dev]"
+        fi
         deactivate
 
     - name: Run tests
 
@@ -0,0 +1,109 @@
+name: Experimental courseexam benchmark run
+
+on:
+  workflow_dispatch:
+    inputs:
+      pr_number:
+        description: 'PR number to validate'
+        required: true
+        type: number
+
+jobs:
+  validate-exam:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+
+    steps:
+      - name: Checkout PR code
+        uses: actions/checkout@v4
+        with:
+          ref: refs/pull/${{ inputs.pr_number }}/merge
+          fetch-depth: 0
+
+      - name: Verify PR targets main branch
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          BASE_BRANCH=$(gh pr view ${{ inputs.pr_number }} --json baseRefName --jq '.baseRefName')
+          if [ "$BASE_BRANCH" != "main" ]; then
+            echo "This workflow only runs for PRs targeting main (current: $BASE_BRANCH)"
+            exit 1
+          fi
+
+      - name: Detect new or modified exams
+        id: detect_exams
+        run: |
+          git fetch origin main
+          EXAM_DIRS=$(git diff --name-only origin/main...HEAD | \
+            grep -E '^benchmarks/courseexam_bench/data/raw/[^/]+/exam\.md$' | \
+            sed 's|/exam\.md||' | sed 's|benchmarks/courseexam_bench/data/raw/||' | sort -u)
+
+          if [ -z "$EXAM_DIRS" ]; then
+            echo "No new or modified exams detected"
+            echo "has_exams=false" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          EXAM_IDS=$(echo "$EXAM_DIRS" | tr '\n' ',' | sed 's/,$//')
+          echo "exam_ids=$EXAM_IDS" >> $GITHUB_OUTPUT
+          echo "has_exams=true" >> $GITHUB_OUTPUT
+
+      - name: Set up Python
+        if: steps.detect_exams.outputs.has_exams == 'true'
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        if: steps.detect_exams.outputs.has_exams == 'true'
+        working-directory: benchmarks/courseexam_bench
+        run: pip install -e . inspect-ai anthropic
+
+      - name: Prepare dataset and run evaluation
+        if: steps.detect_exams.outputs.has_exams == 'true'
+        working-directory: benchmarks/courseexam_bench
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
+          EXAM_IDS: ${{ steps.detect_exams.outputs.exam_ids }}
+        # I think it's simpler to have a single script block here
+        run: |
+          python prepare_dataset.py
+
+          python << 'EOF'
+          import os
+          from inspect_ai import eval
+          from courseexam.courseexam import courseexam
+
+          exam_ids = [x.strip() for x in os.environ['EXAM_IDS'].split(',') if x.strip()]
+
+          task = courseexam(exam_ids=exam_ids, judge_model="anthropic/claude-haiku-4-5")
+          eval(tasks=task, model="anthropic/claude-haiku-4-5")
+          EOF
+
+      - name: Upload evaluation results
+        if: steps.detect_exams.outputs.has_exams == 'true'
+        uses: actions/upload-artifact@v4
+        with:
+          name: evaluation-results
+          path: benchmarks/courseexam_bench/logs/*.eval
+          retention-days: 30
+
+      - name: Comment on PR
+        if: steps.detect_exams.outputs.has_exams == 'true'
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          cat > comment.md << EOF
+          ## Experimental courseexam benchmark run
+
+          **Exams tested:**
+          $(echo "${{ steps.detect_exams.outputs.exam_ids }}" | tr ',' '\n' | sed 's/^/- `/' | sed 's/$/`/')
+
+          **Evaluation file:** Download the \`evaluation-results\` artifact from [this workflow run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) and inspect it using \`inspect view <file>.eval\` or the [Inspect VS Code extension](https://marketplace.visualstudio.com/items?itemName=UKGovernmentAISafetyInstitute.inspect-ai). For more details on how to use the web-based log viewer, see [Inspect Log Viewer documentation](https://inspect.aisi.org.uk/log-viewer.html)
+
+          This experimental run helps you verify that questions have enough context for the LLM to answer and that grading rubrics are appropriate.
+          EOF
+
+          gh pr comment ${{ inputs.pr_number }} --body-file comment.md
@@ -2,6 +2,15 @@
 
 System Intelligence Benchmark is a comprehensive benchmark suite for evaluating the performance of Large Language Models (LLMs) and AI systems across critical system capabilities. It features tutorial, example benchmarks and offers both CLI tools and an SDK for further development.
 
+
+> [!Note]  
+> ## Private Contributions
+> This repository contains the public portion of the benchmark dataset. We also maintain a private fork that is not publicly accessible. If you would like to contribute exams or labs to the benchmark while keeping them private and confidential, please contribute via our private repository instead.
+>
+> The private repository is access-controlled. To gain access, please reach out on our [Slack channel](https://join.slack.com/t/sys-intelligence/shared_invite/zt-3hpkgr2aa-NnuPxUbyHr45S89DFi_N1A).
+
+
+
 ## Benchmark Overview
 A benchmark is a standard or point of reference against which things may be compared or assessed. In the context of AI and LLMs, benchmarks are essential for evaluating model capabilities, guiding research directions, and measuring progress. 
 
@@ -17,8 +26,8 @@ The benchmark framework is **still under development**. If you have any question
 
 System Intelligence Benchmark currently includes the following example benchmarks. Each benchmark assesses specific capabilities across multiple levels within a given research direction. Some benchmarks are still under development — we're actively updating them. Stay tuned!
 
-- **System Exam Benchmark** ([benchmarks/course_exam_bench/](benchmarks/course_exam_bench/)) - Tests LLM understanding of system concepts through university course exams (54 questions across 4 exams)
-- **System Lab Benchmark** ([benchmarks/course_lab_bench/](benchmarks/course_lab_bench/)) - Assesses AI capability on practical system course labs and projects 
+- **System Exam Benchmark** ([benchmarks/courseexam_bench/](benchmarks/courseexam_bench/)) - Tests LLM understanding of system concepts through university course exams
+- **System Lab Benchmark** ([benchmarks/courselab_bench/](benchmarks/courselab_bench/)) - Assesses AI capability on practical system course labs and projects 
 - **System Artifact Benchmark** ([benchmarks/arteval_bench/](benchmarks/arteval_bench/)) - Evaluates AI performance on artifact evaluation
 - **System Modeling Benchmark** ([benchmarks/sysmobench/](benchmarks/sysmobench/)) - Evaluates an agent's ability to produce correct TLA+ models for real-world concurrent and distributed systems, covering system capabilities across system comprehension, abstraction, and potentially tool fluency.
 - **TopoSense Benchmark** ([benchmarks/toposense_bench/](benchmarks/toposense_bench/)) - Evaluates Semantic-Spatial Sensor Scheduling (S³) capabilities in large-scale IoT digital twins (5,250 queries across 2,510 cameras)
 
@@ -0,0 +1,97 @@
+\relax 
+\providecommand\hyper@newdestlabel[2]{}
+\providecommand\HyField@AuxAddToFields[1]{}
+\providecommand\HyField@AuxAddToCoFields[2]{}
+\citation{Ellis1989}
+\citation{Nichols1995}
+\citation{DayRichter2010}
+\citation{Li2006,Roh2011RGA,Sun2020OT}
+\@LN@col{1}
+\@writefile{toc}{\contentsline {section}{Abstract}{1}{section*.1}\protected@file@percent }
+\@LN@col{2}
+\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Two concurrent insertions into a text document.}}{1}{figure.caption.5}\protected@file@percent }
+\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
+\newlabel{two-inserts}{{1}{1}{Two concurrent insertions into a text document}{figure.caption.5}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent }
+\newlabel{introduction}{{1}{1}{Introduction}{section.1}{}}
+\citation{Upwelling,Patchwork}
+\citation{Oster2006WOOT}
+\citation{crdt-papers}
+\citation{DayRichter2010}
+\citation{overleaf-ot}
+\citation{ditto-aircraft}
+\citation{ditto-military}
+\citation{antarctica}
+\citation{Hellerstein2010}
+\@LN@col{1}
+\@LN@col{2}
+\@writefile{toc}{\contentsline {section}{\numberline {2}Background}{2}{section.2}\protected@file@percent }
+\citation{Shapiro2011}
+\citation{Lamport1978}
+\citation{Birman1991,Cachin2011}
+\@LN@col{1}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}System model}{3}{subsection.2.1}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Event graphs}{3}{subsection.2.2}\protected@file@percent }
+\newlabel{event-graphs}{{2.2}{3}{Event graphs}{subsection.2.2}{}}
+\@LN@col{2}
+\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces The event graph corresponding to \autoref {two-inserts}.}}{3}{figure.caption.6}\protected@file@percent }
+\newlabel{graph-example}{{2}{3}{The event graph corresponding to \autoref {two-inserts}}{figure.caption.6}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Document versions}{3}{subsection.2.3}\protected@file@percent }
+\newlabel{versions}{{2.3}{3}{Document versions}{subsection.2.3}{}}
+\citation{polog}
+\@LN@col{1}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Replaying editing history}{4}{subsection.2.4}\protected@file@percent }
+\newlabel{replay}{{2.4}{4}{Replaying editing history}{subsection.2.4}{}}
+\@LN@col{2}
+\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces An event graph (left) and one possible topologically sorted order of that graph (right).}}{4}{figure.caption.7}\protected@file@percent }
+\newlabel{topological-sort}{{3}{4}{An event graph (left) and one possible topologically sorted order of that graph (right)}{figure.caption.7}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.5}Implementing OT using a CRDT}{4}{subsection.2.5}\protected@file@percent }
+\newlabel{crdt-replay}{{2.5}{4}{Implementing OT using a CRDT}{subsection.2.5}{}}
+\citation{Boehm1995}
+\citation{vscode-buffer}
+\citation{Attiya2016}
+\citation{fugue}
+\@LN@col{1}
+\@writefile{toc}{\contentsline {section}{\numberline {3}The Event Graph Walker algorithm}{5}{section.3}\protected@file@percent }
+\newlabel{algorithm}{{3}{5}{The Event Graph Walker algorithm}{section.3}{}}
+\@LN@col{2}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Characteristics of Eg-walker\xspace  }{5}{subsection.3.1}\protected@file@percent }
+\newlabel{characteristics}{{3.1}{5}{Characteristics of \algname }{subsection.3.1}{}}
+\citation{CLRS2009}
+\@LN@col{1}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Walking the event graph}{6}{subsection.3.2}\protected@file@percent }
+\newlabel{graph-walk}{{3.2}{6}{Walking the event graph}{subsection.3.2}{}}
+\@LN@col{2}
+\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces An event graph. Starting with document ``hi'', one user changes ``hi'' to ``hey'', while concurrently another user capitalises the ``H''. After merging to the state ``Hey'', one of them appends an exclamation mark to produce ``Hey!''.}}{6}{figure.caption.8}\protected@file@percent }
+\newlabel{graph-hi-hey}{{4}{6}{An event graph. Starting with document ``hi'', one user changes ``hi'' to ``hey'', while concurrently another user capitalises the ``H''. After merging to the state ``Hey'', one of them appends an exclamation mark to produce ``Hey!''}{figure.caption.8}{}}
+\citation{Roh2011RGA}
+\citation{Nicolaescu2016YATA}
+\citation{yjs}
+\@LN@col{1}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Representing prepare and effect versions}{7}{subsection.3.3}\protected@file@percent }
+\newlabel{prepare-effect-versions}{{3.3}{7}{Representing prepare and effect versions}{subsection.3.3}{}}
+\@LN@col{2}
+\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces State machine for internal state variable $s_p$.}}{7}{figure.caption.9}\protected@file@percent }
+\newlabel{spv-state}{{5}{7}{State machine for internal state variable $s_p$}{figure.caption.9}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces Left: the internal state after applying $e_1 ... e_4$ from \autoref {graph-hi-hey}. Right: after $\mathsf  {retreat}(e_4)$ and $\mathsf  {retreat}(e_3)$, the prepare state is updated to mark ``H'' as \texttt  {NotInsertedYet}, and the deletion of ``h'' is undone. The effect state is unchanged.}}{7}{figure.caption.10}\protected@file@percent }
+\newlabel{crdt-state-1}{{6}{7}{Left: the internal state after applying $e_1 ... e_4$ from \autoref {graph-hi-hey}. Right: after $\mathsf {retreat}(e_4)$ and $\mathsf {retreat}(e_3)$, the prepare state is updated to mark ``H'' as \texttt {NotInsertedYet}, and the deletion of ``h'' is undone. The effect state is unchanged}{figure.caption.10}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces The internal Eg-walker\xspace  state after replaying all of the events in \autoref {graph-hi-hey}.}}{7}{figure.caption.11}\protected@file@percent }
+\newlabel{crdt-state-2}{{7}{7}{The internal \algname state after replaying all of the events in \autoref {graph-hi-hey}}{figure.caption.11}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Mapping indexes to character IDs}{7}{subsection.3.4}\protected@file@percent }
+\newlabel{b-trees}{{3.4}{7}{Mapping indexes to character IDs}{subsection.3.4}{}}
+\citation{CLRS2009}
+\@LN@col{1}
+\@LN@col{2}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.5}Clearing the internal state}{8}{subsection.3.5}\protected@file@percent }
+\newlabel{clearing}{{3.5}{8}{Clearing the internal state}{subsection.3.5}{}}
+\citation{automerge-storage,automerge-columnar}
+\citation{Abadi2013,Stonebraker2005}
+\citation{yjs}
+\@LN@col{1}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.6}Partial event graph replay}{9}{subsection.3.6}\protected@file@percent }
+\newlabel{partial-replay}{{3.6}{9}{Partial event graph replay}{subsection.3.6}{}}
+\@LN@col{2}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.7}Algorithm complexity}{9}{subsection.3.7}\protected@file@percent }
+\newlabel{complexity}{{3.7}{9}{Algorithm complexity}{subsection.3.7}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.8}Storing the event graph}{9}{subsection.3.8}\protected@file@percent }
+\newlabel{storage}{{3.8}{9}{Storing the event graph}{subsection.3.8}{}}