EvolvingLMMs-Lab · Luodian · Feb 28, 2026 · Nov 3, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/.github/ISSUE_TEMPLATE/design_proposal.yml b/.github/ISSUE_TEMPLATE/design_proposal.yml
@@ -0,0 +1,106 @@
+name: Design Proposal / Follow-up
+description: Propose architecture-level follow-up work with clear scope and acceptance criteria
+labels: ["needs decision"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Use this template for non-trivial design or architecture work.
+        Keep it concise and evidence-driven, aligned with PR template style.
+
+  - type: checkboxes
+    attributes:
+      label: Checklist
+      options:
+        - label: I have searched for related issues/PRs and linked them below.
+          required: true
+        - label: I have separated in-scope and out-of-scope items.
+          required: true
+
+  - type: textarea
+    id: summary
+    attributes:
+      label: Summary
+      description: Max 3 bullets. What is the problem and what outcome is expected?
+      placeholder: |
+        - Problem: ...
+        - Impact: ...
+        - Desired outcome: ...
+    validations:
+      required: true
+
+  - type: textarea
+    id: in_scope
+    attributes:
+      label: In Scope
+      description: Explicit list of what this issue will change.
+      placeholder: |
+        - ...
+        - ...
+    validations:
+      required: true
+
+  - type: textarea
+    id: out_of_scope
+    attributes:
+      label: Out of Scope
+      description: Explicit list of what this issue will NOT change.
+      placeholder: |
+        - ...
+    validations:
+      required: true
+
+  - type: textarea
+    id: proposal
+    attributes:
+      label: Proposed Plan
+      description: 3-6 concrete steps.
+      placeholder: |
+        1. ...
+        2. ...
+        3. ...
+    validations:
+      required: true
+
+  - type: textarea
+    id: validation_plan
+    attributes:
+      label: Validation Plan
+      description: How will we verify success? Include commands/benchmarks where applicable.
+      placeholder: |
+        - `command` | sample size: `N=<...>` | key metrics: `<...>` | result: `pass/fail`
+    validations:
+      required: true
+
+  - type: textarea
+    id: risk
+    attributes:
+      label: Risk / Compatibility
+      description: 1-3 bullets on behavior changes, migration risk, or blockers.
+      placeholder: |
+        - ...
+    validations:
+      required: true
+
+  - type: textarea
+    id: acceptance_criteria
+    attributes:
+      label: Acceptance Criteria
+      description: Objective done conditions.
+      placeholder: |
+        - [ ] ...
+        - [ ] ...
+    validations:
+      required: true
+
+  - type: textarea
+    id: references
+    attributes:
+      label: References
+      description: Related PRs, issues, docs, benchmark artifacts, or Linear links.
+      placeholder: |
+        - PR: ...
+        - Issue: ...
+        - Doc: ...
+    validations:
+      required: false
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -1,31 +1,35 @@
-## Description
-
-<!-- Briefly describe what this PR does and why. Focus on the problem it solves. -->
+## Summary
+<!-- Max 3 bullets. -->
+- 
+- 
+- 
+
+## In scope
+<!-- Explicitly list what this PR changes. -->
+- 
+
+## Out of scope
+<!-- Explicitly list what this PR does NOT change. -->
+- 
+
+## Validation
+<!--
+Max 3 bullets.
+Use this format:
+`<command>` | sample size: `N=<...>` | key metrics: `<...>` | result: `pass/fail`
+If you ran tests/benchmarks with metrics, include concrete numbers.
+-->
+- 
+
+## Risk / Compatibility
+<!-- 1-2 bullets. Note breaking changes, behavior changes, or migration impact. -->
+- 
 
 ## Type of Change
-
-- [ ] Bug fix (non-breaking change that fixes an issue)
-- [ ] New feature (non-breaking change that adds functionality)
+- [ ] Bug fix (non-breaking change)
+- [ ] New feature
 - [ ] New benchmark/task
 - [ ] New model integration
-- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
+- [ ] Breaking change
 - [ ] Documentation update
 - [ ] Refactoring (no functional changes)
-
-## Changes Made
-
-<!-- List the key changes. Keep it high-level; the diff tells the details. -->
-
--
-
-## Testing
-
-<!-- Describe how you tested your changes. -->
-
-- [ ] Tested locally with: `python -m lmms_eval --model <model> --tasks <task> --limit 8`
-- [ ] Ran pre-commit: `pre-commit run --all-files`
-- [ ] Added/updated tests (if applicable)
-
-## Additional Notes
-
-<!-- Any context, screenshots, or related issues. -->
diff --git a/.github/workflows/line-stats.yml b/.github/workflows/line-stats.yml
@@ -0,0 +1,31 @@
+name: Line Stats
+
+on:
+  push:
+    branches: [main]
+
+permissions:
+  contents: read
+
+jobs:
+  line-stats:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+      - name: Count lines added/deleted
+        run: |
+          echo "### Commit: ${{ github.sha }}"
+          STATS=$(git diff --shortstat HEAD~1 HEAD)
+          echo "$STATS"
+          ADDED=$(git diff --numstat HEAD~1 HEAD | awk '{s+=$1} END {print s+0}')
+          DELETED=$(git diff --numstat HEAD~1 HEAD | awk '{s+=$2} END {print s+0}')
+          echo "Lines added: $ADDED"
+          echo "Lines deleted: $DELETED"
+          echo "### Summary" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Metric | Count |" >> "$GITHUB_STEP_SUMMARY"
+          echo "|--------|-------|" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Lines added | $ADDED |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Lines deleted | $DELETED |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Net change | $((ADDED - DELETED)) |" >> "$GITHUB_STEP_SUMMARY"
diff --git a/.github/workflows/task-input-ab.yml b/.github/workflows/task-input-ab.yml
@@ -0,0 +1,126 @@
+name: task-input-ab
+
+on:
+  pull_request:
+    paths:
+      - "lmms_eval/tasks/**"
+      - "lmms_eval/api/**"
+      - "tools/task_input_capture.py"
+      - "test/eval/task_input_specs/**"
+      - ".github/workflows/task-input-ab.yml"
+  workflow_dispatch:
+    inputs:
+      base_sha:
+        description: "Optional base commit SHA"
+        required: false
+        type: string
+
+jobs:
+  compare-task-input-boundary:
+    runs-on: ubuntu-latest
+    timeout-minutes: 45
+    steps:
+      - name: Checkout head
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+
+      - name: Sync dependencies
+        run: uv sync
+
+      - name: Resolve BASE revision
+        id: base
+        run: |
+          BASE_SHA="${{ github.event.pull_request.base.sha }}"
+          if [ -z "$BASE_SHA" ]; then
+            BASE_SHA="${{ github.event.inputs.base_sha }}"
+          fi
+          if [ -z "$BASE_SHA" ]; then
+            DEFAULT_HEAD="$(git symbolic-ref refs/remotes/origin/HEAD)"
+            DEFAULT_BRANCH="${DEFAULT_HEAD#refs/remotes/origin/}"
+            BASE_SHA="$(git merge-base HEAD "origin/${DEFAULT_BRANCH}")"
+          fi
+          BASE_WORKTREE="/tmp/lmms-base-${{ github.run_id }}"
+          echo "base_sha=${BASE_SHA}" >> "$GITHUB_OUTPUT"
+          echo "base_worktree=${BASE_WORKTREE}" >> "$GITHUB_OUTPUT"
+
+      - name: Prepare BASE worktree
+        run: git worktree add "${{ steps.base.outputs.base_worktree }}" "${{ steps.base.outputs.base_sha }}"
+
+      - name: Resolve pinned checker
+        id: checker
+        run: |
+          BASE_CHECKER="${{ steps.base.outputs.base_worktree }}/tools/task_input_capture.py"
+          BASE_SPEC="${{ steps.base.outputs.base_worktree }}/test/eval/task_input_specs/redundancy_refactor.yaml"
+          if [ -f "$BASE_CHECKER" ] && [ -f "$BASE_SPEC" ]; then
+            CHECKER_PATH="$BASE_CHECKER"
+            SPEC_PATH="$BASE_SPEC"
+          else
+            echo "Pinned checker/spec missing in base revision: ${{ steps.base.outputs.base_sha }}"
+            echo "Bootstrap mode: use HEAD checker/spec for this run."
+            CHECKER_PATH="tools/task_input_capture.py"
+            SPEC_PATH="test/eval/task_input_specs/redundancy_refactor.yaml"
+          fi
+
+          if [ ! -f "$CHECKER_PATH" ] || [ ! -f "$SPEC_PATH" ]; then
+            echo "Checker/spec not found in current checkout."
+            exit 1
+          fi
+
+          echo "checker_path=${CHECKER_PATH}" >> "$GITHUB_OUTPUT"
+          echo "spec_path=${SPEC_PATH}" >> "$GITHUB_OUTPUT"
+
+      - name: Capture HEAD snapshot
+        run: |
+          source .venv/bin/activate
+          HF_HOME=/tmp/hf-cache python "${{ steps.checker.outputs.checker_path }}" \
+            --repo-root . \
+            --spec "${{ steps.checker.outputs.spec_path }}" \
+            --output /tmp/task-input-head.json
+
+      - name: Capture BASE snapshot
+        run: |
+          source .venv/bin/activate
+          HF_HOME=/tmp/hf-cache python "${{ steps.checker.outputs.checker_path }}" \
+            --repo-root "${{ steps.base.outputs.base_worktree }}" \
+            --spec "${{ steps.checker.outputs.spec_path }}" \
+            --output /tmp/task-input-base.json
+
+      - name: Compare snapshots
+        run: |
+          source .venv/bin/activate
+          python - <<'PY'
+          import json
+          from pathlib import Path
+
+          base = json.loads(Path('/tmp/task-input-base.json').read_text(encoding='utf-8'))
+          head = json.loads(Path('/tmp/task-input-head.json').read_text(encoding='utf-8'))
+          if base != head:
+              print('Task input snapshot mismatch detected.')
+              raise SystemExit(1)
+          print('Task input snapshots match.')
+          PY
+
+      - name: Upload snapshots on failure
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: task-input-snapshots
+          path: |
+            /tmp/task-input-base.json
+            /tmp/task-input-head.json
+
+      - name: Cleanup BASE worktree
+        if: always()
+        run: |
+          if [ -n "${{ steps.base.outputs.base_worktree }}" ]; then
+            git worktree remove --force "${{ steps.base.outputs.base_worktree }}"
+          fi
diff --git a/.gitignore b/.gitignore
@@ -32,7 +32,7 @@ cache_dir
 ckpt
 pretrained/
 LLaVA/
-/*logs
+*logs
 *.isorted
 temp/
 InternVL/
@@ -58,3 +58,5 @@ remote_code/*
 docs/plans/
 .opencode
 .worktrees/
+AGENTS.md
+.ignored/