diff --git a/.github/workflows/diagnostic-diff.yml b/.github/workflows/diagnostic-diff.yml
new file mode 100644
index 00000000..9359bc51
--- /dev/null
+++ b/.github/workflows/diagnostic-diff.yml
@@ -0,0 +1,111 @@
+name: Diagnostic Diff
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+# Allow only one concurrent diagnostic diff per PR
+concurrency:
+  group: diagnostic-diff-${{ github.event.pull_request.number }}
+  cancel-in-progress: true
+
+jobs:
+  diagnostic-diff:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+
+    steps:
+      - name: Checkout PR branch
+        uses: actions/checkout@v4
+
+      - name: Update Rust toolchain
+        run: rustup update
+
+      - uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.13"
+
+      - name: Build wheels
+        uses: PyO3/maturin-action@v1
+        with:
+          command: build
+
+      - name: Run diagnostics on PR branch
+        run: |
+          cargo run --release -p karva_diffs --bin karva-diagnostics -- run --output pr-diagnostics.json
+        continue-on-error: true
+
+      - name: Stash PR diagnostics
+        run: |
+          mkdir -p /tmp/karva-diagnostics
+          cp pr-diagnostics.json /tmp/karva-diagnostics/
+
+      - name: Checkout main branch
+        uses: actions/checkout@v4
+        with:
+          ref: main
+
+      - name: Build wheels
+        uses: PyO3/maturin-action@v1
+        with:
+          command: build
+
+      - name: Run diagnostics on main branch
+        run: |
+          cargo run --release -p karva_diffs --bin karva-diagnostics -- run --output main-diagnostics.json
+        continue-on-error: true
+
+      - name: Retrieve PR diagnostics
+        run: |
+          cp /tmp/karva-diagnostics/pr-diagnostics.json .
+
+      - name: Generate diff
+        run: |
+          cargo run --release -p karva_diffs --bin karva-diagnostics -- diff --base main-diagnostics.json --head pr-diagnostics.json --output diff.md
+
+      - name: Comment PR
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            const diff = fs.readFileSync('diff.md', 'utf8');
+
+            // Find existing comment
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+
+            const botComment = comments.find(comment =>
+              comment.user.type === 'Bot' &&
+              comment.body.includes('Diagnostic Diff Report')
+            );
+
+            const body = diff + '\n\n---\n*This comment is automatically generated by the diagnostic diff workflow.*';
+
+            if (botComment) {
+              // Update existing comment
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: botComment.id,
+                body: body
+              });
+            } else {
+              // Create new comment
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body: body
+              });
+            }
diff --git a/Cargo.lock b/Cargo.lock
index d66b2aa5..46831885 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -934,6 +934,22 @@ dependencies = [
  "pretty_assertions",
 ]
 
+[[package]]
+name = "karva_diffs"
+version = "0.0.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "karva_core",
+ "karva_project",
+ "karva_test",
+ "ruff_python_ast",
+ "serde",
+ "serde_json",
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "karva_project"
 version = "0.0.0"
@@ -952,8 +968,12 @@ dependencies = [
  "insta",
  "rand 0.9.2",
  "regex",
+ "ruff_python_ast",
  "ruff_python_trivia",
+ "serde",
+ "serde_json",
  "tempfile",
+ "tracing",
 ]
 
 [[package]]
diff --git a/crates/karva_benchmark/benches/karva_walltime.rs b/crates/karva_benchmark/benches/karva_walltime.rs
index c5ad9c71..6ed2269d 100644
--- a/crates/karva_benchmark/benches/karva_walltime.rs
+++ b/crates/karva_benchmark/benches/karva_walltime.rs
@@ -1,8 +1,8 @@
-use std::{path::PathBuf, sync::Once};
+use std::sync::Once;
 
 use karva_benchmark::{
+    InstalledProject, RealWorldProject, affect_project,
     criterion::{BatchSize, Criterion, criterion_group, criterion_main},
-    real_world_projects::{InstalledProject, RealWorldProject},
 };
 use karva_core::{TestRunner, testing::setup_module};
 use karva_project::{
@@ -10,7 +10,6 @@ use karva_project::{
     project::{Project, ProjectOptions},
     verbosity::VerbosityLevel,
 };
-use ruff_python_ast::PythonVersion;
 
 static SETUP_MODULE_ONCE: Once = Once::new();
 
@@ -73,15 +72,7 @@ fn bench_project(benchmark: &ProjectBenchmark, criterion: &mut Criterion) {
 }
 
 fn affect(criterion: &mut Criterion) {
-    let benchmark = ProjectBenchmark::new(RealWorldProject {
-        name: "affect",
-        repository: "https://github.com/MatthewMckee4/affect",
-        commit: "803cc916b492378a8ad8966e747cac3325e11b5f",
-        paths: vec![PathBuf::from("tests")],
-        dependencies: vec!["pydantic", "pydantic-settings", "pytest"],
-        python_version: PythonVersion::PY313,
-    });
-
+    let benchmark = ProjectBenchmark::new(affect_project());
     bench_project(&benchmark, criterion);
 }
 
diff --git a/crates/karva_benchmark/src/lib.rs b/crates/karva_benchmark/src/lib.rs
index 24b492f4..275038ae 100644
--- a/crates/karva_benchmark/src/lib.rs
+++ b/crates/karva_benchmark/src/lib.rs
@@ -1,7 +1,12 @@
 use std::path::PathBuf;
 
 pub mod criterion;
-pub mod real_world_projects;
+
+// Re-export real world projects from karva_test
+pub use karva_test::{
+    InstalledProject, RealWorldProject, affect_project, get_real_world_projects,
+    real_world_projects,
+};
 
 pub static TRUE_ASSERTIONS: TestFile = TestFile::new(
     "test_true_assertions.py",
@@ -36,19 +41,6 @@ pub static PARAMETRIZE: TestFile = TestFile::new(
     include_str!("../resources/test_parametrize.py"),
 );
 
-/// Relative size of a test case. Benchmarks can use it to configure the time for how long a benchmark should run to get stable results.
-#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
-pub enum TestCaseSpeed {
-    /// A test case that is fast to run
-    Fast,
-
-    /// A normal test case
-    Normal,
-
-    /// A slow test case
-    Slow,
-}
-
 #[derive(Debug, Clone)]
 pub struct TestCase {
     file: TestFile,
diff --git a/crates/karva_core/src/diagnostic/diagnostic.rs b/crates/karva_core/src/diagnostic/diagnostic.rs
index 3d1149d6..7a73302c 100644
--- a/crates/karva_core/src/diagnostic/diagnostic.rs
+++ b/crates/karva_core/src/diagnostic/diagnostic.rs
@@ -50,7 +50,7 @@ impl Diagnostic {
     }
 
     #[must_use]
-    pub(crate) const fn severity(&self) -> &DiagnosticSeverity {
+    pub const fn severity(&self) -> &DiagnosticSeverity {
         &self.inner.severity
     }
 
@@ -175,14 +175,14 @@ impl DiagnosticInner {
 
 // Diagnostic severity
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub(crate) enum DiagnosticSeverity {
+pub enum DiagnosticSeverity {
     Error(DiagnosticErrorType),
     Warning(String),
 }
 
 impl DiagnosticSeverity {
     #[must_use]
-    pub(crate) const fn is_error(&self) -> bool {
+    pub const fn is_error(&self) -> bool {
         matches!(self, Self::Error(_))
     }
 
@@ -193,7 +193,7 @@ impl DiagnosticSeverity {
 }
 
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub(crate) enum DiagnosticErrorType {
+pub enum DiagnosticErrorType {
     TestCase {
         test_name: String,
         diagnostic_type: TestCaseDiagnosticType,
@@ -203,17 +203,17 @@ pub(crate) enum DiagnosticErrorType {
 }
 
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub(crate) enum TestCaseDiagnosticType {
+pub enum TestCaseDiagnosticType {
     Fail(String),
     Collection(TestCaseCollectionDiagnosticType),
 }
 
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub(crate) enum TestCaseCollectionDiagnosticType {
+pub enum TestCaseCollectionDiagnosticType {
     FixtureNotFound,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub(crate) enum FixtureDiagnosticType {
+pub enum FixtureDiagnosticType {
     Invalid,
 }
diff --git a/crates/karva_core/src/runner/diagnostic.rs b/crates/karva_core/src/runner/diagnostic.rs
index 44201150..b4bed1ef 100644
--- a/crates/karva_core/src/runner/diagnostic.rs
+++ b/crates/karva_core/src/runner/diagnostic.rs
@@ -126,6 +126,7 @@ impl TestResultStats {
         self.inner.values().sum()
     }
 
+    #[must_use]
     pub fn is_success(&self) -> bool {
         self.failed() == 0
     }
@@ -135,17 +136,17 @@ impl TestResultStats {
     }
 
     #[must_use]
-    pub(crate) fn passed(&self) -> usize {
+    pub fn passed(&self) -> usize {
         self.get(TestResultKind::Passed)
     }
 
     #[must_use]
-    pub(crate) fn failed(&self) -> usize {
+    pub fn failed(&self) -> usize {
         self.get(TestResultKind::Failed)
     }
 
     #[must_use]
-    pub(crate) fn skipped(&self) -> usize {
+    pub fn skipped(&self) -> usize {
         self.get(TestResultKind::Skipped)
     }
 
diff --git a/crates/karva_core/src/runner/mod.rs b/crates/karva_core/src/runner/mod.rs
index 34b87e43..6d55f54a 100644
--- a/crates/karva_core/src/runner/mod.rs
+++ b/crates/karva_core/src/runner/mod.rs
@@ -9,9 +9,9 @@ use crate::{
     utils::attach,
 };
 
-pub(crate) mod diagnostic;
+pub mod diagnostic;
 
-pub(crate) use diagnostic::TestRunResult;
+pub use diagnostic::TestRunResult;
 
 pub trait TestRunner {
     fn test(&self) -> TestRunResult {
diff --git a/crates/karva_diffs/Cargo.toml b/crates/karva_diffs/Cargo.toml
new file mode 100644
index 00000000..05dc06ea
--- /dev/null
+++ b/crates/karva_diffs/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "karva_diffs"
+version = "0.0.0"
+description = "Diagnostic diff tests for Karva on real-world projects"
+publish = false
+authors = { workspace = true }
+edition = { workspace = true }
+rust-version = { workspace = true }
+homepage = { workspace = true }
+documentation = { workspace = true }
+repository = { workspace = true }
+license = { workspace = true }
+
+[[bin]]
+name = "karva-diagnostics"
+path = "src/bin/karva-diagnostics.rs"
+
+[dependencies]
+karva_core = { workspace = true }
+karva_project = { workspace = true }
+karva_test = { workspace = true }
+
+anyhow = { workspace = true }
+ruff_python_ast = { workspace = true }
+tracing = { workspace = true }
+tracing-subscriber = { workspace = true }
+serde = { workspace = true, features = ["derive"] }
+serde_json = { workspace = true }
+clap = { workspace = true }
+
+[lints]
+workspace = true
diff --git a/crates/karva_diffs/README.md b/crates/karva_diffs/README.md
new file mode 100644
index 00000000..9d79177d
--- /dev/null
+++ b/crates/karva_diffs/README.md
@@ -0,0 +1,126 @@
+# Karva Diffs
+
+The `karva_diffs` crate tracks diagnostic changes on real-world Python projects, similar to how mypy_primer works for mypy.
+
+## Purpose
+
+This crate helps track progress in pytest feature support by:
+- Running Karva on real-world projects
+- Capturing diagnostic output (test counts, errors, warnings)
+- Automatically comparing diagnostics between main and PR branches
+- Posting diff reports as PR comments
+
+## How It Works
+
+The diagnostic diff workflow runs automatically on every pull request:
+
+1. **Runs diagnostics on main branch** - Establishes the baseline
+2. **Runs diagnostics on PR branch** - Shows current state
+3. **Generates a diff** - Compares the two results
+4. **Posts a comment** - Shows improvements or regressions in the PR
+
+## GitHub Actions Workflow
+
+The `.github/workflows/diagnostic-diff.yml` workflow:
+- Triggers on PR open, sync, or reopen
+- Runs both sets of diagnostics
+- Generates a markdown diff report
+- Posts/updates a comment on the PR
+
+Example comment output:
+```markdown
+# Diagnostic Diff Report
+
+## Summary
+
+| Project | Tests | Passed | Failed | Skipped | Errors | Warnings |
+|---------|-------|--------|--------|---------|--------|----------|
+| affect  | 50 (+5) | 45 (+5) | 5 | 0 | 5 | 2 (-1) |
+
+## Detailed Changes
+
+### affect
+
+- **Passed tests:** 40 → 45 ✅
+- **Warnings:** 3 → 2 ✅
+```
+
+## CLI Usage
+
+The crate provides a `karva-diagnostics` binary for manual runs:
+
+### Run diagnostics
+```shell
+# Run on all configured projects and output JSON
+cargo run -p karva_diffs --bin karva-diagnostics -- run --output diagnostics.json
+
+# Or just to stdout
+cargo run -p karva_diffs --bin karva-diagnostics -- run
+```
+
+### Compare two reports
+```shell
+cargo run -p karva_diffs --bin karva-diagnostics -- diff \
+  --base main-diagnostics.json \
+  --head pr-diagnostics.json \
+  --output diff.md
+```
+
+## Adding New Projects
+
+To add a new project for tracking, edit `src/lib.rs` and add to the `get_test_projects()` function:
+
+```rust
+pub fn get_test_projects() -> Vec<RealWorldProject<'static>> {
+    vec![
+        RealWorldProject {
+            name: "your-project",
+            repository: "https://github.com/user/repo",
+            commit: "abc123...",  // Pin to specific commit
+            paths: vec![PathBuf::from("tests")],
+            dependencies: vec!["pytest", "other-deps"],
+            python_version: PythonVersion::PY313,
+        },
+        // ... more projects
+    ]
+}
+```
+
+## Interpreting Results
+
+When reviewing a PR with diagnostic diffs:
+
+- ✅ **Green checkmarks** - Improvements (more passing tests, fewer errors)
+- ❌ **Red X marks** - Regressions (fewer passing tests, more errors)
+- Numbers in parentheses - Show the change from main (e.g., `(+5)` means 5 more than main)
+
+### What changes mean:
+
+- **More passed tests** ✅ - New features working or bugs fixed
+- **Fewer failed tests** ✅ - Bugs fixed or better compatibility
+- **Fewer errors** ✅ - Improved error handling or detection
+- **More failed tests** ❌ - Potential regression or new strict checks
+- **More errors** ❌ - New issues introduced
+
+## Development
+
+The crate consists of:
+- `src/lib.rs` - Core logic for running diagnostics and project registry
+- `src/bin/karva-diagnostics.rs` - CLI tool for running and comparing diagnostics
+- `.github/workflows/diagnostic-diff.yml` - GitHub Actions workflow
+
+## Requirements
+
+- Rust toolchain
+- Python 3.13+
+- `uv` package manager
+- Network access (for cloning projects)
+
+## Performance
+
+Running diagnostics can take several minutes depending on:
+- Number of projects configured
+- Size of test suites
+- Whether projects are cached (in `target/benchmark_cache/`)
+
+The GitHub Actions workflow uses caching and concurrency controls to optimize performance.
diff --git a/crates/karva_diffs/src/bin/karva-diagnostics.rs b/crates/karva_diffs/src/bin/karva-diagnostics.rs
new file mode 100644
index 00000000..1c10a9ea
--- /dev/null
+++ b/crates/karva_diffs/src/bin/karva-diagnostics.rs
@@ -0,0 +1,253 @@
+#![allow(clippy::print_stdout)]
+
+use std::{fs, path::PathBuf};
+
+use anyhow::{Context, Result};
+use clap::{Parser, Subcommand};
+use karva_core::testing::setup_module;
+use karva_diffs::{DiagnosticReport, get_real_world_projects, run_project_diagnostics};
+
+#[derive(Parser)]
+#[command(name = "karva-diagnostics")]
+#[command(about = "Run diagnostic tests on real-world projects", long_about = None)]
+struct Cli {
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    /// Run diagnostics on all configured projects and output JSON
+    Run {
+        /// Output file for the diagnostic report (defaults to stdout)
+        #[arg(short, long)]
+        output: Option<PathBuf>,
+    },
+    /// Compare two diagnostic reports and output a markdown diff
+    Diff {
+        /// Path to the base report (e.g., from main branch)
+        #[arg(long)]
+        base: PathBuf,
+
+        /// Path to the head report (e.g., from PR branch)
+        #[arg(long)]
+        head: PathBuf,
+
+        /// Output file for the diff markdown (defaults to stdout)
+        #[arg(short, long)]
+        output: Option<PathBuf>,
+    },
+}
+
+fn main() -> Result<()> {
+    // Initialize Python module
+    setup_module();
+
+    // Setup tracing
+    tracing_subscriber::fmt()
+        .with_env_filter(
+            tracing_subscriber::EnvFilter::try_from_default_env()
+                .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")),
+        )
+        .init();
+
+    let cli = Cli::parse();
+
+    match cli.command {
+        Commands::Run { output } => run_diagnostics(output),
+        Commands::Diff { base, head, output } => compare_diagnostics(&base, &head, output),
+    }
+}
+
+fn run_diagnostics(output: Option<PathBuf>) -> Result<()> {
+    let projects = get_real_world_projects();
+    let mut report = DiagnosticReport::new();
+
+    eprintln!("Running diagnostics on {} project(s)...", projects.len());
+
+    for project in projects {
+        eprintln!("  Testing project: {}", project.name);
+        match run_project_diagnostics(project) {
+            Ok(diagnostics) => {
+                eprintln!(
+                    "    ✓ {} tests ({} passed, {} failed, {} skipped)",
+                    diagnostics.total_tests,
+                    diagnostics.passed,
+                    diagnostics.failed,
+                    diagnostics.skipped
+                );
+                report.add_project(diagnostics);
+            }
+            Err(e) => {
+                eprintln!("    ✗ Failed to run diagnostics: {e}");
+                return Err(e);
+            }
+        }
+    }
+
+    let json = report.to_json()?;
+
+    if let Some(path) = output {
+        fs::write(&path, json).context("Failed to write output file")?;
+        eprintln!("\nReport written to: {}", path.display());
+    } else {
+        println!("{json}");
+    }
+
+    Ok(())
+}
+
+fn compare_diagnostics(base: &PathBuf, head: &PathBuf, output: Option<PathBuf>) -> Result<()> {
+    let base_json = fs::read_to_string(base)
+        .context(format!("Failed to read base file: {:?}", base.display()))?;
+    let head_json = fs::read_to_string(head)
+        .context(format!("Failed to read head file: {:?}", head.display()))?;
+
+    let base_report = DiagnosticReport::from_json(&base_json)?;
+    let head_report = DiagnosticReport::from_json(&head_json)?;
+
+    let diff = generate_diff(&base_report, &head_report);
+
+    if let Some(path) = output {
+        fs::write(&path, diff).context("Failed to write output file")?;
+        eprintln!("Diff written to: {}", path.display());
+    } else {
+        println!("{diff}");
+    }
+
+    Ok(())
+}
+
+fn generate_diff(base: &DiagnosticReport, head: &DiagnosticReport) -> String {
+    let mut diff = String::new();
+    diff.push_str("# Diagnostic Diff Report\n\n");
+
+    // Summary table
+    diff.push_str("## Summary\n\n");
+    diff.push_str("| Project | Tests | Passed | Failed | Skipped | Errors | Warnings |\n");
+    diff.push_str("|---------|-------|--------|--------|---------|--------|----------|\n");
+
+    for head_project in &head.projects {
+        let base_project = base
+            .projects
+            .iter()
+            .find(|p| p.project_name == head_project.project_name);
+
+        if let Some(base_proj) = base_project {
+            diff.push_str(&format!(
+                "| {} | {} {} | {} {} | {} {} | {} {} | {} {} | {} {} |\n",
+                head_project.project_name,
+                head_project.total_tests,
+                format_diff(base_proj.total_tests, head_project.total_tests),
+                head_project.passed,
+                format_diff(base_proj.passed, head_project.passed),
+                head_project.failed,
+                format_diff(base_proj.failed, head_project.failed),
+                head_project.skipped,
+                format_diff(base_proj.skipped, head_project.skipped),
+                head_project.error_count,
+                format_diff(base_proj.error_count, head_project.error_count),
+                head_project.warning_count,
+                format_diff(base_proj.warning_count, head_project.warning_count),
+            ));
+        } else {
+            diff.push_str(&format!(
+                "| {} | {} | {} | {} | {} | {} | {} |\n",
+                head_project.project_name,
+                head_project.total_tests,
+                head_project.passed,
+                head_project.failed,
+                head_project.skipped,
+                head_project.error_count,
+                head_project.warning_count,
+            ));
+        }
+    }
+
+    diff.push_str("\n## Detailed Changes\n\n");
+
+    for head_project in &head.projects {
+        let base_project = base
+            .projects
+            .iter()
+            .find(|p| p.project_name == head_project.project_name);
+
+        if let Some(base_proj) = base_project {
+            let has_changes = base_proj.total_tests != head_project.total_tests
+                || base_proj.passed != head_project.passed
+                || base_proj.failed != head_project.failed
+                || base_proj.skipped != head_project.skipped
+                || base_proj.error_count != head_project.error_count
+                || base_proj.warning_count != head_project.warning_count;
+
+            if has_changes {
+                diff.push_str(&format!("### {}\n\n", head_project.project_name));
+
+                if base_proj.passed != head_project.passed {
+                    diff.push_str(&format!(
+                        "- **Passed tests:** {} → {} {}\n",
+                        base_proj.passed,
+                        head_project.passed,
+                        change_emoji(base_proj.passed, head_project.passed, true)
+                    ));
+                }
+
+                if base_proj.failed != head_project.failed {
+                    diff.push_str(&format!(
+                        "- **Failed tests:** {} → {} {}\n",
+                        base_proj.failed,
+                        head_project.failed,
+                        change_emoji(base_proj.failed, head_project.failed, false)
+                    ));
+                }
+
+                if base_proj.error_count != head_project.error_count {
+                    diff.push_str(&format!(
+                        "- **Errors:** {} → {} {}\n",
+                        base_proj.error_count,
+                        head_project.error_count,
+                        change_emoji(base_proj.error_count, head_project.error_count, false)
+                    ));
+                }
+
+                if base_proj.warning_count != head_project.warning_count {
+                    diff.push_str(&format!(
+                        "- **Warnings:** {} → {} {}\n",
+                        base_proj.warning_count,
+                        head_project.warning_count,
+                        change_emoji(base_proj.warning_count, head_project.warning_count, false)
+                    ));
+                }
+
+                diff.push('\n');
+            }
+        }
+    }
+
+    diff
+}
+
+fn format_diff(base: usize, head: usize) -> String {
+    if base == head {
+        String::new()
+    } else {
+        let diff = head - base;
+        if diff > 0 {
+            format!("(+{diff})")
+        } else {
+            format!("({diff})")
+        }
+    }
+}
+
+const fn change_emoji(base: usize, head: usize, increase_is_good: bool) -> &'static str {
+    if base == head {
+        ""
+    } else if head > base {
+        if increase_is_good { "✅" } else { "❌" }
+    } else if increase_is_good {
+        "❌"
+    } else {
+        "✅"
+    }
+}
diff --git a/crates/karva_diffs/src/lib.rs b/crates/karva_diffs/src/lib.rs
new file mode 100644
index 00000000..f865eb64
--- /dev/null
+++ b/crates/karva_diffs/src/lib.rs
@@ -0,0 +1,126 @@
+//! Diagnostic diff testing for Karva on real-world projects.
+//!
+//! This crate tracks diagnostic changes across different versions of Karva
+//! by running tests on real-world Python projects and comparing the
+//! diagnostics output. This is similar to `mypy_primer` but focused on pytest
+//! support tracking.
+
+use karva_core::TestRunner;
+use karva_project::{
+    path::absolute,
+    project::{Project, ProjectOptions},
+    verbosity::VerbosityLevel,
+};
+// Re-export project registry from karva_test
+pub use karva_test::get_real_world_projects;
+use karva_test::{InstalledProject, RealWorldProject};
+
+/// Helper function to create a Project from an `InstalledProject`
+#[must_use]
+pub fn create_project(installed: &InstalledProject) -> Project {
+    let test_paths = installed.config().paths.clone();
+
+    let absolute_test_paths = test_paths
+        .iter()
+        .map(|path| absolute(path, installed.path()))
+        .collect();
+
+    Project::new(installed.path().to_path_buf(), absolute_test_paths).with_options(
+        ProjectOptions::new("test".to_string(), VerbosityLevel::Default, false, true),
+    )
+}
+
+/// Serializable diagnostic summary for a single project
+#[derive(Debug, serde::Serialize, serde::Deserialize)]
+pub struct ProjectDiagnostics {
+    pub project_name: String,
+    pub total_tests: usize,
+    pub passed: usize,
+    pub failed: usize,
+    pub skipped: usize,
+    pub error_count: usize,
+    pub warning_count: usize,
+}
+
+/// Complete diagnostic report for all projects
+#[derive(Debug, serde::Serialize, serde::Deserialize)]
+pub struct DiagnosticReport {
+    pub projects: Vec<ProjectDiagnostics>,
+}
+
+impl DiagnosticReport {
+    /// Create a new empty report
+    #[must_use]
+    pub const fn new() -> Self {
+        Self {
+            projects: Vec::new(),
+        }
+    }
+
+    /// Add a project's diagnostics to the report
+    pub fn add_project(&mut self, diagnostics: ProjectDiagnostics) {
+        self.projects.push(diagnostics);
+    }
+
+    /// Serialize to JSON string
+    pub fn to_json(&self) -> anyhow::Result<String> {
+        Ok(serde_json::to_string_pretty(self)?)
+    }
+
+    /// Create from JSON string
+    pub fn from_json(json: &str) -> anyhow::Result<Self> {
+        Ok(serde_json::from_str(json)?)
+    }
+}
+
+impl Default for DiagnosticReport {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ProjectDiagnostics {
+    /// Create diagnostics from a test run result
+    #[must_use]
+    pub fn from_test_result(
+        project_name: String,
+        result: &karva_core::runner::diagnostic::TestRunResult,
+    ) -> Self {
+        let stats = result.stats();
+        let mut error_count = 0;
+        let mut warning_count = 0;
+
+        for diagnostic in result.diagnostics() {
+            if diagnostic.severity().is_error() {
+                error_count += 1;
+            } else {
+                warning_count += 1;
+            }
+        }
+
+        Self {
+            project_name,
+            total_tests: stats.total(),
+            passed: stats.passed(),
+            failed: stats.failed(),
+            skipped: stats.skipped(),
+            error_count,
+            warning_count,
+        }
+    }
+}
+
+/// Run diagnostics on a project and return the results
+pub fn run_project_diagnostics(project: RealWorldProject) -> anyhow::Result<ProjectDiagnostics> {
+    let project_name = project.name.to_string();
+
+    // Setup the project (clone, install dependencies)
+    let installed = project.setup()?;
+
+    // Create and run the project
+    let project = create_project(&installed);
+    let result = project.test();
+
+    // Create diagnostic summary
+    Ok(ProjectDiagnostics::from_test_result(project_name, &result))
+}
diff --git a/crates/karva_test/Cargo.toml b/crates/karva_test/Cargo.toml
index 95ea7007..aded1e6e 100644
--- a/crates/karva_test/Cargo.toml
+++ b/crates/karva_test/Cargo.toml
@@ -18,6 +18,10 @@ anyhow = { workspace = true }
 ruff_python_trivia = { workspace = true }
 rand = { workspace = true }
 dunce = { workspace = true }
+tracing = { workspace = true }
+ruff_python_ast = { workspace = true }
+serde = { workspace = true, features = ["derive"] }
+serde_json = { workspace = true }
 
 [lints]
 workspace = true
diff --git a/crates/karva_test/src/lib.rs b/crates/karva_test/src/lib.rs
index 1616bdf5..2b63c037 100644
--- a/crates/karva_test/src/lib.rs
+++ b/crates/karva_test/src/lib.rs
@@ -1,5 +1,9 @@
 mod context;
+pub mod real_world_projects;
 mod utils;
 
 pub use context::{IntegrationTestContext, TestContext};
+pub use real_world_projects::{
+    InstalledProject, RealWorldProject, affect_project, get_real_world_projects,
+};
 pub use utils::find_karva_wheel;
diff --git a/crates/karva_benchmark/src/real_world_projects.rs b/crates/karva_test/src/real_world_projects.rs
similarity index 90%
rename from crates/karva_benchmark/src/real_world_projects.rs
rename to crates/karva_test/src/real_world_projects.rs
index 1d3ebc27..3d2ac447 100644
--- a/crates/karva_benchmark/src/real_world_projects.rs
+++ b/crates/karva_test/src/real_world_projects.rs
@@ -1,25 +1,15 @@
 #![allow(clippy::print_stderr)]
 
-//! Infrastructure for benchmarking real-world Python projects.
-//!
-//! The module uses a setup similar to mypy primer's, which should make it easy
-//! to add new benchmarks for projects in [mypy primer's project's list](https://github.com/hauntsaninja/mypy_primer/blob/ebaa9fd27b51a278873b63676fd25490cec6823b/mypy_primer/projects.py#L74).
-//!
-//! The basic steps for a project are:
-//! 1. Clone or update the project into a directory inside `./target`. The commits are pinnted to prevent flaky benchmark results due to new commits.
-//! 2. For projects with dependencies, run uv to create a virtual environment and install the dependencies.
-//! 3. (optionally) Copy the entire project structure into a memory file system to reduce the IO noise in benchmarks.
-//! 4. (not in this module) Create a `ProjectDatabase` and run the benchmark.
-
 use std::{
     path::{Path, PathBuf},
     process::Command,
 };
 
 use anyhow::{Context, Result};
-use karva_test::find_karva_wheel;
 use ruff_python_ast::PythonVersion;
 
+use crate::find_karva_wheel;
+
 fn global_venv_path() -> PathBuf {
     PathBuf::from(env!("CARGO_MANIFEST_DIR"))
         .parent()
@@ -29,16 +19,16 @@ fn global_venv_path() -> PathBuf {
         .join(".venv")
 }
 
-/// Configuration for a real-world project to benchmark
+/// Configuration for a real-world project to benchmark or test
 #[derive(Debug, Clone)]
 pub struct RealWorldProject<'a> {
-    // The name of the project.
+    /// The name of the project.
     pub name: &'a str,
     /// The project's GIT repository. Must be publicly accessible.
     pub repository: &'a str,
     /// Specific commit hash to checkout
     pub commit: &'a str,
-    /// List of paths within the project to check (`ty check <paths>`)
+    /// List of paths within the project to test
     pub paths: Vec<PathBuf>,
     /// Dependencies to install via uv
     pub dependencies: Vec<&'a str>,
@@ -47,7 +37,7 @@ pub struct RealWorldProject<'a> {
 }
 
 impl<'a> RealWorldProject<'a> {
-    /// Setup a real-world project for benchmarking
+    /// Setup a real-world project for testing/benchmarking
     pub fn setup(self) -> Result<InstalledProject<'a>> {
         tracing::debug!("Setting up project {}", self.name);
 
@@ -352,3 +342,22 @@ fn cargo_target_directory() -> Option<&'static PathBuf> {
         })
         .as_ref()
 }
+
+/// The affect project - a real-world Python project for testing
+#[must_use]
+pub fn affect_project() -> RealWorldProject<'static> {
+    RealWorldProject {
+        name: "affect",
+        repository: "https://github.com/MatthewMckee4/affect",
+        commit: "803cc916b492378a8ad8966e747cac3325e11b5f",
+        paths: vec![PathBuf::from("tests")],
+        dependencies: vec!["pydantic", "pydantic-settings", "pytest"],
+        python_version: PythonVersion::PY313,
+    }
+}
+
+/// Registry of real-world projects used for benchmarking and diagnostic testing
+#[must_use]
+pub fn get_real_world_projects() -> Vec<RealWorldProject<'static>> {
+    vec![affect_project()]
+}