sys-intelligence
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 9 additions & 14 deletions b/‎.github/workflows/test.yml‎
Lines changed: 9 additions & 14 deletions
diff --git a/‎.gitignore‎
Lines changed: 57 additions & 0 deletions b/‎.gitignore‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎benchmarks/arteval_bench/tests/test_sdk.py‎
Lines changed: 0 additions & 21 deletions b/‎benchmarks/arteval_bench/tests/test_sdk.py‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎benchmarks/cache_bench/tests/test_sdk.py‎
Lines changed: 0 additions & 21 deletions b/‎benchmarks/cache_bench/tests/test_sdk.py‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎benchmarks/course_exam_bench/tests/test_sdk.py‎
Lines changed: 0 additions & 21 deletions b/‎benchmarks/course_exam_bench/tests/test_sdk.py‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎benchmarks/course_project_bench/tests/test_sdk.py‎
Lines changed: 0 additions & 21 deletions b/‎benchmarks/course_project_bench/tests/test_sdk.py‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎benchmarks/example_bench/tests/test_benchmark.py‎
Lines changed: 60 additions & 0 deletions b/‎benchmarks/example_bench/tests/test_benchmark.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎benchmarks/example_bench/tests/test_sdk.py‎
Lines changed: 0 additions & 21 deletions b/‎benchmarks/example_bench/tests/test_sdk.py‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 0 deletions
@@ -15,11 +15,12 @@ jobs:
       fail-fast: false
       matrix:
         benchmark:
-          - arteval_bench
-          - cache_bench
-          - course_exam_bench
-          - course_project_bench
           - example_bench
+          # TODO: For now, we comment out other benchmarks as they have no tests
+          # - arteval_bench
+          # - cache_bench
+          # - course_exam_bench
+          # - course_project_bench
 
     steps:
     - name: Checkout code
@@ -30,7 +31,7 @@ jobs:
       with:
         python-version: '3.9'
 
-    - name: Install dependencies for ${{ matrix.benchmark }}
+    - name: Install dependencies
       working-directory: benchmarks/${{ matrix.benchmark }}
       run: |
         python -m venv env${{ matrix.benchmark }}
@@ -40,16 +41,10 @@ jobs:
         if [ -f requirements.txt ]; then
           pip install -r requirements.txt
         fi
-        pip install -e ../../sdk
         deactivate
 
-    - name: Run tests for ${{ matrix.benchmark }}
-      working-directory: benchmarks/${{ matrix.benchmark }}
+    - name: Run tests
       run: |
-        source env${{ matrix.benchmark }}/bin/activate
-        pytest --version
-        pytest -v
+        source benchmarks/${{ matrix.benchmark }}/env${{ matrix.benchmark }}/bin/activate
+        pytest benchmarks/${{ matrix.benchmark }}/tests -v
         deactivate
-
-    - name: Test completed
-      run: echo "${{ matrix.benchmark }} tests completed successfully."
@@ -0,0 +1,57 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual environments
+.venv
+venv/
+ENV/
+env/
+.env
+env*/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+
+# Logs
+logs/
+*.log
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Project specific
+outputs/
+*.jsonl
+!benchmarks/*/data/**/*.jsonl
@@ -0,0 +1,60 @@
+"""Tests for the example benchmark."""
+
+import json
+import unittest
+from pathlib import Path
+
+
+class TestExampleBenchmark(unittest.TestCase):
+    def test_data_format(self):
+        """Test that benchmark data is in the correct format."""
+        data_path = (
+            Path(__file__).parent.parent
+            / "data"
+            / "benchmark"
+            / "example_bench_benchmark_timestamp.jsonl"
+        )
+
+        self.assertTrue(
+            data_path.exists(), f"Benchmark data file not found: {data_path}"
+        )
+
+        with open(data_path, encoding="utf-8") as f:
+            for line_num, line in enumerate(f, 1):
+                data = json.loads(line)
+
+                # Check required fields
+                self.assertIn("id", data, f'Line {line_num}: missing "id" field')
+                self.assertIn(
+                    "sys_prompt", data, f'Line {line_num}: missing "sys_prompt" field'
+                )
+                self.assertIn(
+                    "user_prompt", data, f'Line {line_num}: missing "user_prompt" field'
+                )
+                self.assertIn(
+                    "response", data, f'Line {line_num}: missing "response" field'
+                )
+
+                # Check field types
+                self.assertIsInstance(
+                    data["id"], str, f'Line {line_num}: "id" must be a string'
+                )
+                self.assertIsInstance(
+                    data["sys_prompt"],
+                    str,
+                    f'Line {line_num}: "sys_prompt" must be a string',
+                )
+                self.assertIsInstance(
+                    data["user_prompt"],
+                    str,
+                    f'Line {line_num}: "user_prompt" must be a string',
+                )
+                self.assertIsInstance(
+                    data["response"],
+                    str,
+                    f'Line {line_num}: "response" must be a string',
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -49,3 +49,6 @@ convention = "google"
 [tool.ruff.format]
 docstring-code-format = true
 quote-style = "single"
+
+[tool.pytest.ini_options]
+pythonpath = ["."]