From 1b4c179fea614643fdd0d54d822f3918119eda06 Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange <robert.lange@barcelonagse.eu>
Date: Thu, 25 Sep 2025 06:38:52 +0200
Subject: [PATCH 01/50] Update README.md with arxiv

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 55f40d262..5929d744d 100644
--- a/README.md
+++ b/README.md
@@ -7,12 +7,12 @@
   <img src="https://img.shields.io/badge/python-%3E%3D3.10-blue" />
   <a href="https://github.com/SakanaAI/ShinkaEvolve/blob/master/LICENSE.md"><img src="https://img.shields.io/badge/license-Apache2.0-blue.svg" /></a>
   <a href="https://github.com/astral-sh/ruff"><img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json" /></a>
-  <a href="http://arxiv.org/abs/2212.04180"><img src="http://img.shields.io/badge/paper-arxiv.2212.04180-B31B1B.svg" /></a>
+  <a href="http://arxiv.org/abs/2509.19349"><img src="http://img.shields.io/badge/paper-arxiv.2509.19349-B31B1B.svg" /></a>
   <a href="https://colab.research.google.com/github/SakanaAI/ShinkaEvolve/blob/main/examples/shinka_tutorial.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 </p>
 
 
-`ShinkaEvolve` is a framework that combines Large Language Models (LLMs) with evolutionary algorithms to drive scientific discovery. By leveraging the creative capabilities of LLMs and the optimization power of evolutionary search, `ShinkaEvolve` enables automated exploration and improvement of scientific code. The system is inspired by the [AI Scientist](https://sakana.ai/ai-scientist/), [AlphaEvolve](https://deepmind.google/discover/blog/alphaevolve-a-gemini-powered-coding-agent-for-designing-advanced-algorithms/) and the [Darwin Goedel Machine](https://sakana.ai/dgm/): It maintains a population of programs that evolve over generations, with an ensemble of LLMs acting as intelligent mutation operators that suggest code improvements.
+[`ShinkaEvolve`](https://arxiv.org/abs/2509.19349) is a framework that combines Large Language Models (LLMs) with evolutionary algorithms to drive scientific discovery. By leveraging the creative capabilities of LLMs and the optimization power of evolutionary search, `ShinkaEvolve` enables automated exploration and improvement of scientific code. The system is inspired by the [AI Scientist](https://sakana.ai/ai-scientist/), [AlphaEvolve](https://deepmind.google/discover/blog/alphaevolve-a-gemini-powered-coding-agent-for-designing-advanced-algorithms/) and the [Darwin Goedel Machine](https://sakana.ai/dgm/): It maintains a population of programs that evolve over generations, with an ensemble of LLMs acting as intelligent mutation operators that suggest code improvements.
 
 The framework supports **parallel evaluation of candidates** locally or on a Slurm cluster. It maintains an archive of successful solutions, enabling knowledge transfer between different evolutionary islands. `ShinkaEvolve` is particularly well-suited for scientific tasks where there is a verifier available and the goal is to optimize performance metrics while maintaining code correctness and readability.
 
@@ -313,4 +313,4 @@ If you use `ShinkaEvolve` in your research, please cite it as follows:
   journal={arXiv preprint},
   year={2025}
 }
-```
\ No newline at end of file
+```

From 2fb7548ce032da3c24e0a34893c8feb5413795dd Mon Sep 17 00:00:00 2001
From: "takeru.fukushima" <100330935+takeruhukushima@users.noreply.github.com>
Date: Thu, 25 Sep 2025 16:33:57 +0900
Subject: [PATCH 02/50] add google gemini embeding model

---
 examples/shinka_tutorial.ipynb | 15 ++++++++++
 shinka/core/runner.py          |  7 ++++-
 shinka/database/dbase.py       |  7 +++--
 shinka/llm/embedding.py        | 52 ++++++++++++++++++++++++++++++++--
 4 files changed, 76 insertions(+), 5 deletions(-)

diff --git a/examples/shinka_tutorial.ipynb b/examples/shinka_tutorial.ipynb
index 66a71a073..c6d818994 100644
--- a/examples/shinka_tutorial.ipynb
+++ b/examples/shinka_tutorial.ipynb
@@ -237,6 +237,17 @@
     "if not llm_models:\n",
     "    llm_models = [\"gpt-5-mini\"]  # fallback if no keys detected\n",
     "\n",
+    "# pick embedding model based on available keys\n",
+    "embedding_model_name = \"\"\n",
+    "if os.getenv(\"GEMINI_API_KEY\"):\n",
+    "    embedding_model_name = \"gemini-embedding-001\"\n",
+    "elif os.getenv(\"OPENAI_API_KEY\"):\n",
+    "    embedding_model_name = \"text-embedding-3-small\"\n",
+    "else:\n",
+    "    embedding_model_name = \"text-embedding-3-small\"\n",
+    "print(f\"✅ Embedding model selected: {embedding_model_name}\")\n",
+    "\n",
+    "\n",
     "# unique experiment directory\n",
     "timestamp = dt.datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
     "run_tag = f\"{timestamp}_weighted_fast\"\n",
@@ -271,6 +282,8 @@
     "    max_novelty_attempts=3,\n",
     "    # ensemble llm selection among candidates based on past performance\n",
     "    llm_dynamic_selection=None,  # e.g. \"ucb1\"\n",
+    "    # set embedding model\n",
+    "    embedding_model=embedding_model_name,\n",
     ")\n",
     "\n",
     "db_config = DatabaseConfig(\n",
@@ -286,11 +299,13 @@
     "    enforce_island_separation=True,\n",
     "    parent_selection_strategy=\"weighted\",\n",
     "    parent_selection_lambda=10.0,\n",
+    "    \n",
     ")\n",
     "\n",
     "job_config = LocalJobConfig(eval_program_path=\"evaluate.py\")\n",
     "\n",
     "print(\"llm_models:\", llm_models)\n",
+    "print(\"embedding_model:\", embedding_model_name)\n",
     "print(\"results_dir:\", evo_config.results_dir)"
    ]
   },
diff --git a/shinka/core/runner.py b/shinka/core/runner.py
index 3c818742c..c8c7c431c 100644
--- a/shinka/core/runner.py
+++ b/shinka/core/runner.py
@@ -158,7 +158,12 @@ def __init__(
 
         # Initialize database and scheduler
         db_config.db_path = str(db_path)
-        self.db = ProgramDatabase(config=db_config)
+        embedding_model_to_use = (
+            evo_config.embedding_model or "text-embedding-3-small"
+        )
+        self.db = ProgramDatabase(
+            config=db_config, embedding_model=embedding_model_to_use
+        )
         self.scheduler = JobScheduler(
             job_type=evo_config.job_type,
             config=job_config,  # type: ignore
diff --git a/shinka/database/dbase.py b/shinka/database/dbase.py
index 69fdf5432..c6a2b89bf 100644
--- a/shinka/database/dbase.py
+++ b/shinka/database/dbase.py
@@ -82,6 +82,9 @@ class DatabaseConfig:
     # Beam search parent selection parameters
     num_beams: int = 5
 
+    # Embedding model name
+    embedding_model: str = "text-embedding-3-small"
+
 
 def db_retry(max_retries=5, initial_delay=0.1, backoff_factor=2):
     """
@@ -248,12 +251,12 @@ class ProgramDatabase:
     populations, and an archive of elites.
     """
 
-    def __init__(self, config: DatabaseConfig, read_only: bool = False):
+    def __init__(self, config: DatabaseConfig,embedding_model: str = "text-embedding-3-small", read_only: bool = False):
         self.config = config
         self.conn: Optional[sqlite3.Connection] = None
         self.cursor: Optional[sqlite3.Cursor] = None
         self.read_only = read_only
-        self.embedding_client = EmbeddingClient()
+        self.embedding_client = EmbeddingClient(model_name=embedding_model)
 
         self.last_iteration: int = 0
         self.best_program_id: Optional[str] = None
diff --git a/shinka/llm/embedding.py b/shinka/llm/embedding.py
index a5c6b07cc..1f2ad495f 100644
--- a/shinka/llm/embedding.py
+++ b/shinka/llm/embedding.py
@@ -1,5 +1,6 @@
 import os
 import openai
+import google.generativeai as genai
 import pandas as pd
 from typing import Union, List, Optional, Tuple
 import numpy as np
@@ -20,13 +21,23 @@
     "azure-text-embedding-3-large",
 ]
 
+GEMINI_EMBEDDING_MODELS = [
+    "gemini-embedding-exp-03-07",
+    "gemini-embedding-001",
+]
+
 OPENAI_EMBEDDING_COSTS = {
     "text-embedding-3-small": 0.02 / M,
     "text-embedding-3-large": 0.13 / M,
 }
 
+# Gemini embedding costs (approximate - check current pricing)
+GEMINI_EMBEDDING_COSTS = {
+    "gemini-embedding-exp-03-07": 0.0 / M,  # Experimental model, often free
+    "gemini-embedding-001": 0.0 / M,  # Check current pricing
+}
 
-def get_client_model(model_name: str) -> tuple[openai.OpenAI, str]:
+def get_client_model(model_name: str) -> tuple[Union[openai.OpenAI, str], str]:
     if model_name in OPENAI_EMBEDDING_MODELS:
         client = openai.OpenAI()
         model_to_use = model_name
@@ -38,6 +49,14 @@ def get_client_model(model_name: str) -> tuple[openai.OpenAI, str]:
             api_version=os.getenv("AZURE_API_VERSION"),
             azure_endpoint=os.getenv("AZURE_API_ENDPOINT"),
         )
+    elif model_name in GEMINI_EMBEDDING_MODELS:
+        # Configure Gemini API
+        api_key = os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            raise ValueError("GOOGLE_API_KEY environment variable not set for Gemini models")
+        genai.configure(api_key=api_key)
+        client = "gemini"  # Use string identifier for Gemini
+        model_to_use = model_name
     else:
         raise ValueError(f"Invalid embedding model: {model_name}")
 
@@ -52,9 +71,10 @@ def __init__(
         Initialize the EmbeddingClient.
 
         Args:
-            model (str): The OpenAI embedding model name to use.
+            model (str): The OpenAI, Azure, or Gemini embedding model name to use.
         """
         self.client, self.model = get_client_model(model_name)
+        self.model_name = model_name
         self.verbose = verbose
 
     def get_embedding(
@@ -76,6 +96,34 @@ def get_embedding(
             single_code = True
         else:
             single_code = False
+        # Handle Gemini models
+        if self.model_name in GEMINI_EMBEDDING_MODELS:
+            try:
+                embeddings = []
+                total_tokens = 0
+                
+                for text in code:
+                    result = genai.embed_content(
+                        model=f"models/{self.model}",
+                        content=text,
+                        task_type="retrieval_document"
+                    )
+                    embeddings.append(result['embedding'])
+                    total_tokens += len(text.split())
+                
+                cost = total_tokens * GEMINI_EMBEDDING_COSTS.get(self.model, 0.0)
+                
+                if single_code:
+                    return embeddings[0] if embeddings else [], cost
+                else:
+                    return embeddings, cost
+            except Exception as e:
+                logger.error(f"Error getting Gemini embedding: {e}")
+                if single_code:
+                    return [], 0.0
+                else:
+                    return [[]], 0.0
+        # Handle OpenAI and Azure models (same interface)
         try:
             response = self.client.embeddings.create(
                 model=self.model, input=code, encoding_format="float"

From 27af71c2db24c3ebba14d9ac7f0f6e9aee2aff7f Mon Sep 17 00:00:00 2001
From: Dixing Xu <i@dex.moe>
Date: Thu, 25 Sep 2025 18:13:56 +0800
Subject: [PATCH 03/50] fix: Fix database summary when patch_name metadata is
 missing

---
 shinka/database/display.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/shinka/database/display.py b/shinka/database/display.py
index 4c34d3445..3e55439bf 100644
--- a/shinka/database/display.py
+++ b/shinka/database/display.py
@@ -122,6 +122,18 @@ def print_program_summary(self, program, console: Optional[RichConsole] = None):
             else:
                 time_display = f"{time_val:.1f}s"
 
+        # Safely extract metadata fields for display
+        metadata = program.metadata or {}
+        patch_name_raw = metadata.get("patch_name", "[dim]N/A[/dim]")
+        if patch_name_raw is None:
+            patch_name_raw = "[dim]N/A[/dim]"
+        patch_name = str(patch_name_raw)[:30]
+
+        patch_type_raw = metadata.get("patch_type", "[dim]N/A[/dim]")
+        if patch_type_raw is None:
+            patch_type_raw = "[dim]N/A[/dim]"
+        patch_type = str(patch_type_raw)
+
         # Add the data row
         island_display = (
             f"I-{program.island_idx}" if program.island_idx is not None else "N/A"
@@ -131,8 +143,8 @@ def print_program_summary(self, program, console: Optional[RichConsole] = None):
             island_display,
             status_display,
             score_display,
-            program.metadata.get("patch_name", "[dim]N/A[/dim]")[:30],
-            program.metadata.get("patch_type", "[dim]N/A[/dim]"),
+            patch_name,
+            patch_type,
             f"{program.complexity:.1f}",
             cost_display,
             time_display,

From 9586cdbe7025537ffa9f22b641cc2aa3f95cddc7 Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange <robert.lange@barcelonagse.eu>
Date: Fri, 26 Sep 2025 09:32:04 +0200
Subject: [PATCH 04/50] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5929d744d..0098c7556 100644
--- a/README.md
+++ b/README.md
@@ -52,9 +52,9 @@ For detailed installation instructions and usage examples, see the [Getting Star
 | Example | Description | Environment Setup |
 |---------|-------------|-------------------|
 | ⭕ [Circle Packing](examples/circle_packing) | Optimize circle packing to maximize radii. | `LocalJobConfig` |
-| 🤖 [Agent Design](examples/agent_design) | Design agent scaffolds for math tasks. | `LocalJobConfig` |
+| 🤖 [Agent Design](examples/adas_aime) | Design agent scaffolds for math tasks. | `LocalJobConfig` |
 | 🎯 [ALE-Bench](examples/ale_bench) | Code optimization for ALE-Bench tasks. | `LocalJobConfig` |
-| ✨ [Novelty Generator](examples/novelty_generator_bck) | Generate creative, surprising outputs (e.g., ASCII art). | `LocalJobConfig` |
+| ✨ [Novelty Generator](examples/novelty_generator) | Generate creative, surprising outputs (e.g., ASCII art). | `LocalJobConfig` |
 
 
 ## `shinka` Run with Python API 🐍

From a60bc9e4782ee77a5684841a6252c87ece6fe562 Mon Sep 17 00:00:00 2001
From: Koki-Kazaore <kazaore@icloud.com>
Date: Sun, 28 Sep 2025 19:12:28 +0900
Subject: [PATCH 05/50] docs: change repo name on the onboarding doc

---
 docs/getting_started.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/getting_started.md b/docs/getting_started.md
index 234158839..a866c011f 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -53,7 +53,7 @@ pip install uv
 
 ```bash
 git clone <shinka-repository-url>
-cd shinka
+cd ShinkaEvolve
 
 # Create virtual environment with Python 3.11
 uv venv --python 3.11
@@ -79,7 +79,7 @@ conda activate shinka
 
 ```bash
 git clone <shinka-repository-url>
-cd shinka
+cd ShinkaEvolve
 pip install -e .
 ```
 
@@ -249,7 +249,7 @@ from shinka.core import run_shinka_eval
 
 def main(program_path: str, results_dir: str):
     """Main evaluation function called by Shinka"""
-    
+
     metrics, correct, error_msg = run_shinka_eval(
         program_path=program_path,
         results_dir=results_dir,
@@ -268,11 +268,11 @@ def main(program_path: str, results_dir: str):
 def validate_packing(run_output):
     """Returns (is_valid: bool, error_msg: str or None)"""
     centers, radii, reported_sum = run_output
-    
+
     # Check constraints (bounds, overlaps, etc.)
     if constraint_violated:
         return False, "Specific error description"
-    
+
     return True, None  # Valid solution
 ```
 
@@ -280,10 +280,10 @@ def validate_packing(run_output):
 ```python
 def aggregate_metrics(results, results_dir):
     """Returns metrics dictionary with required structure"""
-    
+
     # Extract data from results
     centers, radii, reported_sum = results[0]
-    
+
     return {
         "combined_score": float(reported_sum),    # PRIMARY FITNESS (higher = better)
         "public": {                               # Visible in WebUI/logs

From 00035528af09a03b36d42a4e276f9f61c3e124d7 Mon Sep 17 00:00:00 2001
From: Edoardo Cetin <32273096+Aladoro@users.noreply.github.com>
Date: Sun, 28 Sep 2025 20:47:42 +0900
Subject: [PATCH 06/50] Update README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0098c7556..b0dba5f7d 100644
--- a/README.md
+++ b/README.md
@@ -308,9 +308,9 @@ If you use `ShinkaEvolve` in your research, please cite it as follows:
 
 ```
 @article{lange2025shinka,
-  title={ShinkaEvolve: Towards Open-Ended and Sample-Efficient Program Evolution},
+  title={ShinkaEvolve: Towards Open-Ended And Sample-Efficient Program Evolution},
   author={Lange, Robert Tjarko and Imajuku, Yuki and Cetin, Edoardo},
-  journal={arXiv preprint},
+  journal={arXiv preprint arXiv:2509.19349},
   year={2025}
 }
 ```

From be2e2037c90a6cf081d9a8eb38e2ccedd48e6211 Mon Sep 17 00:00:00 2001
From: vicruz99 <vicruz1999@gmail.com>
Date: Sun, 12 Oct 2025 14:55:07 +0100
Subject: [PATCH 07/50] Added a doc explaining how to add suport for a local
 LLM and embedding model

---
 docs/support_local_llm.md | 232 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 docs/support_local_llm.md

diff --git a/docs/support_local_llm.md b/docs/support_local_llm.md
new file mode 100644
index 000000000..5f406e7b9
--- /dev/null
+++ b/docs/support_local_llm.md
@@ -0,0 +1,232 @@
+
+# 🧩 Integrating Local LLMs into **ShinkaEvolve**
+
+## 🧠 Overview
+
+The original **ShinkaEvolve** code does **not** include built-in support for running **local LLMs**.
+To enable this functionality, parts of the codebase can be modified to integrate locally hosted models.
+
+---
+
+## 🏗️ Code Organization
+
+**ShinkaEvolve** uses a **modular architecture** that supports multiple **LLM providers**.
+The relevant code for LLM interaction is located in the **`LLM/`** folder, which manages all model communications.
+ShinkaEvolve distinguishes between two LLM types:
+
+* **Regular LLMs**
+* **Embedding LLMs**
+
+---
+
+## ⚙️ Adding a Regular LLM
+
+To add support for a **regular LLM**, follow these steps. They will show an example of adding support for gpt-oss models running with unsloth, which provides an API compatible with OpenAI API (v1/completions).
+This LLM can then be specified in the configuration variables:
+
+```yaml
+llm_models:
+meta_llm_models:
+```
+
+---
+
+### 🔧 Step 1: Modify the Client
+
+The file **`client.py`** is responsible for creating clients that interact with LLMs.
+Each client instance is later used to query a specific model.
+
+To add a local model, introduce a new client configuration.
+The API URL is extracted from the model name, which follows this format:
+
+```
+local-gptoss-unsloth-url
+```
+
+#### Example
+
+```python
+elif "local-gptoss-unsloth" in model_name:
+    # Extract URL from model name
+    pattern = r"https?://"
+    match = re.search(pattern, model_name)
+    if match:
+        start_index = match.start()
+        url = model_name[start_index:]
+    else:
+        raise ValueError(f"Invalid URL in model name: {model_name}")
+    
+    # Create OpenAI-compatible client
+    client = openai.OpenAI(
+        api_key="filler",
+        base_url=url
+    )
+
+    # Structured output mode (if required)
+    if structured_output:
+        client = instructor.from_openai(
+            client,
+            mode=instructor.Mode.JSON,
+        )
+```
+
+---
+
+### 📁 Step 2: Create the Local Query Function
+
+Inside the **`models/`** folder, create a new subfolder to store the query functions for your local models:
+
+```
+LLM/models/local/
+```
+
+> Don’t forget to include an empty `__init__.py` file.
+
+This folder should contain a **custom query function** for the local model. I called my file local_gptoss_unsloth.py.
+It should follow the same structure as other functions in `LLM/models/`, but with small adjustments.
+
+#### My Key Adjustments
+
+* Replace `max_output_tokens` with **`max_tokens`** to match the local API.
+* Extract additional response metadata such as:
+
+  * `total_tokens`
+  * `thinking_tokens` (if your model includes reasoning traces)
+
+This function is later imported and registered in **`query.py`**.
+
+---
+
+### 🧩 Step 3: Update `__init__.py`
+
+Configure **`__init__.py`** to include and expose the new local query function, so it can be imported elsewhere.
+
+```
+from .local.local_gptoss_unsloth import query_local_gptoss_unsloth            # ADDED THIS LINE
+from .result import QueryResult
+
+__all__ = [
+    "query_anthropic",
+    "query_openai",
+    "query_deepseek",
+    "query_gemini",
+    "query_local_gptoss_unsloth",              # ADDED THIS LINE
+    "QueryResult",
+]
+```
+
+---
+
+### 📬 Step 4: Update `query.py`
+
+Import and register the new local query function in query.py.
+
+#### Imports
+
+```python
+from .models import (
+    query_anthropic,
+    query_openai,
+    query_deepseek,
+    query_gemini,
+    query_local_gptoss_unsloth,  # ADDED THIS LINE
+    QueryResult,
+)
+```
+
+#### Model Selection Logic
+
+```python
+elif "local-gptoss-unsloth" in model_name:  # ADDED THIS LINE
+    query_fn = query_local_gptoss_unsloth
+```
+
+---
+
+### 🧠 Step 5: Other Observations
+
+The file **`query.py`** also defines functions such as:
+
+* `sample_model_kwargs`
+* `sample_batch_kwargs`
+
+However, these are **not referenced anywhere else** in the repository, so no modifications are required here for now.
+
+---
+
+### ✅ Summary
+
+| Step | File                                         | Change               | Description                                              |
+| ---- | -------------------------------------------- | -------------------- | -------------------------------------------------------- |
+| 1    | `client.py`                                  | Add new client block | Create OpenAI-compatible client for local LLM            |
+| 2    | `models/local/query_local_gptoss_unsloth.py` | New function         | Query local model, adjust tokens, extract reasoning info |
+| 3    | `__init__.py`                                | Add import           | Expose new query function                                |
+| 4    | `query.py`                                   | Register model       | Add conditional for local LLM                            |
+| 5    | —                                            | Review only          | Ignored unused functions                                 |
+
+---
+
+## 🧬 Adding a Local Embedding Model
+
+For embedding models, you can use **Ollama**, which follows the **OpenAI API** format.
+The only relevant file is **`embedding.py`**.
+
+### Code Addition
+
+```python
+elif model_name.startswith("local-"):
+    # Pattern: local-(model-name)-(http or https url)
+    match = re.match(r"local-(.+?)-(https?://.+)", model_name)
+    if match:
+        model_to_use = match.group(1)
+        url = match.group(2)
+    else:
+        raise ValueError(f"Invalid local model format: {model_name}")
+
+    client = openai.OpenAI(
+        base_url=url,
+        api_key="filler"
+    )
+```
+
+#### Notes
+
+* Compatible with **any Ollama model**.
+* The model name must follow this convention:
+
+  ```
+  local-model-name-url
+  ```
+* The code extracts both `model-name` and `url`, and uses them to query Ollama.
+
+---
+
+### Query Logic
+
+The existing line in **`embedding.py`** remains unchanged:
+
+```python
+response = self.client.embeddings.create(
+    model=self.model,
+    input=code,
+    encoding_format="float"
+)
+```
+
+For local embedding models, `self.model` corresponds to the extracted model name.
+The only addition to the **Embedding Client** class:
+
+```python
+elif self.model_name.startswith("local-"):
+    cost = 0.0
+```
+
+---
+
+## 🚀 Result
+
+ShinkaEvolve can now connect to **locally hosted LLMs** and **embedding models** through **OpenAI-compatible APIs**.
+This setup supports **Ollama** and other frameworks such as **gpt-oss** under **Unsloth**.
+
+If your model has different requirements, follow the same pattern with a distinct model identifier and your own custom logic.
+

From bf0c1d47576f5cb34870a9bad26592e50b3eb4cc Mon Sep 17 00:00:00 2001
From: LiaCastaneda <lia.castaneda@datadoghq.com>
Date: Mon, 13 Oct 2025 11:04:22 +0200
Subject: [PATCH 08/50] Add rust to supported languages

---
 shinka/core/runner.py         |  9 ++++++---
 shinka/database/complexity.py |  4 ++--
 shinka/edit/apply_diff.py     |  4 +++-
 shinka/edit/apply_full.py     |  4 +++-
 shinka/edit/async_apply.py    | 26 ++++++++++++++++++++++++++
 5 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/shinka/core/runner.py b/shinka/core/runner.py
index 3c818742c..37b876d00 100644
--- a/shinka/core/runner.py
+++ b/shinka/core/runner.py
@@ -231,6 +231,8 @@ def __init__(
             self.lang_ext = "cpp"
         elif self.evo_config.language == "python":
             self.lang_ext = "py"
+        elif self.evo_config.language == "rust":
+            self.lang_ext = "rs"
         else:
             msg = f"Language {self.evo_config.language} not supported"
             raise ValueError(msg)
@@ -1096,9 +1098,10 @@ def run_patch(
                     # error_attempt is already set from apply_patch or default
                     pass
 
-        # Only consider the diff summary for the original.py file!!!
-        if "original.py" in diff_summary:
-            diff_summary = diff_summary["original.py"]
+        # Only consider the diff summary for the original source file
+        original_filename = f"original.{self.lang_ext}"
+        if original_filename in diff_summary:
+            diff_summary = diff_summary[original_filename]
 
         meta_edit_data = {
             "patch_type": patch_type,
diff --git a/shinka/database/complexity.py b/shinka/database/complexity.py
index 4116567e9..933d7f4e6 100644
--- a/shinka/database/complexity.py
+++ b/shinka/database/complexity.py
@@ -259,8 +259,8 @@ def analyze_code_metrics(code_string, language="python"):
             # If Python parsing fails, fall back to C++ analysis
             return analyze_cpp_complexity(code_string)
 
-    # For C/C++/CUDA and other languages, use regex-based analysis
-    elif language in ["cpp", "c", "cuda", "c++"]:
+    # For C/C++/CUDA/Rust and other languages, use regex-based analysis
+    elif language in ["cpp", "c", "cuda", "c++", "rust"]:
         return analyze_cpp_complexity(code_string)
 
     # For unknown languages, use simple line-based complexity
diff --git a/shinka/edit/apply_diff.py b/shinka/edit/apply_diff.py
index ead28e231..4b5f29148 100644
--- a/shinka/edit/apply_diff.py
+++ b/shinka/edit/apply_diff.py
@@ -698,7 +698,7 @@ def apply_diff_patch(
     patch_str = _strip_trailing_whitespace(patch_str)
 
     # Remove the EVOLVE-BLOCK START and EVOLVE-BLOCK END markers
-    if language in ["cuda", "cpp"]:
+    if language in ["cuda", "cpp", "rust"]:
         patch_str = re.sub(r"// EVOLVE-BLOCK START\\n", "", patch_str)
         patch_str = re.sub(r"// EVOLVE-BLOCK END\\n", "", patch_str)
     elif language == "python":
@@ -730,6 +730,8 @@ def apply_diff_patch(
         suffix = ".cpp"
     elif language == "cuda":
         suffix = ".cu"
+    elif language == "rust":
+        suffix = ".rs"
     else:
         raise ValueError(f"Language {language} not supported")
 
diff --git a/shinka/edit/apply_full.py b/shinka/edit/apply_full.py
index b7e2e2b37..9b14f21ee 100644
--- a/shinka/edit/apply_full.py
+++ b/shinka/edit/apply_full.py
@@ -102,7 +102,7 @@ def apply_full_patch(
                 # We need to find the actual start of the comment line
                 if language == "python":
                     end_marker = "# EVOLVE-BLOCK-END"
-                elif language in ["cuda", "cpp"]:
+                elif language in ["cuda", "cpp", "rust"]:
                     end_marker = "// EVOLVE-BLOCK-END"
                 else:
                     end_marker = "# EVOLVE-BLOCK-END"  # Default fallback
@@ -146,6 +146,8 @@ def apply_full_patch(
         suffix = ".cpp"
     elif language == "cuda":
         suffix = ".cu"
+    elif language == "rust":
+        suffix = ".rs"
     else:
         raise ValueError(f"Language {language} not supported")
 
diff --git a/shinka/edit/async_apply.py b/shinka/edit/async_apply.py
index 8e542c565..4ffd15bed 100644
--- a/shinka/edit/async_apply.py
+++ b/shinka/edit/async_apply.py
@@ -118,6 +118,32 @@ async def validate_code_async(
                 error_msg = stderr.decode() if stderr else "Unknown compilation error"
                 return False, error_msg
 
+        elif language == "rust":
+            # Use rustc for Rust syntax checking
+            proc = await asyncio.create_subprocess_exec(
+                "rustc",
+                "--crate-type=lib",
+                "-Zparse-only",
+                code_path,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+
+            try:
+                stdout, stderr = await asyncio.wait_for(
+                    proc.communicate(), timeout=timeout
+                )
+            except asyncio.TimeoutError:
+                proc.kill()
+                await proc.wait()
+                return False, f"Validation timeout after {timeout}s"
+
+            if proc.returncode == 0:
+                return True, None
+            else:
+                error_msg = stderr.decode() if stderr else "Unknown compilation error"
+                return False, error_msg
+
         elif language == "cpp":
             # Use g++ for C++ compilation check
             proc = await asyncio.create_subprocess_exec(

From 77d1819454673d0f007f5f9044e87475a1b56a14 Mon Sep 17 00:00:00 2001
From: Takuya Akiba <t.akiba.65536@gmail.com>
Date: Tue, 14 Oct 2025 23:44:38 +0900
Subject: [PATCH 09/50] Ensure setuptools discovers subpackages

---
 pyproject.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index e3ec455af..f05429b60 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,9 +48,11 @@ dependencies = [
 ]
 
 [tool.setuptools]
-packages = ["shinka"]
 script-files = ["shinka/shinka_launch", "shinka/shinka_visualize"]
 
+[tool.setuptools.packages.find]
+include = ["shinka", "shinka.*"]
+
 [tool.setuptools.package-data]
 "*" = ["*"]
 

From 929f072e7879852893b959aa4079d903c27aa76f Mon Sep 17 00:00:00 2001
From: Takuya Akiba <t.akiba.65536@gmail.com>
Date: Tue, 14 Oct 2025 23:44:59 +0900
Subject: [PATCH 10/50] Mark shinka.webui as a package

---
 shinka/webui/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 shinka/webui/__init__.py

diff --git a/shinka/webui/__init__.py b/shinka/webui/__init__.py
new file mode 100644
index 000000000..e69de29bb

From 23ace365b4123f6369b98b0bcc5a853984c7da72 Mon Sep 17 00:00:00 2001
From: 51616 <rujikorn.ch@gmail.com>
Date: Fri, 24 Oct 2025 13:28:16 +0000
Subject: [PATCH 11/50] fix apply_full.py when the patch has incomplete (0,1)
 markers instead of expected 2 (end and start) markers

---
 shinka/edit/apply_full.py | 174 +++++++++++++++++++++++++++++++-------
 tests/test_edit_base.py   | 139 ++++++++++++++++++++++++++++++
 2 files changed, 284 insertions(+), 29 deletions(-)

diff --git a/shinka/edit/apply_full.py b/shinka/edit/apply_full.py
index b7e2e2b37..e0b76c892 100644
--- a/shinka/edit/apply_full.py
+++ b/shinka/edit/apply_full.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 from typing import Optional, Union
-from .apply_diff import write_git_diff, _mutable_ranges
+from .apply_diff import write_git_diff, _mutable_ranges, EVOLVE_START, EVOLVE_END
 from shinka.llm import extract_between
 import logging
 
@@ -72,10 +72,15 @@ def apply_full_patch(
         updated_content = ""
         last_end = 0
 
-        # Check if patch_code contains EVOLVE-BLOCK markers
-        patch_mutable_ranges = _mutable_ranges(patch_code)
+        # Detect EVOLVE markers presence in the patch content
+        patch_has_start = EVOLVE_START.search(patch_code) is not None
+        patch_has_end = EVOLVE_END.search(patch_code) is not None
+        patch_has_both = patch_has_start and patch_has_end
+        patch_has_none = not patch_has_start and not patch_has_end
 
-        if patch_mutable_ranges:
+        if patch_has_both:
+            # Patch contains both EVOLVE-BLOCK markers, extract from them
+            patch_mutable_ranges = _mutable_ranges(patch_code)
             # Patch contains EVOLVE-BLOCK markers, extract from them
             for i, (start, end) in enumerate(mutable_ranges):
                 # Add immutable part before this mutable range
@@ -91,47 +96,158 @@ def apply_full_patch(
 
                 updated_content += replacement_content
                 last_end = end
-        else:
+        elif patch_has_none:
             # Patch doesn't contain EVOLVE-BLOCK markers
             # Assume entire patch content should replace all mutable regions
             if len(mutable_ranges) == 1:
-                # Single mutable region, replace with entire patch content
+                # Single mutable region. If the patch appears to be a full-file
+                # rewrite that omitted EVOLVE markers, safely extract only the
+                # content intended for the evolve block by matching immutable
+                # prefix/suffix from the original file.
                 start, end = mutable_ranges[0]
 
-                # The mutable range ends before "EVOLVE-BLOCK-END" text
-                # We need to find the actual start of the comment line
-                if language == "python":
-                    end_marker = "# EVOLVE-BLOCK-END"
-                elif language in ["cuda", "cpp"]:
-                    end_marker = "// EVOLVE-BLOCK-END"
-                else:
-                    end_marker = "# EVOLVE-BLOCK-END"  # Default fallback
-
-                end_marker_pos = original.find(end_marker, end - 5)
-                if end_marker_pos == -1:
-                    # Fallback: use the original end position
-                    end_marker_pos = end
+                # Immutable portions that remain outside the evolve block
+                immutable_prefix = original[:start]
+                immutable_suffix = original[end:]
 
-                # Ensure proper newline handling around the patch content
-                if patch_code and not patch_code.startswith("\n"):
-                    patch_code = "\n" + patch_code
+                # Also compute the portions strictly outside the marker lines
+                # to detect full-file patches that omitted EVOLVE markers.
+                # Find the start and end marker line boundaries.
+                start_match = None
+                end_match = None
+                for m in EVOLVE_START.finditer(original):
+                    if m.end() == start:
+                        start_match = m
+                        break
+                for m in EVOLVE_END.finditer(original):
+                    if m.start() == end:
+                        end_match = m
+                        break
 
-                if patch_code and not patch_code.endswith("\n"):
-                    patch_code = patch_code + "\n"
-
-                updated_content = (
-                    original[:start] + patch_code + original[end_marker_pos:]
+                prefix_outside = (
+                    original[: start_match.start()] if start_match else immutable_prefix
+                )
+                suffix_outside = (
+                    original[end_match.end() :] if end_match else immutable_suffix
                 )
+
+                # Heuristic: if patch includes the same immutable prefix/suffix
+                # outside the markers, treat the middle part as the evolve-block
+                # replacement. Be tolerant to a missing trailing newline in the
+                # footer by checking both versions.
+                suffix_opts = (suffix_outside, suffix_outside.rstrip("\r\n"))
+                if patch_code.startswith(prefix_outside) and any(
+                    patch_code.endswith(sfx) for sfx in suffix_opts
+                ):
+                    mid_start = len(prefix_outside)
+                    # choose the matching suffix option to compute end
+                    sfx = next(sfx for sfx in suffix_opts if patch_code.endswith(sfx))
+                    mid_end = len(patch_code) - len(sfx)
+                    replacement_content = patch_code[mid_start:mid_end]
+                    # Ensure marker boundaries stay on their own lines.
+                    # Add a leading newline only if there is a START marker.
+                    if (
+                        start_match is not None
+                        and replacement_content
+                        and not replacement_content.startswith("\n")
+                    ):
+                        replacement_content = "\n" + replacement_content
+                    # Add a trailing newline only if there is an END marker.
+                    if (
+                        end_match is not None
+                        and replacement_content
+                        and not replacement_content.endswith("\n")
+                    ):
+                        replacement_content = replacement_content + "\n"
+                    updated_content = (
+                        immutable_prefix + replacement_content + immutable_suffix
+                    )
+                else:
+                    # Otherwise, assume the patch_code represents only the
+                    # evolve-block payload and insert it directly between markers.
+                    # Ensure proper newline handling around the patch content.
+                    payload = patch_code
+                    if (
+                        start_match is not None
+                        and payload
+                        and not payload.startswith("\n")
+                    ):
+                        payload = "\n" + payload
+                    if end_match is not None and payload and not payload.endswith("\n"):
+                        payload = payload + "\n"
+                    updated_content = immutable_prefix + payload + immutable_suffix
             else:
-                # Multiple mutable regions, this is ambiguous
+                # Multiple EVOLVE-BLOCK regions found, ambiguous without markers
                 error_message = (
                     "Multiple EVOLVE-BLOCK regions found but patch "
                     "doesn't specify which to replace"
                 )
                 return original, 0, None, error_message, None, None
+        else:
+            # Patch contains exactly one marker (START xor END).
+            # Only safe to apply when original has a single evolve region.
+            if len(mutable_ranges) != 1:
+                error_message = (
+                    "Patch contains only one EVOLVE-BLOCK marker, but the original "
+                    f"has {len(mutable_ranges)} editable regions; cannot determine target"
+                )
+                return original, 0, None, error_message, None, None
+
+            # Single target region in original
+            start, end = mutable_ranges[0]
+            immutable_prefix = original[:start]
+            immutable_suffix = original[end:]
+
+            # Find exact marker locations in original for newline policy
+            start_match = None
+            end_match = None
+            for m in EVOLVE_START.finditer(original):
+                if m.end() == start:
+                    start_match = m
+                    break
+            for m in EVOLVE_END.finditer(original):
+                if m.start() == end:
+                    end_match = m
+                    break
+
+            # Compute outside-of-markers prefix/suffix from original
+            prefix_outside = (
+                original[: start_match.start()] if start_match else immutable_prefix
+            )
+            suffix_outside = (
+                original[end_match.end() :] if end_match else immutable_suffix
+            )
+
+            # Extract payload based on which single marker is present in patch
+            if patch_has_start and not patch_has_end:
+                m = EVOLVE_START.search(patch_code)
+                payload = patch_code[m.end() :] if m else patch_code
+                # Trim footer if the patch included it
+                for sfx in (suffix_outside, suffix_outside.rstrip("\r\n")):
+                    if sfx and payload.endswith(sfx):
+                        payload = payload[: -len(sfx)]
+                        break
+            elif patch_has_end and not patch_has_start:
+                m = EVOLVE_END.search(patch_code)
+                payload = patch_code[: m.start()] if m else patch_code
+                # Trim header if the patch included it
+                for pfx in (prefix_outside, prefix_outside.rstrip("\r\n")):
+                    if pfx and payload.startswith(pfx):
+                        payload = payload[len(pfx) :]
+                        break
+            else:
+                payload = patch_code
+
+            # Normalize newlines so markers remain on their own lines
+            if start_match is not None and payload and not payload.startswith("\n"):
+                payload = "\n" + payload
+            if end_match is not None and payload and not payload.endswith("\n"):
+                payload = payload + "\n"
+
+            updated_content = immutable_prefix + payload + immutable_suffix
 
         # Add remaining immutable content after last mutable range
-        if patch_mutable_ranges and mutable_ranges:
+        if patch_has_both and mutable_ranges:
             updated_content += original[mutable_ranges[-1][1] :]
 
         num_applied = 1
diff --git a/tests/test_edit_base.py b/tests/test_edit_base.py
index edc0e1178..67c6f2e20 100644
--- a/tests/test_edit_base.py
+++ b/tests/test_edit_base.py
@@ -161,6 +161,110 @@ def new_func2():
     # Should have replaced both evolve blocks with new content
 
 
+def test_apply_full_patch_full_file_without_markers_extracts_block_only():
+    """Full-file patch without EVOLVE markers should not copy immutable code
+    into the evolve block; only the block payload is replaced."""
+    original_content = """# Header line\n# EVOLVE-BLOCK-START\nold_line()\n# EVOLVE-BLOCK-END\n# Footer line\n"""
+
+    # Patch is the entire file content but with the EVOLVE markers omitted.
+    patch_content = """```python
+new_line()
+another_new_line()
+```"""
+
+    expected = """# Header line
+# EVOLVE-BLOCK-START
+new_line()
+another_new_line()
+# EVOLVE-BLOCK-END
+# Footer line
+"""
+
+    result = apply_full_patch(
+        patch_str=patch_content,
+        original_str=original_content,
+        language="python",
+        verbose=False,
+    )
+    updated_content, num_applied, output_path, error, patch_txt, diff_path = result
+
+    assert error is None
+    assert num_applied == 1
+    assert updated_content == expected
+
+
+def test_apply_full_patch_patch_with_start_marker_only():
+    """Patch has only START marker; original has both markers."""
+    original_content = """# Header line
+# EVOLVE-BLOCK-START
+old_line()
+# EVOLVE-BLOCK-END
+# Footer line
+"""
+
+    patch_content = """```python
+# Header line
+# EVOLVE-BLOCK-START
+new_line()
+# Footer line
+```"""
+
+    expected = """# Header line
+# EVOLVE-BLOCK-START
+new_line()
+# EVOLVE-BLOCK-END
+# Footer line
+"""
+
+    result = apply_full_patch(
+        patch_str=patch_content,
+        original_str=original_content,
+        language="python",
+        verbose=False,
+    )
+    updated_content, num_applied, output_path, error, patch_txt, diff_path = result
+
+    assert error is None
+    assert num_applied == 1
+    assert updated_content == expected
+
+
+def test_apply_full_patch_patch_with_end_marker_only():
+    """Patch has only END marker; original has both markers."""
+    original_content = """# Header line
+# EVOLVE-BLOCK-START
+old_line()
+# EVOLVE-BLOCK-END
+# Footer line
+"""
+
+    patch_content = """```python
+# Header line
+new_line()
+# EVOLVE-BLOCK-END
+# Footer line
+```"""
+
+    expected = """# Header line
+# EVOLVE-BLOCK-START
+new_line()
+# EVOLVE-BLOCK-END
+# Footer line
+"""
+
+    result = apply_full_patch(
+        patch_str=patch_content,
+        original_str=original_content,
+        language="python",
+        verbose=False,
+    )
+    updated_content, num_applied, output_path, error, patch_txt, diff_path = result
+
+    assert error is None
+    assert num_applied == 1
+    assert updated_content == expected
+
+
 def test_apply_full_patch_no_evolve_blocks():
     """Test apply_full_patch with no EVOLVE-BLOCK regions - should error."""
     original_content = """# Just regular code
@@ -221,6 +325,41 @@ def new_function():
     assert updated_content == original_content  # Should return original content
 
 
+def test_apply_full_patch_patch_with_single_marker_ambiguous_multiple_regions():
+    """Single marker in patch is ambiguous when original has multiple regions."""
+    original_content = """# Header
+# EVOLVE-BLOCK-START
+func1()
+# EVOLVE-BLOCK-END
+
+# EVOLVE-BLOCK-START
+func2()
+# EVOLVE-BLOCK-END
+# Footer
+"""
+
+    # Patch includes only START marker
+    patch_content = """```python
+# Header
+# EVOLVE-BLOCK-START
+new_code()
+# Footer
+```"""
+
+    updated_content, num_applied, output_path, error, patch_txt, diff_path = (
+        apply_full_patch(
+            patch_str=patch_content,
+            original_str=original_content,
+            language="python",
+            verbose=False,
+        )
+    )
+
+    assert num_applied == 0
+    assert error is not None
+    assert "only one EVOLVE-BLOCK marker" in error
+
+
 def test_apply_full_patch_invalid_extraction():
     """Test apply_full_patch with invalid code extraction."""
     original_content = """# EVOLVE-BLOCK-START

From c5b1abe80331532aed5ce1e1fbd7fd5e7d14b087 Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange <robert.lange@barcelonagse.eu>
Date: Mon, 27 Oct 2025 16:20:22 +0100
Subject: [PATCH 12/50] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index b0dba5f7d..7a59f760e 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,7 @@ The framework supports **parallel evaluation of candidates** locally or on a Slu
 | 📓 **[Tutorial Notebook](examples/shinka_tutorial.ipynb)** | Interactive walkthrough of Shinka features | Hands-on examples, configuration, best practices |
 | ⚙️ **[Configuration](docs/configuration.md)** | Comprehensive configuration reference | All config options, optimization settings, advanced features |
 | 🎨 **[WebUI](docs/webui.md)** | Interactive visualization and monitoring | Real-time tracking, result analysis, debugging tools | 
+|🕹️ **[Local LLM Support](https://github.com/SakanaAI/ShinkaEvolve/blob/main/docs/support_local_llm.md)**| Instructions for Local LLMs | How to setup local LLMs on your machine|
 
 ## Installation & Quick Start 🚀
 

From ded457647e3fe9d50d2ddf756d00d66ae890f0bd Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange <robert.lange@barcelonagse.eu>
Date: Mon, 27 Oct 2025 20:36:19 +0100
Subject: [PATCH 13/50] Update inspirations.py - archive

---
 shinka/database/inspirations.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/shinka/database/inspirations.py b/shinka/database/inspirations.py
index ee564dfa1..42c3859d8 100644
--- a/shinka/database/inspirations.py
+++ b/shinka/database/inspirations.py
@@ -72,6 +72,7 @@ def sample_context(self, parent: Any, n: int) -> List[Any]:
             self.cursor.execute(
                 """
                 SELECT p.id FROM programs p
+                JOIN archive a ON p.id = a.program_id
                 WHERE p.island_idx = ? AND p.correct = 1
                 ORDER BY p.combined_score DESC
                 LIMIT ?
@@ -93,7 +94,8 @@ def sample_context(self, parent: Any, n: int) -> List[Any]:
                 placeholders_rand = ",".join("?" * len(insp_ids))
                 sql_rand = f"""
                     SELECT p.id FROM programs p
-                    WHERE p.island_idx = ? AND p.correct = 1 
+                    JOIN archive a ON p.id = a.program_id
+                    WHERE p.island_idx = ? AND p.correct = 1
                     AND p.id NOT IN ({placeholders_rand})
                     ORDER BY RANDOM() LIMIT ?
                 """
@@ -111,9 +113,10 @@ def sample_context(self, parent: Any, n: int) -> List[Any]:
             needed = n - len(inspirations)
             if needed > 0:
                 placeholders_rand = ",".join("?" * len(insp_ids))
-                sql_rand = f"""SELECT id FROM programs
-                                 WHERE correct = 1 
-                                 AND id NOT IN ({placeholders_rand})
+                sql_rand = f"""SELECT p.id FROM programs p
+                                 JOIN archive a ON p.id = a.program_id
+                                 WHERE p.correct = 1
+                                 AND p.id NOT IN ({placeholders_rand})
                                  ORDER BY RANDOM() LIMIT ?
                                  """
                 params_rand = list(insp_ids) + [needed]

From ee6e8a5e98478e53948ddacb94588a727c10521b Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange <robert.lange@barcelonagse.eu>
Date: Mon, 27 Oct 2025 21:07:23 +0100
Subject: [PATCH 14/50] Update dependencies gemini embed

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index f05429b60..f60d0b659 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,7 @@ dependencies = [
     "adjustText",
     "markdown",
     "aiofiles",
+    "google-generativeai",
 ]
 
 [tool.setuptools]

From a759778b5f410528a99a878e724c5e6ac7511ed2 Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange <robert.lange@barcelonagse.eu>
Date: Thu, 30 Oct 2025 11:07:50 +0100
Subject: [PATCH 15/50] Update dbase.py path default

---
 shinka/database/dbase.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shinka/database/dbase.py b/shinka/database/dbase.py
index c6a2b89bf..aef4f7219 100644
--- a/shinka/database/dbase.py
+++ b/shinka/database/dbase.py
@@ -50,7 +50,7 @@ def clean_nan_values(obj: Any) -> Any:
 
 @dataclass
 class DatabaseConfig:
-    db_path: Optional[str] = None
+    db_path: str = "evolution_db.sqlite"
     num_islands: int = 4
     archive_size: int = 100
 

From c097a8821ff081c433fa285874448467e5b9f04a Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange <robert.lange@barcelonagse.eu>
Date: Thu, 30 Oct 2025 21:03:34 +0100
Subject: [PATCH 16/50] Fix reasoning token sampling

---
 shinka/llm/query.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/shinka/llm/query.py b/shinka/llm/query.py
index a7288df8e..218ae33eb 100644
--- a/shinka/llm/query.py
+++ b/shinka/llm/query.py
@@ -137,16 +137,13 @@ def sample_model_kwargs(
         r_effort = random.choice(reasoning_efforts)
         think_bool = r_effort != "auto"
         if think_bool:
-            thinking_tokens = [
-                t
-                for t in THINKING_TOKENS.values()
-                if t < kwargs_dict["max_tokens"] and t >= 1024
-            ]
+            t = THINKING_TOKENS[r_effort]
+            thinking_tokens = t if t < kwargs_dict["max_tokens"] else 1024
             kwargs_dict["extra_body"] = {
                 "extra_body": {
                     "google": {
                         "thinking_config": {
-                            "thinking_budget": random.choice(thinking_tokens),
+                            "thinking_budget": thinking_tokens,
                             "include_thoughts": True,
                         }
                     }
@@ -161,15 +158,12 @@ def sample_model_kwargs(
         if think_bool:
             # filter thinking tokens to be smaller than max_tokens
             # not auto THINKING_TOKENS
-            thinking_tokens = [
-                t
-                for t in THINKING_TOKENS.values()
-                if t < kwargs_dict["max_tokens"] and t >= 1024
-            ]
+            t = THINKING_TOKENS[r_effort]
+            thinking_tokens = t if t < kwargs_dict["max_tokens"] else 1024
             # sample only from thinking tokens that are valid
             kwargs_dict["thinking"] = {
                 "type": "enabled",
-                "budget_tokens": random.choice(thinking_tokens),
+                "budget_tokens": thinking_tokens,
             }
 
     else:

From 6d5e208ae04e18ba906d8f2c6e77ae6facf0afb7 Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange <robert.lange@barcelonagse.eu>
Date: Thu, 30 Oct 2025 22:49:31 +0100
Subject: [PATCH 17/50] Fix anthropic budget sampling

---
 shinka/llm/query.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/shinka/llm/query.py b/shinka/llm/query.py
index 218ae33eb..c88c7d7c3 100644
--- a/shinka/llm/query.py
+++ b/shinka/llm/query.py
@@ -154,7 +154,8 @@ def sample_model_kwargs(
         REASONING_CLAUDE_MODELS + REASONING_BEDROCK_MODELS
     ):
         kwargs_dict["max_tokens"] = min(random.choice(max_tokens), 16384)
-        think_bool = random.choice(reasoning_efforts) != "auto"
+        r_effort = random.choice(reasoning_efforts)
+        think_bool = r_effort != "auto"
         if think_bool:
             # filter thinking tokens to be smaller than max_tokens
             # not auto THINKING_TOKENS

From 9b4d7c760ab9b0d13ee0fb672c24cc0f14336c4d Mon Sep 17 00:00:00 2001
From: RobertTLange <robertlange0@gmail.com>
Date: Sun, 2 Nov 2025 10:00:19 +0100
Subject: [PATCH 18/50] fix shinka_launch --help

---
 configs/config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/config.yaml b/configs/config.yaml
index 9702c6617..577e1dfe2 100644
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -2,9 +2,9 @@ defaults:
   - _self_
   - database@_global_: island_small
   - evolution@_global_: small_budget
-  - task@_global_: mad_tf
+  - task@_global_: circle_packing
   - cluster@_global_: local
-  - variant@_global_: mad_tf_example
+  - variant@_global_: circle_packing_example
 
 verbose: false
 results_dir: results

From d7a3f7e77d45c156b45bbc92f2a39de7e5b4e131 Mon Sep 17 00:00:00 2001
From: RobertTLange <robertlange0@gmail.com>
Date: Sun, 2 Nov 2025 10:05:49 +0100
Subject: [PATCH 19/50] fix wrap_eval catch

---
 shinka/core/wrap_eval.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/shinka/core/wrap_eval.py b/shinka/core/wrap_eval.py
index 7e1d1e5d3..bf2cf92eb 100644
--- a/shinka/core/wrap_eval.py
+++ b/shinka/core/wrap_eval.py
@@ -96,6 +96,9 @@ def run_shinka_eval(
     num_valid_runs = 0
     num_invalid_runs = 0
 
+    all_run_results: List[Any] = []
+    execution_times: List[float] = []
+
     try:
         module = load_program(program_path)
         if not hasattr(module, experiment_fn_name):
@@ -105,9 +108,6 @@ def run_shinka_eval(
             )
         experiment_fn = getattr(module, experiment_fn_name)
 
-        all_run_results: List[Any] = []
-        execution_times: List[float] = []
-
         for i in range(num_runs):
             kwargs: Dict[str, Any] = {}
             if get_experiment_kwargs:

From 397e0fd67e6c04c7b82124da715c2cdc99d53efa Mon Sep 17 00:00:00 2001
From: RobertTLange <robertlange0@gmail.com>
Date: Sun, 2 Nov 2025 10:10:10 +0100
Subject: [PATCH 20/50] add documentation for resuming experiments

---
 docs/getting_started.md | 69 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/docs/getting_started.md b/docs/getting_started.md
index a866c011f..d40c16b59 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -331,6 +331,75 @@ The `run_shinka_eval` function returns three values:
 
 ## Advanced Usage
 
+### Resuming Experiments
+
+If you need to pause and resume an evolutionary run, or extend a completed run with more generations, Shinka supports seamless resumption from existing results.
+
+#### How Resuming Works
+
+When you specify an existing `results_dir` that contains a database, Shinka will:
+- Detect the previous run automatically
+- Restore the population database and all program history
+- Resume meta-recommendations from the last checkpoint
+- Continue from the last completed generation
+
+#### Using the CLI (Hydra)
+
+```bash
+# Resume an existing run and extend to 50 generations
+shinka_launch \
+    variant=circle_packing_example \
+    evo_config.results_dir=results_20250101_120000 \
+    evo_config.num_generations=50
+
+# Or with a custom task
+shinka_launch \
+    task=circle_packing \
+    database=island_small \
+    evolution=small_budget \
+    cluster=local \
+    evo_config.results_dir=path/to/previous/results \
+    evo_config.num_generations=100
+```
+
+#### Using the Python API
+
+```python
+from shinka.core import EvolutionRunner, EvolutionConfig
+from shinka.database import DatabaseConfig
+from shinka.launch import LocalJobConfig
+
+# Point to existing results directory
+evo_config = EvolutionConfig(
+    num_generations=50,  # Extend to 50 total generations
+    results_dir="results_20250101_120000",  # Existing results
+    # ... other config parameters ...
+)
+
+job_config = LocalJobConfig(
+    eval_program_path="examples/circle_packing/evaluate.py",
+)
+
+db_config = DatabaseConfig(
+    archive_size=20,
+    num_islands=2,
+)
+
+# Run will automatically detect and resume
+runner = EvolutionRunner(
+    evo_config=evo_config,
+    job_config=job_config,
+    db_config=db_config,
+)
+runner.run()
+```
+
+**Important Notes:**
+- The `num_generations` parameter should be set to the **total** number of generations you want (not additional generations)
+- For example, if you completed 20 generations and want 30 more, set `num_generations=50`
+- The database configuration (number of islands, archive size, etc.) should match the original run
+- All previous progress, including the best solutions and meta-recommendations, will be preserved
+
 ### Environment Management for Local Jobs
 
 When running jobs locally, you have several options for managing Python environments:

From f6896dc03d63571c12506fcc85fced52a93da4b0 Mon Sep 17 00:00:00 2001
From: RobertTLange <robertlange0@gmail.com>
Date: Sun, 2 Nov 2025 10:26:33 +0100
Subject: [PATCH 21/50] fix OAI dependency db for visualization

---
 shinka/database/dbase.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/shinka/database/dbase.py b/shinka/database/dbase.py
index aef4f7219..2118763c4 100644
--- a/shinka/database/dbase.py
+++ b/shinka/database/dbase.py
@@ -251,12 +251,22 @@ class ProgramDatabase:
     populations, and an archive of elites.
     """
 
-    def __init__(self, config: DatabaseConfig,embedding_model: str = "text-embedding-3-small", read_only: bool = False):
+    def __init__(
+        self,
+        config: DatabaseConfig,
+        embedding_model: str = "text-embedding-3-small",
+        read_only: bool = False,
+    ):
         self.config = config
         self.conn: Optional[sqlite3.Connection] = None
         self.cursor: Optional[sqlite3.Cursor] = None
         self.read_only = read_only
-        self.embedding_client = EmbeddingClient(model_name=embedding_model)
+        # Only create embedding client if not in read-only mode
+        # (e.g., WebUI doesn't need it for visualization)
+        if not read_only:
+            self.embedding_client = EmbeddingClient(model_name=embedding_model)
+        else:
+            self.embedding_client = None
 
         self.last_iteration: int = 0
         self.best_program_id: Optional[str] = None

From 1d9d498054af8da462c1cf6f14aa3cb566973108 Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange <robert.lange@barcelonagse.eu>
Date: Sun, 2 Nov 2025 13:27:56 +0100
Subject: [PATCH 22/50] Fix init program island copying -> archive

---
 shinka/database/islands.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/shinka/database/islands.py b/shinka/database/islands.py
index 9975eac3b..341dea79c 100644
--- a/shinka/database/islands.py
+++ b/shinka/database/islands.py
@@ -682,6 +682,16 @@ def copy_program_to_islands(self, program: Any) -> List[str]:
                 f"Created copy {new_id[:8]}... of program {program.id[:8]}... "
                 f"for island {island_idx}"
             )
+
+            # Add the copied program to the archive if it's correct
+            # This ensures it can be used as inspiration for that island
+            if program.correct:
+                self.cursor.execute(
+                    "INSERT OR IGNORE INTO archive (program_id) VALUES (?)",
+                    (new_id,),
+                )
+                logger.debug(f"Added copy {new_id[:8]}... to archive (correct program)")
+
         self.conn.commit()
         logger.info(
             f"Created {len(created_ids)} copies of program "

From 2f01b3ed549793fda12aa1f5b157cc617ec80eb1 Mon Sep 17 00:00:00 2001
From: "takeru.fukushima" <100330935+takeruhukushima@users.noreply.github.com>
Date: Mon, 3 Nov 2025 16:28:09 +0900
Subject: [PATCH 23/50] fix:GEMINI_API_KEY name error

---
 shinka/llm/embedding.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/shinka/llm/embedding.py b/shinka/llm/embedding.py
index 1f2ad495f..4082ad58b 100644
--- a/shinka/llm/embedding.py
+++ b/shinka/llm/embedding.py
@@ -51,9 +51,9 @@ def get_client_model(model_name: str) -> tuple[Union[openai.OpenAI, str], str]:
         )
     elif model_name in GEMINI_EMBEDDING_MODELS:
         # Configure Gemini API
-        api_key = os.getenv("GOOGLE_API_KEY")
+        api_key = os.getenv("GEMINI_API_KEY")
         if not api_key:
-            raise ValueError("GOOGLE_API_KEY environment variable not set for Gemini models")
+            raise ValueError("GEMINI_API_KEY environment variable not set for Gemini models")
         genai.configure(api_key=api_key)
         client = "gemini"  # Use string identifier for Gemini
         model_to_use = model_name

From f5f7e68f2ec3423291ac9e98bb1836478b757df0 Mon Sep 17 00:00:00 2001
From: ifsheldon <feng.liang@kaust.edu.sa>
Date: Sat, 8 Nov 2025 17:29:11 +0800
Subject: [PATCH 24/50] use dependency-groups.dev

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f60d0b659..5802a1522 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,8 +57,8 @@ include = ["shinka", "shinka.*"]
 [tool.setuptools.package-data]
 "*" = ["*"]
 
-[tool.uv]
-dev-dependencies = [
+[dependency-groups]
+dev = [
     "pytest>=6.0",
     "black",
     "isort",

From 14739fc5e364eda8fc7ff184f5811e45d0d00657 Mon Sep 17 00:00:00 2001
From: Arun Parthiban <arun.parthiban@datadoghq.com>
Date: Sat, 8 Nov 2025 07:05:54 -0500
Subject: [PATCH 25/50] Add support for Claude Sonnet 4.5
 (claude-sonnet-4-5-20250929)

---
 shinka/llm/models/pricing.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/shinka/llm/models/pricing.py b/shinka/llm/models/pricing.py
index c9c101a2c..a4595a99d 100644
--- a/shinka/llm/models/pricing.py
+++ b/shinka/llm/models/pricing.py
@@ -35,6 +35,10 @@
         "input_price": 3.0 / M,
         "output_price": 15.0 / M,
     },
+    "claude-sonnet-4-5-20250929": {
+        "input_price": 3.0 / M,
+        "output_price": 15.0 / M,
+    },
 }
 
 OPENAI_MODELS = {
@@ -176,6 +180,7 @@
 REASONING_CLAUDE_MODELS = [
     "claude-3-7-sonnet-20250219",
     "claude-4-sonnet-20250514",
+    "claude-sonnet-4-5-20250929",
 ]
 
 REASONING_DEEPSEEK_MODELS = [

From ed9f51f49305d14091f339a1487ed9e534f96591 Mon Sep 17 00:00:00 2001
From: Jeethu Rao <jeethu@jeethurao.com>
Date: Mon, 3 Nov 2025 16:09:08 +0000
Subject: [PATCH 26/50] Add Swift language support

---
 shinka/core/runner.py         |  2 ++
 shinka/database/complexity.py |  4 ++--
 shinka/edit/apply_diff.py     |  4 +++-
 shinka/edit/apply_full.py     |  2 ++
 shinka/edit/async_apply.py    | 26 +++++++++++++++++++++++++-
 5 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/shinka/core/runner.py b/shinka/core/runner.py
index f1b5e947d..975ab5373 100644
--- a/shinka/core/runner.py
+++ b/shinka/core/runner.py
@@ -238,6 +238,8 @@ def __init__(
             self.lang_ext = "py"
         elif self.evo_config.language == "rust":
             self.lang_ext = "rs"
+        elif self.evo_config.language == "swift":
+            self.lang_ext = "swift"
         else:
             msg = f"Language {self.evo_config.language} not supported"
             raise ValueError(msg)
diff --git a/shinka/database/complexity.py b/shinka/database/complexity.py
index 933d7f4e6..30a46aa31 100644
--- a/shinka/database/complexity.py
+++ b/shinka/database/complexity.py
@@ -259,8 +259,8 @@ def analyze_code_metrics(code_string, language="python"):
             # If Python parsing fails, fall back to C++ analysis
             return analyze_cpp_complexity(code_string)
 
-    # For C/C++/CUDA/Rust and other languages, use regex-based analysis
-    elif language in ["cpp", "c", "cuda", "c++", "rust"]:
+    # For C/C++/CUDA/Rust/Swift and other languages, use regex-based analysis
+    elif language in ["cpp", "c", "cuda", "c++", "rust", "swift"]:
         return analyze_cpp_complexity(code_string)
 
     # For unknown languages, use simple line-based complexity
diff --git a/shinka/edit/apply_diff.py b/shinka/edit/apply_diff.py
index 4b5f29148..af1dff747 100644
--- a/shinka/edit/apply_diff.py
+++ b/shinka/edit/apply_diff.py
@@ -698,7 +698,7 @@ def apply_diff_patch(
     patch_str = _strip_trailing_whitespace(patch_str)
 
     # Remove the EVOLVE-BLOCK START and EVOLVE-BLOCK END markers
-    if language in ["cuda", "cpp", "rust"]:
+    if language in ["cuda", "cpp", "rust", "swift"]:
         patch_str = re.sub(r"// EVOLVE-BLOCK START\\n", "", patch_str)
         patch_str = re.sub(r"// EVOLVE-BLOCK END\\n", "", patch_str)
     elif language == "python":
@@ -732,6 +732,8 @@ def apply_diff_patch(
         suffix = ".cu"
     elif language == "rust":
         suffix = ".rs"
+    elif language == "swift":
+        suffix = ".swift"
     else:
         raise ValueError(f"Language {language} not supported")
 
diff --git a/shinka/edit/apply_full.py b/shinka/edit/apply_full.py
index 4cc4ddca4..f175aec74 100644
--- a/shinka/edit/apply_full.py
+++ b/shinka/edit/apply_full.py
@@ -264,6 +264,8 @@ def apply_full_patch(
         suffix = ".cu"
     elif language == "rust":
         suffix = ".rs"
+    elif language == "swift":
+        suffix = ".swift"
     else:
         raise ValueError(f"Language {language} not supported")
 
diff --git a/shinka/edit/async_apply.py b/shinka/edit/async_apply.py
index 4ffd15bed..e4c21202f 100644
--- a/shinka/edit/async_apply.py
+++ b/shinka/edit/async_apply.py
@@ -143,7 +143,6 @@ async def validate_code_async(
             else:
                 error_msg = stderr.decode() if stderr else "Unknown compilation error"
                 return False, error_msg
-
         elif language == "cpp":
             # Use g++ for C++ compilation check
             proc = await asyncio.create_subprocess_exec(
@@ -154,6 +153,31 @@ async def validate_code_async(
                 stderr=asyncio.subprocess.PIPE,
             )
 
+            try:
+                stdout, stderr = await asyncio.wait_for(
+                    proc.communicate(), timeout=timeout
+                )
+            except asyncio.TimeoutError:
+                proc.kill()
+                await proc.wait()
+                return False, f"Validation timeout after {timeout}s"
+
+            if proc.returncode == 0:
+                return True, None
+            else:
+                error_msg = stderr.decode() if stderr else "Unknown compilation error"
+                return False, error_msg
+        elif language == "swift":
+            # Use swiftc for Swift syntax checking
+            proc = await asyncio.create_subprocess_exec(
+                "swiftc",
+                "-typecheck",
+                "-parse-as-library",
+                code_path,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+
             try:
                 stdout, stderr = await asyncio.wait_for(
                     proc.communicate(), timeout=timeout

From 0437118c4518139f79a96b4b44e173bb05b39745 Mon Sep 17 00:00:00 2001
From: Aladoro <edoardocetin@gmail.com>
Date: Tue, 11 Nov 2025 03:44:03 +0000
Subject: [PATCH 27/50] ignore warning for correct behavior when no improvement
 is detected, keeping the tracked llm scores in log space to -inf

---
 shinka/llm/dynamic_sampling.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/shinka/llm/dynamic_sampling.py b/shinka/llm/dynamic_sampling.py
index 6c038d9fa..eb0cd8cb3 100644
--- a/shinka/llm/dynamic_sampling.py
+++ b/shinka/llm/dynamic_sampling.py
@@ -28,7 +28,8 @@ def _logdiffexp(a_log, b_log):
 
 def _logexpm1(z):
     z = np.asarray(z, dtype=float)
-    return np.where(z > 50.0, z, np.log(np.expm1(z)))
+    with np.errstate(divide='ignore', invalid='ignore'):
+        return np.where(z > 50.0, z, np.log(np.expm1(z)))
 
 
 class BanditBase(ABC):
@@ -433,12 +434,13 @@ def decay(self, factor: float) -> None:
         if self.use_exponential_scaling and self.asymmetric_scaling:
             # shrink in exp space to match original score scale
             s = self.s
-            log1p_term = np.where(
-                s > 0.0,
-                s + np.log(one_minus_factor + np.exp(-s)),
-                np.log1p(one_minus_factor * np.exp(s)),
-            )
-            self.s = s + np.log(factor) - log1p_term
+            with np.errstate(divide='ignore', invalid='ignore'):
+                log1p_term = np.where(
+                    s > 0.0,
+                    s + np.log(one_minus_factor + np.exp(-s)),
+                    np.log1p(one_minus_factor * np.exp(s)),
+                )
+                self.s = s + np.log(factor) - log1p_term
 
             if self.adaptive_scale and np.isfinite(self._obs_max):
                 means_log = self._mean()

From 259e786777cf042535d129cbdbc41653c18b8e91 Mon Sep 17 00:00:00 2001
From: Jai Menon <87035087+jm424@users.noreply.github.com>
Date: Wed, 12 Nov 2025 11:29:56 -0500
Subject: [PATCH 28/50] Allow boolean flags for eval jobs

Currently flags are passed on as key-value pairs but that approach doesn't extend to boolean flags
---
 shinka/launch/scheduler.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/shinka/launch/scheduler.py b/shinka/launch/scheduler.py
index 5782613ee..4e824c3ff 100644
--- a/shinka/launch/scheduler.py
+++ b/shinka/launch/scheduler.py
@@ -138,7 +138,13 @@ def _build_command(self, exec_fname_t: str, results_dir_t: str) -> List[str]:
                 ]
         if self.config.extra_cmd_args:
             for k, v in self.config.extra_cmd_args.items():
-                cmd.extend([f"--{k}", str(v)])
+                # Handle boolean flags
+                if isinstance(v, bool):
+                    if v:  # Only append flag if True
+                        cmd.append(f"--{k}")
+                else:
+                    # For non-boolean values, append both flag and value
+                    cmd.extend([f"--{k}", str(v)])
         return cmd
 
     def run(

From 3251a701661d2eedf77e2473bba8c2a022295cf1 Mon Sep 17 00:00:00 2001
From: Jeremy Cochoy <jeremy.cochoy@gmail.com>
Date: Mon, 17 Nov 2025 15:52:53 +0100
Subject: [PATCH 29/50] Add json support

---
 shinka/core/runner.py         | 2 ++
 shinka/database/complexity.py | 2 +-
 shinka/edit/apply_diff.py     | 4 +++-
 shinka/edit/apply_full.py     | 2 ++
 4 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/shinka/core/runner.py b/shinka/core/runner.py
index f1b5e947d..be76994ed 100644
--- a/shinka/core/runner.py
+++ b/shinka/core/runner.py
@@ -238,6 +238,8 @@ def __init__(
             self.lang_ext = "py"
         elif self.evo_config.language == "rust":
             self.lang_ext = "rs"
+        elif self.evo_config.language in ["json", "json5"]:
+            self.lang_ext = "json"
         else:
             msg = f"Language {self.evo_config.language} not supported"
             raise ValueError(msg)
diff --git a/shinka/database/complexity.py b/shinka/database/complexity.py
index 933d7f4e6..714ebaae8 100644
--- a/shinka/database/complexity.py
+++ b/shinka/database/complexity.py
@@ -260,7 +260,7 @@ def analyze_code_metrics(code_string, language="python"):
             return analyze_cpp_complexity(code_string)
 
     # For C/C++/CUDA/Rust and other languages, use regex-based analysis
-    elif language in ["cpp", "c", "cuda", "c++", "rust"]:
+    elif language in ["cpp", "c", "cuda", "c++", "rust", "json", "json5"]:
         return analyze_cpp_complexity(code_string)
 
     # For unknown languages, use simple line-based complexity
diff --git a/shinka/edit/apply_diff.py b/shinka/edit/apply_diff.py
index 4b5f29148..6465ffe96 100644
--- a/shinka/edit/apply_diff.py
+++ b/shinka/edit/apply_diff.py
@@ -698,7 +698,7 @@ def apply_diff_patch(
     patch_str = _strip_trailing_whitespace(patch_str)
 
     # Remove the EVOLVE-BLOCK START and EVOLVE-BLOCK END markers
-    if language in ["cuda", "cpp", "rust"]:
+    if language in ["cuda", "cpp", "rust", "json", "json5"]:
         patch_str = re.sub(r"// EVOLVE-BLOCK START\\n", "", patch_str)
         patch_str = re.sub(r"// EVOLVE-BLOCK END\\n", "", patch_str)
     elif language == "python":
@@ -732,6 +732,8 @@ def apply_diff_patch(
         suffix = ".cu"
     elif language == "rust":
         suffix = ".rs"
+    elif language in ["json", "json5"]:
+        suffix = ".json"
     else:
         raise ValueError(f"Language {language} not supported")
 
diff --git a/shinka/edit/apply_full.py b/shinka/edit/apply_full.py
index 4cc4ddca4..5dd336547 100644
--- a/shinka/edit/apply_full.py
+++ b/shinka/edit/apply_full.py
@@ -264,6 +264,8 @@ def apply_full_patch(
         suffix = ".cu"
     elif language == "rust":
         suffix = ".rs"
+    elif language in ["json", "json5"]:
+        suffix = ".json"
     else:
         raise ValueError(f"Language {language} not supported")
 

From ed8f1b4ab2093ab5f489c72f0c585625b7de1fee Mon Sep 17 00:00:00 2001
From: Jai Menon <87035087+jm424@users.noreply.github.com>
Date: Wed, 19 Nov 2025 16:43:15 -0500
Subject: [PATCH 30/50] llm: Add GPT-5.1 and Gemini 3 Pro models

---
 shinka/llm/models/pricing.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/shinka/llm/models/pricing.py b/shinka/llm/models/pricing.py
index a4595a99d..91e965c75 100644
--- a/shinka/llm/models/pricing.py
+++ b/shinka/llm/models/pricing.py
@@ -118,6 +118,10 @@
         "input_price": 0.05 / M,
         "output_price": 0.4 / M,
     },
+    "gpt-5.1": {
+        "input_price": 1.25 / M,
+        "output_price": 10.0 / M,
+    },
 }
 
 
@@ -145,6 +149,10 @@
         "input_price": 0.1 / M,
         "output_price": 0.4 / M,
     },
+    "gemini-3-pro-preview" : {
+        "input_price": 2.0 / M,
+        "output_price": 12.0 / M,
+    },
 }
 
 BEDROCK_MODELS = {
@@ -191,6 +199,7 @@
     "gemini-2.5-pro",
     "gemini-2.5-flash",
     "gemini-2.5-flash-lite-preview-06-17",
+    "gemini-3-pro-preview",
 ]
 
 REASONING_AZURE_MODELS = [

From ecf762bc6c6af3ac92920714b6287eb04d9aa2bb Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange <robert.lange@barcelonagse.eu>
Date: Sat, 22 Nov 2025 17:13:08 +0100
Subject: [PATCH 31/50] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7a59f760e..4404c24d9 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 
 The framework supports **parallel evaluation of candidates** locally or on a Slurm cluster. It maintains an archive of successful solutions, enabling knowledge transfer between different evolutionary islands. `ShinkaEvolve` is particularly well-suited for scientific tasks where there is a verifier available and the goal is to optimize performance metrics while maintaining code correctness and readability.
 
-![](docs/conceptual.png)
+![evolution](https://github.com/user-attachments/assets/22cf3468-17fe-4995-9e13-d602b490a54e)
 
 ## Documentation 📝
 

From c686d7fb97e620d83730270ccfbc8e4fc253c08a Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange <robert.lange@barcelonagse.eu>
Date: Sat, 22 Nov 2025 17:21:35 +0100
Subject: [PATCH 32/50] Update getting_started.md

---
 docs/getting_started.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/getting_started.md b/docs/getting_started.md
index d40c16b59..03bc54c80 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -2,6 +2,8 @@
 
 Shinka is a framework that combines Large Language Models (LLMs) with evolutionary algorithms to drive scientific discovery. This guide will help you get started with installing, configuring, and running your first evolutionary experiments.
 
+![](../docs/conceptual.png)
+
 ## Table of Contents
 
 1. [What is Shinka?](#what-is-shinka)

From bad5b37002b482e4771eb0c4fa49d6e31d4cc30e Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange <robert.lange@barcelonagse.eu>
Date: Wed, 3 Dec 2025 10:58:48 +0100
Subject: [PATCH 33/50] Update apply_diff.py

---
 shinka/edit/apply_diff.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/shinka/edit/apply_diff.py b/shinka/edit/apply_diff.py
index d33f58042..7d2161056 100644
--- a/shinka/edit/apply_diff.py
+++ b/shinka/edit/apply_diff.py
@@ -699,11 +699,11 @@ def apply_diff_patch(
 
     # Remove the EVOLVE-BLOCK START and EVOLVE-BLOCK END markers
     if language in ["cuda", "cpp", "rust", "swift", "json", "json5"]:
-        patch_str = re.sub(r"// EVOLVE-BLOCK START\\n", "", patch_str)
-        patch_str = re.sub(r"// EVOLVE-BLOCK END\\n", "", patch_str)
+        patch_str = re.sub(r"// EVOLVE-BLOCK-START\\n", "", patch_str)
+        patch_str = re.sub(r"// EVOLVE-BLOCK-END\\n", "", patch_str)
     elif language == "python":
-        patch_str = re.sub(r"# EVOLVE-BLOCK START\\n", "", patch_str)
-        patch_str = re.sub(r"# EVOLVE-BLOCK END\\n", "", patch_str)
+        patch_str = re.sub(r"# EVOLVE-BLOCK-START\\n", "", patch_str)
+        patch_str = re.sub(r"# EVOLVE-BLOCK-END\\n", "", patch_str)
     else:
         raise ValueError(f"Language {language} not supported")
 

From e72fb60a7126b6450f7f18074ec772b597e412eb Mon Sep 17 00:00:00 2001
From: Racemuis <chiarathoni@gmail.com>
Date: Thu, 18 Dec 2025 12:18:57 +0100
Subject: [PATCH 34/50] [example] add autoformalization example from natural
 language to lean

---
 configs/task/autoformalization.yaml     | 30 +++++++++++++++++++++++++
 examples/autoformalization/evaluate.py  |  0
 examples/autoformalization/initial.lean |  0
 3 files changed, 30 insertions(+)
 create mode 100644 configs/task/autoformalization.yaml
 create mode 100644 examples/autoformalization/evaluate.py
 create mode 100644 examples/autoformalization/initial.lean

diff --git a/configs/task/autoformalization.yaml b/configs/task/autoformalization.yaml
new file mode 100644
index 000000000..d454cbe30
--- /dev/null
+++ b/configs/task/autoformalization.yaml
@@ -0,0 +1,30 @@
+evaluate_function:
+  _target_: examples.math_autoformalization.evaluate.main
+  program_path: ???
+  results_dir: ???
+
+distributed_job_config:
+  _target_: shinka.launch.SlurmCondaJobConfig
+  modules:
+  - "cuda/12.4"
+  - "cudnn/8.9.7"
+  - "hpcx/2.20"
+  eval_program_path: "shinka/eval_hydra.py"
+  conda_env: "shinka"
+  time: "01:00:00"
+  cpus: 16
+  gpus: 1
+  mem: "8G"
+
+evo_config:
+  task_sys_msg: >
+    You are an expert machine learning engineer tasked with designing a new agentic system capable of solving math problems.
+
+    You will be given a program that implements an agent scaffold. Your goal is to design a new agentic system by suggesting improvements to the scaffold.
+
+    You will be given a set of performance metrics for the program. Your goal is to maximize the `combined_score` of the program.
+  language: "python"
+  init_program_path: "examples/agent_design/initial.py"
+  job_type: "slurm_conda"
+
+exp_name: "shinka_agent_design"
diff --git a/examples/autoformalization/evaluate.py b/examples/autoformalization/evaluate.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/autoformalization/initial.lean b/examples/autoformalization/initial.lean
new file mode 100644
index 000000000..e69de29bb

From 4cd2dc497e11c7bb37c3ce00af0913ebcd705015 Mon Sep 17 00:00:00 2001
From: Racemuis <chiarathoni@gmail.com>
Date: Thu, 18 Dec 2025 12:23:25 +0100
Subject: [PATCH 35/50] [feat] add lean 4 evaluation

---
 examples/autoformalization/evaluate.py | 172 +++++++++++++++++++++++
 shinka/core/runner.py                  |   7 +-
 shinka/core/wrap_eval.py               |  19 ++-
 shinka/eval_hydra.py                   |   9 ++
 shinka/launch/scheduler.py             |  22 ++-
 shinka/utils/__init__.py               |   2 +
 shinka/utils/utils_lean.py             | 187 +++++++++++++++++++++++++
 7 files changed, 404 insertions(+), 14 deletions(-)
 create mode 100644 shinka/utils/utils_lean.py

diff --git a/examples/autoformalization/evaluate.py b/examples/autoformalization/evaluate.py
index e69de29bb..bd314e3d7 100644
--- a/examples/autoformalization/evaluate.py
+++ b/examples/autoformalization/evaluate.py
@@ -0,0 +1,172 @@
+import os
+import logging
+import argparse
+from typing import Optional, List, Tuple, Dict, Any
+
+import numpy as np
+
+from shinka.llm.client import get_client_llm
+from shinka.utils import validate_lean
+from shinka.core import run_shinka_eval
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def validate_proof(run_output: Tuple[str, Optional[str]]) -> Tuple[bool, Optional[str]]:
+    """
+    Validates the proof generation results based on the output of ``generate_proof``.
+
+    Args:
+        run_output(Tuple[str, Optional[str]): the run output, containing
+            - file_path (str): the path to a Lean 4 file containing an incomplete proof (may include ``sorry`` s)
+            - proof_text (str): the output of ``generate_proof``.
+
+    Returns:
+        (is_valid: bool, error_message: Optional[str])
+    """
+    file_path, proof_text = run_output
+    return validate_lean(proof_text, allow_sorry=False, timeout=60, verbose=False)
+
+
+def aggregate_hypothesis_generation_metrics(results: Tuple[str, str], results_dir: str) -> Dict[str, Any]:
+    """
+    Aggregates metrics for the generation of hypotheses. Assumes num_runs=1. Saves extra.npz with detailed generation
+     information.
+
+    Args:
+        results (Tuple[str, str]): the validated output of ``generate_proof``.
+        results_dir (str): the path to the directory where to save the results.
+
+    Returns:
+        dict: a dictionary of the results.
+
+    """
+    print("Aggregation results:", results)
+    if not results:
+        return {"combined_score": 0.0, "error": "No results to aggregate"}
+
+    path, proof = results
+
+    public_metrics = {
+        "proof_length": len(proof),
+    }
+
+    private_metrics = {}
+    metrics = {
+        "combined_score": len(proof),
+        "public": public_metrics,
+        "private": private_metrics,
+    }
+
+    extra_file = os.path.join(results_dir, "extra.npz")
+    try:
+        np.savez(extra_file, proof_length=len(results))
+        print(f"Detailed packing data saved to {extra_file}")
+    except Exception as e:
+        print(f"Error saving extra.npz: {e}")
+        metrics["extra_npz_save_error"] = str(e)
+    return metrics
+
+
+def get_proof_generation_kwargs(run_index: int) -> Dict[str, Any]:
+    """
+    Provides keyword arguments for generating proofs.
+
+    Args:
+        run_index (int): the index of the run, added for compatibility with ShinkaEvolve and not used.
+
+    Returns:
+        dict: a dictionary of hypothesis generation parameters.
+    """
+    del run_index  # Unused
+    return {
+        "sampling_params": {
+            "temperature": 0.2,
+            "max_tokens": 1024,
+            "n": 1,
+            "top_p": 0.95,
+        },
+        "timeout": 60,
+    }
+
+
+def main(program_path: str, results_dir: str, prover_model: str) -> None:
+    """
+    Run the hypothesis evaluation using shinka.eval
+
+    Args:
+        program_path (str): Path to program to evaluate.
+        results_dir (str): Dir to save results (metrics.json, correct.json, extra.npz)
+        prover_model (str): LLM agent used to construct LEAN proofs based on the initial header and formalization.
+
+    Returns:
+        None
+    """
+
+    print(f"Evaluating program: {program_path} with {prover_model}")
+    print(f"Saving results to: {results_dir}")
+    os.makedirs(results_dir, exist_ok=True)
+
+    client, prover_model = get_client_llm(prover_model, False,)
+
+    # Helper functions
+    def _aggregator_with_context(
+        r: List[Tuple[str, str]],
+    ) -> Dict[str, Any]:
+        """A curried function to pass results_dir to the aggregator, extracts the tuple from the list containing 1 element"""
+        return aggregate_hypothesis_generation_metrics(r[0], results_dir)
+
+    def _kwargs_with_context(run_index: int) -> dict:
+        """A curried function to pass the proof client to the proof solver"""
+        return {"model": prover_model, "proof_client": client} | get_proof_generation_kwargs(run_index=run_index)
+
+    num_experiment_runs = 1
+
+    metrics, correct, error_msg = run_shinka_eval(
+        program_path=program_path,
+        results_dir=results_dir,
+        experiment_fn_name="run_autoformalization",
+        num_runs=num_experiment_runs,
+        get_experiment_kwargs=_kwargs_with_context,
+        validate_fn=validate_proof,
+        aggregate_metrics_fn=_aggregator_with_context,
+    )
+
+    if correct:
+        print("Evaluation and Validation completed successfully.")
+    else:
+        print(f"Evaluation or Validation failed: {error_msg}")
+
+    print("Metrics:")
+    for key, value in metrics.items():
+        if isinstance(value, str) and len(value) > 100:
+            print(f"  {key}: <string_too_long_to_display>")
+        else:
+            print(f"  {key}: {value}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Hypothesis evaluator using shinka.eval")
+    parser.add_argument(
+        "--program_path",
+        type=str,
+        default="initial.lean",
+        help="Path to program to evaluate",
+    )
+    parser.add_argument(
+        "--results_dir",
+        type=str,
+        default="results",
+        help="Dir to save results (metrics.json, correct.json, extra.npz)",
+    )
+
+    parser.add_argument(
+        "--prover_model",
+        type=str,
+        default="gpt-5-nano",  # or an actual prover like "deepseek-ai/DeepSeek-Prover-V2-7B" (requires local LLM support)
+        help="LLM agent used to construct LEAN proofs based on the initial header and formalization.",
+    )
+
+    parsed_args = parser.parse_args()
+    main(parsed_args.program_path, parsed_args.results_dir, parsed_args.prover_model)
diff --git a/shinka/core/runner.py b/shinka/core/runner.py
index a0dd5f81d..a178d7436 100644
--- a/shinka/core/runner.py
+++ b/shinka/core/runner.py
@@ -55,6 +55,7 @@ class EvolutionConfig:
     meta_llm_kwargs: dict = field(default_factory=lambda: {})
     meta_max_recommendations: int = 5
     embedding_model: Optional[str] = None
+    prover_model: Optional[str] = None
     init_program_path: Optional[str] = "initial.py"
     results_dir: Optional[str] = None
     max_novelty_attempts: int = 3
@@ -242,6 +243,8 @@ def __init__(
             self.lang_ext = "swift"
         elif self.evo_config.language in ["json", "json5"]:
             self.lang_ext = "json"
+        elif self.evo_config.language == "lean":
+            self.lang_ext = "lean"
         else:
             msg = f"Language {self.evo_config.language} not supported"
             raise ValueError(msg)
@@ -492,7 +495,7 @@ def _run_generation_0(self):
                 logger.info(f"Initial program generated and saved to {exec_fname}")
 
         # Run the evaluation synchronously
-        results, rtime = self.scheduler.run(exec_fname, results_dir)
+        results, rtime = self.scheduler.run(exec_fname, self.evo_config.prover_model, results_dir,)
 
         code_embedding, e_cost = self.get_code_embedding(exec_fname)
 
@@ -737,7 +740,7 @@ def _submit_new_job(self):
             meta_patch_data["novelty_explanation"] = novelty_explanation
 
         # Submit the job asynchronously
-        job_id = self.scheduler.submit_async(exec_fname, results_dir)
+        job_id = self.scheduler.submit_async(exec_fname, results_dir, self.evo_config.prover_model)
 
         # Add to running jobs queue
         running_job = RunningJob(
diff --git a/shinka/core/wrap_eval.py b/shinka/core/wrap_eval.py
index bf2cf92eb..cf44aca85 100644
--- a/shinka/core/wrap_eval.py
+++ b/shinka/core/wrap_eval.py
@@ -6,6 +6,8 @@
 import pickle
 from typing import Callable, Any, Dict, List, Tuple, Optional
 
+from shinka.utils.utils_lean import generate_proof
+
 DEFAULT_METRICS_ON_ERROR = {
     "combined_score": 0.0,
     "execution_time_mean": 0.0,
@@ -100,13 +102,16 @@ def run_shinka_eval(
     execution_times: List[float] = []
 
     try:
-        module = load_program(program_path)
-        if not hasattr(module, experiment_fn_name):
-            raise AttributeError(
-                f"Experiment function '{experiment_fn_name}' not found in "
-                f"{program_path}"
-            )
-        experiment_fn = getattr(module, experiment_fn_name)
+        if program_path.endswith(".lean"):
+            experiment_fn = generate_proof
+        else:
+            module = load_program(program_path)
+            if not hasattr(module, experiment_fn_name):
+                raise AttributeError(
+                    f"Experiment function '{experiment_fn_name}' not found in "
+                    f"{program_path}"
+                )
+            experiment_fn = getattr(module, experiment_fn_name)
 
         for i in range(num_runs):
             kwargs: Dict[str, Any] = {}
diff --git a/shinka/eval_hydra.py b/shinka/eval_hydra.py
index 82b3c0d4b..c09eea900 100644
--- a/shinka/eval_hydra.py
+++ b/shinka/eval_hydra.py
@@ -54,11 +54,20 @@ def chdir_to_target(cfg):
         default="results",
         help="Directory to save results and logs",
     )
+
+    parser.add_argument(
+        "--prover_model",
+        type=str,
+        default="gpt-5-nano",  # or an actual prover like "deepseek-ai/DeepSeek-Prover-V2-7B" (requires local LLM support)
+        help="LLM agent used to construct LEAN proofs based on the initial header and formalization.",
+    )
+
     args = parser.parse_args()
     Path(args.results_dir).mkdir(parents=True, exist_ok=True)
     cfg = load_hydra_config(args.results_dir, max_parent_depth=2)
     cfg.evaluate_function.program_path = os.path.abspath(args.program_path)
     cfg.evaluate_function.results_dir = os.path.abspath(args.results_dir)
+    cfg.evaluate_function.prover_model = args.prover_model
     print(os.getcwd())
     print("Launching evaluation of function:")
     print(omegaconf.OmegaConf.to_yaml(cfg.evaluate_function))
diff --git a/shinka/launch/scheduler.py b/shinka/launch/scheduler.py
index 4e824c3ff..f9b574902 100644
--- a/shinka/launch/scheduler.py
+++ b/shinka/launch/scheduler.py
@@ -95,7 +95,7 @@ def __init__(
                 f"Must be 'local', 'slurm_docker', or 'slurm_conda'"
             )
 
-    def _build_command(self, exec_fname_t: str, results_dir_t: str) -> List[str]:
+    def _build_command(self, exec_fname_t: str, results_dir_t: str, prover_model: Optional[str]) -> List[str]:
         # Docker requires workspace to be mounted
         if self.job_type == "slurm_docker":
             assert isinstance(self.config, SlurmDockerJobConfig)
@@ -145,13 +145,25 @@ def _build_command(self, exec_fname_t: str, results_dir_t: str) -> List[str]:
                 else:
                     # For non-boolean values, append both flag and value
                     cmd.extend([f"--{k}", str(v)])
+
+        if exec_fname_t.endswith(".lean"):
+            if prover_model:
+                cmd.extend([f"--prover_model", prover_model])
+            else:
+                logger.warning(
+                    f"'prover_model' has not been specified. "
+                    f"Using default 'gpt-5-nano'."
+                    f"You can specify the proving model in configs/evolution/*.yaml"
+                )
+                cmd.extend([f"--prover_model", "gpt-5-nano"])
+
         return cmd
 
     def run(
-        self, exec_fname_t: str, results_dir_t: str
+        self, exec_fname_t: str, prover_model: Optional[str], results_dir_t: str
     ) -> Tuple[Dict[str, Any], float]:
         job_id: Union[str, ProcessWithLogging]
-        cmd = self._build_command(exec_fname_t, results_dir_t)
+        cmd = self._build_command(exec_fname_t, results_dir_t, prover_model=prover_model)
         start_time = time.time()
 
         if self.job_type == "local":
@@ -204,10 +216,10 @@ def run(
         return results, rtime
 
     def submit_async(
-        self, exec_fname_t: str, results_dir_t: str
+        self, exec_fname_t: str, results_dir_t: str, prover_model: Optional[str]
     ) -> Union[str, ProcessWithLogging]:
         """Submit a job asynchronously and return the job ID or process."""
-        cmd = self._build_command(exec_fname_t, results_dir_t)
+        cmd = self._build_command(exec_fname_t, results_dir_t, prover_model=prover_model)
         if self.job_type == "local":
             assert isinstance(self.config, LocalJobConfig)
             return submit_local(results_dir_t, cmd, verbose=self.verbose)
diff --git a/shinka/utils/__init__.py b/shinka/utils/__init__.py
index a308c0acd..8e237b9da 100644
--- a/shinka/utils/__init__.py
+++ b/shinka/utils/__init__.py
@@ -1,6 +1,7 @@
 from .load_df import load_programs_to_df, get_path_to_best_node, store_best_path
 from .general import parse_time_to_seconds, load_results
 from .utils_hydra import build_cfgs_from_python, add_evolve_markers, chdir_to_function_dir, wrap_object, load_hydra_config
+from .utils_lean import validate_lean
 
 __all__ = [
     "load_programs_to_df",
@@ -13,4 +14,5 @@
     "chdir_to_function_dir",
     "wrap_object",
     "load_hydra_config",
+    "validate_lean",
 ]
diff --git a/shinka/utils/utils_lean.py b/shinka/utils/utils_lean.py
new file mode 100644
index 000000000..ab6aa06bd
--- /dev/null
+++ b/shinka/utils/utils_lean.py
@@ -0,0 +1,187 @@
+import json
+import logging
+from pathlib import Path
+from typing import Tuple, Optional, List
+
+from openai import OpenAI
+
+from lean_interact import LeanREPLConfig, AutoLeanServer, TempRequireProject
+from lean_interact.interface import Command, FileCommand, LeanError
+from lean_interact.utils import remove_lean_comments
+
+logger = logging.getLogger(__name__)
+
+
+def generate_prompt(file_path: str) -> str:
+    """
+    Generate a prompt for an LLM-based prover agent for the completion of the proof provided in ``file_path``.
+
+    Args:
+        file_path (str): the path to a Lean 4 file.
+
+    Returns:
+        str: a prompt for an LLM-based prover agent based on the contents of ``file_path``.
+
+    Notes:
+        - This function is based on the example provided by the DeepSeek Development team.
+        https://huggingface.co/deepseek-ai/DeepSeek-Prover-V2-7B
+    """
+    formal_statement = Path(file_path).read_text(encoding="UTF-8")
+
+    prompt = """
+    Complete the following Lean 4 code:
+
+    ```lean4
+    {}
+    ```
+
+    Please *only* output the submitted, fully proven lean program. Do not add any reasoning or explanation to your answer. You are not allowed to include "sorrys" in your result. Make sure to include all imports in your final answer.
+    """.strip()
+
+    return prompt.format(formal_statement)
+
+
+def generate_proof(
+    file_path: str, model: str, proof_client: OpenAI, sampling_params: dict, timeout: int
+) -> Tuple[str, Optional[str]]:
+    """
+    Complete the proof provided at ``file_path`` using ``model``. The ``model`` should be hosted via the vLLM-based
+    OpenAI client.
+
+    Args:
+        file_path (str): the path to a Lean 4 file containing an incomplete proof (may include ``sorry``s)
+        model (str): the name of the LLM to use. Recommended are ``DeepSeek-Prover-V2-7B``,
+            ``Goedel-LM/Goedel-Prover-V2-8B`` or``Goedel-LM/Goedel-Prover-V2-32B``.
+        proof_client (OpenAI): the inference API.
+        sampling_params (dict): a dictionary of the sampling params that are passed as parameters to the API request.
+        timeout (int): The timeout for the request in seconds.
+
+    Returns:
+        str: the generated proof.
+
+    """
+    try:
+        prompt = generate_prompt(file_path)
+        if not prompt:
+            return file_path, None
+        response = proof_client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "user", "content": prompt},
+            ],
+            **sampling_params,
+            timeout=timeout,
+        )
+        proof_text = response.choices[0].message.content
+        proof_text = proof_text.split("```lean4")[1].replace("`", "")
+        proof_text = validate_imports(proof_text=proof_text)
+
+        logger.info(f"The generated proof text is:\n{proof_text}")
+
+        # Reformat proofs by removing comments, as they can sometimes cause issues when validating the proofs
+        return file_path, remove_lean_comments(proof_text)
+
+    except Exception as e:
+        logger.error(f"Error generating proof for {file_path}: {e}")
+        return file_path, None
+
+
+def validate_imports(proof_text: str, required_imports: Optional[List] = None) -> str:
+    """
+    Check whether the imports are present in the proof header. Add missing imports if required.
+
+    Args:
+        proof_text (str): the proof text.
+        required_imports (Optional[List]): a list of required imports. Default is ["import Mathlib.Data.Real.Basic",
+            "import Mathlib.Tactic"]
+
+    Returns:
+        str: the proof text including the required imports.
+
+    """
+    if not required_imports:  # Default argument
+        required_imports = ["import Mathlib.Data.Real.Basic", "import Mathlib.Tactic"]
+
+    for statement in required_imports:
+        if statement not in proof_text:
+            proof_text = statement + "\n" + proof_text
+    return proof_text
+
+
+def validate_lean(
+    path_or_str: str, allow_sorry: bool, timeout: int, verbose: bool, lean_version: str = "v4.24.0"
+) -> Tuple[bool, str]:
+    """
+    Verify the validity of the Lean program  found at ``path`` via LeanInteract. The function builds a Lean
+    Read-Eval-Print-Loop (REPL) for solving Lean programs. The function returns ``True`` if the lean program has run
+    successfully and when the Lean code is considered valid.
+
+    Args:
+        path_or_str (str): The path of the file to be operated on by the REPL.
+        allow_sorry (bool): True to allow for partially complete proofs that include ``sorry`` statements.
+        timeout (int): The timeout for the request in seconds.
+        verbose (bool): Whether to print additional information when downloading and building the Lean REPL,
+            and running the Lean REPL request using ``run``.
+        lean_version (str): The Lean version used. Default is ``"v4.24.0"``, which is the latest version around October
+            2025.
+
+    Returns:
+        bool: ``True`` if the lean program associated with ``path`` is considered valid and has run successfully.
+        str: the outcome or exception associated with the validation.
+
+    Notes:
+        - Store the lean output as a json, storing the header and formalization code only, to run as a ``Command``.
+        - `Formalization_code` must end by `:=`.
+    """
+    project = TempRequireProject(lean_version=lean_version, require="mathlib")
+    config = LeanREPLConfig(project=project)
+
+    try:
+        server = AutoLeanServer(config)
+        command = FileCommand(path=path_or_str) if path_or_str.endswith(".lean") else Command(cmd=path_or_str)
+        server_output = server.run(command, timeout=timeout, verbose=verbose, add_to_session_cache=False)
+
+        logger.info(server_output.messages)
+
+        if not server_output.lean_code_is_valid(allow_sorry=allow_sorry):
+            return False, f"The provided lean file {path_or_str} is invalid or contains 'sorry'."
+        elif isinstance(server_output, LeanError):
+            return False, f"{path_or_str}:" + server_output.message
+        else:
+            return (
+                True,
+                f"Run {path_or_str} terminated successfully: {server_output}",
+            )  # The content may still contain errors
+
+    except (TimeoutError, ConnectionAbortedError, json.JSONDecodeError) as e:
+        logger.error(f"Error while checking the lean file {path_or_str}: {e}")
+        return False, e
+
+async def async_validate_lean(
+    path_or_str: str, allow_sorry: bool, timeout: int, verbose: bool, lean_version: str = "v4.24.0"
+) -> Tuple[bool, str]:
+    """
+    Verify the validity of the Lean program  found at ``path`` via LeanInteract. The function builds a Lean
+    Read-Eval-Print-Loop (REPL) for solving Lean programs. The function returns ``True`` if the lean program has run
+    successfully and when the Lean code is considered valid.
+
+    Args:
+        path_or_str (str): The path of the file to be operated on by the REPL.
+        allow_sorry (bool): True to allow for partially complete proofs that include ``sorry`` statements.
+        timeout (int): The timeout for the request in seconds.
+        verbose (bool): Whether to print additional information when downloading and building the Lean REPL,
+            and running the Lean REPL request using ``run``.
+        lean_version (str): The Lean version used. Default is ``"v4.24.0"``, which is the latest version around October
+            2025.
+
+    Returns:
+        bool: ``True`` if the lean program associated with ``path`` is considered valid and has run successfully.
+        str: the outcome or exception associated with the validation.
+
+    Notes:
+        - Store the lean output as a json, storing the header and formalization code only, to run as a ``Command``.
+        - `Formalization_code` must end by `:=`.
+    """
+    return validate_lean(
+        path_or_str, allow_sorry=allow_sorry, timeout=timeout, verbose=verbose, lean_version=lean_version
+    )
\ No newline at end of file

From 7333b8bfd0ba65aafdeea8a7b7146adef48b3ded Mon Sep 17 00:00:00 2001
From: Racemuis <chiarathoni@gmail.com>
Date: Thu, 18 Dec 2025 12:23:41 +0100
Subject: [PATCH 36/50] [feat] add lean 4 edit support

---
 shinka/edit/apply_diff.py  |  9 ++++++++-
 shinka/edit/apply_full.py  |  2 ++
 shinka/edit/async_apply.py | 19 +++++++++++++++++++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/shinka/edit/apply_diff.py b/shinka/edit/apply_diff.py
index 7d2161056..d7464d6b6 100644
--- a/shinka/edit/apply_diff.py
+++ b/shinka/edit/apply_diff.py
@@ -122,9 +122,11 @@ def _clean_evolve_markers(text: str) -> str:
         r"^\s*#\s*EVOLVE-BLOCK-START\s*$",  # Python style
         r"^\s*//\s*EVOLVE-BLOCK-START\s*$",  # C/C++/CUDA style
         r"^\s*EVOLVE-BLOCK-START\s*$",  # Plain text
+        r"^\s*--\s*EVOLVE-BLOCK-START\s*$",  # LEAN 4
         r"^\s*#\s*EVOLVE-BLOCK-END\s*$",  # Python style
         r"^\s*//\s*EVOLVE-BLOCK-END\s*$",  # C/C++/CUDA
         r"^\s*EVOLVE-BLOCK-END\s*$",  # Plain text
+        r"^\s*--\s*EVOLVE-BLOCK-END\s*$",  # LEAN 4
     ]
 
     cleaned_text = text
@@ -553,7 +555,7 @@ def _create_no_evolve_block_error(original_text: str, operation: str) -> str:
             "",
             "Suggestions:",
             "1. Add EVOLVE-BLOCK-START and EVOLVE-BLOCK-END markers around editable code",
-            "2. Ensure the markers are properly formatted (with # for Python, // for C/C++)",
+            "2. Ensure the markers are properly formatted (with # for Python, // for C/C++ and -- for LEAN)",
             "3. Check that there's at least one EVOLVE-BLOCK region in the file",
         ]
     )
@@ -704,6 +706,9 @@ def apply_diff_patch(
     elif language == "python":
         patch_str = re.sub(r"# EVOLVE-BLOCK-START\\n", "", patch_str)
         patch_str = re.sub(r"# EVOLVE-BLOCK-END\\n", "", patch_str)
+    elif language.lower() == "lean":
+        patch_str = re.sub(r"-- EVOLVE-BLOCK START\\n", "", patch_str)
+        patch_str = re.sub(r"-- EVOLVE-BLOCK END\\n", "", patch_str)
     else:
         raise ValueError(f"Language {language} not supported")
 
@@ -736,6 +741,8 @@ def apply_diff_patch(
         suffix = ".swift"
     elif language in ["json", "json5"]:
         suffix = ".json"
+    elif language.lower() == "lean":  # Run full lean files with the `LeanInteract` `FileCommand`.
+        suffix = ".lean"
     else:
         raise ValueError(f"Language {language} not supported")
 
diff --git a/shinka/edit/apply_full.py b/shinka/edit/apply_full.py
index ac6288128..e14ce7d38 100644
--- a/shinka/edit/apply_full.py
+++ b/shinka/edit/apply_full.py
@@ -268,6 +268,8 @@ def apply_full_patch(
         suffix = ".swift"
     elif language in ["json", "json5"]:
         suffix = ".json"
+    elif language == "lean":
+        suffix = ".lean"
     else:
         raise ValueError(f"Language {language} not supported")
 
diff --git a/shinka/edit/async_apply.py b/shinka/edit/async_apply.py
index e4c21202f..f1460027f 100644
--- a/shinka/edit/async_apply.py
+++ b/shinka/edit/async_apply.py
@@ -10,6 +10,8 @@
 from .apply_diff import apply_diff_patch
 from .apply_full import apply_full_patch
 
+from shinka.utils.utils_lean import async_validate_lean
+
 try:
     import aiofiles
 except ImportError:
@@ -192,6 +194,23 @@ async def validate_code_async(
             else:
                 error_msg = stderr.decode() if stderr else "Unknown compilation error"
                 return False, error_msg
+        elif language == "lean":  # Compile LEAN 4 using LeanInteract
+            try:
+                is_valid, msg = await asyncio.wait_for(
+                    async_validate_lean(
+                        code_path,
+                        allow_sorry=False,
+                        timeout=timeout,
+                        verbose=True,
+                    ),
+                    timeout=timeout,
+                )
+            except asyncio.TimeoutError:
+                return False, f"Validation timeout after {timeout}s"
+            if is_valid:
+                return True, None
+            else:
+                return False, msg
         else:
             # For other languages, just check if file exists and is readable
             try:

From 7a1dc9d314ed51210799f406243cc49d97eeb120 Mon Sep 17 00:00:00 2001
From: Racemuis <chiarathoni@gmail.com>
Date: Thu, 18 Dec 2025 12:24:32 +0100
Subject: [PATCH 37/50] [example] add an autoformulation example based on group
 theory

---
 configs/task/autoformalization.yaml     | 35 +++++++++++++++++++------
 examples/autoformalization/initial.lean | 14 ++++++++++
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/configs/task/autoformalization.yaml b/configs/task/autoformalization.yaml
index d454cbe30..56881fbf9 100644
--- a/configs/task/autoformalization.yaml
+++ b/configs/task/autoformalization.yaml
@@ -1,5 +1,5 @@
 evaluate_function:
-  _target_: examples.math_autoformalization.evaluate.main
+  _target_: examples.autoformalization.evaluate.main
   program_path: ???
   results_dir: ???
 
@@ -17,14 +17,33 @@ distributed_job_config:
   mem: "8G"
 
 evo_config:
-  task_sys_msg: >
-    You are an expert machine learning engineer tasked with designing a new agentic system capable of solving math problems.
+  task_sys_msg: |
+    You are an expert mathematician with a specialization in group theory. You want to prove the following statement:
+    
+    "Let $H$ be the subgroup generated by two elements $a, b$ of a group $G$. Prove that if $a b=b a$, then $H$ is an abelian group."
+    
+    Your task is to identify the minimal set of variables and assumptions needed to formalize the proof in Lean 4. You do not have to provide the proof itself. Therefore, your output is expected to look like:
+    
+    ```lean
+    -- Variables
+    theorem abelian_group (<Write the variables needed for the proof here>)
+    -- Assumptions
+    <Write the assumptions needed for the proof here>
+    :
+    -- Conjecture
+    (<Write the proof's conjecture here>)
+    :=
+    ```
+    
+    Key components to add:
+    1. Introduce all variables you need to complete the proof.
+    2. Write all assumptions needed to complete the proof.
+    3. Think about a conjecture that reflects the scientific concept you want to prove. 
 
-    You will be given a program that implements an agent scaffold. Your goal is to design a new agentic system by suggesting improvements to the scaffold.
+    Make sure your output is valid Lean 4. Do not leave the conjecture empty `()`. Do **not** finish the proof, you can leave the program up to the `:=` sign. 
 
-    You will be given a set of performance metrics for the program. Your goal is to maximize the `combined_score` of the program.
-  language: "python"
-  init_program_path: "examples/agent_design/initial.py"
+  language: "lean"
+  init_program_path: "examples/autoformalization/initial.lean"
   job_type: "slurm_conda"
 
-exp_name: "shinka_agent_design"
+exp_name: "shinka_autoformalization"
diff --git a/examples/autoformalization/initial.lean b/examples/autoformalization/initial.lean
index e69de29bb..19b44103c 100644
--- a/examples/autoformalization/initial.lean
+++ b/examples/autoformalization/initial.lean
@@ -0,0 +1,14 @@
+import data.real.basic
+import data.real.nnreal
+import Mathlib.Tactic
+
+-- EVOLVE-BLOCK-START
+theorem abelian_group ()
+-- Assumptions
+()
+
+:
+-- Conjecture
+()
+-- EVOLVE-BLOCK-END
+:=
\ No newline at end of file

From 91924ee9d7a92a8b7e23706ea9b4a4aafe29f19d Mon Sep 17 00:00:00 2001
From: ifsheldon <feng.liang@kaust.edu.sa>
Date: Fri, 26 Dec 2025 22:30:37 +0800
Subject: [PATCH 38/50] update gemini embedding price

---
 shinka/llm/embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shinka/llm/embedding.py b/shinka/llm/embedding.py
index 4082ad58b..ba751b5f3 100644
--- a/shinka/llm/embedding.py
+++ b/shinka/llm/embedding.py
@@ -34,7 +34,7 @@
 # Gemini embedding costs (approximate - check current pricing)
 GEMINI_EMBEDDING_COSTS = {
     "gemini-embedding-exp-03-07": 0.0 / M,  # Experimental model, often free
-    "gemini-embedding-001": 0.0 / M,  # Check current pricing
+    "gemini-embedding-001": 0.15 / M,  # Check current pricing
 }
 
 def get_client_model(model_name: str) -> tuple[Union[openai.OpenAI, str], str]:

From 580cefd8ec8f9308f9bfd0bc4f66190fae7a0087 Mon Sep 17 00:00:00 2001
From: ifsheldon <feng.liang@kaust.edu.sa>
Date: Fri, 26 Dec 2025 21:29:59 +0800
Subject: [PATCH 39/50] add gemini-3-flash-preview

---
 shinka/llm/models/pricing.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/shinka/llm/models/pricing.py b/shinka/llm/models/pricing.py
index 91e965c75..8c13a5dbe 100644
--- a/shinka/llm/models/pricing.py
+++ b/shinka/llm/models/pricing.py
@@ -153,6 +153,10 @@
         "input_price": 2.0 / M,
         "output_price": 12.0 / M,
     },
+    "gemini-3-flash-preview" : {
+        "input_price": 0.5 / M,
+        "output_price": 3.0 / M,
+    },
 }
 
 BEDROCK_MODELS = {
@@ -200,6 +204,7 @@
     "gemini-2.5-flash",
     "gemini-2.5-flash-lite-preview-06-17",
     "gemini-3-pro-preview",
+    "gemini-3-flash-preview",
 ]
 
 REASONING_AZURE_MODELS = [

From a51026263790c4a9745fc2ea63f7d6001eae3f65 Mon Sep 17 00:00:00 2001
From: ifsheldon <feng.liang@kaust.edu.sa>
Date: Fri, 26 Dec 2025 23:12:01 +0800
Subject: [PATCH 40/50] add GPT 5.2

---
 shinka/llm/models/pricing.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/shinka/llm/models/pricing.py b/shinka/llm/models/pricing.py
index 91e965c75..574c6d9fe 100644
--- a/shinka/llm/models/pricing.py
+++ b/shinka/llm/models/pricing.py
@@ -122,6 +122,10 @@
         "input_price": 1.25 / M,
         "output_price": 10.0 / M,
     },
+    "gpt-5.2": {
+        "input_price": 1.75 / M,
+        "output_price": 14.0 / M,
+    },
 }
 
 
@@ -183,6 +187,8 @@
     "gpt-5",
     "gpt-5-mini",
     "gpt-5-nano",
+    "gpt-5.1",
+    "gpt-5.2",
 ]
 
 REASONING_CLAUDE_MODELS = [

From c5ef8dc77203aa279e826f9108bd50cfff2b73c9 Mon Sep 17 00:00:00 2001
From: Racemuis <chiarathoni@gmail.com>
Date: Mon, 5 Jan 2026 10:40:28 +0100
Subject: [PATCH 41/50] [feat] remove specific sampling params to increase
 compatibility with various LLM families

---
 examples/autoformalization/evaluate.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/examples/autoformalization/evaluate.py b/examples/autoformalization/evaluate.py
index bd314e3d7..707aef2b3 100644
--- a/examples/autoformalization/evaluate.py
+++ b/examples/autoformalization/evaluate.py
@@ -71,7 +71,8 @@ def aggregate_hypothesis_generation_metrics(results: Tuple[str, str], results_di
 
 def get_proof_generation_kwargs(run_index: int) -> Dict[str, Any]:
     """
-    Provides keyword arguments for generating proofs.
+    Provides keyword arguments for generating proofs. Insert your sampling parameters here. The timeout is provided
+    in seconds.
 
     Args:
         run_index (int): the index of the run, added for compatibility with ShinkaEvolve and not used.
@@ -82,12 +83,8 @@ def get_proof_generation_kwargs(run_index: int) -> Dict[str, Any]:
     del run_index  # Unused
     return {
         "sampling_params": {
-            "temperature": 0.2,
-            "max_tokens": 1024,
-            "n": 1,
-            "top_p": 0.95,
         },
-        "timeout": 60,
+        "timeout": 180,
     }
 
 

From e9e95458be553af95b2d2112cc8358f70328295a Mon Sep 17 00:00:00 2001
From: Racemuis <chiarathoni@gmail.com>
Date: Mon, 5 Jan 2026 10:42:51 +0100
Subject: [PATCH 42/50] [core] add lean-interact dependency

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 5802a1522..afbaf5f3d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,7 @@ dependencies = [
     "markdown",
     "aiofiles",
     "google-generativeai",
+    "lean-interact",
 ]
 
 [tool.setuptools]

From ba29305028e2e25d6ad16ed6451d0da11081140a Mon Sep 17 00:00:00 2001
From: Racemuis <chiarathoni@gmail.com>
Date: Mon, 5 Jan 2026 10:43:23 +0100
Subject: [PATCH 43/50] [fix] add file name to proof generation function

---
 shinka/core/wrap_eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shinka/core/wrap_eval.py b/shinka/core/wrap_eval.py
index cf44aca85..c9a3f6b06 100644
--- a/shinka/core/wrap_eval.py
+++ b/shinka/core/wrap_eval.py
@@ -116,7 +116,7 @@ def run_shinka_eval(
         for i in range(num_runs):
             kwargs: Dict[str, Any] = {}
             if get_experiment_kwargs:
-                kwargs = get_experiment_kwargs(i)
+                kwargs = get_experiment_kwargs(i) | {"file_path": program_path}
             else:
                 kwargs = {"seed": i + 1}
 

From 18c57ced8710c5e79774f2987e753079946ecae7 Mon Sep 17 00:00:00 2001
From: Racemuis <chiarathoni@gmail.com>
Date: Mon, 5 Jan 2026 10:43:49 +0100
Subject: [PATCH 44/50] [fix] add try-catch clause for proof parsing

---
 shinka/utils/utils_lean.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/shinka/utils/utils_lean.py b/shinka/utils/utils_lean.py
index ab6aa06bd..7366a7454 100644
--- a/shinka/utils/utils_lean.py
+++ b/shinka/utils/utils_lean.py
@@ -73,7 +73,11 @@ def generate_proof(
             timeout=timeout,
         )
         proof_text = response.choices[0].message.content
-        proof_text = proof_text.split("```lean4")[1].replace("`", "")
+
+        try:
+            proof_text = proof_text.split("```lean4")[1].replace("`", "")
+        except IndexError:
+            proof_text = proof_text.replace("`", "")
         proof_text = validate_imports(proof_text=proof_text)
 
         logger.info(f"The generated proof text is:\n{proof_text}")

From e250a512348001537fc42a7fa33ab2c395e2c712 Mon Sep 17 00:00:00 2001
From: Racemuis <chiarathoni@gmail.com>
Date: Mon, 5 Jan 2026 10:44:56 +0100
Subject: [PATCH 45/50] [core] add autoformalization example config

---
 configs/variant/autoformalization_example.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 configs/variant/autoformalization_example.yaml

diff --git a/configs/variant/autoformalization_example.yaml b/configs/variant/autoformalization_example.yaml
new file mode 100644
index 000000000..2983ba959
--- /dev/null
+++ b/configs/variant/autoformalization_example.yaml
@@ -0,0 +1,8 @@
+defaults:
+  - override /database@_global_: island_small
+  - override /evolution@_global_: small_budget
+  - override /task@_global_: autoformalization
+  - override /cluster@_global_: local
+  - _self_
+
+variant_suffix: "_example"
\ No newline at end of file

From 329abc87844a9c40ae975fdd3a577f1e22a85feb Mon Sep 17 00:00:00 2001
From: Racemuis <chiarathoni@gmail.com>
Date: Mon, 5 Jan 2026 11:39:39 +0100
Subject: [PATCH 46/50] add Lean block markers to global definition

---
 shinka/edit/apply_diff.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/shinka/edit/apply_diff.py b/shinka/edit/apply_diff.py
index d7464d6b6..f9dfb97c0 100644
--- a/shinka/edit/apply_diff.py
+++ b/shinka/edit/apply_diff.py
@@ -12,8 +12,8 @@
 )
 
 
-EVOLVE_START = re.compile(r"(?:#|//|)?\s*EVOLVE-BLOCK-START")
-EVOLVE_END = re.compile(r"(?:#|//|)?\s*EVOLVE-BLOCK-END")
+EVOLVE_START = re.compile(r"(?:#|//|--|)?\s*EVOLVE-BLOCK-START")
+EVOLVE_END = re.compile(r"(?:#|//|--|)?\s*EVOLVE-BLOCK-END")
 
 
 def _mutable_ranges(text: str) -> list[tuple[int, int]]:

From db379556c1c855048686c5a00f6b5ea399239fe8 Mon Sep 17 00:00:00 2001
From: Racemuis <chiarathoni@gmail.com>
Date: Mon, 16 Feb 2026 15:52:10 +0100
Subject: [PATCH 47/50] [remove] remove lean verification from async apply

---
 .../autoformalization}/utils_lean.py          |  0
 shinka/edit/async_apply.py                    | 19 -------------------
 2 files changed, 19 deletions(-)
 rename {shinka/utils => examples/autoformalization}/utils_lean.py (100%)

diff --git a/shinka/utils/utils_lean.py b/examples/autoformalization/utils_lean.py
similarity index 100%
rename from shinka/utils/utils_lean.py
rename to examples/autoformalization/utils_lean.py
diff --git a/shinka/edit/async_apply.py b/shinka/edit/async_apply.py
index f1460027f..e4c21202f 100644
--- a/shinka/edit/async_apply.py
+++ b/shinka/edit/async_apply.py
@@ -10,8 +10,6 @@
 from .apply_diff import apply_diff_patch
 from .apply_full import apply_full_patch
 
-from shinka.utils.utils_lean import async_validate_lean
-
 try:
     import aiofiles
 except ImportError:
@@ -194,23 +192,6 @@ async def validate_code_async(
             else:
                 error_msg = stderr.decode() if stderr else "Unknown compilation error"
                 return False, error_msg
-        elif language == "lean":  # Compile LEAN 4 using LeanInteract
-            try:
-                is_valid, msg = await asyncio.wait_for(
-                    async_validate_lean(
-                        code_path,
-                        allow_sorry=False,
-                        timeout=timeout,
-                        verbose=True,
-                    ),
-                    timeout=timeout,
-                )
-            except asyncio.TimeoutError:
-                return False, f"Validation timeout after {timeout}s"
-            if is_valid:
-                return True, None
-            else:
-                return False, msg
         else:
             # For other languages, just check if file exists and is readable
             try:

From 431e1ba12e19cf6568435c3a0f769b22fb2b3b66 Mon Sep 17 00:00:00 2001
From: Racemuis <chiarathoni@gmail.com>
Date: Mon, 16 Feb 2026 15:53:03 +0100
Subject: [PATCH 48/50] [remove] remove prover_model dependencies from Shinka's
 core

---
 shinka/core/runner.py      |  5 ++---
 shinka/core/wrap_eval.py   |  4 +---
 shinka/launch/scheduler.py | 24 ++++++------------------
 3 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/shinka/core/runner.py b/shinka/core/runner.py
index a178d7436..92cdee99d 100644
--- a/shinka/core/runner.py
+++ b/shinka/core/runner.py
@@ -55,7 +55,6 @@ class EvolutionConfig:
     meta_llm_kwargs: dict = field(default_factory=lambda: {})
     meta_max_recommendations: int = 5
     embedding_model: Optional[str] = None
-    prover_model: Optional[str] = None
     init_program_path: Optional[str] = "initial.py"
     results_dir: Optional[str] = None
     max_novelty_attempts: int = 3
@@ -495,7 +494,7 @@ def _run_generation_0(self):
                 logger.info(f"Initial program generated and saved to {exec_fname}")
 
         # Run the evaluation synchronously
-        results, rtime = self.scheduler.run(exec_fname, self.evo_config.prover_model, results_dir,)
+        results, rtime = self.scheduler.run(exec_fname, results_dir,)
 
         code_embedding, e_cost = self.get_code_embedding(exec_fname)
 
@@ -740,7 +739,7 @@ def _submit_new_job(self):
             meta_patch_data["novelty_explanation"] = novelty_explanation
 
         # Submit the job asynchronously
-        job_id = self.scheduler.submit_async(exec_fname, results_dir, self.evo_config.prover_model)
+        job_id = self.scheduler.submit_async(exec_fname, results_dir,)
 
         # Add to running jobs queue
         running_job = RunningJob(
diff --git a/shinka/core/wrap_eval.py b/shinka/core/wrap_eval.py
index c9a3f6b06..c411b1bfd 100644
--- a/shinka/core/wrap_eval.py
+++ b/shinka/core/wrap_eval.py
@@ -6,8 +6,6 @@
 import pickle
 from typing import Callable, Any, Dict, List, Tuple, Optional
 
-from shinka.utils.utils_lean import generate_proof
-
 DEFAULT_METRICS_ON_ERROR = {
     "combined_score": 0.0,
     "execution_time_mean": 0.0,
@@ -103,7 +101,7 @@ def run_shinka_eval(
 
     try:
         if program_path.endswith(".lean"):
-            experiment_fn = generate_proof
+            experiment_fn = experiment_fn_name
         else:
             module = load_program(program_path)
             if not hasattr(module, experiment_fn_name):
diff --git a/shinka/launch/scheduler.py b/shinka/launch/scheduler.py
index f9b574902..91efa1be2 100644
--- a/shinka/launch/scheduler.py
+++ b/shinka/launch/scheduler.py
@@ -95,7 +95,7 @@ def __init__(
                 f"Must be 'local', 'slurm_docker', or 'slurm_conda'"
             )
 
-    def _build_command(self, exec_fname_t: str, results_dir_t: str, prover_model: Optional[str]) -> List[str]:
+    def _build_command(self, exec_fname_t: str, results_dir_t: str) -> List[str]:
         # Docker requires workspace to be mounted
         if self.job_type == "slurm_docker":
             assert isinstance(self.config, SlurmDockerJobConfig)
@@ -145,25 +145,13 @@ def _build_command(self, exec_fname_t: str, results_dir_t: str, prover_model: Op
                 else:
                     # For non-boolean values, append both flag and value
                     cmd.extend([f"--{k}", str(v)])
-
-        if exec_fname_t.endswith(".lean"):
-            if prover_model:
-                cmd.extend([f"--prover_model", prover_model])
-            else:
-                logger.warning(
-                    f"'prover_model' has not been specified. "
-                    f"Using default 'gpt-5-nano'."
-                    f"You can specify the proving model in configs/evolution/*.yaml"
-                )
-                cmd.extend([f"--prover_model", "gpt-5-nano"])
-
         return cmd
 
     def run(
-        self, exec_fname_t: str, prover_model: Optional[str], results_dir_t: str
+        self, exec_fname_t: str, results_dir_t: str
     ) -> Tuple[Dict[str, Any], float]:
         job_id: Union[str, ProcessWithLogging]
-        cmd = self._build_command(exec_fname_t, results_dir_t, prover_model=prover_model)
+        cmd = self._build_command(exec_fname_t, results_dir_t)
         start_time = time.time()
 
         if self.job_type == "local":
@@ -216,10 +204,10 @@ def run(
         return results, rtime
 
     def submit_async(
-        self, exec_fname_t: str, results_dir_t: str, prover_model: Optional[str]
+        self, exec_fname_t: str, results_dir_t: str
     ) -> Union[str, ProcessWithLogging]:
         """Submit a job asynchronously and return the job ID or process."""
-        cmd = self._build_command(exec_fname_t, results_dir_t, prover_model=prover_model)
+        cmd = self._build_command(exec_fname_t, results_dir_t)
         if self.job_type == "local":
             assert isinstance(self.config, LocalJobConfig)
             return submit_local(results_dir_t, cmd, verbose=self.verbose)
@@ -388,4 +376,4 @@ def cancel_job():
 
     def shutdown(self):
         """Shutdown the thread pool executor."""
-        self.executor.shutdown(wait=True)
+        self.executor.shutdown(wait=True)
\ No newline at end of file

From 68b80489fc6455515c0ac98f4ceab838c0b4f1be Mon Sep 17 00:00:00 2001
From: Racemuis <chiarathoni@gmail.com>
Date: Mon, 16 Feb 2026 15:53:26 +0100
Subject: [PATCH 49/50] [remove] remove prover_model dependencies from Shinka's
 core

---
 shinka/eval_hydra.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/shinka/eval_hydra.py b/shinka/eval_hydra.py
index c09eea900..4c5c72d35 100644
--- a/shinka/eval_hydra.py
+++ b/shinka/eval_hydra.py
@@ -54,24 +54,15 @@ def chdir_to_target(cfg):
         default="results",
         help="Directory to save results and logs",
     )
-
-    parser.add_argument(
-        "--prover_model",
-        type=str,
-        default="gpt-5-nano",  # or an actual prover like "deepseek-ai/DeepSeek-Prover-V2-7B" (requires local LLM support)
-        help="LLM agent used to construct LEAN proofs based on the initial header and formalization.",
-    )
-
     args = parser.parse_args()
     Path(args.results_dir).mkdir(parents=True, exist_ok=True)
     cfg = load_hydra_config(args.results_dir, max_parent_depth=2)
     cfg.evaluate_function.program_path = os.path.abspath(args.program_path)
     cfg.evaluate_function.results_dir = os.path.abspath(args.results_dir)
-    cfg.evaluate_function.prover_model = args.prover_model
     print(os.getcwd())
     print("Launching evaluation of function:")
     print(omegaconf.OmegaConf.to_yaml(cfg.evaluate_function))
 
     # import & run under the target directory
     with chdir_to_target(cfg.evaluate_function):
-        hydra.utils.instantiate(cfg.evaluate_function)
+        hydra.utils.instantiate(cfg.evaluate_function)
\ No newline at end of file

From cd6e9f85be81acb14cde6fb5f039e3c8bf8b4d98 Mon Sep 17 00:00:00 2001
From: Racemuis <chiarathoni@gmail.com>
Date: Mon, 16 Feb 2026 15:54:24 +0100
Subject: [PATCH 50/50] [core] move lean utils to 'examples' folder

---
 examples/autoformalization/evaluate.py   |  50 +++++++++--
 examples/autoformalization/utils_lean.py | 106 +++++++++++++++++++----
 shinka/utils/__init__.py                 |   2 -
 3 files changed, 135 insertions(+), 23 deletions(-)

diff --git a/examples/autoformalization/evaluate.py b/examples/autoformalization/evaluate.py
index 707aef2b3..e89bd1657 100644
--- a/examples/autoformalization/evaluate.py
+++ b/examples/autoformalization/evaluate.py
@@ -4,15 +4,35 @@
 from typing import Optional, List, Tuple, Dict, Any
 
 import numpy as np
+from lean_interact import LeanREPLConfig, AutoLeanServer, Command, TempRequireProject, FileCommand
+from lean_interact.interface import BaseREPLResponse, LeanError
 
+from .utils_lean import validate_lean, generate_proof
 from shinka.llm.client import get_client_llm
-from shinka.utils import validate_lean
 from shinka.core import run_shinka_eval
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
+def check_lean(path_or_str: str) -> BaseREPLResponse | LeanError:
+    """
+    Plug the generated proof through the Lean 4 compiler.
+
+    Args:
+        path_or_str (str): The path to the proof or the proof itself.
+
+    Returns:
+        BaseREPLResponse: the output of the Lean compiler.
+    """
+    project = TempRequireProject(lean_version="v4.24.0", require="mathlib")
+    config = LeanREPLConfig(project=project)
+    server = AutoLeanServer(config)  # start Lean REPL
+    command = FileCommand(path=path_or_str) if path_or_str.endswith(".lean") else Command(cmd=path_or_str)
+    server_output = server.run(command)
+    logger.info(server_output.messages)
+    return server_output
+
 def validate_proof(run_output: Tuple[str, Optional[str]]) -> Tuple[bool, Optional[str]]:
     """
     Validates the proof generation results based on the output of ``generate_proof``.
@@ -46,17 +66,35 @@ def aggregate_hypothesis_generation_metrics(results: Tuple[str, str], results_di
     if not results:
         return {"combined_score": 0.0, "error": "No results to aggregate"}
 
-    path, proof = results
+    path, lean_cmd = results
+
+    server_output = check_lean(lean_cmd)
+    if not server_output.lean_code_is_valid(allow_sorry=False):
+        penalty = 0
+        for message in server_output.messages:
+            if "error" in message.severity:
+                penalty += -1
+
+        messages = server_output
+        text_feedback = (
+            f"The generated proof:\n{lean_cmd} was invalid. Each error or sorry leads to a -1 penalty."
+            f"Please consider the following compiler feedback and update the formalization accordingly:\n"
+            f"{messages}"
+        )
+    else:
+        text_feedback = ""
+        penalty = 0
 
     public_metrics = {
-        "proof_length": len(proof),
+        "proof_length": len(lean_cmd),
     }
 
     private_metrics = {}
     metrics = {
-        "combined_score": len(proof),
+        "combined_score": len(lean_cmd),
         "public": public_metrics,
         "private": private_metrics,
+        "text_feedback": text_feedback,
     }
 
     extra_file = os.path.join(results_dir, "extra.npz")
@@ -88,7 +126,7 @@ def get_proof_generation_kwargs(run_index: int) -> Dict[str, Any]:
     }
 
 
-def main(program_path: str, results_dir: str, prover_model: str) -> None:
+def main(program_path: str, results_dir: str, prover_model: str='gpt-5-nano') -> None:
     """
     Run the hypothesis evaluation using shinka.eval
 
@@ -123,7 +161,7 @@ def _kwargs_with_context(run_index: int) -> dict:
     metrics, correct, error_msg = run_shinka_eval(
         program_path=program_path,
         results_dir=results_dir,
-        experiment_fn_name="run_autoformalization",
+        experiment_fn_name=generate_proof,
         num_runs=num_experiment_runs,
         get_experiment_kwargs=_kwargs_with_context,
         validate_fn=validate_proof,
diff --git a/examples/autoformalization/utils_lean.py b/examples/autoformalization/utils_lean.py
index 7366a7454..1add4517a 100644
--- a/examples/autoformalization/utils_lean.py
+++ b/examples/autoformalization/utils_lean.py
@@ -1,4 +1,6 @@
 import json
+import re
+import os
 import logging
 from pathlib import Path
 from typing import Tuple, Optional, List
@@ -74,13 +76,15 @@ def generate_proof(
         )
         proof_text = response.choices[0].message.content
 
-        try:
-            proof_text = proof_text.split("```lean4")[1].replace("`", "")
-        except IndexError:
-            proof_text = proof_text.replace("`", "")
-        proof_text = validate_imports(proof_text=proof_text)
+        results_dir, _ = os.path.split(file_path)
+        fname = os.path.join(results_dir, fr"unprocessed_proof.lean")
+        with open(fname, "w", encoding="utf-8") as f:
+            f.write(proof_text)
 
-        logger.info(f"The generated proof text is:\n{proof_text}")
+        proof_text = postprocess(proof_text)
+        fname = os.path.join(results_dir, fr"processed_proof.lean")
+        with open(fname, "w", encoding="utf-8") as f:
+            f.write(proof_text)
 
         # Reformat proofs by removing comments, as they can sometimes cause issues when validating the proofs
         return file_path, remove_lean_comments(proof_text)
@@ -90,26 +94,98 @@ def generate_proof(
         return file_path, None
 
 
-def validate_imports(proof_text: str, required_imports: Optional[List] = None) -> str:
+def postprocess(proof: str) -> str:
+    """
+    Postprocess the ``proof``, fixing common syntax errors.
+
+    Args:
+        proof (str): a Lean 4 proof.
+
+    Returns:
+        (str): the post-processed and cleaned proof.
+    """
+    try:
+        proof = proof.split("```lean4")[1].replace("`", "")
+    except IndexError:
+        proof = proof.replace("`", "")
+    proof = validate_imports(proof_text=proof)
+    # Reformat proofs by removing comments, as they can sometimes cause issues when validating the proofs
+    clean_lean = remove_lean_comments(proof)
+    if clean_lean.endswith("D"):
+        clean_lean = clean_lean[:-1]
+    # Replace "∑ n in range" with "∑ n ∈ range"
+    lean_txt = fix_range_notation(clean_lean)
+    # Replace " pi " with π
+    lean_txt = re.sub(r'\s+pi\s+', ' π ', lean_txt)
+    return lean_txt
+
+
+def fix_range_notation(text: str) -> str:
+    """
+    Replace '∑/∏ variable in range' with '∑/∏ variable ∈ range' in Lean 4 code, regardless of the variable name.
+
+    Args:
+        text (str): The input string containing Lean code.
+
+    Returns:
+        str: The string with all sum notations fixed.
+    """
+    pattern = r"([∑∏]\s+)(\w+)(\s+)in(\s+)"
+    replacement = r"\1\2\3∈\4"
+    return re.sub(pattern, replacement, text)
+
+
+def validate_imports(
+    proof_text: str,
+    standard_imports: Optional[str] = None,
+    open_imports: Optional[str] = None,
+) -> str:
     """
     Check whether the imports are present in the proof header. Add missing imports if required.
 
     Args:
         proof_text (str): the proof text.
-        required_imports (Optional[List]): a list of required imports. Default is ["import Mathlib.Data.Real.Basic",
-            "import Mathlib.Tactic"]
+        standard_imports (Optional[List]): a list of the standard imports required to solve most proofs in chemical
+            physics.
+        open_imports (Optional[List]): a list of the standard opens required to solve mst proofs in chemical physics.
 
     Returns:
         str: the proof text including the required imports.
 
     """
-    if not required_imports:  # Default argument
-        required_imports = ["import Mathlib.Data.Real.Basic", "import Mathlib.Tactic"]
+    if not standard_imports: # Default argument
+        standard_imports = ["import mathlib", ]
+
+    if not open_imports:
+        open_imports = ["Real", "BigOperators", "Topology", "Set", "Filter", "Finset"]
+
+    imports, opens, proof = [], [], []
+
+    proof_lines = proof_text.split("\n")
+    for line in proof_lines:
+        if line.startswith("import"):  # Verify imports
+            continue
+
+        if line.lower() == "lean":  # fix parsing issues
+            continue
+
+        elif line.startswith("open"):  # Open statement
+            curated_line = line
+            for statement in open_imports:
+                if statement not in curated_line:
+                    curated_line += f" {statement}"
+            opens.append(curated_line)
+
+        else:  # The rest of the proof text
+            proof.append(line)
+
+    for statement in standard_imports:  # Add the remaining standard imports
+        if statement not in imports:
+            imports.append(statement)
+
+    proof_text = "\n".join(imports) + "\n" + "\n".join(opens) + "\n" + "\n".join(proof)
 
-    for statement in required_imports:
-        if statement not in proof_text:
-            proof_text = statement + "\n" + proof_text
-    return proof_text
+    return proof_text.strip()
 
 
 def validate_lean(
diff --git a/shinka/utils/__init__.py b/shinka/utils/__init__.py
index 8e237b9da..a308c0acd 100644
--- a/shinka/utils/__init__.py
+++ b/shinka/utils/__init__.py
@@ -1,7 +1,6 @@
 from .load_df import load_programs_to_df, get_path_to_best_node, store_best_path
 from .general import parse_time_to_seconds, load_results
 from .utils_hydra import build_cfgs_from_python, add_evolve_markers, chdir_to_function_dir, wrap_object, load_hydra_config
-from .utils_lean import validate_lean
 
 __all__ = [
     "load_programs_to_df",
@@ -14,5 +13,4 @@
     "chdir_to_function_dir",
     "wrap_object",
     "load_hydra_config",
-    "validate_lean",
 ]