lean-dojo
diff --git a/‎README.md‎
Lines changed: 267 additions & 23 deletions b/‎README.md‎
Lines changed: 267 additions & 23 deletions
diff --git a/‎examples/grpo.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/grpo.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lean_dojo_v2/__init__.py‎
Lines changed: 0 additions & 15 deletions b/‎lean_dojo_v2/__init__.py‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎lean_dojo_v2/agent/base_agent.py‎
Lines changed: 1 addition & 1 deletion b/‎lean_dojo_v2/agent/base_agent.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎…_dojo_v2/lean_agent/database/__init__.py‎ ‎lean_dojo_v2/database/__init__.py‎lean_dojo_v2/lean_agent/database/__init__.py renamed to lean_dojo_v2/database/__init__.py b/‎…_dojo_v2/lean_agent/database/__init__.py‎ ‎lean_dojo_v2/database/__init__.py‎lean_dojo_v2/lean_agent/database/__init__.py renamed to lean_dojo_v2/database/__init__.py
diff --git a/‎…/lean_agent/database/dynamic_database.py‎ ‎lean_dojo_v2/database/dynamic_database.py‎lean_dojo_v2/lean_agent/database/dynamic_database.py renamed to lean_dojo_v2/database/dynamic_database.py
Lines changed: 1 addition & 2 deletions b/‎…/lean_agent/database/dynamic_database.py‎ ‎lean_dojo_v2/database/dynamic_database.py‎lean_dojo_v2/lean_agent/database/dynamic_database.py renamed to lean_dojo_v2/database/dynamic_database.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎…2/lean_agent/database/models/__init__.py‎ ‎lean_dojo_v2/database/models/__init__.py‎lean_dojo_v2/lean_agent/database/models/__init__.py renamed to lean_dojo_v2/database/models/__init__.py b/‎…2/lean_agent/database/models/__init__.py‎ ‎lean_dojo_v2/database/models/__init__.py‎lean_dojo_v2/lean_agent/database/models/__init__.py renamed to lean_dojo_v2/database/models/__init__.py
diff --git a/‎…ean_agent/database/models/annotations.py‎ ‎…n_dojo_v2/database/models/annotations.py‎lean_dojo_v2/lean_agent/database/models/annotations.py renamed to lean_dojo_v2/database/models/annotations.py b/‎…ean_agent/database/models/annotations.py‎ ‎…n_dojo_v2/database/models/annotations.py‎lean_dojo_v2/lean_agent/database/models/annotations.py renamed to lean_dojo_v2/database/models/annotations.py
diff --git a/‎…2/lean_agent/database/models/premises.py‎ ‎lean_dojo_v2/database/models/premises.py‎lean_dojo_v2/lean_agent/database/models/premises.py renamed to lean_dojo_v2/database/models/premises.py b/‎…2/lean_agent/database/models/premises.py‎ ‎lean_dojo_v2/database/models/premises.py‎lean_dojo_v2/lean_agent/database/models/premises.py renamed to lean_dojo_v2/database/models/premises.py
diff --git a/‎…lean_agent/database/models/repository.py‎ ‎…an_dojo_v2/database/models/repository.py‎lean_dojo_v2/lean_agent/database/models/repository.py renamed to lean_dojo_v2/database/models/repository.py b/‎…lean_agent/database/models/repository.py‎ ‎…an_dojo_v2/database/models/repository.py‎lean_dojo_v2/lean_agent/database/models/repository.py renamed to lean_dojo_v2/database/models/repository.py
@@ -1,35 +1,143 @@
 # LeanDojo-v2
+
+LeanDojo-v2 is an end-to-end framework for training, evaluating, and deploying AI-assisted theorem provers for Lean 4. It combines repository tracing, lifelong dataset management, retrieval-augmented agents, Hugging Face fine-tuning, and external inference APIs into one toolkit.
+
+---
+
+## Table of Contents
+
+1. [Overview](#overview)
+2. [Key Features](#key-features)
+3. [Repository Layout](#repository-layout)
+4. [Requirements](#requirements)
+5. [Installation](#installation)
+6. [Environment Setup](#environment-setup)
+7. [Quick Start](#quickstart)
+8. [Working with Agents and Trainers](#working-with-agents-and-trainers)
+9. [Tracing and Dataset Generation](#tracing-and-dataset-generation)
+10. [External APIs and LeanCopilot](#external-apis-and-leancopilot)
+11. [Testing](#testing)
+12. [Troubleshooting & Tips](#troubleshooting--tips)
+13. [Contributing](#contributing)
+14. [License](#license)
+
+---
+
+## Overview
+
+LeanDojo-v2 extends the original LeanDojo stack with the LeanAgent lifelong learning pipeline. It automates the entire loop of:
+
+1. Cloning Lean repositories (GitHub or local) and tracing them with Lean instrumentation.
+2. Storing structured theorem information in a dynamic database.
+3. Training agent policies with supervised fine-tuning (SFT), GRPO-style RL, or retrieval objectives.
+4. Driving Pantograph-based provers to fill in sorrys or verify solutions.
+5. Using HuggingFace API for large model inference.
+
+The codebase is modular: you can reuse the tracing pipeline without the agents, swap in custom trainers, or stand up your own inference service via the external API layer.
+
+---
+
+## Key Features
+
+- **Unified Agent Abstractions**: `BaseAgent` orchestrates repository setup, training, and proving. Concrete implementations (`HFAgent`, `LeanAgent`, and `ExternalAgent`) tailor the workflow to Hugging Face models, retrieval-based provers, or REST-backed models.
+- **Powerful Trainers**: `SFTTrainer`, `GRPOTrainer`, and `RetrievalTrainer` cover LoRA-enabled supervised fine-tuning, group-relative policy optimization, and retriever-only curriculum learning.
+- **Multi-Modal Provers**: `HFProver`, `RetrievalProver`, and `ExternalProver` run on top of Pantograph’s Lean RPC server to search for tactics, generate whole proofs, or delegate to custom models.
+- **Lean Tracing Pipeline**: `lean_dojo` includes the Lean 4 instrumentation (`ExtractData.lean`) and Python utilities to trace commits, normalize ASTs, and cache proof states.
+- **Dynamic Repository Database**: `database` tracks repositories, theorems, curriculum difficulty, and sorry status, enabling lifelong training schedules.
+- **External API**: The `external_api` folder exposes HTTP endpoints (FastAPI + uvicorn) and Lean frontend snippets so you can query LLMs from Lean editors.
+
+---
+
+## Repository Layout
+
+| Path | Description |
+|------|-------------|
+| `lean_dojo_v2/agent/` | Base class plus `HFAgent`, `LeanAgent`, and helpers to manage repositories and provers. |
+| `lean_dojo_v2/trainer/` | SFT, GRPO, and retrieval trainers with Hugging Face + DeepSpeed integration. |
+| `lean_dojo_v2/prover/` | Pantograph-based prover implementations (HF, retrieval, external). |
+| `lean_dojo_v2/lean_dojo/` | Lean tracing, dataset generation, caching, and AST utilities. |
+| `lean_dojo_v2/lean_agent/` | Lifelong learning pipeline (configs, database, retrieval stack, generator). |
+| `lean_dojo_v2/external_api/` | LeanCopilot code (Lean + Python server) to query external models. |
+| `lean_dojo_v2/utils/` | Shared helpers for Git, filesystem operations, and constants. |
+| `lean_dojo_v2/tests/` | Pytest regression suite. |
+
+For deeper documentation on the lifelong learning component, see `lean_dojo_v2/lean_agent/README.md`.
+
+---
+
 ## Requirements
-* Python >= 3.11
-* GPU
+
+- Python ≥ 3.11.
+- CUDA-capable GPU for training and inference (tested with CUDA 12.6).
+- Git ≥ 2.25 and `wget`.
+- [elan](https://github.com/leanprover/elan) Lean toolchain to trace repositories locally.
+- Adequate disk space for the `raid/` working directory (datasets, checkpoints, traces).
+
+Python dependencies are declared in `pyproject.toml` and include PyTorch, PyTorch Lightning, Transformers, DeepSpeed, TRL, PEFT, and more.
+
+---
+
 ## Installation
-To install LeanDojo-v2, run
-``` sh
-uv pip install lean-dojo-v2
-```
-install Pantograph
-``` sh
-uv add git+https://github.com/stanford-centaur/PyPantograph
-```
-make sure you've installed CUDA-compiled torch,
-``` sh
-uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
-```
-export your GitHub access token,
-``` sh
-export GITHUB_ACCESS_TOKEN=<GITHUB_ACCESS_TOKEN>
+
+### Option 1: From PyPI
+
+```sh
+# Install the core package
+pip install lean-dojo-v2
+
+# Pantograph is required for Lean RPC
+pip install git+https://github.com/stanford-centaur/PyPantograph
+
+# Install a CUDA-enabled torch build (adjust the index URL for your CUDA version)
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
 ```
-To use the HuggingFace API, you need to export your HuggingFace token,
-``` sh
-export HF_TOKEN=<HF_TOKEN>
+
+### Option 2: From Source (development)
+
+```sh
+git clone https://github.com/lean-dojo/LeanDojo-v2.git
+cd LeanDojo-v2
+python -m venv .venv
+source .venv/bin/activate
+pip install --upgrade pip
+pip install -e .[dev]
+pip install git+https://github.com/stanford-centaur/PyPantograph
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
 ```
-## Example
-``` python
+
+> Tip: You can use [uv](https://github.com/astral-sh/uv) (`uv pip install lean-dojo-v2`) as an alternative Python package manager.
+
+---
+
+## Environment Setup
+
+1. **GitHub Access Token (required)**  
+   The tracing pipeline calls the GitHub API extensively. Create a personal access token and export it before running any agent:
+   ```sh
+   export GITHUB_ACCESS_TOKEN=<your-token>
+   ```
+
+2. **Hugging Face Token (optional but needed for gated models)**  
+   ```sh
+   export HF_TOKEN=<your-hf-token>
+   ```
+
+3. **Working directories**  
+   By default all datasets, caches, and checkpoints live under `<repo>/raid`. Change the layout by editing `lean_dojo_v2/utils/constants.py` or by pointing `RAID_DIR` to faster storage.
+
+4. **Lean toolchains**  
+   Ensure `elan` is configured and Lean 4 (e.g., `leanprover/lean4:nightly`) is available on your `$PATH`. The tracing scripts look under `~/.elan/toolchains/`.
+
+---
+
+## Quick Start
+
+```python
 from lean_dojo_v2.agent.hf_agent import HFAgent
 from lean_dojo_v2.trainer.sft_trainer import SFTTrainer
 
 url = "https://github.com/durant42040/lean4-example"
-commit = "b14fef0ceca29a65bc3122bf730406b33c7effe5"
+commit = "005de00d03f1aaa32cb2923d5e3cbaf0b954a192"
 
 trainer = SFTTrainer(
     model_name="deepseek-ai/DeepSeek-Prover-V2-7B",
@@ -43,5 +151,141 @@ agent = HFAgent(trainer=trainer)
 agent.setup_github_repository(url=url, commit=commit)
 agent.train()
 agent.prove()
+```
+
+This example:
 
+1. Downloads and traces the target Lean repository + commit.
+2. Builds a supervised dataset from sorry theorems.
+3. Fine-tunes the specified Hugging Face model (optionally with LoRA).
+4. Launches an `HFProver` backed by Pantograph to search for proofs.
+
+---
+
+## Working with Agents and Trainers
+
+### Supervised Fine-Tuning (`SFTTrainer`)
+
+- Accepts any Hugging Face causal LM identifier.
+- Supports LoRA by passing a `peft.LoraConfig`.
+- Key arguments: `epochs_per_repo`, `batch_size`, `max_seq_len`, `lr`, `warmup_steps`, `gradient_checkpointing`.
+- Produces checkpoints under `output_dir` that the `HFProver` consumes.
+
+### GRPO Trainer (`GRPOTrainer`)
+
+- Implements Group Relative Policy Optimization for reinforcement-style refinement.
+- Accepts `reference_model`, `reward_weights`, and `kl_beta` settings.
+- Useful for improving search policies on curated theorem batches.
+
+### Retrieval Trainer & LeanAgent
+
+- `RetrievalTrainer` trains the dense retriever that scores prior proofs.
+- `LeanAgent` wraps the trainer, maintains repository curricula, and couples it with `RetrievalProver`.
+
+Each agent inherits `BaseAgent`, so you can implement your own by overriding `_get_build_deps()` and `_setup_prover()` to register new trainer/prover pairs.
+
+---
+
+## Tracing and Dataset Generation
+
+The `lean_dojo_v2/lean_dojo/data_extraction` package powers repository tracing:
+
+- `lean.py` clones repositories (GitHub, remote, or local), validates Lean versions, and normalizes URLs.
+- `trace.py` drives Lean with the custom `ExtractData.lean` instrumented module to capture theorem states.
+- `dataset.py` converts traced files to JSONL datasets ready for trainers.
+- `cache.py` memoizes repository metadata to avoid redundant downloads.
+- `traced_data.py` exposes typed wrappers for traced AST nodes and sorrys.
+
+Typical usage:
+
+```python
+from lean_dojo_v2.database import DynamicDatabase
+
+url = "https://github.com/durant42040/lean4-example"
+commit = "005de00d03f1aaa32cb2923d5e3cbaf0b954a192"
+
+database = DynamicDatabase()
+
+database.setup_github_repository(
+    url=url,
+    commit=commit,
+    build_deps=False,
+)
+```
+
+The generated artifacts flow into the `DynamicDatabase`, which keeps repositories sorted by difficulty and appends new sorrys without retracing everything.
+
+---
+
+## External APIs and LeanCopilot
+
+`lean_dojo_v2/external_api` contains Lean and Python code to expose models through LeanCopilot:
+
+- `LeanCopilot.lean` registers RPC endpoints inside Lean.
+- `python/server.py` hosts a FastAPI service with adapters for Anthropic, OpenAI, Google Generative AI, vLLM, and custom HF models.
+- Start the service with:
+  ```sh
+  cd lean_dojo_v2/external_api/python
+  pip install -r requirements.txt
+  uvicorn server:app --port 23337
+  ```
+- Point your Lean client to the running server to interactively request tactics, proofs, or completions from external models.
+
+### LeanProgress Step-Prediction Workflow
+
+- Generate a JSONL dataset with remaining-step targets (or replace it with your own LeanProgress export):
+  ```sh
+  python -m lean_dojo_v2.lean_progress.create_sample_dataset --output raid/data/sample_leanprogress_dataset.jsonl
+  ```
+- Fine-tune a regression head that predicts `steps_remaining`:
+  ```sh
+  python -m lean_dojo_v2.lean_progress.train_steps_model \
+    --dataset raid/data/sample_leanprogress_dataset.jsonl \
+    --output-dir raid/checkpoints/leanprogress_steps \
+    --model-name bert-base-uncased
+  ```
+- Tell the LeanCopilot server where to find the checkpoint by exporting:
+  ```sh
+  export LEANPROGRESS_MODEL=raid/checkpoints/leanprogress_steps
+  uvicorn server:app --port 23337
+  ```
+- Add `use_reward=true` when calling `/generate`. Each output now includes `steps_remaining` and a reward value (currently `-steps_remaining`) so agents can minimize proof length.
+
+---
+
+## Testing
+
+We use `pytest` for regression coverage.
+
+```sh
+pip install -e .[dev]          # make sure dev extras like pytest/trl are present
+export GITHUB_ACCESS_TOKEN=<token>
+export HF_TOKEN=<hf-token>     # only required for tests touching HF APIs
+pytest -v
 ```
+
+---
+
+## Troubleshooting & Tips
+
+- **401 Bad Credentials / rate limits**: Ensure `GITHUB_ACCESS_TOKEN` is exported and has `repo` + `read:org` scopes.
+- **Lean tracing failures**: Confirm that the repo’s Lean version exists locally (`elan toolchain install <version>`).
+- **Missing CUDA libraries**: Install the PyTorch wheel that matches your driver and CUDA version.
+- **Dataset location**: The default `raid/` directory can grow large. Point it to high-throughput storage or use symlinks.
+- **Pantograph errors**: Reinstall Pantograph from source (`pip install git+https://github.com/stanford-centaur/PyPantograph`) whenever Lean upstream changes.
+
+---
+
+## Contributing
+
+Issues and pull requests are welcome! Please:
+
+1. Open an issue describing the bug or feature.
+2. Run formatters (`black`, `isort`) and `pytest` before submitting.
+3. Mention if your change touches Lean tracing files so reviewers can re-generate artifacts.
+
+---
+
+## License
+
+LeanDojo-v2 is released under the MIT License. See `LICENSE` for details.
@@ -12,7 +12,7 @@ def reward_func(completions, **kwargs):
     return torch.tensor([1.0] * len(completions))
 
 
-url = "https://github.com/durant42040/lean4-example",
+url = ("https://github.com/durant42040/lean4-example",)
 commit = "b14fef0ceca29a65bc3122bf730406b33c7effe5"
 
 trainer = GRPOTrainer(
 
@@ -1,17 +1,2 @@
-
 __version__ = "1.0.0"
 __author__ = "LeanDojo-v2 Contributors"
-
-# Import main components for easy access
-from .agent import BaseAgent, HFAgent, LeanAgent
-from .prover import BaseProver, ExternalProver, HFProver, RetrievalProver
-
-__all__ = [
-    "BaseAgent",
-    "HFAgent",
-    "LeanAgent",
-    "BaseProver",
-    "HFProver",
-    "RetrievalProver",
-    "ExternalProver",
-]
@@ -5,7 +5,7 @@
 from loguru import logger
 from pantograph import Server
 
-from lean_dojo_v2.lean_agent.database.dynamic_database import DynamicDatabase
+from lean_dojo_v2.database.dynamic_database import DynamicDatabase
 from lean_dojo_v2.lean_dojo.data_extraction.trace import get_traced_repo_path
 from lean_dojo_v2.utils.constants import DATA_DIR, RAID_DIR
 
 
@@ -31,7 +31,6 @@
     search_github_repositories,
 )
 from lean_dojo_v2.utils.lean import get_lean4_version_from_config
-from lean_dojo_v2.utils.repository import save_sorted_repos
 
 from .models import Repository, Theorem
 
@@ -482,7 +481,7 @@ def trace_repository(
         if (
             total_theorems < 3 * BATCH_SIZE
         ):  # Should be enough theorems for train/val/test
-            logger.info(f"Not enough theorems found in {url}")
+            logger.info(f"Not enough theorems found in {repo.url}")
             return None
 
         config = repo.get_config("lean-toolchain")