Skip to content

Commit e77ae00

Browse files
committed
add data generation examples
1 parent e4505fe commit e77ae00

File tree

8 files changed

+49
-5
lines changed

8 files changed

+49
-5
lines changed

examples/data_generation/discover_github.py

Whitespace-only changes.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""
2+
Example script for generating dataset from a Lean 4 GitHub repository.
3+
The data is saved at <RAID_DIR>/<DATA_DIR>/<repo.name>_<repo.commit>.
4+
e.g. LeanDojo-v2/raid/data/lean4-example_005de00d03f1aaa32cb2923d5e3cbaf0b954a192
5+
6+
Usage: python examples/data_generation/trace_github.py
7+
"""
8+
9+
from lean_dojo_v2.database import DynamicDatabase
10+
11+
url = "https://github.com/durant42040/lean4-example"
12+
commit = "005de00d03f1aaa32cb2923d5e3cbaf0b954a192"
13+
14+
database = DynamicDatabase()
15+
16+
database.trace_repository(
17+
url=url,
18+
commit=commit,
19+
build_deps=False,
20+
)
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
"""
2+
Example script for generating dataset from a local Lean 4 repository.
3+
The data is saved at <RAID_DIR>/<DATA_DIR>/<repo.name>_<repo.commit>.
4+
e.g. LeanDojo-v2/raid/data/lean4-example_005de00d03f1aaa32cb2923d5e3cbaf0b954a192
5+
6+
Usage: python examples/data_generation/trace_local.py
7+
"""
8+
9+
from lean_dojo_v2.database import DynamicDatabase
10+
11+
# path = "path/to/lean4-example"
12+
path = "/Users/electron/Code/lean4-example"
13+
commit = "005de00d03f1aaa32cb2923d5e3cbaf0b954a192"
14+
15+
database = DynamicDatabase()
16+
17+
database.trace_repository(
18+
url=path,
19+
commit=commit,
20+
build_deps=False,
21+
)

lean_dojo_v2/agent/base_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from loguru import logger
66
from pantograph import Server
77

8-
from lean_dojo_v2.database.dynamic_database import DynamicDatabase
8+
from lean_dojo_v2.database import DynamicDatabase
99
from lean_dojo_v2.lean_dojo.data_extraction.lean import LeanGitRepo
1010
from lean_dojo_v2.lean_dojo.data_extraction.trace import get_traced_repo_path
1111
from lean_dojo_v2.utils.constants import DATA_DIR, RAID_DIR

lean_dojo_v2/lean_dojo/data_extraction/lean.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -624,7 +624,10 @@ def exists(self) -> bool:
624624
repo = self.repo # git repo
625625
try:
626626
repo.commit(self.commit)
627-
return repo.head.commit.hexsha == self.commit
627+
if repo.head.commit.hexsha != self.commit:
628+
self.repo.git.checkout(self.commit)
629+
print(self.repo.working_dir, self.commit)
630+
return True
628631
except BadName:
629632
logger.warning(
630633
f"Commit {self.commit} does not exist in this repository."

lean_dojo_v2/trainer/grpo_trainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from transformers import AutoModelForCausalLM, AutoTokenizer
1313
from trl import GRPOConfig
1414

15-
from lean_dojo_v2.database.dynamic_database import DynamicDatabase
15+
from lean_dojo_v2.database import DynamicDatabase
1616
from lean_dojo_v2.lean_dojo.data_extraction.lean import LeanGitRepo
1717
from lean_dojo_v2.utils import remove_marks
1818
from lean_dojo_v2.utils.constants import DATA_DIR, RAID_DIR

lean_dojo_v2/trainer/retrieval_trainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pytorch_lightning import seed_everything
1010
from tqdm import tqdm
1111

12-
from lean_dojo_v2.database.dynamic_database import DynamicDatabase
12+
from lean_dojo_v2.database import DynamicDatabase
1313
from lean_dojo_v2.database.models.repository import Repository
1414
from lean_dojo_v2.lean_agent.config import TrainingConfig
1515
from lean_dojo_v2.lean_agent.retrieval.datamodule import RetrievalDataModule

lean_dojo_v2/trainer/sft_trainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from transformers import AutoModelForCausalLM, AutoTokenizer
1414
from trl import SFTConfig
1515

16-
from lean_dojo_v2.database.dynamic_database import DynamicDatabase
16+
from lean_dojo_v2.database import DynamicDatabase
1717
from lean_dojo_v2.lean_dojo.data_extraction.lean import LeanGitRepo
1818
from lean_dojo_v2.utils import remove_marks
1919
from lean_dojo_v2.utils.constants import DATA_DIR, RAID_DIR

0 commit comments

Comments
 (0)