-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathMakefile
More file actions
122 lines (90 loc) · 5.43 KB
/
Makefile
File metadata and controls
122 lines (90 loc) · 5.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
.PHONY: help venv install install-crf install-transformers install-training install-all test test-cov lint format clean bench bench-ci bench-dev bench-test bench-quick bench-json bench-validate diagnose train-crf eval-crf bench-crf bench-transformer export-bio train-transformer-subset train-transformer eval-transformer bench-transformer-trained
PYTHON ?= python3
VENV := .venv
BIN := $(VENV)/bin
# Auto-detect: prefer uv, fall back to pip
UV := $(shell command -v uv 2>/dev/null)
ifdef UV
VENV_CMD = uv venv $(VENV) --python $(PYTHON)
INSTALL_CMD = uv pip install --python $(BIN)/python
else
VENV_CMD = $(PYTHON) -m venv $(VENV) && $(BIN)/pip install --upgrade pip
INSTALL_CMD = $(BIN)/pip install
endif
help: ## Show this help message
@awk 'BEGIN {FS = ":.*?## "; printf "Usage: make \033[36m<target>\033[0m\n\nTargets:\n"} /^[a-zA-Z_-]+:.*?## / {printf " \033[36m%-20s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
venv: ## Create virtualenv (auto-detects uv vs pip)
$(VENV_CMD)
install: venv ## Install package with dev dependencies (editable)
$(INSTALL_CMD) -e ".[dev]"
install-crf: install ## Install CRF engine extras (sklearn-crfsuite)
$(INSTALL_CMD) -e ".[crf]"
install-transformers: install ## Install transformer engine extras (transformers, torch)
$(INSTALL_CMD) -e ".[transformers]"
install-training: install ## Install training extras (CRF + transformers + wandb + seqeval + datasets + accelerate)
$(INSTALL_CMD) -e ".[crf,transformers,training]"
install-all: venv ## Install package with all optional dependencies
$(INSTALL_CMD) -e ".[all]"
test: install ## Run pytest test suite
$(BIN)/pytest
test-cov: install ## Run tests with coverage report (HTML + terminal)
$(BIN)/pytest --cov --cov-report=term-missing --cov-report=html
lint: install ## Check code with ruff (lint + format check)
$(BIN)/ruff check src/ tests/
$(BIN)/ruff format --check src/ tests/
format: install ## Auto-fix lint issues and format code with ruff
$(BIN)/ruff check --fix src/ tests/
$(BIN)/ruff format src/ tests/
bench: install ## Run benchmark (pass extra args via BENCH_ARGS=...)
$(BIN)/python -m benchmarks.run $(BENCH_ARGS)
bench-ci: install ## Run benchmark against vendored CI fixtures (no external data)
$(BIN)/python -m benchmarks.run -d benchmarks/fixtures $(BENCH_ARGS)
bench-dev: install ## Run benchmark against validation split (development)
$(BIN)/python -m benchmarks.run -s validation $(BENCH_ARGS)
bench-test: install ## Run benchmark against test split (final evaluation only)
$(BIN)/python -m benchmarks.run -s test $(BENCH_ARGS)
bench-quick: install ## Quick benchmark sanity check (50 validation docs)
$(BIN)/python -m benchmarks.run -s validation -n 50
bench-json: install ## Run benchmark with JSON output
$(BIN)/python -m benchmarks.run --json $(BENCH_ARGS)
bench-validate: install ## Run dataset integrity checks
$(BIN)/python -m benchmarks.validate $(BENCH_ARGS)
diagnose: install ## Error analysis on validation split
$(BIN)/python -m benchmarks.diagnose --split validation $(BENCH_ARGS)
train-crf: install-crf ## Train CRF model (pass args via CRF_ARGS=...)
$(BIN)/python -m refex.engines.crf --train $(CRF_ARGS)
eval-crf: install-crf ## Evaluate trained CRF model
$(BIN)/python -m refex.engines.crf --evaluate $(CRF_ARGS)
bench-crf: install-crf ## Benchmark regex+CRF ensemble on validation split
$(BIN)/python -m benchmarks.run -s validation -e regex+crf $(BENCH_ARGS)
bench-transformer: install-transformers ## Benchmark regex+transformer ensemble (downloads weights)
$(BIN)/python -m benchmarks.run -s validation -e regex+transformer $(BENCH_ARGS)
export-bio: install-training ## Export BIO JSONL for train/validation/test splits
$(BIN)/python scripts/export_bio.py --split train --output data/hf_bio/train.jsonl
$(BIN)/python scripts/export_bio.py --split validation --output data/hf_bio/validation.jsonl
$(BIN)/python scripts/export_bio.py --split test --output data/hf_bio/test.jsonl
train-transformer-subset: install-training ## Smoke-test training (500 docs, 2 epochs)
$(BIN)/python scripts/train_transformer.py \
--train data/hf_bio/train.jsonl \
--eval data/hf_bio/validation.jsonl \
--output models/refex-eurobert-210m-smoke \
--device mps --epochs 2 --batch-size 16 --limit 500 \
--wandb-run-name eurobert-210m-smoke $(TRAIN_ARGS)
train-transformer: install-training ## Full transformer training (EuroBERT-210m, 3 epochs)
$(BIN)/python scripts/train_transformer.py \
--train data/hf_bio/train.jsonl \
--eval data/hf_bio/validation.jsonl \
--output models/refex-eurobert-210m \
--device mps --epochs 3 --batch-size 16 \
--wandb-run-name eurobert-210m-full-e3-b16-lr3e5 $(TRAIN_ARGS)
eval-transformer: install-transformers ## Evaluate trained transformer on validation (full engine)
REFEX_TRANSFORMER_MODEL=models/refex-eurobert-210m REFEX_TRANSFORMER_DEVICE=mps \
$(BIN)/python -m benchmarks.run -s validation -e transformer --json \
--output logs/bench-transformer-validation.json $(BENCH_ARGS)
bench-transformer-trained: install-transformers ## Benchmark regex+trained transformer on validation
REFEX_TRANSFORMER_MODEL=models/refex-eurobert-210m REFEX_TRANSFORMER_DEVICE=mps \
$(BIN)/python -m benchmarks.run -s validation -e regex+transformer --json \
--output logs/bench-regex+transformer-validation.json $(BENCH_ARGS)
clean: ## Remove virtualenv, build artifacts, and __pycache__ directories
rm -rf $(VENV) build/ dist/ *.egg-info src/*.egg-info
find . -type d -name __pycache__ -exec rm -rf {} +