Skip to content

Commit 325ee16

Browse files
committed
update build scripts to automate oldest env local validation, bump tensorboard version
1 parent 1e49b94 commit 325ee16

File tree

6 files changed

+127
-36
lines changed

6 files changed

+127
-36
lines changed

.github/copilot-instructions.md

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ Use the provided build script for automated setup:
9797

9898
# Build from Lightning source
9999
./scripts/build_fts_env.sh --repo_home=${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --from-source="lightning:${HOME}/repos/lightning"
100+
101+
# Build with oldest compatible dependencies (Python 3.10, mirrors CI oldest matrix)
102+
./scripts/build_fts_env.sh --repo_home=${HOME}/repos/finetuning-scheduler --target_env_name=fts_oldest --oldest
100103
```
101104

102105
**Venv Location Options:**
@@ -266,17 +269,65 @@ python -m pytest src/finetuning_scheduler tests -v
266269

267270
### Coverage Collection
268271

272+
Use the `manage_standalone_processes.sh` harness with `--use-nohup` to run coverage collection in an isolated process. Output is written to `/tmp/gen_fts_coverage_<env>_<timestamp>.log`.
273+
274+
**Monitoring progress:**
275+
269276
```bash
270-
# Generate coverage with rebuild
271-
./scripts/gen_fts_coverage.sh --repo_home=${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest
277+
# Tail the most recent coverage log
278+
tail -f `ls -rt /tmp/gen_fts_coverage_fts_* | tail -1`
279+
```
280+
281+
**Common coverage commands:**
282+
283+
```bash
284+
# Generate coverage with rebuild (fts_latest with stable PyTorch)
285+
~/repos/finetuning-scheduler/scripts/manage_standalone_processes.sh --use-nohup \
286+
~/repos/finetuning-scheduler/scripts/gen_fts_coverage.sh \
287+
--repo_home=${HOME}/repos/finetuning-scheduler \
288+
--target_env_name=fts_latest \
289+
--venv-dir=/mnt/cache/${USER}/.venvs
272290

273291
# Generate coverage without rebuild
274-
./scripts/gen_fts_coverage.sh --repo_home=${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --no_rebuild_base
292+
~/repos/finetuning-scheduler/scripts/manage_standalone_processes.sh --use-nohup \
293+
~/repos/finetuning-scheduler/scripts/gen_fts_coverage.sh \
294+
--repo_home=${HOME}/repos/finetuning-scheduler \
295+
--target_env_name=fts_latest \
296+
--venv-dir=/mnt/cache/${USER}/.venvs \
297+
--no_rebuild_base
275298

276299
# Include experimental patch tests
277-
./scripts/gen_fts_coverage.sh --repo_home=${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --include_experimental
300+
~/repos/finetuning-scheduler/scripts/manage_standalone_processes.sh --use-nohup \
301+
~/repos/finetuning-scheduler/scripts/gen_fts_coverage.sh \
302+
--repo_home=${HOME}/repos/finetuning-scheduler \
303+
--target_env_name=fts_latest \
304+
--venv-dir=/mnt/cache/${USER}/.venvs \
305+
--include_experimental
306+
307+
# Generate coverage with oldest dependencies (Python 3.10, mirrors CI oldest matrix)
308+
~/repos/finetuning-scheduler/scripts/manage_standalone_processes.sh --use-nohup \
309+
~/repos/finetuning-scheduler/scripts/gen_fts_coverage.sh \
310+
--repo_home=${HOME}/repos/finetuning-scheduler \
311+
--target_env_name=fts_oldest \
312+
--venv-dir=/mnt/cache/${USER}/.venvs \
313+
--oldest
314+
315+
# Generate coverage with oldest deps, skip special tests (faster CI-like run)
316+
~/repos/finetuning-scheduler/scripts/manage_standalone_processes.sh --use-nohup \
317+
~/repos/finetuning-scheduler/scripts/gen_fts_coverage.sh \
318+
--repo_home=${HOME}/repos/finetuning-scheduler \
319+
--target_env_name=fts_oldest \
320+
--venv-dir=/mnt/cache/${USER}/.venvs \
321+
--oldest \
322+
--no-special
278323
```
279324

325+
**Flags:**
326+
327+
- `--oldest`: Uses Python 3.10 and `requirements/ci/requirements-oldest.txt` (mirrors CI oldest matrix)
328+
- `--no-special`: Skips `special_tests.sh` standalone and experimental patch tests (faster iteration)
329+
- `--venv-dir`: Base directory for venvs (recommended: `/mnt/cache/${USER}/.venvs` for hardlink performance)
330+
280331
## Special Dependencies and Known Issues
281332

282333
### Lightning Dependency

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ examples = [
6060
"transformers>=4.26.0",
6161
"scikit-learn>=1.2.0",
6262
"sentencepiece>=0.2.0",
63-
"tensorboardX>=2.5",
63+
"tensorboardX>=2.6.1", # 2.6.1+ regenerated protos for protobuf 4.x
6464
"tabulate>=0.8.9",
6565
"psutil>=5.9.0",
6666
# Include CLI deps
@@ -98,7 +98,7 @@ all = [
9898
"transformers>=4.26.0",
9999
"scikit-learn>=1.2.0",
100100
"sentencepiece>=0.2.0",
101-
"tensorboardX>=2.5",
101+
"tensorboardX>=2.6.1", # 2.6.1+ regenerated protos for protobuf 4.x
102102
"tabulate>=0.8.9",
103103
"psutil>=5.9.0",
104104
# ipynb

requirements/ci/requirements-oldest.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ packaging==23.2
391391
# mlflow
392392
# pytest
393393
# pytorch-lightning
394+
# tensorboardx
394395
# torchmetrics
395396
# transformers
396397
pandas==2.3.3
@@ -563,7 +564,6 @@ six==1.17.0
563564
# nbval
564565
# python-dateutil
565566
# querystring-parser
566-
# tensorboardx
567567
smmap==5.0.2
568568
# via gitdb
569569
sqlalchemy==2.0.44
@@ -578,7 +578,7 @@ tabulate==0.8.9
578578
# via
579579
# finetuning-scheduler (pyproject.toml)
580580
# databricks-cli
581-
tensorboardx==2.5
581+
tensorboardx==2.6.1
582582
# via finetuning-scheduler (pyproject.toml)
583583
terminado==0.18.1
584584
# via notebook

requirements/ci/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# This file was autogenerated by uv via the following command:
2-
# uv pip compile /home/speediedan/repos/finetuning-scheduler/pyproject.toml --extra all --group dev --group test --output-file /tmp/tmp.wCfF6eMKZC --no-strip-extras --resolution highest --universal --python-version 3.10
2+
# uv pip compile /home/speediedan/repos/finetuning-scheduler/pyproject.toml --extra all --group dev --group test --output-file /tmp/tmp.flfkowf3vK --no-strip-extras --resolution highest --universal --python-version 3.10
33
aiohappyeyeballs==2.6.1
44
# via aiohttp
55
aiohttp==3.13.2

scripts/build_fts_env.sh

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
# ./build_fts_env.sh --repo_home=~/repos/finetuning-scheduler --target_env_name=fts_latest
77
# build latest with explicit venv directory (recommended for hardlink performance):
88
# ./build_fts_env.sh --repo_home=${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --venv-dir=/mnt/cache/${USER}/.venvs
9+
# build oldest (CI oldest build simulation with Python 3.10 and oldest deps):
10+
# ./build_fts_env.sh --repo_home=${HOME}/repos/finetuning-scheduler --target_env_name=fts_oldest --oldest
911
# build release:
1012
# ./build_fts_env.sh --repo_home=${HOME}/repos/fts-release --target_env_name=fts_release
1113
# build latest with torch test channel:
@@ -22,6 +24,7 @@ unset torch_test_channel
2224
unset uv_install_flags
2325
unset no_commit_pin
2426
unset venv_dir
27+
unset oldest
2528
declare -a from_source_specs=()
2629

2730
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -32,6 +35,7 @@ usage(){
3235
Usage: $0
3336
[ --repo_home input]
3437
[ --target_env_name input ]
38+
[ --oldest ] # Use oldest CI requirements (Python 3.10, requirements-oldest.txt)
3539
[ --torch_test_channel ] # Use PyTorch test/RC channel
3640
[ --uv_install_flags "flags" ]
3741
[ --no_commit_pin ]
@@ -43,6 +47,8 @@ Usage: $0
4347
# ./build_fts_env.sh --repo_home=\${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest
4448
# build latest with explicit venv directory (recommended for hardlink performance):
4549
# ./build_fts_env.sh --repo_home=\${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --venv-dir=/mnt/cache/\${USER}/.venvs
50+
# build oldest (CI oldest build simulation):
51+
# ./build_fts_env.sh --repo_home=\${HOME}/repos/finetuning-scheduler --target_env_name=fts_oldest --oldest --venv-dir=/mnt/cache/\${USER}/.venvs
4652
# build release:
4753
# ./build_fts_env.sh --repo_home=\${HOME}/repos/fts-release --target_env_name=fts_release
4854
# build latest with torch test channel:
@@ -61,7 +67,7 @@ EOF
6167
exit 1
6268
}
6369

64-
args=$(getopt -o '' --long repo_home:,target_env_name:,torch_test_channel,uv_install_flags:,no_commit_pin,venv-dir:,from-source:,help -- "$@")
70+
args=$(getopt -o '' --long repo_home:,target_env_name:,oldest,torch_test_channel,uv_install_flags:,no_commit_pin,venv-dir:,from-source:,help -- "$@")
6571
if [[ $? -gt 0 ]]; then
6672
usage
6773
fi
@@ -72,6 +78,7 @@ do
7278
case $1 in
7379
--repo_home) repo_home=$2 ; shift 2 ;;
7480
--target_env_name) target_env_name=$2 ; shift 2 ;;
81+
--oldest) oldest=1 ; shift ;;
7582
--torch_test_channel) torch_test_channel=1 ; shift ;;
7683
--uv_install_flags) uv_install_flags=$2 ; shift 2 ;;
7784
--no_commit_pin) no_commit_pin=1 ; shift ;;
@@ -143,16 +150,26 @@ log_torch_version(){
143150
}
144151

145152
base_env_build(){
153+
# Use Python 3.10 for oldest builds, 3.12 for latest
146154
local python_version="python3.12"
155+
if [[ -n ${oldest} ]]; then
156+
python_version="python3.10"
157+
echo "Using Python 3.10 for oldest build"
158+
fi
147159

148160
clear_activate_env ${python_version}
149161

150-
# Check for torch nightly configuration
151-
read_torch_nightly_config
162+
# Check for torch nightly configuration (skip for oldest builds)
163+
if [[ -z ${oldest} ]]; then
164+
read_torch_nightly_config
165+
fi
152166

153167
# Handle PyTorch version selection (pre-install before FTS dependencies)
154-
# Priority: torch nightly from config > torch test channel > stable (via --torch-backend in fts_install)
155-
if [[ -n "${TORCH_NIGHTLY_VERSION}" ]]; then
168+
# Priority: oldest (stable from lock) > torch nightly from config > torch test channel > stable (via --torch-backend in fts_install)
169+
if [[ -n ${oldest} ]]; then
170+
# For oldest builds, torch is installed from requirements-oldest.txt (stable version)
171+
echo "Using torch stable from requirements-oldest.txt for oldest build"
172+
elif [[ -n "${TORCH_NIGHTLY_VERSION}" ]]; then
156173
# Nightly version from torch-nightly.txt with specified CUDA backend
157174
local cuda_target="${TORCH_NIGHTLY_CUDA:-cu128}" # Default to cu128 if not specified
158175
local torch_pkg="torch==${TORCH_NIGHTLY_VERSION}"
@@ -195,7 +212,13 @@ fts_install(){
195212
local req_file="${repo_home}/requirements/ci/requirements.txt"
196213
local torch_backend_flag=""
197214

198-
if [[ -n "${TORCH_NIGHTLY_VERSION}" || -n ${torch_test_channel} ]]; then
215+
# For oldest builds, use requirements-oldest.txt
216+
if [[ -n ${oldest} ]]; then
217+
req_file="${repo_home}/requirements/ci/requirements-oldest.txt"
218+
echo "Using oldest requirements file: ${req_file}"
219+
# Oldest builds use torch stable from lock file, need --torch-backend=auto
220+
torch_backend_flag="--torch-backend=auto"
221+
elif [[ -n "${TORCH_NIGHTLY_VERSION}" || -n ${torch_test_channel} ]]; then
199222
# Torch already pre-installed (nightly or test channel)
200223
# When nightly: requirements.txt already has torch filtered during lock generation
201224
# When test channel: filter at runtime
@@ -227,12 +250,11 @@ fts_install(){
227250
uv pip install ${uv_install_flags} -r requirements/docs.txt ${torch_backend_flag}
228251
log_torch_version "after docs requirements install"
229252

230-
# Install pip for mypy and pre-commit (they use pip internally)
253+
# Install pip for pre-commit (it uses pip internally)
231254
uv pip install pip
232255

233256
# Development setup
234-
rm -rf .mypy_cache
235-
mypy --install-types --non-interactive
257+
pyright -p pyproject.toml || echo "⚠ pyright check had issues, continuing..."
236258
pre-commit install
237259
git lfs install
238260

scripts/gen_fts_coverage.sh

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ unset uv_install_flags
1313
unset no_commit_pin
1414
unset venv_dir
1515
unset dry_run
16+
unset oldest
17+
unset no_special
1618
declare -a from_source_specs=()
1719

1820
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -23,9 +25,11 @@ usage(){
2325
Usage: $0
2426
[ --repo_home input]
2527
[ --target_env_name input ]
28+
[ --oldest ] # Use oldest CI requirements (Python 3.10, requirements-oldest.txt)
2629
[ --torch_dev_ver input ]
2730
[ --torch_test_channel ]
2831
[ --no_rebuild_base ]
32+
[ --no-special ] # Skip special tests (standalone/experimental), run only main test suite
2933
[ --include_experimental ]
3034
[ --uv_install_flags "flags" ]
3135
[ --no_commit_pin ]
@@ -35,26 +39,28 @@ Usage: $0
3539
[ --help ]
3640
Examples:
3741
# generate fts_latest coverage without rebuilding the fts_latest base environment:
38-
# ./gen_fts_coverage.sh --repo_home=${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --no_rebuild_base
42+
# ./gen_fts_coverage.sh --repo_home=\${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --no_rebuild_base
43+
# generate oldest CI build coverage (matches CI oldest matrix):
44+
# ./gen_fts_coverage.sh --repo_home=\${HOME}/repos/finetuning-scheduler --target_env_name=fts_oldest --oldest --no-special --venv-dir=/mnt/cache/\${USER}/.venvs
3945
# generate fts_latest coverage with a given torch_dev_version:
40-
# ./gen_fts_coverage.sh --repo_home=${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --torch_dev_ver=dev20240201
46+
# ./gen_fts_coverage.sh --repo_home=\${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --torch_dev_ver=dev20240201
4147
# generate fts_latest coverage, rebuilding base fts_latest with PyTorch test channel and run tests that require experimental patches:
42-
# ./gen_fts_coverage.sh --repo_home=${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --torch_test_channel --include_experimental
48+
# ./gen_fts_coverage.sh --repo_home=\${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --torch_test_channel --include_experimental
4349
# generate fts_release coverage, rebuilding the base fts_release environment with PyTorch stable channel:
44-
# ./gen_fts_coverage.sh --repo_home=${HOME}/repos/fts-release --target_env_name=fts_release
50+
# ./gen_fts_coverage.sh --repo_home=\${HOME}/repos/fts-release --target_env_name=fts_release
4551
# generate fts_release coverage, rebuilding the base fts_release environment with PyTorch test channel:
46-
# ./gen_fts_coverage.sh --repo_home=${HOME}/repos/fts-release --target_env_name=fts_release --torch_test_channel
52+
# ./gen_fts_coverage.sh --repo_home=\${HOME}/repos/fts-release --target_env_name=fts_release --torch_test_channel
4753
# generate fts_latest coverage with explicit venv directory (recommended for hardlink performance):
48-
# ./gen_fts_coverage.sh --repo_home=${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --venv-dir=/mnt/cache/\${USER}/.venvs
54+
# ./gen_fts_coverage.sh --repo_home=\${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --venv-dir=/mnt/cache/\${USER}/.venvs
4955
# generate fts_release coverage without using CI commit pinning:
50-
# ./gen_fts_coverage.sh --repo_home=${HOME}/repos/fts-release --target_env_name=fts_release --no_commit_pin
56+
# ./gen_fts_coverage.sh --repo_home=\${HOME}/repos/fts-release --target_env_name=fts_release --no_commit_pin
5157
# dry-run mode: setup environment and show what tests would run without executing them:
52-
# ./gen_fts_coverage.sh --repo_home=${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --torch_dev_ver=dev20240201 --dry-run
58+
# ./gen_fts_coverage.sh --repo_home=\${HOME}/repos/finetuning-scheduler --target_env_name=fts_latest --torch_dev_ver=dev20240201 --dry-run
5359
EOF
5460
exit 1
5561
}
5662

57-
args=$(getopt -o '' --long repo_home:,target_env_name:,torch_dev_ver:,torch_test_channel,no_rebuild_base,include_experimental,uv_install_flags:,no_commit_pin,venv-dir:,from-source:,dry-run,help -- "$@")
63+
args=$(getopt -o '' --long repo_home:,target_env_name:,oldest,torch_dev_ver:,torch_test_channel,no_rebuild_base,no-special,include_experimental,uv_install_flags:,no_commit_pin,venv-dir:,from-source:,dry-run,help -- "$@")
5864
if [[ $? -gt 0 ]]; then
5965
usage
6066
fi
@@ -65,9 +71,11 @@ do
6571
case $1 in
6672
--repo_home) repo_home=$2 ; shift 2 ;;
6773
--target_env_name) target_env_name=$2 ; shift 2 ;;
74+
--oldest) oldest=1 ; shift ;;
6875
--torch_dev_ver) torch_dev_ver=$2 ; shift 2 ;;
6976
--torch_test_channel) torch_test_channel=1 ; shift ;;
7077
--no_rebuild_base) no_rebuild_base=1 ; shift ;;
78+
--no-special) no_special=1 ; shift ;;
7179
--include_experimental) include_experimental=1 ; shift ;;
7280
--uv_install_flags) uv_install_flags=$2 ; shift 2 ;;
7381
--no_commit_pin) no_commit_pin=1 ; shift ;;
@@ -121,6 +129,11 @@ env_rebuild(){
121129
# Build command arguments array
122130
local -a cmd_args=("${repo_home}/scripts/build_fts_env.sh" "--repo_home=${repo_home}" "--target_env_name=$1")
123131

132+
# Add oldest flag if specified
133+
if [[ $oldest -eq 1 ]]; then
134+
cmd_args+=("--oldest")
135+
fi
136+
124137
# Add uv_install_flags if specified
125138
if [[ -n "${uv_install_flags}" ]]; then
126139
cmd_args+=("--uv_install_flags=${uv_install_flags}")
@@ -145,7 +158,7 @@ env_rebuild(){
145158
log_msg "Executing build command: ${cmd_args[*]}"
146159

147160
case $1 in
148-
fts_latest)
161+
fts_latest|fts_oldest)
149162
if [[ -n ${torch_dev_ver} ]]; then
150163
cmd_args+=("--torch_dev_ver=${torch_dev_ver}")
151164
elif [[ $torch_test_channel -eq 1 ]]; then
@@ -212,18 +225,23 @@ collect_env_coverage(){
212225
fi
213226

214227
case $1 in
215-
fts_latest|fts_release|$all_supported_pattern)
228+
fts_latest|fts_oldest|fts_release|$all_supported_pattern)
216229
log_msg "Erasing previous coverage data"
217230
python -m coverage erase
218231
log_msg "Running main test suite with coverage"
219232
python -m coverage run --append --source src/finetuning_scheduler -m pytest src/finetuning_scheduler tests -v 2>&1 >> $coverage_session_log
220-
log_msg "Running standalone tests (pattern: test_f)"
221-
(./tests/special_tests.sh --mark_type=standalone --filter_pattern='test_f' --log_file=${coverage_session_log} 2>&1 >> ${temp_special_log}) > /dev/null
222-
if [[ $include_experimental -eq 1 ]]; then
223-
log_msg "Running tests that require experimental patches using $1"
224-
(./tests/special_tests.sh --mark_type=exp_patch --filter_pattern='test_f' --log_file=${coverage_session_log} --experiment_patch_mask="1 0 0 1" 2>&1 >> ${temp_special_log}) > /dev/null
233+
# Skip special tests if --no-special flag is set
234+
if [[ $no_special -eq 1 ]]; then
235+
log_msg "Skipping special tests (--no-special flag set)"
225236
else
226-
log_msg "Skipping tests that require experimental patches."
237+
log_msg "Running standalone tests (pattern: test_f)"
238+
(./tests/special_tests.sh --mark_type=standalone --filter_pattern='test_f' --log_file=${coverage_session_log} 2>&1 >> ${temp_special_log}) > /dev/null
239+
if [[ $include_experimental -eq 1 ]]; then
240+
log_msg "Running tests that require experimental patches using $1"
241+
(./tests/special_tests.sh --mark_type=exp_patch --filter_pattern='test_f' --log_file=${coverage_session_log} --experiment_patch_mask="1 0 0 1" 2>&1 >> ${temp_special_log}) > /dev/null
242+
else
243+
log_msg "Skipping tests that require experimental patches."
244+
fi
227245
fi
228246
;;
229247
*)
@@ -255,7 +273,7 @@ fi
255273
log_msg "Generating base coverage for the FTS env ${target_env_name}"
256274
env_rebuild_collect "${target_env_name}"
257275
case ${target_env_name} in
258-
fts_latest|$supported_fts_latest_pattern)
276+
fts_latest|fts_oldest|$supported_fts_latest_pattern)
259277
log_msg "No env-specific additional coverage currently required for ${target_env_name}"
260278
;;
261279
fts_release|$supported_fts_release_pattern)

0 commit comments

Comments
 (0)