Skip to content

Commit 4a5aa67

Browse files
committed
bump lightning dev sha, set pytorch 2.9 as base docker image, pin jsonargparse < 4.42.0 until parsing issue addressed, disable cpu offload config with certain FSDP 2 tests on PT 2.9+ until upstream issues addressed, use new sphinx theme to address mobile dev table of contents issue
1 parent 35dc7c4 commit 4a5aa67

File tree

14 files changed

+30
-20
lines changed

14 files changed

+30
-20
lines changed

.azure-pipelines/gpu-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ jobs:
104104
python -m coverage html
105105
curl -Os https://uploader.codecov.io/latest/linux/codecov
106106
chmod +x codecov
107-
./codecov --token= $CODECOV_TOK --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
107+
./codecov -t $CODECOV_TOK --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
108108
env:
109109
CODECOV_TOK: $(CODECOV_TOKEN) # explicit mapping required for secret azure pipeline variables
110110
displayName: 'Statistics'

dockers/base-cuda/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,11 @@ RUN \
8787
else \
8888
# or target a specific cuda build, by specifying a particular index url w/...
8989
# ... default channel
90-
#pip install torch torchvision --index-url https://download.pytorch.org/whl/cu128; \
90+
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu128; \
9191
# ... pytorch patch version
9292
# pip install torch==1.11.1+cu113 torchvision==0.11.3+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html; \
9393
# ... pytorch nightly dev version
94-
pip install --pre torch==2.9.0.dev20250811 --index-url https://download.pytorch.org/whl/nightly/cu128; \
94+
# pip install --pre torch==2.9.0.dev20250811 --index-url https://download.pytorch.org/whl/nightly/cu128; \
9595
# ... test channel
9696
# pip install --pre torch==2.9.0 --index-url https://download.pytorch.org/whl/test/cu128; \
9797
fi && \

dockers/docker_images_release.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ maybe_build(){
4141

4242
build_eval(){
4343
# latest PyTorch image supported by release
44-
declare -A iv=(["cuda"]="12.8.1" ["python"]="3.12" ["pytorch"]="2.8.0" ["lightning"]="2.5" ["cust_build"]="0")
44+
declare -A iv=(["cuda"]="12.8.1" ["python"]="3.12" ["pytorch"]="2.9.0" ["lightning"]="2.5" ["cust_build"]="0")
4545
export latest_pt="base-cu${iv["cuda"]}-py${iv["python"]}-pt${iv["pytorch"]}-pl${iv["lightning"]}"
4646
export latest_azpl="py${iv["python"]}-pt${iv["pytorch"]}-pl${iv["lightning"]}-azpl-init"
4747
maybe_build iv "${latest_pt}" "${latest_azpl}"

requirements/cli.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
jsonargparse[signatures]>=4.27.7
1+
jsonargparse[signatures]>=4.27.7,<4.42.0
22
omegaconf>=2.1.0
33
hydra-core>=1.1.0

requirements/docs.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ sphinx-togglebutton>=0.2
1111
sphinx-copybutton>=0.3
1212
typing-extensions # already in `base.txt` but the docs CI job does not install it
1313
jinja2>=3.0.0,<3.1.0
14-
git+https://github.com/speediedan/lightning_sphinx_theme.git@3f124e96e7f035c3391db2a3d601faf11530cd81#egg=pt_lightning_sphinx_theme
14+
git+https://github.com/speediedan/lightning_sphinx_theme.git@ad28149a5c27ed4bfb3f812760e179202024b2c9#egg=pt_lightning_sphinx_theme

requirements/lightning_pin.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
8d1a73475e0f11ed3b3c5907da54fc993717d492
1+
c943c05c1425fb73cdaef8dee1bca65dabff626b

scripts/build_fts_env.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,9 @@ base_env_build(){
9898
if [[ -n ${torchvision_dev_ver} ]]; then
9999
torchvision_dev_ver=${torch_dev_ver}
100100
fi
101-
pip install ${pip_install_flags} --pre torch==2.9.0.${torch_dev_ver} --index-url https://download.pytorch.org/whl/nightly/cu128
101+
pip install ${pip_install_flags} --pre torch==2.10.0.${torch_dev_ver} --index-url https://download.pytorch.org/whl/nightly/cu128
102102
elif [[ $torch_test_channel -eq 1 ]]; then
103-
pip install ${pip_install_flags} --pre torch==2.9.0 --index-url https://download.pytorch.org/whl/test/cu128
103+
pip install ${pip_install_flags} --pre torch==2.10.0 --index-url https://download.pytorch.org/whl/test/cu128
104104
else
105105
pip install ${pip_install_flags} torch torchvision --index-url https://download.pytorch.org/whl/cu128
106106
fi
@@ -129,6 +129,10 @@ base_env_build(){
129129
clear_activate_env python3.12
130130
pip install ${pip_install_flags} torch==2.8.0 torchvision --index-url https://download.pytorch.org/whl/cu128
131131
;;
132+
fts_latest_pt2_9_x | fts_release_pt2_9_x)
133+
clear_activate_env python3.12
134+
pip install ${pip_install_flags} torch==2.9.0 torchvision --index-url https://download.pytorch.org/whl/cu128
135+
;;
132136
*)
133137
echo "no matching environment found, exiting..."
134138
exit 1

scripts/gen_fts_coverage.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@ tmp_coverage_dir="/tmp"
7575
coverage_session_log="${tmp_coverage_dir}/gen_fts_coverage_${target_env_name}_${d}.log"
7676

7777
# Define arrays of supported versions
78-
supported_fts_latest=(fts_latest_pt2_5_x fts_latest_pt2_6_x fts_latest_pt2_7_x fts_latest_pt2_8_x)
79-
supported_fts_release=(fts_release_pt2_5_x fts_release_pt2_6_x fts_release_pt2_7_x fts_release_pt2_8_x)
78+
supported_fts_latest=(fts_latest_pt2_5_x fts_latest_pt2_6_x fts_latest_pt2_7_x fts_latest_pt2_8_x fts_latest_pt2_9_x)
79+
supported_fts_release=(fts_release_pt2_5_x fts_release_pt2_6_x fts_release_pt2_7_x fts_release_pt2_8_x fts_release_pt2_9_x)
8080

8181
# Enable extended globbing for pattern matching
8282
shopt -s extglob

src/finetuning_scheduler/strategy_adapters/model_parallel.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,8 @@ def _resolve_cfg_aliases(config_dict: Dict) -> Dict:
265265
for k, v in list(config_dict.items()):
266266
# currently adding a `cpu_offload_policy` option alias for convenience
267267
# open a GitHub issue if you think other poliy alias options would be useful
268-
if k == "cpu_offload_policy":
268+
# TODO: renable coverage below when upstream cpu offload issue addressed
269+
if k == "cpu_offload_policy": # pragma: no cover
269270
config_dict["offload_policy"] = CPUOffloadPolicy(**v)
270271
del config_dict[k]
271272
elif k == "act_ckpt":
@@ -283,7 +284,8 @@ def _validate_fsdp_plan(self) -> None:
283284
"""
284285
if not self.fsdp_plan:
285286
return
286-
if self.fsdp_default_kwargs:
287+
# TODO: renable coverage below when upstream cpu offload issue addressed
288+
if self.fsdp_default_kwargs: # pragma: no cover
287289
self.fsdp_default_kwargs = ModelParallelStrategyAdapter._resolve_cfg_aliases(self.fsdp_default_kwargs)
288290
named_modules = dict(self.pl_module.named_modules()).keys()
289291
resolved_modules: Dict[str, Dict] = {}

src/fts_examples/model_parallel/config/fts_fsdp_auto_plan.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ trainer:
99
fsdp_default_kwargs:
1010
reshard_after_forward: True # default value of a normal ``fully_shard`` kwarg
1111
act_ckpt: ['composable'] # use composable AC with default kwargs
12-
cpu_offload_policy: {} # apply default cpu offload policy
12+
# cpu_offload_policy: {} # apply default cpu offload policy
1313
fsdp_plan: {'model.output': {}, 'model.layers.\d*$': {}}
1414
strategy:
1515
init_args:

0 commit comments

Comments
 (0)