Open
Description
[experimental]
context_parallel_degree = 1
pipeline_parallel_degree = 4
pipeline_parallel_microbatches = 8
pipeline_parallel_schedule='ZBVZeroBubble'
world-size is 8
[rank3]:[rank3]: Traceback (most recent call last):
[rank3]:[rank3]: File "code/torchtitan/train.py", line 429, in <module>
[rank3]:[rank3]: main(config)
[rank3]:[rank3]: File "miniconda3/envs/torchtitan/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
[rank3]:[rank3]: return f(*args, **kwargs)
[rank3]:[rank3]: File "/code/torchtitan/train.py", line 290, in main
[rank3]:[rank3]: pp_schedule.step()
[rank3]:[rank3]: File "/miniconda3/envs/torchtitan/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 1194, in step
[rank3]:[rank3]: self._step_microbatches(args_split, kwargs_split, targets_split, losses)
[rank3]:[rank3]: File "/miniconda3/envs/torchtitan/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 1372, in _step_microbatches
[rank3]:[rank3]: raise e
[rank3]:[rank3]: File "/miniconda3/envs/torchtitan/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 1251, in _step_microbatches
[rank3]:[rank3]: stage = stage_index_to_stage[stage_index]
[rank3]:[rank3]: KeyError: 6
[rank2]:[rank2]:E0103 05:54:18.993137 491746 site-packages/torch/distributed/pipelining/schedules.py:1358] [Rank 1] pipeline schedule ScheduleZBVZeroBubble caught the following exception at time_step 6 when running action 6F0
[rank2]:[rank2]:E0103 05:54:18.996414 491746 site-packages/torch/distributed/pipelining/schedules.py:1366] Rank 0 Rank 1 Rank 2 Rank 3
[rank2]:[rank2]:E0103 05:54:18.996414 491746 site-packages/torch/distributed/pipelining/schedules.py:1366] Step 00: 0F0
[rank2]:[rank2]:E0103 05:54:18.996414 491746 site-packages/torch/distributed/pipelining/schedules.py:1366] Step 01: 0F1 1F0
[rank2]:[rank2]:E0103 05:54:18.996414 491746 site-packages/torch/distributed/pipelining/schedules.py:1366] Step 02: 0F2 1F1 2F0
[rank2]:[rank2]:E0103 05:54:18.996414 491746 site-packages/torch/distributed/pipelining/schedules.py:1366] Step 03: 0F3 1F2 2F1 3F0
[rank2]:[rank2]:E0103 05:54:18.996414 491746 site-packages/torch/distributed/pipelining/schedules.py:1366] Step 04: 0F4 1F3 2F2 4F0
[rank2]:[rank2]:E0103 05:54:18.996414 491746 site-packages/torch/distributed/pipelining/schedules.py:1366] Step 05: 0F5 1F4 5F0 3F1
[rank2]:[rank2]:E0103 05:54:18.996414 491746 site-packages/torch/distributed/pipelining/schedules.py:1366] Step 06: 0F6 6F0 2F3 4F1 <-- ERROR HERE
[rank2]:[rank2]:E0103 05:54:18.996414 491746 site-packages/torch/distributed/pipelining/schedules.py:1366] Step 07: 7F0 1F5 5F1 3F2
[rank2]:[rank2]:E0103 05:54:18.996414 491746 site-packages/torch/distributed/pipelining/schedules.py:1366] Step 08: 7I0 6F1 2F4 4F2