Skip to content

Commit 5764650

Browse files
authored
profiling ops on xpu (#2249)
1 parent 90fd2d3 commit 5764650

13 files changed

+34
-3
lines changed

recipes/dev/early_exit_finetune_distributed.py

+2
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,7 @@ def train(self) -> None:
870870
and curr_epoch == 0
871871
and self.profiler_profile_memory
872872
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
873+
and self._device.type == "cuda"
873874
):
874875
torch.cuda.memory._record_memory_history()
875876

@@ -1019,6 +1020,7 @@ def train(self) -> None:
10191020
== self.profiler_wait_steps
10201021
+ self.profiler_warmup_steps
10211022
+ self.profiler_active_steps
1023+
and self._device.type == "cuda"
10221024
):
10231025
torch.cuda.memory._record_memory_history(enabled=None)
10241026

recipes/full_finetune_distributed.py

+2
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,7 @@ def train(self) -> None:
723723
and curr_epoch == 0
724724
and self.profiler_profile_memory
725725
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
726+
and self._device.type == "cuda"
726727
):
727728
torch.cuda.memory._record_memory_history()
728729

@@ -846,6 +847,7 @@ def train(self) -> None:
846847
== self.profiler_wait_steps
847848
+ self.profiler_warmup_steps
848849
+ self.profiler_active_steps
850+
and self._device.type == "cuda"
849851
):
850852
torch.cuda.memory._record_memory_history(enabled=None)
851853

recipes/full_finetune_single_device.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -685,9 +685,9 @@ def train(self) -> None:
685685
curr_epoch == 0
686686
and self.profiler_profile_memory
687687
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
688+
and self._device.type == "cuda"
688689
):
689690
torch.cuda.memory._record_memory_history()
690-
691691
utils.batch_to_device(batch, self._device)
692692

693693
# Calculate the number of unmasked tokens in the current batch
@@ -766,6 +766,7 @@ def train(self) -> None:
766766
== self.profiler_wait_steps
767767
+ self.profiler_warmup_steps
768768
+ self.profiler_active_steps
769+
and self._device.type == "cuda"
769770
):
770771
torch.cuda.memory._record_memory_history(enabled=None)
771772

recipes/knowledge_distillation_distributed.py

+1
Original file line numberDiff line numberDiff line change
@@ -846,6 +846,7 @@ def train(self) -> None:
846846
and curr_epoch == 0
847847
and self.profiler_profile_memory
848848
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
849+
and self._device.type == "cuda"
849850
):
850851
torch.cuda.memory._record_memory_history()
851852

recipes/knowledge_distillation_single_device.py

+2
Original file line numberDiff line numberDiff line change
@@ -702,6 +702,7 @@ def train(self) -> None:
702702
curr_epoch == 0
703703
and self.profiler_profile_memory
704704
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
705+
and self._device.type == "cuda"
705706
):
706707
torch.cuda.memory._record_memory_history()
707708

@@ -784,6 +785,7 @@ def train(self) -> None:
784785
== self.profiler_wait_steps
785786
+ self.profiler_warmup_steps
786787
+ self.profiler_active_steps
788+
and self._device.type == "cuda"
787789
):
788790
torch.cuda.memory._record_memory_history(enabled=None)
789791

recipes/lora_finetune_distributed.py

+2
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,7 @@ def train(self) -> None:
776776
and curr_epoch == 0
777777
and self.profiler_profile_memory
778778
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
779+
and self._device.type == "cuda"
779780
):
780781
torch.cuda.memory._record_memory_history()
781782

@@ -880,6 +881,7 @@ def train(self) -> None:
880881
== self.profiler_wait_steps
881882
+ self.profiler_warmup_steps
882883
+ self.profiler_active_steps
884+
and self._device.type == "cuda"
883885
):
884886
torch.cuda.memory._record_memory_history(enabled=None)
885887

recipes/lora_finetune_distributed_multi_dataset.py

+2
Original file line numberDiff line numberDiff line change
@@ -805,6 +805,7 @@ def train(self) -> None:
805805
and curr_epoch == 0
806806
and self.profiler_profile_memory
807807
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
808+
and self._device.type == "cuda"
808809
):
809810
torch.cuda.memory._record_memory_history()
810811

@@ -909,6 +910,7 @@ def train(self) -> None:
909910
== self.profiler_wait_steps
910911
+ self.profiler_warmup_steps
911912
+ self.profiler_active_steps
913+
and self._device.type == "cuda"
912914
):
913915
torch.cuda.memory._record_memory_history(enabled=None)
914916

recipes/lora_finetune_single_device.py

+2
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,7 @@ def train(self) -> None:
688688
curr_epoch == 0
689689
and self.profiler_profile_memory
690690
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
691+
and self._device.type == "cuda"
691692
):
692693
torch.cuda.memory._record_memory_history()
693694

@@ -761,6 +762,7 @@ def train(self) -> None:
761762
== self.profiler_wait_steps
762763
+ self.profiler_warmup_steps
763764
+ self.profiler_active_steps
765+
and self._device.type == "cuda"
764766
):
765767
torch.cuda.memory._record_memory_history(enabled=None)
766768

recipes/ppo_full_finetune_single_device.py

+2
Original file line numberDiff line numberDiff line change
@@ -935,6 +935,7 @@ def train(self) -> None:
935935
curr_epoch == 0
936936
and self.profiler_profile_memory
937937
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
938+
and self._device.type == "cuda"
938939
):
939940
torch.cuda.memory._record_memory_history()
940941

@@ -1034,6 +1035,7 @@ def train(self) -> None:
10341035
== self.profiler_wait_steps
10351036
+ self.profiler_warmup_steps
10361037
+ self.profiler_active_steps
1038+
and self._device.type == "cuda"
10371039
):
10381040
torch.cuda.memory._record_memory_history(enabled=None)
10391041

recipes/qat_distributed.py

+2
Original file line numberDiff line numberDiff line change
@@ -773,6 +773,7 @@ def train(self) -> None:
773773
and curr_epoch == 0
774774
and self.profiler_profile_memory
775775
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
776+
and self._device.type == "cuda"
776777
):
777778
torch.cuda.memory._record_memory_history()
778779

@@ -913,6 +914,7 @@ def train(self) -> None:
913914
== self.profiler_wait_steps
914915
+ self.profiler_warmup_steps
915916
+ self.profiler_active_steps
917+
and self._device.type == "cuda"
916918
):
917919
torch.cuda.memory._record_memory_history(enabled=None)
918920

recipes/qat_lora_finetune_distributed.py

+2
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,7 @@ def train(self) -> None:
820820
and curr_epoch == 0
821821
and self.profiler_profile_memory
822822
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
823+
and self._device.type == "cuda"
823824
):
824825
torch.cuda.memory._record_memory_history()
825826

@@ -924,6 +925,7 @@ def train(self) -> None:
924925
== self.profiler_wait_steps
925926
+ self.profiler_warmup_steps
926927
+ self.profiler_active_steps
928+
and self._device.type == "cuda"
927929
):
928930
torch.cuda.memory._record_memory_history(enabled=None)
929931

tests/torchtune/training/test_profiler.py

+5
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def profiler_cfg():
3939
enabled: True
4040
cpu: True
4141
cuda: True
42+
xpu: True
4243
profile_memory: False
4344
with_stack: False
4445
record_shapes: True
@@ -92,6 +93,7 @@ def reference_profiler_basic():
9293
activities=[
9394
torch.profiler.ProfilerActivity.CPU,
9495
torch.profiler.ProfilerActivity.CUDA,
96+
torch.profiler.ProfilerActivity.XPU,
9597
],
9698
schedule=torch.profiler.schedule(wait=3, warmup=1, active=1, repeat=0),
9799
profile_memory=False,
@@ -107,6 +109,7 @@ def reference_profiler_full():
107109
activities=[
108110
torch.profiler.ProfilerActivity.CPU,
109111
torch.profiler.ProfilerActivity.CUDA,
112+
torch.profiler.ProfilerActivity.XPU,
110113
],
111114
schedule=torch.profiler.schedule(wait=3, warmup=1, active=1, repeat=0),
112115
profile_memory=True,
@@ -194,10 +197,12 @@ def test_default_activities(profiler_cfg):
194197
# Test setup automatically adds CPU + CUDA tracing if neither CPU nor CUDA is specified
195198
cfg.pop("cpu")
196199
cfg.pop("cuda")
200+
cfg.pop("xpu")
197201
profiler, updated_cfg = _setup_profiler(cfg)
198202
assert profiler.activities == DEFAULT_PROFILER_ACTIVITIES
199203
assert updated_cfg.cpu is True
200204
assert updated_cfg.cuda is True
205+
assert updated_cfg.xpu is True
201206

202207

203208
def test_default_output_dir(profiler_cfg):

torchtune/training/_profiler.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
DEFAULT_PROFILER_ACTIVITIES = {
2828
torch.profiler.ProfilerActivity.CPU,
2929
torch.profiler.ProfilerActivity.CUDA,
30+
torch.profiler.ProfilerActivity.XPU,
3031
}
3132

3233
DEFAULT_SCHEDULE: dict = {
@@ -111,7 +112,7 @@ def trace_handler(
111112
log.info(f"Finished dumping traces in {time.monotonic() - begin:.2f} seconds")
112113

113114
# Memory timeline sometimes fails to export
114-
if prof.profile_memory:
115+
if prof.profile_memory and torch.cuda.is_available():
115116
if rank == 0:
116117
try:
117118
prof.export_memory_timeline(
@@ -185,6 +186,7 @@ def setup_torch_profiler(
185186
enabled: bool = False,
186187
cpu: bool = True,
187188
cuda: bool = True,
189+
xpu: bool = True,
188190
profile_memory: bool = DEFAULT_TRACE_OPTS["profile_memory"],
189191
with_stack: bool = DEFAULT_TRACE_OPTS["with_stack"],
190192
record_shapes: bool = DEFAULT_TRACE_OPTS["record_shapes"],
@@ -252,6 +254,7 @@ def setup_torch_profiler(
252254
enabled (bool): Enable pytorch profiler. Default is False.
253255
cpu (bool): Enable cpu profiling. Default is True.
254256
cuda (bool): Enable cuda profiling. Default is True.
257+
xpu (bool): Enable xpu profiling. Default is True.
255258
profile_memory (bool): Profile memory usage. Default is False.
256259
with_stack (bool): Profile stack. Default is False.
257260
record_shapes (bool): Record shapes. Default is True.
@@ -276,10 +279,12 @@ def setup_torch_profiler(
276279
activities.append(torch.profiler.ProfilerActivity.CPU)
277280
if cuda:
278281
activities.append(torch.profiler.ProfilerActivity.CUDA)
282+
if xpu:
283+
activities.append(torch.profiler.ProfilerActivity.XPU)
279284
if len(activities) == 0:
280285
_warn("No activities specified, defaulting to CPU + CUDA")
281286
activities = DEFAULT_PROFILER_ACTIVITIES
282-
cpu = cuda = True
287+
cpu = cuda = xpu = True
283288

284289
# Check for schedule
285290
# 1) If no schedule is provided, set to DEFAULT_SCHEDULE
@@ -372,6 +377,7 @@ def setup_torch_profiler(
372377
"output_dir": output_dir,
373378
"cpu": cpu,
374379
"cuda": cuda,
380+
"xpu": xpu,
375381
"profile_memory": profile_memory,
376382
"with_stack": with_stack,
377383
"record_shapes": record_shapes,

0 commit comments

Comments
 (0)