Skip to content

Commit e23ac19

Browse files
[Bench] Add KernelSubmitGraphVllmMock benchmarks (intel#21412)
and re-enable SYCL graph benchmark on PVC, as the issue is gone now. --------- Signed-off-by: luszczewskakasia1 <katarzyna.luszczewska@intel.com> Co-authored-by: Łukasz Stolarczuk <lukasz.stolarczuk@intel.com>
1 parent 0da22e2 commit e23ac19

3 files changed

Lines changed: 187 additions & 107 deletions

File tree

devops/scripts/benchmarks/benches/compute/compute.py

Lines changed: 129 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -525,90 +525,11 @@ def createTorchLinearKernelSizeBench(variant_name: str, **kwargs):
525525
),
526526
]
527527

528-
# Graph benchmarks segfault on pvc
529-
device_arch = getattr(options, "device_architecture", "")
530-
if not ("pvc" in device_arch):
531-
# Add TorchGraphSingleQueue benchmarks
532-
for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
533-
for profiler_type, kernel_name in product(
534-
list(PROFILERS), list(KERNEL_NAME)
535-
):
536-
537-
def createTorchGraphSingleQueueBench(variant_name: str, **kwargs):
538-
return TorchGraphSingleQueue(
539-
self,
540-
runtime,
541-
variant_name,
542-
profiler_type,
543-
fixed_args={
544-
"KernelWGCount": 512,
545-
"KernelWGSize": 256,
546-
"Profiling": 0,
547-
"UseEvents": 0,
548-
},
549-
**kwargs,
550-
)
551-
552-
benches += [
553-
createTorchGraphSingleQueueBench(
554-
"small",
555-
KernelName=kernel_name.value,
556-
KernelsPerQueue=10,
557-
KernelBatchSize=10,
558-
),
559-
createTorchGraphSingleQueueBench(
560-
"medium",
561-
KernelName=kernel_name.value,
562-
KernelsPerQueue=32,
563-
KernelBatchSize=32,
564-
),
565-
createTorchGraphSingleQueueBench(
566-
"large",
567-
KernelName=kernel_name.value,
568-
KernelsPerQueue=64,
569-
KernelBatchSize=64,
570-
),
571-
]
572-
573-
# Add TorchGraphMultiQueue benchmarks
574-
for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
575-
for profiler_type in list(PROFILERS):
576-
577-
def createTorchGraphMultiQueueBench(variant_name: str, **kwargs):
578-
return TorchGraphMultiQueue(
579-
self,
580-
runtime,
581-
variant_name,
582-
profiler_type,
583-
fixed_args={
584-
"KernelWGCount": 512,
585-
"KernelWGSize": 256,
586-
"Profiling": 0,
587-
"UseEvents": 0,
588-
},
589-
**kwargs,
590-
)
591-
592-
benches += [
593-
createTorchGraphMultiQueueBench(
594-
"small",
595-
KernelsPerQueue=10,
596-
),
597-
createTorchGraphMultiQueueBench(
598-
"medium",
599-
KernelsPerQueue=32,
600-
),
601-
createTorchGraphMultiQueueBench(
602-
"large",
603-
KernelsPerQueue=64,
604-
),
605-
]
606-
607-
# Add TorchSubmitEventRecordWait benchmarks
528+
# Add TorchEventRecordWait benchmarks
608529
for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
609530
for profiler_type in list(PROFILERS):
610531
benches.append(
611-
TorchSubmitEventRecordWait(
532+
TorchEventRecordWait(
612533
self,
613534
runtime,
614535
"medium",
@@ -619,6 +540,133 @@ def createTorchGraphMultiQueueBench(variant_name: str, **kwargs):
619540
)
620541
)
621542

543+
#
544+
# Note: Graph benchmarks segfault on pvc on L0
545+
#
546+
device_arch = getattr(options, "device_architecture", "")
547+
548+
# Add TorchGraphSingleQueue benchmarks
549+
for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
550+
if "pvc" in device_arch and runtime == RUNTIMES.LEVEL_ZERO:
551+
continue
552+
553+
for profiler_type, kernel_name in product(
554+
list(PROFILERS), list(KERNEL_NAME)
555+
):
556+
557+
def createTorchGraphSingleQueueBench(variant_name: str, **kwargs):
558+
return TorchGraphSingleQueue(
559+
self,
560+
runtime,
561+
variant_name,
562+
profiler_type,
563+
fixed_args={
564+
"KernelWGCount": 512,
565+
"KernelWGSize": 256,
566+
"Profiling": 0,
567+
"UseEvents": 0,
568+
},
569+
**kwargs,
570+
)
571+
572+
benches += [
573+
createTorchGraphSingleQueueBench(
574+
"small",
575+
KernelName=kernel_name.value,
576+
KernelsPerQueue=10,
577+
KernelBatchSize=10,
578+
),
579+
createTorchGraphSingleQueueBench(
580+
"medium",
581+
KernelName=kernel_name.value,
582+
KernelsPerQueue=32,
583+
KernelBatchSize=32,
584+
),
585+
createTorchGraphSingleQueueBench(
586+
"large",
587+
KernelName=kernel_name.value,
588+
KernelsPerQueue=64,
589+
KernelBatchSize=64,
590+
),
591+
]
592+
593+
# Add TorchGraphMultiQueue benchmarks
594+
for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
595+
if "pvc" in device_arch and runtime == RUNTIMES.LEVEL_ZERO:
596+
continue
597+
598+
for profiler_type in list(PROFILERS):
599+
600+
def createTorchGraphMultiQueueBench(variant_name: str, **kwargs):
601+
return TorchGraphMultiQueue(
602+
self,
603+
runtime,
604+
variant_name,
605+
profiler_type,
606+
fixed_args={
607+
"KernelWGCount": 512,
608+
"KernelWGSize": 256,
609+
"Profiling": 0,
610+
"UseEvents": 0,
611+
},
612+
**kwargs,
613+
)
614+
615+
benches += [
616+
createTorchGraphMultiQueueBench(
617+
"small",
618+
KernelsPerQueue=10,
619+
),
620+
createTorchGraphMultiQueueBench(
621+
"medium",
622+
KernelsPerQueue=32,
623+
),
624+
createTorchGraphMultiQueueBench(
625+
"large",
626+
KernelsPerQueue=64,
627+
),
628+
]
629+
630+
# Add TorchGraphVllmMock benchmarks
631+
for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
632+
if "pvc" in device_arch and runtime == RUNTIMES.LEVEL_ZERO:
633+
continue
634+
635+
for profiler_type in list(PROFILERS):
636+
637+
def createTorchGraphVllmMockBench(variant_name: str, **kwargs):
638+
return TorchGraphVllmMock(
639+
self,
640+
runtime,
641+
variant_name,
642+
profiler_type,
643+
fixed_args={
644+
"KernelWGCount": 512,
645+
"KernelWGSize": 256,
646+
"Profiling": 0,
647+
"UseEvents": 0,
648+
},
649+
**kwargs,
650+
)
651+
652+
benches += [
653+
createTorchGraphVllmMockBench(
654+
"small", AllocCount=32, GraphScenario=0
655+
),
656+
createTorchGraphVllmMockBench(
657+
"large", AllocCount=128, GraphScenario=0
658+
),
659+
createTorchGraphVllmMockBench(
660+
"large", AllocCount=128, GraphScenario=1
661+
),
662+
createTorchGraphVllmMockBench(
663+
"large", AllocCount=128, GraphScenario=2
664+
),
665+
createTorchGraphVllmMockBench(
666+
"large", AllocCount=128, GraphScenario=3
667+
),
668+
]
669+
622670
# Add UR-specific benchmarks
623671
benches += [
624672
# TODO: multithread_benchmark_ur fails with segfault

devops/scripts/benchmarks/benches/compute/compute_torch.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ def __init__(
232232
)
233233

234234

235-
class TorchSubmitEventRecordWait(TorchBenchmark):
235+
class TorchEventRecordWait(TorchBenchmark):
236236
def __init__(
237237
self,
238238
suite,
@@ -249,3 +249,24 @@ def __init__(
249249
profiler_type,
250250
**kwargs,
251251
)
252+
253+
254+
class TorchGraphVllmMock(TorchBenchmark):
255+
def __init__(
256+
self,
257+
suite,
258+
runtime: RUNTIMES,
259+
variant_name: str,
260+
profiler_type: PROFILERS,
261+
fixed_args: dict | None = None,
262+
**kwargs,
263+
):
264+
super().__init__(
265+
suite,
266+
runtime,
267+
"KernelSubmitGraphVllmMock",
268+
variant_name,
269+
profiler_type,
270+
fixed_args=fixed_args,
271+
**kwargs,
272+
)

devops/scripts/benchmarks/tests/test_integration.py

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ def test_torch_l0(self):
277277
"KernelSubmitMemoryReuse Int32Large",
278278
{"pytorch", "L0"},
279279
)
280-
# FIXME: Graph benchmarks segfault on pvc
280+
# FIXME: Graph benchmarks segfault on pvc on L0
281281
if not ("pvc" in self.device_arch.lower()):
282282
self._checkCase(
283283
"torch_benchmark_l0 KernelSubmitGraphSingleQueue KernelBatchSize 10, KernelName Add, KernelsPerQueue 10 CPU count",
@@ -289,6 +289,11 @@ def test_torch_l0(self):
289289
"KernelSubmitGraphMultiQueue large, CPU count",
290290
{"pytorch", "L0"},
291291
)
292+
self._checkCase(
293+
"torch_benchmark_l0 KernelSubmitGraphVllmMock AllocCount 128, GraphScenario 3",
294+
"KernelSubmitGraphVllmMock large",
295+
{"pytorch", "L0"},
296+
)
292297

293298
def test_torch_sycl(self):
294299
self._checkCase(
@@ -326,18 +331,21 @@ def test_torch_sycl(self):
326331
"KernelSubmitMemoryReuse FloatLarge",
327332
{"pytorch", "SYCL"},
328333
)
329-
# FIXME: Graph benchmarks segfault on pvc
330-
if not ("pvc" in self.device_arch.lower()):
331-
self._checkCase(
332-
"torch_benchmark_sycl KernelSubmitGraphSingleQueue KernelBatchSize 32, KernelName Add, KernelsPerQueue 32",
333-
"KernelSubmitGraphSingleQueue medium",
334-
{"pytorch", "SYCL"},
335-
)
336-
self._checkCase(
337-
"torch_benchmark_sycl KernelSubmitGraphMultiQueue KernelsPerQueue 32 CPU count",
338-
"KernelSubmitGraphMultiQueue medium, CPU count",
339-
{"pytorch", "SYCL"},
340-
)
334+
self._checkCase(
335+
"torch_benchmark_sycl KernelSubmitGraphSingleQueue KernelBatchSize 32, KernelName Add, KernelsPerQueue 32",
336+
"KernelSubmitGraphSingleQueue medium",
337+
{"pytorch", "SYCL"},
338+
)
339+
self._checkCase(
340+
"torch_benchmark_sycl KernelSubmitGraphMultiQueue KernelsPerQueue 32 CPU count",
341+
"KernelSubmitGraphMultiQueue medium, CPU count",
342+
{"pytorch", "SYCL"},
343+
)
344+
self._checkCase(
345+
"torch_benchmark_sycl KernelSubmitGraphVllmMock AllocCount 32, GraphScenario 0",
346+
"KernelSubmitGraphVllmMock small",
347+
{"pytorch", "SYCL"},
348+
)
341349

342350
def test_torch_syclpreview(self):
343351
self._checkCase(
@@ -380,18 +388,21 @@ def test_torch_syclpreview(self):
380388
"KernelSubmitMemoryReuse FloatMedium, CPU count",
381389
{"pytorch", "SYCL"},
382390
)
383-
# FIXME: Graph benchmarks segfault on pvc
384-
if not ("pvc" in self.device_arch.lower()):
385-
self._checkCase(
386-
"torch_benchmark_syclpreview KernelSubmitGraphSingleQueue KernelBatchSize 64, KernelName Add, KernelsPerQueue 64",
387-
"KernelSubmitGraphSingleQueue large",
388-
{"pytorch", "SYCL"},
389-
)
390-
self._checkCase(
391-
"torch_benchmark_syclpreview KernelSubmitGraphMultiQueue KernelsPerQueue 10",
392-
"KernelSubmitGraphMultiQueue small",
393-
{"pytorch", "SYCL"},
394-
)
391+
self._checkCase(
392+
"torch_benchmark_syclpreview KernelSubmitGraphSingleQueue KernelBatchSize 64, KernelName Add, KernelsPerQueue 64",
393+
"KernelSubmitGraphSingleQueue large",
394+
{"pytorch", "SYCL"},
395+
)
396+
self._checkCase(
397+
"torch_benchmark_syclpreview KernelSubmitGraphMultiQueue KernelsPerQueue 10",
398+
"KernelSubmitGraphMultiQueue small",
399+
{"pytorch", "SYCL"},
400+
)
401+
self._checkCase(
402+
"torch_benchmark_syclpreview KernelSubmitGraphVllmMock AllocCount 128, GraphScenario 1 CPU count",
403+
"KernelSubmitGraphVllmMock large, CPU count",
404+
{"pytorch", "SYCL"},
405+
)
395406

396407

397408
if __name__ == "__main__":

0 commit comments

Comments
 (0)