[Bench] Add KernelSubmitGraphVllmMock benchmarks (intel#21412)

luszczewskakasia1 · lukaszstolarczuk · web-flow · commit e23ac190649d · 2026-03-25T07:54:06.000-07:00
and re-enable SYCL graph benchmark on PVC, as the issue is gone now.

---------

Signed-off-by: luszczewskakasia1 &lt;katarzyna.luszczewska@intel.com&gt;
Co-authored-by: Łukasz Stolarczuk &lt;lukasz.stolarczuk@intel.com&gt;
diff --git a/devops/scripts/benchmarks/benches/compute/compute.py b/devops/scripts/benchmarks/benches/compute/compute.py
@@ -525,90 +525,11 @@ def createTorchLinearKernelSizeBench(variant_name: str, **kwargs):
                     ),
                 ]
 
-        # Graph benchmarks segfault on pvc
-        device_arch = getattr(options, "device_architecture", "")
-        if not ("pvc" in device_arch):
-            # Add TorchGraphSingleQueue benchmarks
-            for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
-                for profiler_type, kernel_name in product(
-                    list(PROFILERS), list(KERNEL_NAME)
-                ):
-
-                    def createTorchGraphSingleQueueBench(variant_name: str, **kwargs):
-                        return TorchGraphSingleQueue(
-                            self,
-                            runtime,
-                            variant_name,
-                            profiler_type,
-                            fixed_args={
-                                "KernelWGCount": 512,
-                                "KernelWGSize": 256,
-                                "Profiling": 0,
-                                "UseEvents": 0,
-                            },
-                            **kwargs,
-                        )
-
-                    benches += [
-                        createTorchGraphSingleQueueBench(
-                            "small",
-                            KernelName=kernel_name.value,
-                            KernelsPerQueue=10,
-                            KernelBatchSize=10,
-                        ),
-                        createTorchGraphSingleQueueBench(
-                            "medium",
-                            KernelName=kernel_name.value,
-                            KernelsPerQueue=32,
-                            KernelBatchSize=32,
-                        ),
-                        createTorchGraphSingleQueueBench(
-                            "large",
-                            KernelName=kernel_name.value,
-                            KernelsPerQueue=64,
-                            KernelBatchSize=64,
-                        ),
-                    ]
-
-            # Add TorchGraphMultiQueue benchmarks
-            for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
-                for profiler_type in list(PROFILERS):
-
-                    def createTorchGraphMultiQueueBench(variant_name: str, **kwargs):
-                        return TorchGraphMultiQueue(
-                            self,
-                            runtime,
-                            variant_name,
-                            profiler_type,
-                            fixed_args={
-                                "KernelWGCount": 512,
-                                "KernelWGSize": 256,
-                                "Profiling": 0,
-                                "UseEvents": 0,
-                            },
-                            **kwargs,
-                        )
-
-                    benches += [
-                        createTorchGraphMultiQueueBench(
-                            "small",
-                            KernelsPerQueue=10,
-                        ),
-                        createTorchGraphMultiQueueBench(
-                            "medium",
-                            KernelsPerQueue=32,
-                        ),
-                        createTorchGraphMultiQueueBench(
-                            "large",
-                            KernelsPerQueue=64,
-                        ),
-                    ]
-
-        # Add TorchSubmitEventRecordWait benchmarks
+        # Add TorchEventRecordWait benchmarks
         for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
             for profiler_type in list(PROFILERS):
                 benches.append(
-                    TorchSubmitEventRecordWait(
+                    TorchEventRecordWait(
                         self,
                         runtime,
                         "medium",
@@ -619,6 +540,133 @@ def createTorchGraphMultiQueueBench(variant_name: str, **kwargs):
                     )
                 )
 
+        #
+        # Note: Graph benchmarks segfault on pvc on L0
+        #
+        device_arch = getattr(options, "device_architecture", "")
+
+        # Add TorchGraphSingleQueue benchmarks
+        for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
+            if "pvc" in device_arch and runtime == RUNTIMES.LEVEL_ZERO:
+                continue
+
+            for profiler_type, kernel_name in product(
+                list(PROFILERS), list(KERNEL_NAME)
+            ):
+
+                def createTorchGraphSingleQueueBench(variant_name: str, **kwargs):
+                    return TorchGraphSingleQueue(
+                        self,
+                        runtime,
+                        variant_name,
+                        profiler_type,
+                        fixed_args={
+                            "KernelWGCount": 512,
+                            "KernelWGSize": 256,
+                            "Profiling": 0,
+                            "UseEvents": 0,
+                        },
+                        **kwargs,
+                    )
+
+                benches += [
+                    createTorchGraphSingleQueueBench(
+                        "small",
+                        KernelName=kernel_name.value,
+                        KernelsPerQueue=10,
+                        KernelBatchSize=10,
+                    ),
+                    createTorchGraphSingleQueueBench(
+                        "medium",
+                        KernelName=kernel_name.value,
+                        KernelsPerQueue=32,
+                        KernelBatchSize=32,
+                    ),
+                    createTorchGraphSingleQueueBench(
+                        "large",
+                        KernelName=kernel_name.value,
+                        KernelsPerQueue=64,
+                        KernelBatchSize=64,
+                    ),
+                ]
+
+        # Add TorchGraphMultiQueue benchmarks
+        for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
+            if "pvc" in device_arch and runtime == RUNTIMES.LEVEL_ZERO:
+                continue
+
+            for profiler_type in list(PROFILERS):
+
+                def createTorchGraphMultiQueueBench(variant_name: str, **kwargs):
+                    return TorchGraphMultiQueue(
+                        self,
+                        runtime,
+                        variant_name,
+                        profiler_type,
+                        fixed_args={
+                            "KernelWGCount": 512,
+                            "KernelWGSize": 256,
+                            "Profiling": 0,
+                            "UseEvents": 0,
+                        },
+                        **kwargs,
+                    )
+
+                benches += [
+                    createTorchGraphMultiQueueBench(
+                        "small",
+                        KernelsPerQueue=10,
+                    ),
+                    createTorchGraphMultiQueueBench(
+                        "medium",
+                        KernelsPerQueue=32,
+                    ),
+                    createTorchGraphMultiQueueBench(
+                        "large",
+                        KernelsPerQueue=64,
+                    ),
+                ]
+
+        # Add TorchGraphVllmMock benchmarks
+        for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
+            if "pvc" in device_arch and runtime == RUNTIMES.LEVEL_ZERO:
+                continue
+
+            for profiler_type in list(PROFILERS):
+
+                def createTorchGraphVllmMockBench(variant_name: str, **kwargs):
+                    return TorchGraphVllmMock(
+                        self,
+                        runtime,
+                        variant_name,
+                        profiler_type,
+                        fixed_args={
+                            "KernelWGCount": 512,
+                            "KernelWGSize": 256,
+                            "Profiling": 0,
+                            "UseEvents": 0,
+                        },
+                        **kwargs,
+                    )
+
+                benches += [
+                    createTorchGraphVllmMockBench(
+                        "small", AllocCount=32, GraphScenario=0
+                    ),
+                    createTorchGraphVllmMockBench(
+                        "large", AllocCount=128, GraphScenario=0
+                    ),
+                    createTorchGraphVllmMockBench(
+                        "large", AllocCount=128, GraphScenario=1
+                    ),
+                    createTorchGraphVllmMockBench(
+                        "large", AllocCount=128, GraphScenario=2
+                    ),
+                    createTorchGraphVllmMockBench(
+                        "large", AllocCount=128, GraphScenario=3
+                    ),
+                ]
+
         # Add UR-specific benchmarks
         benches += [
             # TODO: multithread_benchmark_ur fails with segfault
diff --git a/devops/scripts/benchmarks/benches/compute/compute_torch.py b/devops/scripts/benchmarks/benches/compute/compute_torch.py
@@ -232,7 +232,7 @@ def __init__(
         )
 
 
-class TorchSubmitEventRecordWait(TorchBenchmark):
+class TorchEventRecordWait(TorchBenchmark):
     def __init__(
         self,
         suite,
@@ -249,3 +249,24 @@ def __init__(
             profiler_type,
             **kwargs,
         )
+
+
+class TorchGraphVllmMock(TorchBenchmark):
+    def __init__(
+        self,
+        suite,
+        runtime: RUNTIMES,
+        variant_name: str,
+        profiler_type: PROFILERS,
+        fixed_args: dict | None = None,
+        **kwargs,
+    ):
+        super().__init__(
+            suite,
+            runtime,
+            "KernelSubmitGraphVllmMock",
+            variant_name,
+            profiler_type,
+            fixed_args=fixed_args,
+            **kwargs,
+        )
diff --git a/devops/scripts/benchmarks/tests/test_integration.py b/devops/scripts/benchmarks/tests/test_integration.py
@@ -277,7 +277,7 @@ def test_torch_l0(self):
             "KernelSubmitMemoryReuse Int32Large",
             {"pytorch", "L0"},
         )
-        # FIXME: Graph benchmarks segfault on pvc
+        # FIXME: Graph benchmarks segfault on pvc on L0
         if not ("pvc" in self.device_arch.lower()):
             self._checkCase(
                 "torch_benchmark_l0 KernelSubmitGraphSingleQueue KernelBatchSize 10, KernelName Add, KernelsPerQueue 10 CPU count",
@@ -289,6 +289,11 @@ def test_torch_l0(self):
                 "KernelSubmitGraphMultiQueue large, CPU count",
                 {"pytorch", "L0"},
             )
+            self._checkCase(
+                "torch_benchmark_l0 KernelSubmitGraphVllmMock AllocCount 128, GraphScenario 3",
+                "KernelSubmitGraphVllmMock large",
+                {"pytorch", "L0"},
+            )
 
     def test_torch_sycl(self):
         self._checkCase(
@@ -326,18 +331,21 @@ def test_torch_sycl(self):
             "KernelSubmitMemoryReuse FloatLarge",
             {"pytorch", "SYCL"},
         )
-        # FIXME: Graph benchmarks segfault on pvc
-        if not ("pvc" in self.device_arch.lower()):
-            self._checkCase(
-                "torch_benchmark_sycl KernelSubmitGraphSingleQueue KernelBatchSize 32, KernelName Add, KernelsPerQueue 32",
-                "KernelSubmitGraphSingleQueue medium",
-                {"pytorch", "SYCL"},
-            )
-            self._checkCase(
-                "torch_benchmark_sycl KernelSubmitGraphMultiQueue KernelsPerQueue 32 CPU count",
-                "KernelSubmitGraphMultiQueue medium, CPU count",
-                {"pytorch", "SYCL"},
-            )
+        self._checkCase(
+            "torch_benchmark_sycl KernelSubmitGraphSingleQueue KernelBatchSize 32, KernelName Add, KernelsPerQueue 32",
+            "KernelSubmitGraphSingleQueue medium",
+            {"pytorch", "SYCL"},
+        )
+        self._checkCase(
+            "torch_benchmark_sycl KernelSubmitGraphMultiQueue KernelsPerQueue 32 CPU count",
+            "KernelSubmitGraphMultiQueue medium, CPU count",
+            {"pytorch", "SYCL"},
+        )
+        self._checkCase(
+            "torch_benchmark_sycl KernelSubmitGraphVllmMock AllocCount 32, GraphScenario 0",
+            "KernelSubmitGraphVllmMock small",
+            {"pytorch", "SYCL"},
+        )
 
     def test_torch_syclpreview(self):
         self._checkCase(
@@ -380,18 +388,21 @@ def test_torch_syclpreview(self):
             "KernelSubmitMemoryReuse FloatMedium, CPU count",
             {"pytorch", "SYCL"},
         )
-        # FIXME: Graph benchmarks segfault on pvc
-        if not ("pvc" in self.device_arch.lower()):
-            self._checkCase(
-                "torch_benchmark_syclpreview KernelSubmitGraphSingleQueue KernelBatchSize 64, KernelName Add, KernelsPerQueue 64",
-                "KernelSubmitGraphSingleQueue large",
-                {"pytorch", "SYCL"},
-            )
-            self._checkCase(
-                "torch_benchmark_syclpreview KernelSubmitGraphMultiQueue KernelsPerQueue 10",
-                "KernelSubmitGraphMultiQueue small",
-                {"pytorch", "SYCL"},
-            )
+        self._checkCase(
+            "torch_benchmark_syclpreview KernelSubmitGraphSingleQueue KernelBatchSize 64, KernelName Add, KernelsPerQueue 64",
+            "KernelSubmitGraphSingleQueue large",
+            {"pytorch", "SYCL"},
+        )
+        self._checkCase(
+            "torch_benchmark_syclpreview KernelSubmitGraphMultiQueue KernelsPerQueue 10",
+            "KernelSubmitGraphMultiQueue small",
+            {"pytorch", "SYCL"},
+        )
+        self._checkCase(
+            "torch_benchmark_syclpreview KernelSubmitGraphVllmMock AllocCount 128, GraphScenario 1 CPU count",
+            "KernelSubmitGraphVllmMock large, CPU count",
+            {"pytorch", "SYCL"},
+        )
 
 
 if __name__ == "__main__":