|
| 1 | +import numpy as np, time, torch, torch.nn as nn, aie.iron as iron |
| 2 | +from aie.utils import NPUKernel, DefaultNPURuntime |
| 3 | + |
| 4 | +configs = [ |
| 5 | + (3, 32, 32, 1, "q_d3_s32_c1"), |
| 6 | + (3, 32, 32, 2, "q_d3_s32_c2"), |
| 7 | + (3, 32, 32, 4, "q_d3_s32_c4"), |
| 8 | + (3, 64, 64, 4, "q_d3_s64_c4"), |
| 9 | +] |
| 10 | + |
| 11 | +print(f"\n{'='*90}") |
| 12 | +print(f"Conv3D Performance: NPU vs PyTorch CPU") |
| 13 | +print(f"{'='*90}\n") |
| 14 | +print(f"{'Volume':<15} {'PyTorch CPU':>15} {'NPU (cores)':>20} {'NPU Speedup':>15} {'Multi-Core':>15}") |
| 15 | +print(f"{'-'*90}") |
| 16 | + |
| 17 | +for depth, height, width, cores, name in configs: |
| 18 | + ci, co = 8, 8 |
| 19 | + |
| 20 | + # PyTorch CPU |
| 21 | + model = nn.Conv3d(ci, co, kernel_size=(1,3,3), padding=0, bias=False) |
| 22 | + model.eval() |
| 23 | + inp = torch.randint(1, 20, (1, ci, depth, height, width)).type(torch.FloatTensor) |
| 24 | + wt = torch.randint(-50, 50, (co, ci, 1, 3, 3)).type(torch.FloatTensor) |
| 25 | + model.weight.data.copy_(wt) |
| 26 | + inp_pad = torch.nn.functional.pad(inp, (1,1,1,1,0,0), mode='replicate') |
| 27 | + for _ in range(5): _ = model(inp_pad) |
| 28 | + t = [(time.perf_counter(), model(inp_pad), time.perf_counter()) for _ in range(20)] |
| 29 | + pt_time = np.mean([(x[2]-x[0])*1e6 for x in t]) |
| 30 | + |
| 31 | + # NPU |
| 32 | + try: |
| 33 | + k = NPUKernel(f"build/{name}.xclbin", f"build/{name}_insts.bin", kernel_name="MLIR_AIE") |
| 34 | + h = DefaultNPURuntime.load(k) |
| 35 | + np.random.seed(42) |
| 36 | + ifm_r = np.random.randint(1, 20, (depth, 1, height, 8, width), dtype=np.uint8) |
| 37 | + wts_r = np.random.randint(-50, 50, (1, 1, 3, 3, 3, 8, 8), dtype=np.int8) |
| 38 | + buf = [iron.tensor(ifm_r.flatten(), dtype=np.uint8), iron.tensor(wts_r.flatten(), dtype=np.int8), iron.zeros(depth*height*width*co, dtype=np.uint8)] |
| 39 | + for _ in range(5): DefaultNPURuntime.run(h, buf) |
| 40 | + npu_times = [DefaultNPURuntime.run(h, buf).npu_time/1000.0 for _ in range(20)] |
| 41 | + npu_time = np.mean(npu_times) |
| 42 | + |
| 43 | + speedup = pt_time / npu_time |
| 44 | + vol_str = f"{depth}×{height}×{width}" |
| 45 | + npu_str = f"{npu_time:.0f}µs ({cores}c)" |
| 46 | + |
| 47 | + # Multi-core comparison (compare to 1-core for same volume) |
| 48 | + if cores == 1: |
| 49 | + baseline_1core = npu_time |
| 50 | + mc_str = "-" |
| 51 | + else: |
| 52 | + if vol_str == "3×32×32" and cores > 1: |
| 53 | + # Compare to baseline from first config |
| 54 | + mc_speedup = baseline_1core / npu_time if 'baseline_1core' in locals() else 0 |
| 55 | + mc_str = f"{mc_speedup:.2f}×" |
| 56 | + else: |
| 57 | + mc_str = "-" |
| 58 | + |
| 59 | + print(f"{vol_str:<15} {pt_time:>12.0f}µs {npu_str:>20} {speedup:>12.1f}× {mc_str:>15}") |
| 60 | + except Exception as e: |
| 61 | + print(f"{depth}×{height}×{width:<10} {pt_time:>12.0f}µs {'ERROR':>20} {'-':>12} {'-':>15}") |
| 62 | + |
| 63 | +print(f"{'-'*90}\n") |
| 64 | +print("Summary:") |
| 65 | +print(" - PyTorch running on CPU (no GPU transfer overhead)") |
| 66 | +print(" - NPU times include PCIe transfer + compute") |
| 67 | +print(" - Larger volumes show better NPU scaling") |
| 68 | +print() |
0 commit comments