tutorial: floating-point-emulation: fix bounds check

cole-brower · cole-brower · commit 44d7c65128aa · 2026-02-18T13:34:06.000-08:00
diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/02.01-MatmulFundamentals.ipynb b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/02.01-MatmulFundamentals.ipynb
@@ -241,7 +241,7 @@
     "    auto [size_m, size_n] = tensor_c.shape();\n",
     "    auto size_k = tutorial::size<1>(tensor_a);\n",
     "\n",
-    "    if (thread_row_idx > size_m || thread_col_idx > size_n) {\n",
+    "    if (thread_row_idx >= size_m || thread_col_idx >= size_n) {\n",
     "        return;\n",
     "    }\n",
     "\n",
@@ -395,6 +395,9 @@
     "        thread_row_idx = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
     "        thread_col_idx = cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y\n",
     "\n",
+    "        if thread_row_idx >= m or thread_col_idx >= n:\n",
+    "            return\n",
+    "\n",
     "        accumulator = 0.0\n",
     "\n",
     "        # EXERCISE --> Complete the following implementation to compute the dot product between row 'thread_row_idx' of matrix A \n",