WilliamZhang20
diff --git a/‎docs/Block_Diagram.png‎
39.8 KB b/‎docs/Block_Diagram.png‎
39.8 KB
diff --git a/‎test/mac/fp8_multiplication_error_heatmap.png‎
79.1 KB b/‎test/mac/fp8_multiplication_error_heatmap.png‎
79.1 KB
diff --git a/‎test/mac/test_mac.py‎
Lines changed: 113 additions & 0 deletions b/‎test/mac/test_mac.py‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎test/tpu/test.py‎
Lines changed: 129 additions & 1 deletion b/‎test/tpu/test.py‎
Lines changed: 129 additions & 1 deletion
@@ -3,6 +3,8 @@
 from cocotb.clock import Clock
 import random
 import math
+import matplotlib.pyplot as plt
+import numpy as np
 import struct
 # ---------- helpers to emulate RTL bit-exact behavior ----------
 
@@ -110,3 +112,114 @@ async def test_pe_deviation(dut):
             f"[{i}] a={fa:.6f}  b={fb:.6f}  expected={expected:.6f}  "
             f"bf16={got_float:.6f}  abs_err={abs_err:.6e}  rel_err={rel_err:.6e}"
         )
+
+async def perform_multiplication(dut, fa: float, fb: float):
+    expected = fa * fb
+
+    fp8_a = fp8_e4m3_encode(fa)
+    fp8_b = fp8_e4m3_encode(fb)
+
+    await reset_accumulator(dut)
+    
+    dut.a_in.value = fp8_a
+    dut.b_in.value = fp8_b
+    await RisingEdge(dut.clk) # one cycle if output is assigned combinationally
+    await RisingEdge(dut.clk) # two cycles if output is registered inside PE
+
+    bf16_raw = int(dut.c_out.value) & 0xFFFF
+    got_float = bf16_to_float(bf16_raw)
+
+    abs_err = abs(got_float - expected)
+    # Using relative error or a variant of it (like ULP) is often better for floats, 
+    # but based on the previous test and request for 'error', we'll use Absolute Error for the heatmap.
+    return abs_err
+
+# Function to setup the clock and reset for the tests
+async def setup_dut(dut):
+    cocotb.start_soon(Clock(dut.clk, 10, "ns").start())
+    dut.rst.value = 1
+    dut.clear.value = 1
+    dut.a_in.value = 0
+    dut.b_in.value = 0
+    for _ in range(3):
+        await RisingEdge(dut.clk)
+    dut.rst.value = 0
+    dut.clear.value = 0
+    await RisingEdge(dut.clk)
+
+@cocotb.test()
+async def test_pe_error_heatmap(dut):
+    await setup_dut(dut)
+
+    # --- Plotting parameters ---
+    MIN_VAL = -10.0
+    MAX_VAL = 10.0
+    NUM_STEPS = 100 # Increase for finer resolution, decrease for faster test
+    
+    a_values = np.linspace(MIN_VAL, MAX_VAL, NUM_STEPS)
+    b_values = np.linspace(MIN_VAL, MAX_VAL, NUM_STEPS)
+
+    epsilon_grid = 1e-6
+    
+    # Initialize the error matrix
+    error_matrix = np.zeros((NUM_STEPS, NUM_STEPS))
+
+    dut._log.info(f"Starting heatmap generation with {NUM_STEPS*NUM_STEPS} points...")
+    
+    # --- Data Collection Loop ---
+    for i in range(NUM_STEPS):
+        fa = a_values[i]
+        for j in range(NUM_STEPS):
+            fb = b_values[j]
+            
+            # The perform_multiplication function now handles the RTL interaction
+            abs_err = await perform_multiplication(dut, fa, fb)
+
+            expected = fa * fb
+
+            if abs(expected) < epsilon_grid:
+                # If the product is essentially zero, relative error is undefined/infinite.
+                # We'll assign a max error value for plotting purposes.
+                rel_err = 1.0 # Max expected relative error
+            else:
+                rel_err = abs_err / abs(expected)
+
+            error_matrix[i, j] = rel_err
+
+    dut._log.info("Data collection complete. Generating plot...")
+
+    # --- Matplotlib Plotting ---
+    
+    # Create the X and Y meshgrid for the plot
+    X, Y = np.meshgrid(a_values, b_values)
+    
+    # Create the figure and axes
+    plt.figure(figsize=(10, 8))
+    
+    # Plot the heatmap (using imshow for 2D array)
+    epsilon = 1e-15
+    log_error_matrix = error_matrix + epsilon
+    
+    # Plot the log10 of the absolute error
+    plt.imshow(log_error_matrix, origin='lower', aspect='auto', 
+               extent=[MIN_VAL, MAX_VAL, MIN_VAL, MAX_VAL], 
+               cmap='viridis') # 'viridis' or 'inferno' are good choices
+
+    # Add a color bar to show the error scale
+    cbar = plt.colorbar()
+    cbar.set_label('Relative Error') # 
+    
+    # Add labels and title
+    plt.xlabel('Multiplicand A (a_in)')
+    plt.ylabel('Multiplicand B (b_in)')
+    plt.title(f'FP8 E4M3 Multiplication Relative Error Heatmap ({NUM_STEPS}x{NUM_STEPS} Grid)')
+    
+    # Save the plot
+    plot_filename = "fp8_multiplication_error_heatmap.png"
+    plt.savefig(plot_filename)
+    dut._log.info(f"Error heatmap saved to: {plot_filename}")
+
+    # Optionally, you can assert that the max error is below a certain threshold
+    max_err = np.max(error_matrix)
+    dut._log.info(f"Maximum absolute error in grid: {max_err:.6e}")
+    # assert max_err < 1e-2, f"Max absolute error {max_err} exceeds threshold"
@@ -201,4 +201,132 @@ async def test_gemm(dut):
             f"C[{i//2}][{i%2}] = {results[i]} "
             f"!= expected {expected[i]} (relative error {rel_err:.4f})"
         )
-    dut._log.info("Test 2 passed")
+    dut._log.info("Test 2 passed")
+
+def get_expected_large_matmul(A, B, transpose=0, relu=0):
+    if transpose:
+        B = B.T
+    
+    result = A @ B
+
+    if relu:
+        result = np.maximum(result, 0)
+
+    return result
+
+def check_expected(A, B, result, transpose=0, relu=0):
+    """
+    Check DUT results against expected matrix multiplication, for big matrices
+    """
+    expected = get_expected_large_matmul(A, B, transpose, relu)
+    np.testing.assert_array_equal(result, expected, err_msg="Matrix multiplication result does not match expected")
+
+async def accumulate_matrix_output(dut, results_large, i, j, transpose=0, A_block=None, B_block=None):
+    """
+    Serially loads A_block and B_block (1 value per cycle),
+    and reads interleaved output (1 byte per cycle: high, low, high, low, ...).
+    Accumulates output into results_large[i:i+2, j:j+2].
+    """
+    # Full interleaved stream of 8 input values: A0-A3, then B0-B3
+    input_stream = (A_block + B_block) if (A_block and B_block) else [0]*8
+
+    dut.uio_in.value = (transpose << 1) | 1  # load_en=1
+
+    partial_outputs = []
+
+    for idx in range(8):
+        dut.ui_in.value = input_stream[idx]
+        await ClockCycles(dut.clk, 1)
+        val = dut.uo_out.value.integer
+        partial_outputs.append(val)
+
+    # Now decode high/low bytes
+    combined_outputs = []
+    for ii in range(0, 8, 2):
+        high = partial_outputs[ii]
+        low = partial_outputs[ii + 1]
+        val = (high << 8) | low
+        if val >= 0x8000:
+            val -= 0x10000
+        combined_outputs.append(val)
+
+    results_large[i,   j  ] += combined_outputs[0]  # C00
+    results_large[i,   j+1] += combined_outputs[1]  # C01
+    results_large[i+1, j  ] += combined_outputs[2]  # C10
+    results_large[i+1, j+1] += combined_outputs[3]  # C11
+
+    return combined_outputs
+
+async def matmul(dut, A, B, transpose=False, relu=False):
+    """
+    Fully pipelined systolic matrix multiplication using 2x2 blocks.
+    Accumulates partial results across k dimension for each (i,j) tile.
+    Loads A and B in parallel with reading previous output.
+    """
+    m, n = A.shape
+    n_b, p = B.shape
+    if (transpose):
+        assert n == p, "Reminder: you are computing A*B^T"
+    else:
+        assert n == n_b, "Matrix dimension mismatch"
+
+    # Pad dimensions to multiples of 2
+    m_p = ((m + 1) // 2) * 2
+    n_p = ((n + 1) // 2) * 2
+    n_bp = ((n_b + 1) // 2) * 2
+    p_p = ((p + 1) // 2) * 2
+
+    A_padded = np.zeros((m_p, n_p), dtype=int)
+    B_padded = np.zeros((n_bp, p_p), dtype=int)
+    
+    A_padded[:m, :n] = A
+    B_padded[:n_b, :p] = B
+    results_large = np.zeros((m_p, n_bp), dtype=int) if transpose else np.zeros((m_p, p_p), dtype=int)
+
+    # Generate tile coordinates (i, j, k)
+    if transpose:
+        # Order: j, i, k for transpose case
+        tile_coords = [
+            (i, j, k)
+            for i in range(0, m_p, 2)
+            for j in range(0, n_bp, 2)
+            for k in range(0, p_p, 2)
+        ]
+    else:
+        # Original order: i, j, k
+        tile_coords = [
+            (i, j, k)
+            for i in range(0, m_p, 2)
+            for j in range(0, p_p, 2)
+            for k in range(0, n_p, 2)
+        ]
+
+    # Step 1: Load first tile only (no output yet)
+    i0, j0, k0 = tile_coords[0]
+    A_block = A_padded[i0:i0+2, k0:k0+2].flatten().tolist()
+    B_block = B_padded[k0:k0+2, j0:j0+2].flatten().tolist()
+
+    await load_matrix(dut, A_block, transpose=0, relu=relu)
+    await load_matrix(dut, B_block, transpose=transpose, relu=relu)
+
+    # Step 2: Pipelined main loop
+    for coord in tile_coords[1:]:
+        i1, j1, k1 = coord
+        A_next = A_padded[i1:i1+2, k1:k1+2].flatten().tolist()
+        B_next = B_padded[j1:j1+2, k1:k1+2].flatten().tolist() if transpose else B_padded[k1:k1+2, j1:j1+2].flatten().tolist()
+        # Read output from previous tile while loading next
+        await accumulate_matrix_output(dut, results_large, i0, j0, transpose, A_next, B_next)
+
+        # Slide to next
+        i0, j0, k0 = i1, j1, k1
+        A_block = A_next
+        B_block = B_next
+
+    # Final tile read (no further input)
+    await accumulate_matrix_output(dut, results_large, i0, j0, transpose)
+
+    # Apply ReLU if enabled
+    if relu:
+        results_large = np.maximum(results_large, 0)
+
+    return results_large[:m, :n_b] if transpose else results_large[:m, :p]