|
| 1 | +/* |
| 2 | + * driver.c — per-kernel measurement driver for the RVV 1.0 vector evidence |
| 3 | + * harness (scripts/run_e1_rvv_vector.sh). |
| 4 | + * |
| 5 | + * One kernel is selected at compile time via -DKERNEL_<name>=1. The driver |
| 6 | + * fills deterministic input buffers, then calls the kernel exactly once with |
| 7 | + * the problem size declared in kernels.json. The kernel itself lives in |
| 8 | + * kernels.c; this file only sets up data and a checksum so the optimizer |
| 9 | + * cannot delete the call. |
| 10 | + * |
| 11 | + * The kernel function is wrapped between two named symbols so the QEMU |
| 12 | + * execlog post-processor can isolate the kernel's dynamic instruction |
| 13 | + * stream by program-counter range (kernel_region_begin .. kernel_region_end) |
| 14 | + * without depending on libc symbol sizes. The kernel call sits between them |
| 15 | + * and is marked noinline so it cannot be hoisted out of the region. |
| 16 | + * |
| 17 | + * Exit code carries a one-byte checksum of the result so a mismatch between |
| 18 | + * the scalar (rv64gc) and vector (rv64gcv) builds is observable from QEMU's |
| 19 | + * process exit status. |
| 20 | + */ |
| 21 | +#include <stdint.h> |
| 22 | +#include <stddef.h> |
| 23 | +#include <math.h> |
| 24 | + |
| 25 | +/* Kernel prototypes (definitions in kernels.c). */ |
| 26 | +void saxpy(size_t n, float a, const float *x, float *y); |
| 27 | +void daxpy(size_t n, double a, const double *x, double *y); |
| 28 | +float dot_product(size_t n, const float *a, const float *b); |
| 29 | +float l2_norm(size_t n, const float *a); |
| 30 | +void cond_mask_add(size_t n, const float *x, float *y); |
| 31 | +void cond_mask_mul(size_t n, const float *x, float *y); |
| 32 | +float strided_load_2(size_t n, const float *x); |
| 33 | +float strided_load_4(size_t n, const float *x); |
| 34 | +float sum_reduction(size_t n, const float *x); |
| 35 | +float max_reduction(size_t n, const float *x); |
| 36 | +size_t argmax(size_t n, const float *x); |
| 37 | +void int8_quantize(size_t n, const float *x, int8_t *y, float scale); |
| 38 | +void int8_dequantize(size_t n, const int8_t *x, float *y, float scale); |
| 39 | +void bit_reverse_byte(size_t n, uint8_t *x); |
| 40 | +void packed_uint8_to_uint16(size_t n, const uint8_t *x, uint16_t *y); |
| 41 | +void softmax_inplace(size_t n, float *x); |
| 42 | +void memcpy_byte(size_t n, const uint8_t *src, uint8_t *dst); |
| 43 | +size_t strlen_simple(const char *s); |
| 44 | +float dot_product_f32_unrolled4(size_t n, const float *a, const float *b); |
| 45 | +void layernorm_f32(size_t n, float *x, const float *gamma, const float *beta, float eps); |
| 46 | +void gelu_tanh_f32(size_t n, float *x); |
| 47 | +void silu_f32(size_t n, float *x); |
| 48 | +void saxpy_i8(size_t n, int8_t a, const int8_t *x, int8_t *y); |
| 49 | +int32_t sum_i16(size_t n, const int16_t *x); |
| 50 | +float gather_sum_f32(size_t n, const float *x, const int32_t *idx); |
| 51 | +void memset_byte(size_t n, uint8_t v, uint8_t *dst); |
| 52 | + |
| 53 | +/* |
| 54 | + * Region markers. These two no-op functions bound the kernel call so the |
| 55 | + * execlog post-processor can find the dynamic instruction window by the |
| 56 | + * addresses of these symbols rather than by libc symbol metadata. They are |
| 57 | + * noinline and use inline asm with a memory clobber so the compiler keeps |
| 58 | + * them, in order, around the kernel call. |
| 59 | + */ |
| 60 | +__attribute__((noinline)) void kernel_region_begin(void) { __asm__ volatile("" ::: "memory"); } |
| 61 | +__attribute__((noinline)) void kernel_region_end(void) { __asm__ volatile("" ::: "memory"); } |
| 62 | + |
| 63 | +/* Deterministic LCG so scalar and vector builds see identical inputs. */ |
| 64 | +static uint32_t lcg_state = 0x12345678u; |
| 65 | +static uint32_t lcg(void) { lcg_state = lcg_state * 1664525u + 1013904223u; return lcg_state; } |
| 66 | +static float randf(void) { return (float)(int32_t)(lcg() >> 8) / (float)(1 << 23); } |
| 67 | + |
| 68 | +/* Buffers large enough for the biggest kernel (n = 65536). */ |
| 69 | +#define MAXN 65536 |
| 70 | +static float fa[MAXN], fb[MAXN], fc[MAXN]; |
| 71 | +static double da[MAXN], db[MAXN]; |
| 72 | +static int8_t ia[MAXN], ib[MAXN]; |
| 73 | +static uint8_t ua[MAXN], ub[MAXN]; |
| 74 | +static uint16_t u16[MAXN]; |
| 75 | +static int16_t i16[MAXN]; |
| 76 | +static int32_t i32[MAXN]; |
| 77 | + |
| 78 | +static volatile float sink_f; |
| 79 | +static volatile double sink_d; |
| 80 | +static volatile size_t sink_z; |
| 81 | +static volatile int32_t sink_i; |
| 82 | + |
| 83 | +int main(void) { |
| 84 | + for (size_t i = 0; i < MAXN; ++i) { |
| 85 | + fa[i] = randf(); |
| 86 | + fb[i] = randf(); |
| 87 | + fc[i] = randf(); |
| 88 | + da[i] = (double)randf(); |
| 89 | + db[i] = (double)randf(); |
| 90 | + ia[i] = (int8_t)(lcg() & 0xff); |
| 91 | + ib[i] = (int8_t)(lcg() & 0xff); |
| 92 | + ua[i] = (uint8_t)(lcg() & 0xff); |
| 93 | + i16[i] = (int16_t)(lcg() & 0xffff); |
| 94 | + i32[i] = (int32_t)(lcg() % 4096); |
| 95 | + } |
| 96 | + |
| 97 | + kernel_region_begin(); |
| 98 | +#if defined(KERNEL_saxpy) |
| 99 | + saxpy(8192, 2.5f, fa, fb); sink_f = fb[7]; |
| 100 | +#elif defined(KERNEL_daxpy) |
| 101 | + daxpy(4096, 2.5, da, db); sink_d = db[7]; |
| 102 | +#elif defined(KERNEL_dot_product) |
| 103 | + sink_f = dot_product(8192, fa, fb); |
| 104 | +#elif defined(KERNEL_l2_norm) |
| 105 | + sink_f = l2_norm(8192, fa); |
| 106 | +#elif defined(KERNEL_cond_mask_add) |
| 107 | + cond_mask_add(8192, fa, fb); sink_f = fb[7]; |
| 108 | +#elif defined(KERNEL_cond_mask_mul) |
| 109 | + cond_mask_mul(8192, fa, fb); sink_f = fb[7]; |
| 110 | +#elif defined(KERNEL_strided_load_2) |
| 111 | + sink_f = strided_load_2(8192, fa); |
| 112 | +#elif defined(KERNEL_strided_load_4) |
| 113 | + sink_f = strided_load_4(8192, fa); |
| 114 | +#elif defined(KERNEL_sum_reduction) |
| 115 | + sink_f = sum_reduction(8192, fa); |
| 116 | +#elif defined(KERNEL_max_reduction) |
| 117 | + sink_f = max_reduction(8192, fa); |
| 118 | +#elif defined(KERNEL_argmax) |
| 119 | + sink_z = argmax(8192, fa); |
| 120 | +#elif defined(KERNEL_int8_quantize) |
| 121 | + int8_quantize(16384, fa, ia, 0.5f); sink_i = ia[7]; |
| 122 | +#elif defined(KERNEL_int8_dequantize) |
| 123 | + int8_dequantize(16384, ia, fa, 0.5f); sink_f = fa[7]; |
| 124 | +#elif defined(KERNEL_bit_reverse_byte) |
| 125 | + bit_reverse_byte(16384, ua); sink_i = ua[7]; |
| 126 | +#elif defined(KERNEL_packed_uint8_to_uint16) |
| 127 | + packed_uint8_to_uint16(16384, ua, u16); sink_i = u16[7]; |
| 128 | +#elif defined(KERNEL_softmax_inplace) |
| 129 | + softmax_inplace(1024, fa); sink_f = fa[7]; |
| 130 | +#elif defined(KERNEL_memcpy_byte) |
| 131 | + memcpy_byte(65536, ua, ub); sink_i = ub[7]; |
| 132 | +#elif defined(KERNEL_strlen_simple) |
| 133 | + ua[4095] = 0; for (size_t i = 0; i < 4095; ++i) if (ua[i] == 0) ua[i] = 1; |
| 134 | + sink_z = strlen_simple((const char *)ua); |
| 135 | +#elif defined(KERNEL_dot_product_f32_unrolled4) |
| 136 | + sink_f = dot_product_f32_unrolled4(8192, fa, fb); |
| 137 | +#elif defined(KERNEL_layernorm_f32) |
| 138 | + layernorm_f32(4096, fa, fb, fc, 1e-5f); sink_f = fa[7]; |
| 139 | +#elif defined(KERNEL_gelu_tanh_f32) |
| 140 | + gelu_tanh_f32(4096, fa); sink_f = fa[7]; |
| 141 | +#elif defined(KERNEL_silu_f32) |
| 142 | + silu_f32(4096, fa); sink_f = fa[7]; |
| 143 | +#elif defined(KERNEL_saxpy_i8) |
| 144 | + saxpy_i8(16384, 3, ia, ib); sink_i = ib[7]; |
| 145 | +#elif defined(KERNEL_sum_i16) |
| 146 | + sink_i = sum_i16(16384, i16); |
| 147 | +#elif defined(KERNEL_gather_sum_f32) |
| 148 | + sink_f = gather_sum_f32(4096, fa, i32); |
| 149 | +#elif defined(KERNEL_memset_byte) |
| 150 | + memset_byte(65536, 0xab, ub); sink_i = ub[7]; |
| 151 | +#else |
| 152 | +#error "no KERNEL_<name> selected" |
| 153 | +#endif |
| 154 | + kernel_region_end(); |
| 155 | + |
| 156 | + /* Fold every sink into the exit code so the call cannot be elided and a |
| 157 | + * scalar/vector divergence in the result shows up in the exit status. */ |
| 158 | + uint32_t cs = (uint32_t)sink_i; |
| 159 | + cs ^= (uint32_t)(int32_t)(sink_f * 1024.0f); |
| 160 | + cs ^= (uint32_t)(int32_t)(sink_d * 1024.0); |
| 161 | + cs ^= (uint32_t)sink_z; |
| 162 | + return (int)(cs & 0x7f); |
| 163 | +} |
0 commit comments