diff --git a/src/f32-gemm/gen/f32-gemm-10x16c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-10x16c2-minmax-asm-amd64-avx512f-broadcast.S index b253a1ea931..333a982b390 100644 --- a/src/f32-gemm/gen/f32-gemm-10x16c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-10x16c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -454,11 +456,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -466,5 +473,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16c2__asm_amd64_avx512f_broadcast. int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-11x16c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-11x16c2-minmax-asm-amd64-avx512f-broadcast.S index ca5bd294914..6af7814ee7d 100644 --- a/src/f32-gemm/gen/f32-gemm-11x16c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-11x16c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -487,11 +489,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -499,5 +506,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16c2__asm_amd64_avx512f_broadcast. int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-1x16c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-1x16c2-minmax-asm-amd64-avx512f-broadcast.S index 342ba0e5439..94cfb4c1315 100644 --- a/src/f32-gemm/gen/f32-gemm-1x16c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-1x16c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x16c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x16c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -52,7 +54,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x16c2__asm_amd64_avx512f_broadcast mov [rsp], r13 # Allocate some space on the stack. - sub rsp, 64 + sub rsp, 128 # Copy k and flip bit. mov r11, rdx @@ -134,7 +136,7 @@ tail: vmovups ZMMWORD PTR [r10]{k1}, zmm11 return: - add rsp, 64 + add rsp, 128 mov r13, [rsp] mov rsp, r13 # Restore the callee saved registers. @@ -144,11 +146,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x16c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x16c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -156,5 +163,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x16c2__asm_amd64_avx512f_broadcast.d int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x16c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-1x32c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-1x32c2-minmax-asm-amd64-avx512f-broadcast.S index d31aa78e101..d5e4edbe7f5 100644 --- a/src/f32-gemm/gen/f32-gemm-1x32c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-1x32c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x32c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x32c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -52,7 +54,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x32c2__asm_amd64_avx512f_broadcast mov [rsp], r13 # Allocate some space on the stack. - sub rsp, 64 + sub rsp, 128 # Copy k and flip bit. mov r11, rdx @@ -157,7 +159,7 @@ tail: vmovups ZMMWORD PTR [r10 + 64]{k2}, zmm12 return: - add rsp, 64 + add rsp, 128 mov r13, [rsp] mov rsp, r13 # Restore the callee saved registers. @@ -167,11 +169,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x32c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x32c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -179,5 +186,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x32c2__asm_amd64_avx512f_broadcast.d int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x32c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-2x16c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-2x16c2-minmax-asm-amd64-avx512f-broadcast.S index d2015697f28..b550d122de4 100644 --- a/src/f32-gemm/gen/f32-gemm-2x16c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-2x16c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_2x16c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_2x16c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -171,11 +173,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_2x16c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_2x16c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -183,5 +190,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_2x16c2__asm_amd64_avx512f_broadcast.d int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_2x16c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-2x32c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-2x32c2-minmax-asm-amd64-avx512f-broadcast.S index 36f8a00ef01..ce1c9553afe 100644 --- a/src/f32-gemm/gen/f32-gemm-2x32c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-2x32c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_2x32c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_2x32c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -209,11 +211,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_2x32c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_2x32c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -221,5 +228,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_2x32c2__asm_amd64_avx512f_broadcast.d int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_2x32c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-3x16c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-3x16c2-minmax-asm-amd64-avx512f-broadcast.S index d1c56d99e5e..0f67615ee1c 100644 --- a/src/f32-gemm/gen/f32-gemm-3x16c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-3x16c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_3x16c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_3x16c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -198,11 +200,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_3x16c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_3x16c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -210,5 +217,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_3x16c2__asm_amd64_avx512f_broadcast.d int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_3x16c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-3x32c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-3x32c2-minmax-asm-amd64-avx512f-broadcast.S index 24954a34f7f..975b2a13901 100644 --- a/src/f32-gemm/gen/f32-gemm-3x32c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-3x32c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_3x32c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_3x32c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -251,11 +253,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_3x32c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_3x32c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -263,5 +270,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_3x32c2__asm_amd64_avx512f_broadcast.d int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_3x32c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-4x16c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-4x16c2-minmax-asm-amd64-avx512f-broadcast.S index 733cd6aa078..048ac524eb2 100644 --- a/src/f32-gemm/gen/f32-gemm-4x16c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-4x16c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x16c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x16c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -225,11 +227,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x16c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x16c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -237,5 +244,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x16c2__asm_amd64_avx512f_broadcast.d int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x16c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-4x32c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-4x32c2-minmax-asm-amd64-avx512f-broadcast.S index 5f0d903785d..44c42a466e2 100644 --- a/src/f32-gemm/gen/f32-gemm-4x32c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-4x32c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x32c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x32c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -293,11 +295,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x32c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x32c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -305,5 +312,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x32c2__asm_amd64_avx512f_broadcast.d int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x32c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-5x16c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-5x16c2-minmax-asm-amd64-avx512f-broadcast.S index e0bc7920255..21bad11faa9 100644 --- a/src/f32-gemm/gen/f32-gemm-5x16c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-5x16c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x16c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x16c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -52,7 +54,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x16c2__asm_amd64_avx512f_broadcast mov [rsp], r13 # Allocate some space on the stack. - sub rsp, 128 + sub rsp, 192 # Clamp a & c pointers if mr <= 1 mov rax, rcx @@ -242,7 +244,7 @@ tail: vmovups ZMMWORD PTR [r8]{k1}, zmm15 return: - add rsp, 128 + add rsp, 192 mov r13, [rsp] mov rsp, r13 # Restore the callee saved registers. @@ -252,11 +254,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_5x16c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x16c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -264,5 +271,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x16c2__asm_amd64_avx512f_broadcast.d int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_5x16c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-5x32c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-5x32c2-minmax-asm-amd64-avx512f-broadcast.S index a2096959051..10a01f0aa86 100644 --- a/src/f32-gemm/gen/f32-gemm-5x32c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-5x32c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x32c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x32c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -52,7 +54,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x32c2__asm_amd64_avx512f_broadcast mov [rsp], r13 # Allocate some space on the stack. - sub rsp, 128 + sub rsp, 192 # Clamp a & c pointers if mr <= 1 mov rax, rcx @@ -325,7 +327,7 @@ tail: vmovups ZMMWORD PTR [r8 + 64]{k2}, zmm20 return: - add rsp, 128 + add rsp, 192 mov r13, [rsp] mov rsp, r13 # Restore the callee saved registers. @@ -335,11 +337,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_5x32c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x32c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -347,5 +354,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x32c2__asm_amd64_avx512f_broadcast.d int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_5x32c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-6x16c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-6x16c2-minmax-asm-amd64-avx512f-broadcast.S index 862e64cf3a5..14b70d14eaa 100644 --- a/src/f32-gemm/gen/f32-gemm-6x16c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-6x16c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x16c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x16c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -322,11 +324,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x16c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x16c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -334,5 +341,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x16c2__asm_amd64_avx512f_broadcast.d int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x16c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-7x16c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-7x16c2-minmax-asm-amd64-avx512f-broadcast.S index 907ff8b98bd..4e04b9bcd29 100644 --- a/src/f32-gemm/gen/f32-gemm-7x16c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-7x16c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x16c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x16c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -355,11 +357,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_7x16c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x16c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -367,5 +374,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x16c2__asm_amd64_avx512f_broadcast.d int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_7x16c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-8x16c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-8x16c2-minmax-asm-amd64-avx512f-broadcast.S index 61df00ab0c1..a9431898210 100644 --- a/src/f32-gemm/gen/f32-gemm-8x16c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-8x16c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x16c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x16c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -388,11 +390,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_8x16c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x16c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -400,5 +407,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x16c2__asm_amd64_avx512f_broadcast.d int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_8x16c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file diff --git a/src/f32-gemm/gen/f32-gemm-9x16c2-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-9x16c2-minmax-asm-amd64-avx512f-broadcast.S index 8ea685aa04e..31c09eddb5d 100644 --- a/src/f32-gemm/gen/f32-gemm-9x16c2-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-9x16c2-minmax-asm-amd64-avx512f-broadcast.S @@ -25,8 +25,10 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16c2__asm_amd64_avx512f_broadcast .intel_syntax noprefix - # Free up GP registers. + # Save register arguments for tail call to msan annotation helper. + push rdi + push rsi push rbx push rbp push r15 @@ -35,14 +37,14 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16c2__asm_amd64_avx512f_broadcast push r12 # load params to free up a GP registers - mov r13, [rsp + 80] # params + mov r13, [rsp + 96] # params vbroadcastss zmm0, DWORD PTR [r13] vbroadcastss zmm1, DWORD PTR [r13 + 4] # Load c pointer. - mov r10, [rsp + 56] + mov r10, [rsp + 72] # Load cm_stride. - mov r11, [rsp + 64] + mov r11, [rsp + 80] # Align the stack pointer. mov r13, rsp @@ -52,7 +54,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16c2__asm_amd64_avx512f_broadcast mov [rsp], r13 # Allocate some space on the stack. - sub rsp, 192 + sub rsp, 256 # Write rsi (a pointer) to the stack as we need the register. mov [rsp + 16], rcx # Write r10 (c pointer) to the stack as we need the register. @@ -411,7 +413,7 @@ tail: vmovups ZMMWORD PTR [rbp]{k1}, zmm19 return: - add rsp, 192 + add rsp, 256 mov r13, [rsp] mov rsp, r13 # Restore the callee saved registers. @@ -421,11 +423,16 @@ return: pop r15 pop rbp pop rbx + pop rsi + pop rdi + #if XNN_HAS_FEATURE(memory_sanitizer) + jmp xnn_gemm_ukernel_msan_sizeof_c_4 + #else ret + #endif END_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16c2__asm_amd64_avx512f_broadcast - #ifdef __has_feature - #if __has_feature(dataflow_sanitizer) + #if XNN_HAS_FEATURE(dataflow_sanitizer) BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16c2__asm_amd64_avx512f_broadcast.dfsan .intel_syntax noprefix # We could implement this by calling a function that implements the dfsan instrumentation. @@ -433,5 +440,4 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16c2__asm_amd64_avx512f_broadcast.d int 3 ret END_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16c2__asm_amd64_avx512f_broadcast.dfsan - #endif #endif \ No newline at end of file