From d9d2c6ae0e3e14f3568d19e743388298006f3a13 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sun, 22 Jun 2025 16:51:01 -0500 Subject: [PATCH] Conditionally harden barriers. Details: - Adds a configuration flag `--harden-barriers` (disabled by default)`. - When enabled, threads record a) the currently-detected sense variable from the barrier object (in this mode, the sense variable is also incremented rather than XOR'ed between 1 and 0 to prevent ABA problems), and b) the source location of the call to `bli_thrinfo_barrier` or `bli_thrinfo_bcast` as an address to a string literal. If any thread in a team records different information from its peers, a diagnostic is printed and the program aborts. - This information requires an additional dynamically-allocated array, and some extra reads/writes during the barrier process. While I haven't measured it, the performance impact should be small though (and is opt-in). - This should detect errors related to problems such as conditionally-taken barriers within a thread team, use of the incorrect thread info object, threads escaping barriers early, etc. Limitations: - Both calls to `bli_thrcomm_barrier` within `bli_thrinfo_bcast` receive the same source line information. However, the check on sense variable should still catch any problems. - Certain problems (such as missing a broadcast) may still manifest as illegal memory accesses or memory corruption before the problem can be detected in a later barrier. - Not implemented for tree barriers yet. I would prefer to refactor the tree and non-tree barriers as a unified implementation first. --- build/bli_config.h.in | 4 +++ build/libblis-symbols.def | 2 -- configure | 24 ++++++++++++++++ frame/thread/bli_thrcomm.c | 44 +++++++++++++++++++++++------ frame/thread/bli_thrcomm.h | 29 ++++++++++++------- frame/thread/bli_thrcomm_hpx.cpp | 5 ++-- frame/thread/bli_thrcomm_hpx.h | 2 +- frame/thread/bli_thrcomm_openmp.c | 25 +++++++++++----- frame/thread/bli_thrcomm_openmp.h | 4 +-- frame/thread/bli_thrcomm_pthreads.c | 13 +++++++-- frame/thread/bli_thrcomm_pthreads.h | 4 +-- frame/thread/bli_thrcomm_single.c | 2 +- frame/thread/bli_thrcomm_single.h | 2 +- frame/thread/bli_thrinfo.h | 26 +++++++++++++++-- 14 files changed, 146 insertions(+), 40 deletions(-) diff --git a/build/bli_config.h.in b/build/bli_config.h.in index f883c492b..ceb081855 100644 --- a/build/bli_config.h.in +++ b/build/bli_config.h.in @@ -83,6 +83,10 @@ #endif #endif +#if @harden_barriers@ +#define BLIS_HARDEN_BARRIERS +#endif + #if @enable_jrir_slab@ #define BLIS_ENABLE_JRIR_SLAB #endif diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def index ec68592d7..383cada50 100644 --- a/build/libblis-symbols.def +++ b/build/libblis-symbols.def @@ -990,8 +990,6 @@ bli_szcopysc bli_szipsc bli_szxpbym_md bli_szxpbym_md_ex -bli_thrcomm_barrier -bli_thrcomm_bcast bli_thread_get_ic_nt bli_thread_get_ir_nt bli_thread_get_jc_nt diff --git a/configure b/configure index a22054e75..0ceee4828 100755 --- a/configure +++ b/configure @@ -256,6 +256,15 @@ print_usage() --disable-threading if you suspect any correctness or deadlock issues. + --harden-barriers + + Keep track of additional information which enables + run-time detection of various issues due to either misuse + of BLIS thrinfo_t structures (esp. via bli_thrinfo_barrier + and bli_thrinfo_bcast) or potentially to bugs in the + BLIS threading code itself. There may be a performance + penalty. + --disable-pba-pools, --enable-pba-pools --disable-sba-pools, --enable-sba-pools @@ -3027,6 +3036,9 @@ blis_main() # The thread-local storage flag. enable_tls='yes' + # Barrier hardening flag. + harden_barriers='no' + # The threading flag. threading_model='off' @@ -3228,6 +3240,10 @@ blis_main() enable_tls='no' ;; + harden-barriers) + harden_barriers='yes' + ;; + enable-threading=*) threading_model=${OPTARG#*=} ;; @@ -3803,6 +3819,13 @@ blis_main() enable_tls_01=0 fi + # Check for barrier hardening. + harden_barriers_01=0 + if [[ ${harden_barriers} = yes ]]; then + echo "${script_name}: Barriers will be hardened. There may be some performance impact." + harden_barriers_01=1 + fi + # Check the threading model flag and standardize its value, if needed. # Note that single-threaded mode will always be enabled, but not necessarily # by default. @@ -4456,6 +4479,7 @@ blis_main() add_config_var kernel_list_defines add_config_var omit_symbol_list_defines add_config_var enable_tls enable_tls_01 + add_config_var harden_barriers harden_barriers_01 add_config_var enable_openmp enable_openmp_01 add_config_var enable_openmp_as_def enable_openmp_as_def_01 add_config_var enable_pthreads enable_pthreads_01 diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index 30257db39..9d3d27c49 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -172,7 +172,7 @@ void bli_thrcomm_cleanup( thrcomm_t* comm ) fp( comm ); } -void bli_thrcomm_barrier( dim_t tid, thrcomm_t* comm ) +void bli_thrcomm_barrier( dim_t tid, thrcomm_t* comm, const char* tag ) { const timpl_t ti = bli_thrcomm_thread_impl( comm ); const thrcomm_barrier_ft fp = barrier_fpa[ ti ]; @@ -182,25 +182,26 @@ void bli_thrcomm_barrier( dim_t tid, thrcomm_t* comm ) if ( fp == NULL ) bli_abort(); // Call the threading-specific barrier function. - fp( tid, comm ); + fp( tid, comm, tag ); } // -- Other functions ---------------------------------------------------------- void* bli_thrcomm_bcast ( - dim_t id, - void* to_send, - thrcomm_t* comm + dim_t id, + void* to_send, + thrcomm_t* comm, + const char* tag ) { if ( comm == NULL || comm->n_threads == 1 ) return to_send; if ( id == 0 ) comm->sent_object = to_send; - bli_thrcomm_barrier( id, comm ); + bli_thrcomm_barrier( id, comm, tag ); void* object = comm->sent_object; - bli_thrcomm_barrier( id, comm ); + bli_thrcomm_barrier( id, comm, tag ); return object; } @@ -222,7 +223,7 @@ void* bli_thrcomm_bcast #endif -void bli_thrcomm_barrier_atomic( dim_t t_id, thrcomm_t* comm ) +void bli_thrcomm_barrier_atomic( dim_t t_id, thrcomm_t* comm, const char* tag ) { // Return early if the comm is NULL or if there is only one // thread participating. @@ -238,6 +239,11 @@ void bli_thrcomm_barrier_atomic( dim_t t_id, thrcomm_t* comm ) // decremented back to 0, and so forth). gint_t orig_sense = __atomic_load_n( &comm->barrier_sense, __ATOMIC_RELAXED ); + #ifdef BLIS_HARDEN_BARRIERS + comm->status[ t_id ].barrier_sense = orig_sense; + comm->status[ t_id ].tag = tag; + #endif + // Register ourselves (the current thread) as having arrived by // incrementing the barrier_threads_arrived variable. We must perform // this increment (and a subsequent read) atomically. @@ -248,13 +254,35 @@ void bli_thrcomm_barrier_atomic( dim_t t_id, thrcomm_t* comm ) // it will take actions that effectively ends and resets the barrier. if ( my_threads_arrived == comm->n_threads ) { + #ifdef BLIS_HARDEN_BARRIERS + // Check that all threads a) called bli_thrinfo_barrier or + // bli_thrinfo_bcast from the same source location, and b) + // encountered the same original sense variable. + for ( dim_t i = 0;i < comm->n_threads; i++ ) + { + if ( comm->status[ i ].barrier_sense != orig_sense || + comm->status[ i ].tag != tag ) + { + printf( "Inconsistency detected in barrier:\n" ); + for ( dim_t j = 0;j < comm->n_threads; j++ ) + printf( "Thread %d detected sense %lld at %s\n", + (int)(j+1), (long long)comm->status[ j ].barrier_sense, comm->status[ j ].tag ); + bli_abort(); + } + } + #endif + // Reset the variable tracking the number of threads that have arrived // to zero (which returns the barrier to the "empty" state. Then // atomically toggle the barrier sense variable. This will signal to // the other threads (which are spinning in the branch elow) that it // is now safe to exit the barrier. comm->barrier_threads_arrived = 0; + #ifdef BLIS_HARDEN_BARRIERS + __atomic_fetch_add( &comm->barrier_sense, 1, __ATOMIC_RELEASE ); + #else __atomic_fetch_xor( &comm->barrier_sense, 1, __ATOMIC_RELEASE ); + #endif } else { diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index 22b02d97e..8fd2b1daf 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -81,13 +81,22 @@ typedef struct hpx_barrier_t // Define the thrcomm_t structure, which will be common to all threading // implementations. +typedef struct thrcomm_status_s +{ + gint_t barrier_sense; + const char* tag; + char padding[ BLIS_CACHE_LINE_SIZE ]; +} thrcomm_status_t; + typedef struct thrcomm_s { // -- Fields common to all threading implementations -- - - void* sent_object; - dim_t n_threads; - timpl_t ti; + void* sent_object; + dim_t n_threads; + timpl_t ti; + #ifdef BLIS_HARDEN_BARRIERS + thrcomm_status_t* status; + #endif // We insert a cache line of padding here to eliminate false sharing between // the fields above and fields below. @@ -155,7 +164,7 @@ typedef struct thrcomm_s // "overloaded" by each method of multithreading. typedef void (*thrcomm_init_ft)( dim_t nt, thrcomm_t* comm ); typedef void (*thrcomm_cleanup_ft)( thrcomm_t* comm ); -typedef void (*thrcomm_barrier_ft)( dim_t tid, thrcomm_t* comm ); +typedef void (*thrcomm_barrier_ft)( dim_t tid, thrcomm_t* comm, const char* tag ); // thrcomm_t query (field only) @@ -180,13 +189,13 @@ void bli_thrcomm_free( pool_t* sba_pool, thrcomm_t* comm ); // require the timpl_t as an argument. The threading-specific functions can // (and do) omit the timpl_t from their function signatures since their // threading implementation is intrinsically known. -void bli_thrcomm_init( timpl_t ti, dim_t n_threads, thrcomm_t* comm ); -void bli_thrcomm_cleanup( thrcomm_t* comm ); -BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); +void bli_thrcomm_init( timpl_t ti, dim_t n_threads, thrcomm_t* comm ); +void bli_thrcomm_cleanup( thrcomm_t* comm ); +void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm, const char* tag ); // Other function prototypes. -BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); -void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); +void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm, const char* tag ); +void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm, const char* tag ); #endif diff --git a/frame/thread/bli_thrcomm_hpx.cpp b/frame/thread/bli_thrcomm_hpx.cpp index 0947dc81d..9605075cd 100644 --- a/frame/thread/bli_thrcomm_hpx.cpp +++ b/frame/thread/bli_thrcomm_hpx.cpp @@ -55,7 +55,7 @@ void hpx_barrier_destroy( hpx_barrier_t* barrier ) auto* barrier_ = reinterpret_cast*>( barrier->handle ); barrier->handle = nullptr; - delete barrier_; + delete barrier_; } void hpx_barrier_arrive_and_wait( hpx_barrier_t* barrier ) @@ -86,8 +86,9 @@ void bli_thrcomm_cleanup_hpx( thrcomm_t* comm ) hpx_barrier_destroy( &comm->barrier ); } -void bli_thrcomm_barrier_hpx( dim_t t_id, thrcomm_t* comm ) +void bli_thrcomm_barrier_hpx( dim_t t_id, thrcomm_t* comm, const char* tag ) { + ( void )tag; hpx_barrier_arrive_and_wait( &comm->barrier ); } diff --git a/frame/thread/bli_thrcomm_hpx.h b/frame/thread/bli_thrcomm_hpx.h index d80cd2268..544dc87a1 100644 --- a/frame/thread/bli_thrcomm_hpx.h +++ b/frame/thread/bli_thrcomm_hpx.h @@ -40,7 +40,7 @@ void bli_thrcomm_init_hpx( dim_t nt, thrcomm_t* comm ); void bli_thrcomm_cleanup_hpx( thrcomm_t* comm ); -void bli_thrcomm_barrier_hpx( dim_t tid, thrcomm_t* comm ); +void bli_thrcomm_barrier_hpx( dim_t tid, thrcomm_t* comm, const char* tag ); #endif diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 382d6c9a7..47006abf8 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -52,17 +52,26 @@ void bli_thrcomm_init_openmp( dim_t n_threads, thrcomm_t* comm ) comm->ti = BLIS_OPENMP; comm->barrier_sense = 0; comm->barrier_threads_arrived = 0; + + #ifdef BLIS_HARDEN_BARRIERS + err_t r_val; + comm->status = ( thrcomm_status_t* )bli_malloc_intl( n_threads * sizeof( thrcomm_status_t ), &r_val ); + #endif } void bli_thrcomm_cleanup_openmp( thrcomm_t* comm ) { + #ifdef BLIS_HARDEN_BARRIERS + bli_free_intl( comm->status ); + #endif + return; } -void bli_thrcomm_barrier_openmp( dim_t t_id, thrcomm_t* comm ) +void bli_thrcomm_barrier_openmp( dim_t t_id, thrcomm_t* comm, const char* tag ) { - bli_thrcomm_barrier_atomic( t_id, comm ); + bli_thrcomm_barrier_atomic( t_id, comm, tag ); } #else @@ -99,13 +108,13 @@ void bli_thrcomm_cleanup_openmp( thrcomm_t* comm ) bli_free_intl( comm->barriers ); } -void bli_thrcomm_barrier_openmp( dim_t t_id, thrcomm_t* comm ) +void bli_thrcomm_barrier_openmp( dim_t t_id, thrcomm_t* comm, const char* tag ) { // Return early if the comm is NULL or if there is only one // thread participating. if ( comm == NULL || comm->n_threads == 1 ) return; - bli_thrcomm_tree_barrier( comm->barriers[t_id] ); + bli_thrcomm_tree_barrier( comm->barriers[t_id], tag ); } // -- Helper functions --------------------------------------------------------- @@ -146,10 +155,10 @@ barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_ kid->dad = me; leaf_index += threads_this_kid; - } + } me->count = arity; me->arity = arity; - } + } return me; } @@ -184,8 +193,10 @@ void bli_thrcomm_tree_barrier_free( barrier_t* barrier ) #endif -void bli_thrcomm_tree_barrier( barrier_t* barack ) +void bli_thrcomm_tree_barrier( barrier_t* barack, const char* tag ) { + //TODO + gint_t my_signal = __atomic_load_n( &barack->signal, __ATOMIC_RELAXED ); dim_t my_count = diff --git a/frame/thread/bli_thrcomm_openmp.h b/frame/thread/bli_thrcomm_openmp.h index 8c33d0c2f..296820c70 100644 --- a/frame/thread/bli_thrcomm_openmp.h +++ b/frame/thread/bli_thrcomm_openmp.h @@ -45,13 +45,13 @@ // OpenMP-specific function prototypes. void bli_thrcomm_init_openmp( dim_t nt, thrcomm_t* comm ); void bli_thrcomm_cleanup_openmp( thrcomm_t* comm ); -void bli_thrcomm_barrier_openmp( dim_t tid, thrcomm_t* comm ); +void bli_thrcomm_barrier_openmp( dim_t tid, thrcomm_t* comm, const char* tag ); // Prototypes specific to the OpenMP tree barrier implementation. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); -void bli_thrcomm_tree_barrier( barrier_t* barack ); +void bli_thrcomm_tree_barrier( barrier_t* barack, const char* tag ); #endif #endif diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 8e45a2782..94ab2d0bf 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -82,16 +82,25 @@ void bli_thrcomm_init_pthreads( dim_t n_threads, thrcomm_t* comm ) comm->ti = BLIS_POSIX; comm->barrier_sense = 0; comm->barrier_threads_arrived = 0; + + #ifdef BLIS_HARDEN_BARRIERS + err_t r_val; + comm->status = ( thrcomm_status_t* )bli_malloc_intl( n_threads * sizeof( thrcomm_status_t ), &r_val ); + #endif } void bli_thrcomm_cleanup_pthreads( thrcomm_t* comm ) { + #ifdef BLIS_HARDEN_BARRIERS + bli_free_intl( comm->status ); + #endif + return; } -void bli_thrcomm_barrier_pthreads( dim_t t_id, thrcomm_t* comm ) +void bli_thrcomm_barrier_pthreads( dim_t t_id, thrcomm_t* comm, const char* tag ) { - bli_thrcomm_barrier_atomic( t_id, comm ); + bli_thrcomm_barrier_atomic( t_id, comm, tag ); } #endif diff --git a/frame/thread/bli_thrcomm_pthreads.h b/frame/thread/bli_thrcomm_pthreads.h index 9a2447b99..9b8dc9f19 100644 --- a/frame/thread/bli_thrcomm_pthreads.h +++ b/frame/thread/bli_thrcomm_pthreads.h @@ -36,12 +36,12 @@ #define BLIS_THRCOMM_PTHREADS_H // Define these prototypes for situations when POSIX multithreading is enabled. -#ifdef BLIS_ENABLE_PTHREADS +#ifdef BLIS_ENABLE_PTHREADS // pthreads-specific function prototypes. void bli_thrcomm_init_pthreads( dim_t nt, thrcomm_t* comm ); void bli_thrcomm_cleanup_pthreads( thrcomm_t* comm ); -void bli_thrcomm_barrier_pthreads( dim_t tid, thrcomm_t* comm ); +void bli_thrcomm_barrier_pthreads( dim_t tid, thrcomm_t* comm, const char* tag ); #endif diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index 3116a5d06..3e383f0a6 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -51,7 +51,7 @@ void bli_thrcomm_cleanup_single( thrcomm_t* comm ) if ( comm == NULL ) return; } -void bli_thrcomm_barrier_single( dim_t t_id, thrcomm_t* comm ) +void bli_thrcomm_barrier_single( dim_t t_id, thrcomm_t* comm, const char* tag ) { return; } diff --git a/frame/thread/bli_thrcomm_single.h b/frame/thread/bli_thrcomm_single.h index fffb3fb75..2a1a19a99 100644 --- a/frame/thread/bli_thrcomm_single.h +++ b/frame/thread/bli_thrcomm_single.h @@ -41,7 +41,7 @@ // Sequential-specific function prototypes. void bli_thrcomm_init_single( dim_t nt, thrcomm_t* comm ); void bli_thrcomm_cleanup_single( thrcomm_t* comm ); -void bli_thrcomm_barrier_single( dim_t tid, thrcomm_t* comm ); +void bli_thrcomm_barrier_single( dim_t tid, thrcomm_t* comm, const char* tag ); #endif diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h index 694bf282d..11a5ff23b 100644 --- a/frame/thread/bli_thrinfo.h +++ b/frame/thread/bli_thrinfo.h @@ -183,16 +183,38 @@ void bli_thrinfo_attach_sub_node( thrinfo_t* sub_node, thrinfo_t* t ); // other thrinfo_t-related functions +#ifdef BLIS_HARDEN_BARRIERS + +BLIS_INLINE void* bli_thrinfo_broadcast_( const thrinfo_t* t, void* p, const char* tag ) +{ + return bli_thrcomm_bcast( t->thread_id, p, t->comm, tag ); +} + +BLIS_INLINE void bli_thrinfo_barrier_( const thrinfo_t* t, const char* tag ) +{ + bli_thrcomm_barrier( t->thread_id, t->comm, tag ); +} + +#define bli_thrinfo_broadcast( t, p ) \ + bli_thrinfo_broadcast_( t, p, STRINGIFY_INT(__FILE__) ":" STRINGIFY_INT(__LINE__) ) + +#define bli_thrinfo_barrier( t ) \ + bli_thrinfo_barrier_( t, STRINGIFY_INT(__FILE__) ":" STRINGIFY_INT(__LINE__) ) + +#else + BLIS_INLINE void* bli_thrinfo_broadcast( const thrinfo_t* t, void* p ) { - return bli_thrcomm_bcast( t->thread_id, p, t->comm ); + return bli_thrcomm_bcast( t->thread_id, p, t->comm, "" ); } BLIS_INLINE void bli_thrinfo_barrier( const thrinfo_t* t ) { - bli_thrcomm_barrier( t->thread_id, t->comm ); + bli_thrcomm_barrier( t->thread_id, t->comm, "" ); } +#endif + // // Prototypes for level-3 thrinfo functions not specific to any operation.