@@ -47,6 +47,16 @@ OPENSSL_STATIC_ASSERT((sizeof((struct rand_thread_local_state*)0)->generate_call
4747DEFINE_BSS_GET (struct rand_thread_local_state * , thread_states_list_head )
4848DEFINE_STATIC_MUTEX (thread_local_states_list_lock )
4949
50+ // thread_local_drbg_shutdown_started is set to a non-zero value during process
51+ // exit by |rand_thread_local_state_clear_all|, while the linked-list lock is
52+ // held. All reads and writes occur under |thread_local_states_list_lock|, so
53+ // no atomic accessors are needed; doing the check under the lock also avoids
54+ // the otherwise-racy window where a TLS destructor could observe the flag as
55+ // unset, then block on the lock while shutdown zeroization runs, and then
56+ // free a state whose |state_clear_lock| has been intentionally write-locked
57+ // forever.
58+ DEFINE_BSS_GET (int , thread_local_drbg_shutdown_started )
59+
5060#if defined(_MSC_VER )
5161#pragma section(".CRT$XCU", read)
5262static void rand_thread_local_state_clear_all (void );
@@ -67,19 +77,42 @@ static void rand_thread_local_state_clear_all(void) __attribute__ ((destructor))
6777// randomness from a non-valid state. The linked application should obviously
6878// arrange that all threads are gracefully exited before exiting the process.
6979// Yet, in cases where such graceful exit does not happen we ensure that no
70- // output can be returned by locking all thread-local states and deliberately
71- // not releasing the lock. A synchronization step in the core randomness
72- // generation routine |RAND_bytes_core| then ensures that no randomness
73- // generation can occur after a thread-local state has been locked. It also
74- // ensures |rand_thread_local_state_free| cannot free any thread state while we
75- // own the lock.
80+ // output can be returned by locking each thread-local state's
81+ // |state_clear_lock| and deliberately not releasing it. A synchronization step
82+ // in the core randomness generation routine |RAND_bytes_core| then ensures
83+ // that no randomness generation can occur after a thread-local state has been
84+ // locked.
85+ //
86+ // We additionally set |thread_local_drbg_shutdown_started| under the
87+ // linked-list lock so that any thread which has not yet registered a
88+ // thread-local state cannot do so after this routine has begun zeroization;
89+ // without this, a fresh thread could allocate a state and bypass zeroization.
90+ // The linked-list lock itself is released at the end of this function. If we
91+ // instead held it forever, |thread_local_list_delete_node| (called from a
92+ // thread's TLS destructor in |rand_thread_local_state_free|) would block
93+ // indefinitely. On Windows, TLS destructors run under the loader lock, so
94+ // blocking there causes |ExitProcess| to hang waiting for the loader lock.
95+ // See https://github.com/aws/aws-lc/issues/3197.
7696//
77- // When a thread-local DRBGs is gated from returning output, we can invoke the
97+ // When a thread-local DRBG is gated from returning output, we can invoke the
7898// entropy source zeroization from |state->entropy_source|. The entropy source
7999// implementation can assume that any returned seed is never used to generate
80100// any randomness that is later returned to a consumer.
81101static void rand_thread_local_state_clear_all (void ) {
82102 CRYPTO_STATIC_MUTEX_lock_write (thread_local_states_list_lock_bss_get ());
103+
104+ // Idempotency guard. Under normal operation this routine runs exactly once
105+ // (via |atexit|), but it is also exposed for testing via
106+ // |rand_thread_local_state_clear_all_FOR_TESTING|, and re-running would
107+ // attempt to write-lock per-state |state_clear_lock|s that are already held
108+ // write-locked by the first invocation -- which is UB for a non-recursive
109+ // rwlock.
110+ if (* thread_local_drbg_shutdown_started_bss_get () != 0 ) {
111+ CRYPTO_STATIC_MUTEX_unlock_write (thread_local_states_list_lock_bss_get ());
112+ return ;
113+ }
114+ * thread_local_drbg_shutdown_started_bss_get () = 1 ;
115+
83116 for (struct rand_thread_local_state * state = * thread_states_list_head_bss_get ();
84117 state != NULL ; state = state -> next ) {
85118 CRYPTO_MUTEX_lock_write (& state -> state_clear_lock );
@@ -90,13 +123,34 @@ static void rand_thread_local_state_clear_all(void) {
90123 state != NULL ; state = state -> next ) {
91124 state -> entropy_source -> methods -> zeroize_thread (state -> entropy_source );
92125 }
126+
127+ CRYPTO_STATIC_MUTEX_unlock_write (thread_local_states_list_lock_bss_get ());
128+ }
129+
130+ void rand_thread_local_state_clear_all_FOR_TESTING (void ) {
131+ rand_thread_local_state_clear_all ();
93132}
94133
95- static void thread_local_list_delete_node (
134+ // thread_local_list_delete_node removes |node_delete| from the global
135+ // linked list and returns 1. If process-wide shutdown zeroization has already
136+ // begun, the node is left in place and 0 is returned -- the caller must not
137+ // free the node in that case because |rand_thread_local_state_clear_all| has
138+ // write-locked its |state_clear_lock| and intentionally never releases it.
139+ static int thread_local_list_delete_node (
96140 struct rand_thread_local_state * node_delete ) {
97141
98142 // Mutating the global linked list. Need to synchronize over all threads.
99143 CRYPTO_STATIC_MUTEX_lock_write (thread_local_states_list_lock_bss_get ());
144+
145+ // Re-check the shutdown flag under the lock. This makes the
146+ // "free vs. shutdown-zeroize" decision atomic with respect to
147+ // |rand_thread_local_state_clear_all|: either we delete and free before
148+ // shutdown begins, or shutdown wins and we leak the node deliberately.
149+ if (* thread_local_drbg_shutdown_started_bss_get () != 0 ) {
150+ CRYPTO_STATIC_MUTEX_unlock_write (thread_local_states_list_lock_bss_get ());
151+ return 0 ;
152+ }
153+
100154 struct rand_thread_local_state * node_head = * thread_states_list_head_bss_get ();
101155
102156 // We have [node_delete->previous] <--> [node_delete] <--> [node_delete->next]
@@ -127,6 +181,7 @@ static void thread_local_list_delete_node(
127181 }
128182
129183 CRYPTO_STATIC_MUTEX_unlock_write (thread_local_states_list_lock_bss_get ());
184+ return 1 ;
130185}
131186
132187// thread_local_list_add adds the state |node_add| to the linked list. Note that
@@ -141,6 +196,19 @@ static void thread_local_list_add_node(
141196 // Mutating the global linked list. Need to synchronize over all threads.
142197 CRYPTO_STATIC_MUTEX_lock_write (thread_local_states_list_lock_bss_get ());
143198
199+ // If process-wide zeroization has already started, do not add a new state to
200+ // the list -- it would not have been zeroized by
201+ // |rand_thread_local_state_clear_all|. Instead, write-lock the state's
202+ // |state_clear_lock| and never release it. This causes any subsequent
203+ // |RAND_bytes_core| call on this thread to block forever on the read lock,
204+ // matching the FIPS-derived guarantee that no output is returned after
205+ // shutdown zeroization.
206+ if (* thread_local_drbg_shutdown_started_bss_get () != 0 ) {
207+ CRYPTO_MUTEX_lock_write (& node_add -> state_clear_lock );
208+ CRYPTO_STATIC_MUTEX_unlock_write (thread_local_states_list_lock_bss_get ());
209+ return ;
210+ }
211+
144212 // First get a reference to the pointer of the head of the linked list.
145213 // That is, the pointer to the head node node_head is *thread_states_head.
146214 struct rand_thread_local_state * * thread_states_head = thread_states_list_head_bss_get ();
@@ -171,7 +239,16 @@ static void rand_thread_local_state_free(void *state_in) {
171239 return ;
172240 }
173241
174- thread_local_list_delete_node (state );
242+ // If process-wide shutdown zeroization has begun, the per-state
243+ // |state_clear_lock| has been write-locked and is intentionally never
244+ // released (so any in-flight |RAND_bytes| call cannot return output from a
245+ // zeroized state). |thread_local_list_delete_node| therefore detects this
246+ // case under the linked-list lock and refuses to delete; we must then leak
247+ // the node, since freeing it would destroy a held mutex. The OS reclaims
248+ // the memory at process exit. See issue #3197.
249+ if (thread_local_list_delete_node (state ) == 0 ) {
250+ return ;
251+ }
175252
176253 // Potentially, something could kill the thread before an entropy source has
177254 // been associated to the thread-local randomness generator object.
0 commit comments