Skip to content

Commit 0e2d67c

Browse files
committed
extmod/zephyr_kernel: Use static thread pool to prevent GC heap corruption.
Thread structures allocated in the GC heap caused corruption during concurrent gc.collect() operations. When thread A runs GC and scans thread B's structure, the GC could relocate/compact thread B's structure mid-scan, invalidating pointers and causing memory corruption. Solution: Allocate all thread structures from a static pool outside the GC heap. This matches the ports/zephyr pattern and ensures thread structures remain stable during GC operations. Changes: - Add static thread_pool[MP_THREAD_POOL_SIZE] array - Add thread_pool_alloc() and thread_pool_free() functions - Replace m_new_obj() calls with static pool allocation - Free threads back to pool in cleanup instead of GC reclamation This fixes string/QSTR corruption with 2-3 concurrent threads calling gc.collect(). Tests with 4+ threads still exhibit crashes indicating additional issues remain to be investigated. Signed-off-by: Andrew Leech <[email protected]>
1 parent 4e29364 commit 0e2d67c

File tree

866 files changed

+13685
-26
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

866 files changed

+13685
-26
lines changed

=== Exit code: ===

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Starting thread_gc1 test...
2+
True

Timeout exit code:

417 Bytes
Binary file not shown.

echo

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Starting thread_gc1 test...
2+
True

extmod/zephyr_kernel/kernel/mpthread_zephyr.c

Lines changed: 66 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,35 @@ static mp_thread_stack_slot_t stack_slot[MP_THREAD_MAXIMUM_USER_THREADS];
173173
// Pre-allocated stack pool
174174
K_THREAD_STACK_ARRAY_DEFINE(mp_thread_stack_array, MP_THREAD_MAXIMUM_USER_THREADS, MP_THREAD_DEFAULT_STACK_SIZE);
175175

176+
// Static thread pool (main + user threads) - CRITICAL for GC safety
177+
// Thread structures MUST be outside GC heap to avoid corruption during concurrent gc.collect()
178+
// When thread A runs GC and scans thread B's structure, heap allocation could move B mid-scan
179+
#define MP_THREAD_POOL_SIZE (MP_THREAD_MAXIMUM_USER_THREADS + 1)
180+
static mp_thread_protected_t thread_pool[MP_THREAD_POOL_SIZE];
181+
static uint32_t thread_pool_bitmap = 0; // Bit N set = slot N allocated
182+
183+
// Allocate thread from static pool (thread_mutex must be held by caller)
184+
static mp_thread_protected_t *thread_pool_alloc(void) {
185+
for (int i = 0; i < MP_THREAD_POOL_SIZE; i++) {
186+
if (!(thread_pool_bitmap & (1U << i))) {
187+
thread_pool_bitmap |= (1U << i);
188+
memset(&thread_pool[i], 0, sizeof(mp_thread_protected_t));
189+
thread_pool[i].canary_before = CANARY_BEFORE;
190+
thread_pool[i].canary_after = CANARY_AFTER;
191+
return &thread_pool[i];
192+
}
193+
}
194+
return NULL; // Pool exhausted
195+
}
196+
197+
// Free thread back to static pool (thread_mutex must be held by caller)
198+
static void thread_pool_free(mp_thread_protected_t *protected) {
199+
if (protected >= thread_pool && protected < thread_pool + MP_THREAD_POOL_SIZE) {
200+
int index = protected - thread_pool;
201+
thread_pool_bitmap &= ~(1U << index);
202+
}
203+
}
204+
176205
// Forward declarations
177206
static void mp_thread_iterate_threads_cb(const struct k_thread *thread, void *user_data);
178207
static int32_t mp_thread_find_stack_slot(void);
@@ -195,13 +224,14 @@ bool mp_thread_init_early(void) {
195224
bool mp_thread_init(void *stack) {
196225
// Note: mp_zephyr_kernel_init() must be called by main() BEFORE mp_thread_init_early()
197226
// to properly initialize the Zephyr kernel and bootstrap thread.
198-
// GC must be initialized BEFORE calling this function (heap allocation required)
199227
// mp_thread_init_early() must be called BEFORE gc_init() to set thread-local state
200228

201-
// Allocate main thread with canary protection (same as all other threads)
202-
mp_thread_protected_t *protected = m_new_obj(mp_thread_protected_t);
203-
protected->canary_before = CANARY_BEFORE;
204-
protected->canary_after = CANARY_AFTER;
229+
// Allocate main thread from static pool (GC-safe, no heap allocation)
230+
// mutex not yet initialized, but this is single-threaded during init
231+
mp_thread_protected_t *protected = thread_pool_alloc();
232+
if (protected == NULL) {
233+
return false; // Should never happen for main thread
234+
}
205235
mp_thread_t *th = &protected->thread;
206236

207237
// Initialize main thread entry in linked list
@@ -290,7 +320,8 @@ void mp_thread_gc_others(void) {
290320
}
291321
mp_thread_counter--;
292322
DEBUG_printf("GC: Collected thread %s\n", k_thread_name_get(th->id));
293-
// The "th" memory will eventually be reclaimed by the GC
323+
// Free thread back to static pool (GC-safe, no heap involved)
324+
thread_pool_free(th->protected);
294325
// Don't update prev - we removed this node
295326
} else {
296327
th->alive = 0; // Reset for next GC cycle
@@ -331,40 +362,45 @@ void mp_thread_gc_others(void) {
331362
}
332363
}
333364

334-
gc_collect_root((void **)&th, 1);
335-
gc_collect_root(&th->arg, 1);
336-
gc_collect_root((void **)&th->protected, 1); // Mark full protected structure
337-
365+
// Skip current thread - its stack is scanned separately via gc_helper_collect_regs_and_stack()
338366
if (th->id == k_current_get()) {
339-
continue; // Don't scan current thread's stack (done separately)
367+
// Still need to mark thread structure and arg as roots
368+
gc_collect_root((void **)&th, 1);
369+
gc_collect_root(&th->arg, 1);
370+
gc_collect_root((void **)&th->protected, 1);
371+
continue;
340372
}
341373

342374
if (th->status != MP_THREAD_STATUS_READY) {
343375
continue; // Thread not running
344376
}
345377

378+
// Mark thread structure, arg, and protected as GC roots
379+
gc_collect_root((void **)&th, 1);
380+
gc_collect_root(&th->arg, 1);
381+
gc_collect_root((void **)&th->protected, 1);
382+
346383
// Validate stack_len matches Zephyr's reported stack size
347-
// This catches corruption where stack_len is modified or uninitialized
348-
// Query actual allocated stack size from Zephyr
349384
size_t actual_stack_size = th->id->stack_info.size / sizeof(uintptr_t);
350385
const size_t min_stack_words = 256; // Minimum 1KB
351386

352387
if (th->stack_len < min_stack_words) {
353388
mp_printf(&mp_plat_print,
354-
"WARNING: stack_len=0x%x too small (th=%p id=%p status=%d)\r\n",
355-
(unsigned int)th->stack_len, th, th->id, th->status);
356-
continue; // Skip corrupted stack_len values
389+
"WARNING: stack_len=0x%x too small (th=%s)\r\n",
390+
(unsigned int)th->stack_len, k_thread_name_get(th->id));
391+
continue;
357392
}
358393

359394
if (th->stack_len != actual_stack_size) {
360395
mp_printf(&mp_plat_print,
361-
"WARNING: stack_len=0x%x mismatch (th=%p actual=0x%x)\r\n",
362-
(unsigned int)th->stack_len, th, (unsigned int)actual_stack_size);
363-
// Use actual size from Zephyr for safety
396+
"WARNING: stack_len mismatch (th=%s expected=0x%x actual=0x%x)\r\n",
397+
k_thread_name_get(th->id),
398+
(unsigned int)th->stack_len, (unsigned int)actual_stack_size);
364399
th->stack_len = actual_stack_size;
365400
}
366401

367-
// Scan the thread's stack
402+
// Scan the full stack (conservative approach matching ports/zephyr)
403+
// This may scan some stale data below the current SP, but ensures all live references are found
368404
gc_collect_root(th->stack, th->stack_len);
369405
}
370406

@@ -431,12 +467,16 @@ mp_uint_t mp_thread_create_ex(void *(*entry)(void *), void *arg, size_t *stack_s
431467
// trying to lock thread_mutex while main thread was in gc_collect(). The normal GC
432468
// cycles will clean up finished threads anyway.
433469

434-
// Heap allocation with canary protection for memory corruption detection
435-
// Allocate mp_thread_protected_t (contains canaries before/after mp_thread_t)
436-
// Static pool testing showed: 25/42 pass (worse than 26/40 with heap)
437-
mp_thread_protected_t *protected = m_new_obj(mp_thread_protected_t);
438-
protected->canary_before = CANARY_BEFORE;
439-
protected->canary_after = CANARY_AFTER;
470+
// Allocate thread from static pool (GC-safe, outside GC heap)
471+
// CRITICAL: Thread structures in GC heap cause corruption during concurrent gc.collect()
472+
// When thread A scans thread B's structure, heap compaction can move B mid-scan
473+
mp_thread_mutex_lock(&thread_mutex, 1);
474+
mp_thread_protected_t *protected = thread_pool_alloc();
475+
mp_thread_mutex_unlock(&thread_mutex);
476+
477+
if (protected == NULL) {
478+
mp_raise_OSError(ENOMEM); // Thread pool exhausted
479+
}
440480
mp_thread_t *th = &protected->thread;
441481

442482
// Initialize ALL fields to safe values before adding to list

0 commit comments

Comments
 (0)