Fix up a nasty little race on thread exit. After the thread sets the status to Empty and releases run_sema, it could get recycled. Since there's a little bit of code which needs to run after unlocking, there's a chance that it could get its stack stolen from under it. The solution it to use a temporary Zombie state, which prevents the thread structure from being reallocated. The thread exit code then sets the state to Empty and exits without touching the stack in the meantime. coregrind/core.h | 3 ++- coregrind/linux/core_os.c | 6 ------ coregrind/vg_main.c | 2 ++ coregrind/vg_scheduler.c | 19 +++++++++++++++---- coregrind/x86-linux/syscalls.c | 24 +++++++++++++++--------- 5 files changed, 34 insertions(+), 20 deletions(-) diff -puN coregrind/core.h~fix-exit-race coregrind/core.h --- valgrind/coregrind/core.h~fix-exit-race 2005-01-12 18:40:54.000000000 -0800 +++ valgrind-jeremy/coregrind/core.h 2005-01-12 18:40:54.000000000 -0800 @@ -603,7 +603,7 @@ typedef struct Empty -> Init -> Runnable <=> WaitSys/Yielding ^ | - \-----------------/ + \---- Zombie -----/ */ typedef enum ThreadStatus { @@ -612,6 +612,7 @@ typedef VgTs_Runnable, /* ready to run */ VgTs_WaitSys, /* waiting for a syscall to complete */ VgTs_Yielding, /* temporarily yielding the CPU */ + VgTs_Zombie, /* transient state just before exiting */ } ThreadStatus; diff -puN coregrind/vg_scheduler.c~fix-exit-race coregrind/vg_scheduler.c --- valgrind/coregrind/vg_scheduler.c~fix-exit-race 2005-01-12 18:40:54.000000000 -0800 +++ valgrind-jeremy/coregrind/vg_scheduler.c 2005-01-12 18:40:54.000000000 -0800 @@ -211,6 +211,7 @@ const Char* name_of_thread_state ( Threa case VgTs_Runnable: return "VgTs_Runnable"; case VgTs_WaitSys: return "VgTs_WaitSys"; case VgTs_Yielding: return "VgTs_Yielding"; + case VgTs_Zombie: return "VgTs_Zombie"; default: return "VgTs_???"; } } @@ -320,7 +321,8 @@ Int VG_(count_living_threads)(void) ThreadId tid; for(tid = 1; tid < VG_N_THREADS; tid++) - if (VG_(threads)[tid].status != VgTs_Empty) + if (VG_(threads)[tid].status != VgTs_Empty && + VG_(threads)[tid].status != VgTs_Zombie) count++; return count; @@ -366,7 +368,9 @@ inline Bool VG_(is_exiting)(ThreadId tid return VG_(threads)[tid].exitreason != VgSrc_None; } -/* Mark the the ThreadState as unused and release the semaphore */ +/* Clear out the ThreadState and release the semaphore. Leaves the + ThreadState in VgTs_Zombie state, so that it doesn't get + reallocated until the caller is really ready. */ void VG_(exit_thread)(ThreadId tid) { vg_assert(VG_(is_valid_tid)(tid)); @@ -403,7 +407,9 @@ void VG_(kill_thread)(ThreadId tid) vg_assert(VG_(is_exiting)(tid)); if (VG_(threads)[tid].status == VgTs_WaitSys) { - //VG_(printf)("kill_thread zaps tid %d lwp %d\n", tid, VG_(threads)[tid].os_state.lwpid); + if (VG_(clo_trace_signals)) + VG_(message)(Vg_DebugMsg, "kill_thread zaps tid %d lwp %d", + tid, VG_(threads)[tid].os_state.lwpid); VG_(tkill)(VG_(threads)[tid].os_state.lwpid, VKI_SIGVGKILL); } } @@ -545,7 +551,11 @@ void mostly_clear_thread_record ( Thread vg_assert(tid >= 0 && tid < VG_N_THREADS); VGA_(clear_thread)(&VG_(threads)[tid].arch); VG_(threads)[tid].tid = tid; - VG_(threads)[tid].status = VgTs_Empty; + + /* Leave the thread in Zombie, so that it doesn't get reallocated + until the caller is finally done with the thread stack. */ + VG_(threads)[tid].status = VgTs_Zombie; + VG_(sigemptyset)(&VG_(threads)[tid].sig_mask); VGA_(os_state_init)(&VG_(threads)[tid]); @@ -607,6 +617,7 @@ void VG_(scheduler_init) ( void ) VGA_(os_state_init)(&VG_(threads)[i]); mostly_clear_thread_record(i); + VG_(threads)[i].status = VgTs_Empty; VG_(threads)[i].stack_size = 0; VG_(threads)[i].stack_base = (Addr)NULL; VG_(threads)[i].stack_guard_size = 0; diff -puN coregrind/x86-linux/syscalls.c~fix-exit-race coregrind/x86-linux/syscalls.c --- valgrind/coregrind/x86-linux/syscalls.c~fix-exit-race 2005-01-12 18:40:54.000000000 -0800 +++ valgrind-jeremy/coregrind/x86-linux/syscalls.c 2005-01-12 18:40:54.000000000 -0800 @@ -179,19 +179,25 @@ static Int start_thread(void *arg) /* OK, thread is dead; this releases the run lock */ VG_(exit_thread)(tid); - /* XXX There's a nasty little race here. Once we release the lock, - in principle this thread structure could be reused for a new - thread, recycling the stack we're actually using. The only way - to fix this would be to release the lock, poke the reaper and - exit without using any stack... - Fortunately this is quite a small race. - */ + vg_assert(tst->status == VgTs_Zombie); /* Poke the reaper */ VG_(tkill)(VG_(threads)[VG_(master_tid)].os_state.lwpid, VKI_SIGVGCHLD); - /* bye-bye */ - return tst->os_state.exitcode; + /* We have to use this sequence to terminate the thread to prevent + a subtle race. If VG_(exit_thread)() had left the ThreadState + as Empty, then it could have been reallocated, reusing the stack + while we're doing these last cleanups. Instead, + VG_(exit_thread) leaves it as Zombie to prevent reallocation. + We need to make sure we don't touch the stack between marking it + Empty and exiting. Hence the assembler. */ + asm volatile ( + "movl %1, %0\n" /* set tst->status = VgTs_Empty */ + "int $0x80\n" /* exit(tst->os_state.exitcode) */ + : "=m" (tst->status) + : "n" (VgTs_Empty), "a" (__NR_exit), "b" (tst->os_state.exitcode)); + + VG_(core_panic)("Thread exit failed?\n"); } /* diff -puN coregrind/vg_main.c~fix-exit-race coregrind/vg_main.c --- valgrind/coregrind/vg_main.c~fix-exit-race 2005-01-12 18:40:54.000000000 -0800 +++ valgrind-jeremy/coregrind/vg_main.c 2005-01-12 18:40:54.000000000 -0800 @@ -2839,6 +2839,8 @@ void VG_(shutdown_actions)(ThreadId tid) /* should be no threads left */ vg_assert(VG_(count_living_threads)() == 0); + VG_(threads)[tid].status = VgTs_Empty; + //-------------------------------------------------------------- // Finalisation: cleanup, messages, etc. Order no so important, only // affects what order the messages come. diff -puN coregrind/linux/core_os.c~fix-exit-race coregrind/linux/core_os.c --- valgrind/coregrind/linux/core_os.c~fix-exit-race 2005-01-12 18:40:54.000000000 -0800 +++ valgrind-jeremy/coregrind/linux/core_os.c 2005-01-12 18:40:54.000000000 -0800 @@ -55,12 +55,6 @@ void VGA_(thread_wrapper)(ThreadId tid) if (tid == VG_(master_tid)) { VG_(shutdown_actions)(tid); VGA_(terminate)(tid, ret); - } else { - /* OK, disappear, but tell the reaper */ - vg_assert(tst->status == VgTs_Runnable); - VG_(exit_thread)(tid); - VG_(tkill)(VG_(threads)[VG_(master_tid)].os_state.lwpid, VKI_SIGVGCHLD); - VG_(exit_single)(tst->os_state.exitcode); } } _