The test allred from smpi-mpich3-coll-thread (with thread contexts factory and
mmap privatization) occasionally segfaults on thread exit:
teshsuite/smpi/mpich3-test/coll$ ../../../../smpi_script/bin/smpirun \
-platform ../../../../examples/platforms/small_platform_with_routers.xml \
-hostfile ../../hostfile_coll --cfg=smpi/running-power:1e9 \
--cfg=smpi/async-small-thresh:65536 --cfg=contexts/factory:thread \
--cfg=smpi/privatization:1 -np 4 ./allred
[...]
No Errors
Segmentation fault.
Backtrace:
#0 do_lookup_x (...) at dl-lookup.c:98
#1 0x00007ffff7de596d in _dl_lookup_symbol_x (...) at dl-lookup.c:737
#2 0x00007ffff6a4da75 in do_dlsym (...) at dl-libc.c:97
#3 0x00007ffff7deaaa4 in _dl_catch_error (...) at dl-error.c:187
#4 0x00007ffff6a4dabf in dlerror_run (...) at dl-libc.c:46
#5 0x00007ffff6a4db78 in __GI___libc_dlsym (...) at dl-libc.c:210
#6 0x00007ffff730dfbe in pthread_cancel_init ()
at ../nptl/sysdeps/pthread/unwind-forcedunwind.c:55
#7 0x00007ffff730e16c in _Unwind_ForcedUnwind (...)
1at ../nptl/sysdeps/pthread/unwind-forcedunwind.c:129
#8 0x00007ffff730c5d0 in __GI___pthread_unwind (...>) at unwind.c:129
#9 0x00007ffff7307365 in __do_cancel () at pthreadP.h:280
#10 __pthread_exit (...) at pthread_exit.c:29
#11 0x00007ffff7c12df9 in xbt_os_thread_exit (...)
at [...]/src/xbt/xbt_os_thread.c:251
#12 0x00007ffff7c938ab in stop (...)
at [...]/src/kernel/context/ContextThread.cpp:205
#13 simgrid::kernel::context::ThreadContext::wrapper(void*) (...)
at [...]/src/kernel/context/ContextThread.cpp:159
#14 0x00007ffff7c16131 in wrapper_start_routine (...)
at [...]/src/xbt/xbt_os_thread.c:154
#15 0x00007ffff7306064 in start_thread (...) at pthread_create.c:309
#16 0x00007ffff6a1862d in clone ()
at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
Even if I don't fully understand what's going on, I suspect that a segment
switch is occurring while the thread is running some cleanup handlers. Thus the
proposed fix tries to delay the call to xbt_os_sem_release as late as possible.
XBT_PUBLIC(xbt_os_thread_t) xbt_os_thread_create(const char *name, pvoid_f_pvoid_t start_routine, void *param, void *data);
XBT_PUBLIC(void) xbt_os_thread_exit(int *retcode);
XBT_PUBLIC(void) xbt_os_thread_detach(xbt_os_thread_t thread);
+#define xbt_os_thread_cleanup_push(routine, arg) pthread_cleanup_push(routine, arg)
+#define xbt_os_thread_cleanup_pop(execute) pthread_cleanup_pop(execute)
XBT_PUBLIC(xbt_os_thread_t) xbt_os_thread_self(void);
XBT_PUBLIC(const char *) xbt_os_thread_self_name(void);
xbt_os_sem_release(smx_ctx_thread_sem);
// Signal to the maestro that it has finished:
- xbt_os_sem_release(this->end_);
-
+ xbt_os_thread_cleanup_push((void (*)(void*))xbt_os_sem_release, this->end_);
xbt_os_thread_exit(nullptr);
+ xbt_os_thread_cleanup_pop(0);
}
void ThreadContext::suspend()