From: Arnaud Giersch Date: Wed, 19 Jul 2017 20:31:33 +0000 (+0200) Subject: Give back control to maestro as late as possible. X-Git-Tag: v3_17~347 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/c7829c256a041aae9a60454263de30188b3f2fcc Give back control to maestro as late as possible. The test allred from smpi-mpich3-coll-thread (with thread contexts factory and mmap privatization) occasionally segfaults on thread exit: teshsuite/smpi/mpich3-test/coll$ ../../../../smpi_script/bin/smpirun \ -platform ../../../../examples/platforms/small_platform_with_routers.xml \ -hostfile ../../hostfile_coll --cfg=smpi/running-power:1e9 \ --cfg=smpi/async-small-thresh:65536 --cfg=contexts/factory:thread \ --cfg=smpi/privatization:1 -np 4 ./allred [...] No Errors Segmentation fault. Backtrace: #0 do_lookup_x (...) at dl-lookup.c:98 #1 0x00007ffff7de596d in _dl_lookup_symbol_x (...) at dl-lookup.c:737 #2 0x00007ffff6a4da75 in do_dlsym (...) at dl-libc.c:97 #3 0x00007ffff7deaaa4 in _dl_catch_error (...) at dl-error.c:187 #4 0x00007ffff6a4dabf in dlerror_run (...) at dl-libc.c:46 #5 0x00007ffff6a4db78 in __GI___libc_dlsym (...) at dl-libc.c:210 #6 0x00007ffff730dfbe in pthread_cancel_init () at ../nptl/sysdeps/pthread/unwind-forcedunwind.c:55 #7 0x00007ffff730e16c in _Unwind_ForcedUnwind (...) 1at ../nptl/sysdeps/pthread/unwind-forcedunwind.c:129 #8 0x00007ffff730c5d0 in __GI___pthread_unwind (...>) at unwind.c:129 #9 0x00007ffff7307365 in __do_cancel () at pthreadP.h:280 #10 __pthread_exit (...) at pthread_exit.c:29 #11 0x00007ffff7c12df9 in xbt_os_thread_exit (...) at [...]/src/xbt/xbt_os_thread.c:251 #12 0x00007ffff7c938ab in stop (...) at [...]/src/kernel/context/ContextThread.cpp:205 #13 simgrid::kernel::context::ThreadContext::wrapper(void*) (...) at [...]/src/kernel/context/ContextThread.cpp:159 #14 0x00007ffff7c16131 in wrapper_start_routine (...) at [...]/src/xbt/xbt_os_thread.c:154 #15 0x00007ffff7306064 in start_thread (...) at pthread_create.c:309 #16 0x00007ffff6a1862d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111 Even if I don't fully understand what's going on, I suspect that a segment switch is occurring while the thread is running some cleanup handlers. Thus the proposed fix tries to delay the call to xbt_os_sem_release as late as possible. --- diff --git a/include/xbt/xbt_os_thread.h b/include/xbt/xbt_os_thread.h index fa4c014184..e73022d140 100644 --- a/include/xbt/xbt_os_thread.h +++ b/include/xbt/xbt_os_thread.h @@ -31,6 +31,8 @@ typedef struct xbt_os_thread_ *xbt_os_thread_t; XBT_PUBLIC(xbt_os_thread_t) xbt_os_thread_create(const char *name, pvoid_f_pvoid_t start_routine, void *param, void *data); XBT_PUBLIC(void) xbt_os_thread_exit(int *retcode); XBT_PUBLIC(void) xbt_os_thread_detach(xbt_os_thread_t thread); +#define xbt_os_thread_cleanup_push(routine, arg) pthread_cleanup_push(routine, arg) +#define xbt_os_thread_cleanup_pop(execute) pthread_cleanup_pop(execute) XBT_PUBLIC(xbt_os_thread_t) xbt_os_thread_self(void); XBT_PUBLIC(const char *) xbt_os_thread_self_name(void); diff --git a/src/kernel/context/ContextThread.cpp b/src/kernel/context/ContextThread.cpp index 85ac35a62a..012fbcc7ce 100644 --- a/src/kernel/context/ContextThread.cpp +++ b/src/kernel/context/ContextThread.cpp @@ -200,9 +200,9 @@ void ThreadContext::stop() xbt_os_sem_release(smx_ctx_thread_sem); // Signal to the maestro that it has finished: - xbt_os_sem_release(this->end_); - + xbt_os_thread_cleanup_push((void (*)(void*))xbt_os_sem_release, this->end_); xbt_os_thread_exit(nullptr); + xbt_os_thread_cleanup_pop(0); } void ThreadContext::suspend()