From: Gabriel Corona Date: Fri, 19 Jun 2015 09:34:54 +0000 (+0200) Subject: Fix thread safety issue in process cleanup X-Git-Tag: v3_12~561 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/be00fecb16c89d44253052889f4815b35ab9abbe?ds=sidebyside Fix thread safety issue in process cleanup While trying to find the root cause of the occasional failure of the kademlia test, I found a thread-safety issue here: When running in parallel mode all worker threads can call the `SIMIX_process_cleanup()` concurrently which does things such as: ~~~c++ XBT_DEBUG("%p should not be run anymore",process); xbt_swag_remove(process, simix_global->process_list); xbt_swag_remove(process, SIMIX_host_priv(process->smx_host)->process_list); xbt_swag_insert(process, simix_global->process_to_destroy); ~~~ The SWAG are modified concurrently by the worker threads without synchronization. Sometimes, they go into inconsistent states (for example, `swag->count` is no more consistent with the number of elements really in the swag). This fix might not be the best way to do it. --- diff --git a/src/simix/smx_global.c b/src/simix/smx_global.c index 9aea6cac5f..c7abb8a0c6 100644 --- a/src/simix/smx_global.c +++ b/src/simix/smx_global.c @@ -178,6 +178,7 @@ void SIMIX_global_init(int *argc, char **argv) SIMIX_synchro_mallocator_new_f, SIMIX_synchro_mallocator_free_f, SIMIX_synchro_mallocator_reset_f); simix_global->autorestart = SIMIX_host_restart_processes; + simix_global->mutex = xbt_os_mutex_init(); surf_init(argc, argv); /* Initialize SURF structures */ SIMIX_context_mod_init(); @@ -259,6 +260,9 @@ void SIMIX_clean(void) simix_global->process_to_destroy = NULL; xbt_dict_free(&(simix_global->registered_functions)); + xbt_os_mutex_destroy(simix_global->mutex); + simix_global->mutex = NULL; + /* Let's free maestro now */ SIMIX_context_free(simix_global->maestro_process->context); xbt_free(simix_global->maestro_process->running_ctx); diff --git a/src/simix/smx_private.h b/src/simix/smx_private.h index 9576a92e2b..afc5e64af3 100644 --- a/src/simix/smx_private.h +++ b/src/simix/smx_private.h @@ -56,6 +56,8 @@ typedef struct s_smx_global { xbt_os_timer_t timer_seq; /* used to bench the sequential and parallel parts of the simulation, if requested to */ xbt_os_timer_t timer_par; #endif + + xbt_os_mutex_t mutex; } s_smx_global_t, *smx_global_t; XBT_PUBLIC_DATA(smx_global_t) simix_global; diff --git a/src/simix/smx_process.c b/src/simix/smx_process.c index 21a4ef8eb9..df8d7ecaf7 100644 --- a/src/simix/smx_process.c +++ b/src/simix/smx_process.c @@ -54,6 +54,8 @@ void SIMIX_process_cleanup(smx_process_t process) if (process->kill_timer != NULL) SIMIX_timer_remove(process->kill_timer); + xbt_os_mutex_acquire(simix_global->mutex); + /* cancel non-blocking communications */ smx_synchro_t synchro; while ((synchro = xbt_fifo_pop(process->comms))) { @@ -97,6 +99,8 @@ void SIMIX_process_cleanup(smx_process_t process) xbt_swag_remove(process, SIMIX_host_priv(process->smx_host)->process_list); xbt_swag_insert(process, simix_global->process_to_destroy); process->context->iwannadie = 0; + + xbt_os_mutex_release(simix_global->mutex); } /**