Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Fix thread safety issue in process cleanup
authorGabriel Corona <gabriel.corona@loria.fr>
Fri, 19 Jun 2015 09:34:54 +0000 (11:34 +0200)
committerGabriel Corona <gabriel.corona@loria.fr>
Thu, 25 Jun 2015 12:36:57 +0000 (14:36 +0200)
While trying to find the root cause of the occasional failure of the
kademlia test, I found a thread-safety issue here:

When running in parallel mode all worker threads can call the
`SIMIX_process_cleanup()` concurrently which does things such as:

~~~c++
XBT_DEBUG("%p should not be run anymore",process);
xbt_swag_remove(process, simix_global->process_list);
xbt_swag_remove(process, SIMIX_host_priv(process->smx_host)->process_list);
xbt_swag_insert(process, simix_global->process_to_destroy);
~~~

The SWAG are modified concurrently by the worker threads without
synchronization. Sometimes, they go into inconsistent states (for
example, `swag->count` is no more consistent with the number of
elements really in the swag).

This fix might not be the best way to do it.

src/simix/smx_global.c
src/simix/smx_private.h
src/simix/smx_process.c

index 9aea6ca..c7abb8a 100644 (file)
@@ -178,6 +178,7 @@ void SIMIX_global_init(int *argc, char **argv)
         SIMIX_synchro_mallocator_new_f, SIMIX_synchro_mallocator_free_f,
         SIMIX_synchro_mallocator_reset_f);
     simix_global->autorestart = SIMIX_host_restart_processes;
         SIMIX_synchro_mallocator_new_f, SIMIX_synchro_mallocator_free_f,
         SIMIX_synchro_mallocator_reset_f);
     simix_global->autorestart = SIMIX_host_restart_processes;
+    simix_global->mutex = xbt_os_mutex_init();
 
     surf_init(argc, argv);      /* Initialize SURF structures */
     SIMIX_context_mod_init();
 
     surf_init(argc, argv);      /* Initialize SURF structures */
     SIMIX_context_mod_init();
@@ -259,6 +260,9 @@ void SIMIX_clean(void)
   simix_global->process_to_destroy = NULL;
   xbt_dict_free(&(simix_global->registered_functions));
 
   simix_global->process_to_destroy = NULL;
   xbt_dict_free(&(simix_global->registered_functions));
 
+  xbt_os_mutex_destroy(simix_global->mutex);
+  simix_global->mutex = NULL;
+
   /* Let's free maestro now */
   SIMIX_context_free(simix_global->maestro_process->context);
   xbt_free(simix_global->maestro_process->running_ctx);
   /* Let's free maestro now */
   SIMIX_context_free(simix_global->maestro_process->context);
   xbt_free(simix_global->maestro_process->running_ctx);
index 9576a92..afc5e64 100644 (file)
@@ -56,6 +56,8 @@ typedef struct s_smx_global {
   xbt_os_timer_t timer_seq; /* used to bench the sequential and parallel parts of the simulation, if requested to */
   xbt_os_timer_t timer_par;
 #endif
   xbt_os_timer_t timer_seq; /* used to bench the sequential and parallel parts of the simulation, if requested to */
   xbt_os_timer_t timer_par;
 #endif
+
+  xbt_os_mutex_t mutex;
 } s_smx_global_t, *smx_global_t;
 
 XBT_PUBLIC_DATA(smx_global_t) simix_global;
 } s_smx_global_t, *smx_global_t;
 
 XBT_PUBLIC_DATA(smx_global_t) simix_global;
index 21a4ef8..df8d7ec 100644 (file)
@@ -54,6 +54,8 @@ void SIMIX_process_cleanup(smx_process_t process)
   if (process->kill_timer != NULL)
          SIMIX_timer_remove(process->kill_timer);
 
   if (process->kill_timer != NULL)
          SIMIX_timer_remove(process->kill_timer);
 
+  xbt_os_mutex_acquire(simix_global->mutex);
+
   /* cancel non-blocking communications */
   smx_synchro_t synchro;
   while ((synchro = xbt_fifo_pop(process->comms))) {
   /* cancel non-blocking communications */
   smx_synchro_t synchro;
   while ((synchro = xbt_fifo_pop(process->comms))) {
@@ -97,6 +99,8 @@ void SIMIX_process_cleanup(smx_process_t process)
   xbt_swag_remove(process, SIMIX_host_priv(process->smx_host)->process_list);
   xbt_swag_insert(process, simix_global->process_to_destroy);
   process->context->iwannadie = 0;
   xbt_swag_remove(process, SIMIX_host_priv(process->smx_host)->process_list);
   xbt_swag_insert(process, simix_global->process_to_destroy);
   process->context->iwannadie = 0;
+
+  xbt_os_mutex_release(simix_global->mutex);
 }
 
 /**
 }
 
 /**