While trying to find the root cause of the occasional failure of the
kademlia test, I found a thread-safety issue here:
When running in parallel mode all worker threads can call the
`SIMIX_process_cleanup()` concurrently which does things such as:
~~~c++
XBT_DEBUG("%p should not be run anymore",process);
xbt_swag_remove(process, simix_global->process_list);
xbt_swag_remove(process, SIMIX_host_priv(process->smx_host)->process_list);
xbt_swag_insert(process, simix_global->process_to_destroy);
~~~
The SWAG are modified concurrently by the worker threads without
synchronization. Sometimes, they go into inconsistent states (for
example, `swag->count` is no more consistent with the number of
elements really in the swag).
This fix might not be the best way to do it.
SIMIX_synchro_mallocator_new_f, SIMIX_synchro_mallocator_free_f,
SIMIX_synchro_mallocator_reset_f);
simix_global->autorestart = SIMIX_host_restart_processes;
SIMIX_synchro_mallocator_new_f, SIMIX_synchro_mallocator_free_f,
SIMIX_synchro_mallocator_reset_f);
simix_global->autorestart = SIMIX_host_restart_processes;
+ simix_global->mutex = xbt_os_mutex_init();
surf_init(argc, argv); /* Initialize SURF structures */
SIMIX_context_mod_init();
surf_init(argc, argv); /* Initialize SURF structures */
SIMIX_context_mod_init();
simix_global->process_to_destroy = NULL;
xbt_dict_free(&(simix_global->registered_functions));
simix_global->process_to_destroy = NULL;
xbt_dict_free(&(simix_global->registered_functions));
+ xbt_os_mutex_destroy(simix_global->mutex);
+ simix_global->mutex = NULL;
+
/* Let's free maestro now */
SIMIX_context_free(simix_global->maestro_process->context);
xbt_free(simix_global->maestro_process->running_ctx);
/* Let's free maestro now */
SIMIX_context_free(simix_global->maestro_process->context);
xbt_free(simix_global->maestro_process->running_ctx);
xbt_os_timer_t timer_seq; /* used to bench the sequential and parallel parts of the simulation, if requested to */
xbt_os_timer_t timer_par;
#endif
xbt_os_timer_t timer_seq; /* used to bench the sequential and parallel parts of the simulation, if requested to */
xbt_os_timer_t timer_par;
#endif
+
+ xbt_os_mutex_t mutex;
} s_smx_global_t, *smx_global_t;
XBT_PUBLIC_DATA(smx_global_t) simix_global;
} s_smx_global_t, *smx_global_t;
XBT_PUBLIC_DATA(smx_global_t) simix_global;
if (process->kill_timer != NULL)
SIMIX_timer_remove(process->kill_timer);
if (process->kill_timer != NULL)
SIMIX_timer_remove(process->kill_timer);
+ xbt_os_mutex_acquire(simix_global->mutex);
+
/* cancel non-blocking communications */
smx_synchro_t synchro;
while ((synchro = xbt_fifo_pop(process->comms))) {
/* cancel non-blocking communications */
smx_synchro_t synchro;
while ((synchro = xbt_fifo_pop(process->comms))) {
xbt_swag_remove(process, SIMIX_host_priv(process->smx_host)->process_list);
xbt_swag_insert(process, simix_global->process_to_destroy);
process->context->iwannadie = 0;
xbt_swag_remove(process, SIMIX_host_priv(process->smx_host)->process_list);
xbt_swag_insert(process, simix_global->process_to_destroy);
process->context->iwannadie = 0;
+
+ xbt_os_mutex_release(simix_global->mutex);