From: Gabriel Corona Date: Fri, 3 Jul 2015 11:36:53 +0000 (+0200) Subject: Move the host-on-off fix above X-Git-Tag: v3_12~539 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/24ee99596e6ca507bf1e517d49c7e8110a8a410b Move the host-on-off fix above If our host is OFF, we should always get a HOST_FAILURE. This seems to fix issues where a host was turned off but its processes were not woken up with HOST_FAILURE. --- diff --git a/examples/msg/masterslave/masterslave_cpu_ti_crosstraffic.tesh b/examples/msg/masterslave/masterslave_cpu_ti_crosstraffic.tesh index c93afef039..94063dbfed 100644 --- a/examples/msg/masterslave/masterslave_cpu_ti_crosstraffic.tesh +++ b/examples/msg/masterslave/masterslave_cpu_ti_crosstraffic.tesh @@ -290,6 +290,7 @@ $ ${bindir:=.}/masterslave_failure$EXEEXT --log=xbt_cfg.thres:critical --log=no_ > [ 0.000000] (1:master@Tremblay) Got 20 task to process : > [ 1.000000] (0:@) Restart processes on host: Fafard > [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'Jupiter'. Nevermind. Let's keep going! +> [ 1.000000] (3:slave@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (0:@) Restart processes on host: Jupiter > [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'Fafard'. Nevermind. Let's keep going! > [ 12.082474] (1:master@Tremblay) Send completed diff --git a/examples/msg/masterslave/masterslave_failure.tesh b/examples/msg/masterslave/masterslave_failure.tesh index 7b1b5f7e5b..f6596fff95 100644 --- a/examples/msg/masterslave/masterslave_failure.tesh +++ b/examples/msg/masterslave/masterslave_failure.tesh @@ -14,6 +14,7 @@ $ masterslave/masterslave_failure$EXEEXT --log=xbt_cfg.thres:critical --log=no_l > [ 0.000000] (1:master@Tremblay) Got 20 task to process : > [ 1.000000] (0:@) Restart processes on host: Fafard > [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'Jupiter'. Nevermind. Let's keep going! +> [ 1.000000] (3:slave@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (0:@) Restart processes on host: Jupiter > [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'Fafard'. Nevermind. Let's keep going! > [ 12.030928] (1:master@Tremblay) Send completed diff --git a/examples/msg/masterslave/masterslave_failure_crosstraffic.tesh b/examples/msg/masterslave/masterslave_failure_crosstraffic.tesh index 43feb15002..ad02729c3f 100644 --- a/examples/msg/masterslave/masterslave_failure_crosstraffic.tesh +++ b/examples/msg/masterslave/masterslave_failure_crosstraffic.tesh @@ -14,6 +14,7 @@ $ masterslave/masterslave_failure$EXEEXT --log=xbt_cfg.thres:critical --log=no_l > [ 0.000000] (1:master@Tremblay) Got 20 task to process : > [ 1.000000] (0:@) Restart processes on host: Fafard > [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'Jupiter'. Nevermind. Let's keep going! +> [ 1.000000] (3:slave@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (0:@) Restart processes on host: Jupiter > [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'Fafard'. Nevermind. Let's keep going! > [ 12.082474] (1:master@Tremblay) Send completed diff --git a/src/simix/smx_network.c b/src/simix/smx_network.c index 6d1b84928b..42e07bebd9 100644 --- a/src/simix/smx_network.c +++ b/src/simix/smx_network.c @@ -777,7 +777,6 @@ void SIMIX_comm_finish(smx_synchro_t synchro) unsigned int destroy_count = 0; smx_simcall_t simcall; - while ((simcall = xbt_fifo_shift(synchro->simcalls))) { /* If a waitany simcall is waiting for this synchro to finish, then remove @@ -801,6 +800,12 @@ void SIMIX_comm_finish(smx_synchro_t synchro) /* Check out for errors */ + if (surf_resource_get_state(surf_workstation_resource_priv( + simcall->issuer->smx_host)) != SURF_RESOURCE_ON) { + simcall->issuer->context->iwannadie = 1; + SMX_EXCEPTION(simcall->issuer, host_error, 0, "Host failed"); + } else + switch (synchro->state) { case SIMIX_DONE: @@ -836,14 +841,6 @@ void SIMIX_comm_finish(smx_synchro_t synchro) case SIMIX_LINK_FAILURE: - // There should be a cleaner way to do this. - // We should handle this in SIMIX_post_comm instead. - if (surf_resource_get_state(surf_workstation_resource_priv( - simcall->issuer->smx_host)) != SURF_RESOURCE_ON) { - SMX_EXCEPTION(simcall->issuer, host_error, 0, "Host failed"); - break; - } - XBT_DEBUG("Link failure in synchro %p between '%s' and '%s': posting an exception to the issuer: %s (%p) detached:%d", synchro, synchro->comm.src_proc ? sg_host_name(synchro->comm.src_proc->smx_host) : NULL,