From: Martin Quinson Date: Fri, 15 Mar 2019 10:48:11 +0000 (+0100) Subject: actors on failing hosts should die silently and with no delay X-Git-Tag: v3_22~90 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/5fa46b25d81881c427ca3233a43ce12aad4a0242 actors on failing hosts should die silently and with no delay --- diff --git a/examples/deprecated/msg/platform-failures/platform-failures.tesh b/examples/deprecated/msg/platform-failures/platform-failures.tesh index 9c64e17672..b903537584 100644 --- a/examples/deprecated/msg/platform-failures/platform-failures.tesh +++ b/examples/deprecated/msg/platform-failures/platform-failures.tesh @@ -19,9 +19,7 @@ $ $SG_TEST_EXENV ${bindir:=.}/platform-failures$EXEEXT --log=xbt_cfg.thres:criti > [ 1.000000] (7:worker@Fafard) Waiting a message on worker-2 > [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-1'. Nevermind. Let's keep going! > [ 1.000000] (1:master@Tremblay) Send a message to worker-2 -> [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-2'. Nevermind. Let's keep going! -> [ 2.000000] (7:worker@Fafard) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (0:maestro@) Restart processes on host Jupiter > [ 2.000000] (1:master@Tremblay) Send a message to worker-3 > [ 2.000000] (8:worker@Jupiter) Waiting a message on worker-1 @@ -126,12 +124,10 @@ $ $SG_TEST_EXENV ${bindir:=.}/platform-failures$EXEEXT --log=xbt_cfg.thres:criti > [ 1.000000] (7:worker@Fafard) Waiting a message on worker-2 > [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-1'. Nevermind. Let's keep going! > [ 1.000000] (1:master@Tremblay) Send a message to worker-2 -> [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (0:maestro@) Restart processes on host Jupiter > [ 2.000000] (8:worker@Jupiter) Waiting a message on worker-1 > [ 2.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-2'. Nevermind. Let's keep going! > [ 2.000000] (1:master@Tremblay) Send a message to worker-3 -> [ 2.000000] (7:worker@Fafard) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.010825] (2:worker@Tremblay) Execution complete. > [ 2.010825] (2:worker@Tremblay) Waiting a message on worker-0 > [ 3.082474] (5:worker@Ginette) Start execution... diff --git a/examples/s4u/platform-failures/s4u-platform-failures.cpp b/examples/s4u/platform-failures/s4u-platform-failures.cpp index f0150d8d94..75669f5597 100644 --- a/examples/s4u/platform-failures/s4u-platform-failures.cpp +++ b/examples/s4u/platform-failures/s4u-platform-failures.cpp @@ -40,9 +40,6 @@ static int master(int argc, char* argv[]) XBT_INFO("Send a message to %s", mailbox->get_cname()); mailbox->put(payload, comm_size, 10.0); XBT_INFO("Send to %s completed", mailbox->get_cname()); - } catch (simgrid::HostFailureException& e) { - XBT_INFO("Gloups. The cpu on which I'm running just turned off!. See you!"); - return -1; } catch (simgrid::TimeoutError& e) { delete payload; XBT_INFO("Mmh. Got timeouted while speaking to '%s'. Nevermind. Let's keep going!", mailbox->get_cname()); diff --git a/examples/s4u/platform-failures/s4u-platform-failures.tesh b/examples/s4u/platform-failures/s4u-platform-failures.tesh index 9d30015e26..b1acb0b80b 100644 --- a/examples/s4u/platform-failures/s4u-platform-failures.tesh +++ b/examples/s4u/platform-failures/s4u-platform-failures.tesh @@ -19,9 +19,7 @@ $ $SG_TEST_EXENV ${bindir:=.}/s4u-platform-failures$EXEEXT --log=xbt_cfg.thres:c > [ 1.000000] (7:worker@Fafard) Waiting a message on worker-2 > [ 1.000000] (1:master@Tremblay) Mmh. The communication with 'worker-1' failed. Nevermind. Let's keep going! > [ 1.000000] (1:master@Tremblay) Send a message to worker-2 -> [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (1:master@Tremblay) Mmh. The communication with 'worker-2' failed. Nevermind. Let's keep going! -> [ 2.000000] (7:worker@Fafard) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (0:maestro@) Restart processes on host Jupiter > [ 2.000000] (1:master@Tremblay) Send a message to worker-3 > [ 2.000000] (8:worker@Jupiter) Waiting a message on worker-1 @@ -126,12 +124,10 @@ $ $SG_TEST_EXENV ${bindir:=.}/s4u-platform-failures$EXEEXT --log=xbt_cfg.thres:c > [ 1.000000] (7:worker@Fafard) Waiting a message on worker-2 > [ 1.000000] (1:master@Tremblay) Mmh. The communication with 'worker-1' failed. Nevermind. Let's keep going! > [ 1.000000] (1:master@Tremblay) Send a message to worker-2 -> [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (0:maestro@) Restart processes on host Jupiter > [ 2.000000] (8:worker@Jupiter) Waiting a message on worker-1 > [ 2.000000] (1:master@Tremblay) Mmh. The communication with 'worker-2' failed. Nevermind. Let's keep going! > [ 2.000000] (1:master@Tremblay) Send a message to worker-3 -> [ 2.000000] (7:worker@Fafard) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.010825] (2:worker@Tremblay) Execution complete. > [ 2.010825] (2:worker@Tremblay) Waiting a message on worker-0 > [ 3.082474] (5:worker@Ginette) Start execution... diff --git a/src/kernel/activity/CommImpl.cpp b/src/kernel/activity/CommImpl.cpp index 5fbd6b33c9..8079463a78 100644 --- a/src/kernel/activity/CommImpl.cpp +++ b/src/kernel/activity/CommImpl.cpp @@ -585,8 +585,6 @@ void CommImpl::finish() if (not simcall->issuer->get_host()->is_on()) { simcall->issuer->context_->iwannadie = true; - simcall->issuer->exception_ = - std::make_exception_ptr(simgrid::HostFailureException(XBT_THROW_POINT, "Host failed")); } else { switch (state_) { diff --git a/src/kernel/actor/ActorImpl.cpp b/src/kernel/actor/ActorImpl.cpp index 4569badaab..8371aaeae5 100644 --- a/src/kernel/actor/ActorImpl.cpp +++ b/src/kernel/actor/ActorImpl.cpp @@ -302,7 +302,6 @@ void ActorImpl::yield() XBT_DEBUG("Control returned to me: '%s'", get_cname()); if (context_->iwannadie) { - XBT_DEBUG("Actor %s@%s is dead", get_cname(), host_->get_cname()); // throw simgrid::kernel::context::ForcefulKillException(); Does not seem to properly kill the actor context_->stop(); diff --git a/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh b/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh index f4857f0225..ef60921ccb 100644 --- a/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh +++ b/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh @@ -39,8 +39,6 @@ $ ${bindir}/host_on_off_processes ${platfdir}/small_platform.xml 5 --log=no_loc > [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Turn Jupiter off > [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Test 5 seems ok (number of Process: 2, it should be 2) > [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Test done. See you! -> [Jupiter:commRX:(2) 20.000000] [msg_test/INFO] Receive message: HOST_FAILURE -> [Jupiter:commRX:(2) 20.000000] [msg_test/INFO] RX Done > [Tremblay:commTX:(3) 40.000000] [msg_test/INFO] TX done > [40.000000] [msg_test/INFO] Simulation time 40