From: Martin Quinson Date: Thu, 28 Feb 2019 01:30:53 +0000 (+0100) Subject: Don't give actors a chance to survive their exec if their host is turned off X-Git-Tag: v3_22~216 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/fd654461ba21a5d93bb5182e128e3f9bff385d45 Don't give actors a chance to survive their exec if their host is turned off Before, they received an HostFailureException that they could catch to survive. Now, they are simply killed (once their on_exit callbacks are executed). This is more consistent with what happens with sleep actions, and thus fixes the github issue #325. --- diff --git a/ChangeLog b/ChangeLog index 9f4001bec8..cce8b0810f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -44,6 +44,7 @@ Fixed bugs: - #314: SMPI args internal cleanup - #316: Fix a bug related to the CPU utilization of multi-core VM - #318: Invalid trace file when using option --cfg=tracing/smpi/display-sizes:yes + - #325: Turning off a host has different behavior on sleeping actors and computing actors ---------------------------------------------------------------------------- diff --git a/examples/deprecated/msg/platform-failures/platform-failures.tesh b/examples/deprecated/msg/platform-failures/platform-failures.tesh index 9e36930074..6aa6f0b402 100644 --- a/examples/deprecated/msg/platform-failures/platform-failures.tesh +++ b/examples/deprecated/msg/platform-failures/platform-failures.tesh @@ -94,7 +94,6 @@ $ $SG_TEST_EXENV ${bindir:=.}/platform-failures$EXEEXT --log=xbt_cfg.thres:criti > [ 40.278351] (1:master@Tremblay) Send to worker-3 completed > [ 40.278351] (1:master@Tremblay) Send a message to worker-4 > [ 40.278351] (4:worker@Ginette) Start execution... -> [ 41.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! > [ 41.309278] (1:master@Tremblay) Send to worker-4 completed > [ 41.309278] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. > [ 41.309278] (2:worker@Tremblay) I'm done. See you! @@ -202,7 +201,6 @@ $ $SG_TEST_EXENV ${bindir:=.}/platform-failures$EXEEXT --log=xbt_cfg.thres:criti > [ 40.692268] (4:worker@Ginette) Start execution... > [ 40.692268] (1:master@Tremblay) Send to worker-3 completed > [ 40.692268] (1:master@Tremblay) Send a message to worker-4 -> [ 41.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! > [ 41.774742] (5:worker@Bourassa) Start execution... > [ 41.774742] (1:master@Tremblay) Send to worker-4 completed > [ 41.774742] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. diff --git a/examples/s4u/platform-failures/s4u-platform-failures.cpp b/examples/s4u/platform-failures/s4u-platform-failures.cpp index e4cca6a80a..cd949f8edb 100644 --- a/examples/s4u/platform-failures/s4u-platform-failures.cpp +++ b/examples/s4u/platform-failures/s4u-platform-failures.cpp @@ -93,16 +93,15 @@ static int worker(int argc, char* argv[]) payload = static_cast(mailbox->get()); xbt_assert(payload != nullptr, "mailbox->get() failed"); comp_size = *payload; + delete payload; if (comp_size < 0) { /* - Exit when -1.0 is received */ XBT_INFO("I'm done. See you!"); - delete payload; break; } /* - Otherwise, process the task */ XBT_INFO("Start execution..."); simgrid::s4u::this_actor::execute(comp_size); XBT_INFO("Execution complete."); - delete payload; } catch (simgrid::HostFailureException& e) { XBT_INFO("Gloups. The cpu on which I'm running just turned off!. See you!"); delete payload; diff --git a/examples/s4u/platform-failures/s4u-platform-failures.tesh b/examples/s4u/platform-failures/s4u-platform-failures.tesh index 007a7c1cd3..ed8a8cfa98 100644 --- a/examples/s4u/platform-failures/s4u-platform-failures.tesh +++ b/examples/s4u/platform-failures/s4u-platform-failures.tesh @@ -94,7 +94,6 @@ $ $SG_TEST_EXENV ${bindir:=.}/s4u-platform-failures$EXEEXT --log=xbt_cfg.thres:c > [ 40.278351] (1:master@Tremblay) Send to worker-3 completed > [ 40.278351] (1:master@Tremblay) Send a message to worker-4 > [ 40.278351] (4:worker@Ginette) Start execution... -> [ 41.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! > [ 41.309278] (1:master@Tremblay) Send to worker-4 completed > [ 41.309278] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. > [ 41.309278] (2:worker@Tremblay) I'm done. See you! @@ -202,7 +201,6 @@ $ $SG_TEST_EXENV ${bindir:=.}/s4u-platform-failures$EXEEXT --log=xbt_cfg.thres:c > [ 40.692268] (4:worker@Ginette) Start execution... > [ 40.692268] (1:master@Tremblay) Send to worker-3 completed > [ 40.692268] (1:master@Tremblay) Send a message to worker-4 -> [ 41.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! > [ 41.774742] (5:worker@Bourassa) Start execution... > [ 41.774742] (1:master@Tremblay) Send to worker-4 completed > [ 41.774742] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. diff --git a/src/kernel/activity/ExecImpl.cpp b/src/kernel/activity/ExecImpl.cpp index a55c1f3483..6645e768ba 100644 --- a/src/kernel/activity/ExecImpl.cpp +++ b/src/kernel/activity/ExecImpl.cpp @@ -175,8 +175,10 @@ void ExecImpl::finish() case SIMIX_FAILED: XBT_DEBUG("ExecImpl::finish(): host '%s' failed", simcall->issuer->get_host()->get_cname()); simcall->issuer->context_->iwannadie = true; - simcall->issuer->exception_ = - std::make_exception_ptr(simgrid::HostFailureException(XBT_THROW_POINT, "Host failed")); + if (simcall->issuer->get_host()->is_on()) + simcall->issuer->exception_ = + std::make_exception_ptr(simgrid::HostFailureException(XBT_THROW_POINT, "Host failed")); + /* else, the actor will be killed with no possibility to survive */ break; case SIMIX_CANCELED: diff --git a/teshsuite/msg/host_on_off_processes/host_on_off_processes.cpp b/teshsuite/msg/host_on_off_processes/host_on_off_processes.cpp index 85eab76b4f..87cb748648 100644 --- a/teshsuite/msg/host_on_off_processes/host_on_off_processes.cpp +++ b/teshsuite/msg/host_on_off_processes/host_on_off_processes.cpp @@ -27,13 +27,9 @@ static int process_daemon(int /*argc*/, char** /*argv*/) msg_task_t task = MSG_task_create("daemon", MSG_host_get_speed(MSG_host_self()), 0, NULL); MSG_process_set_data(self, task); XBT_INFO(" Execute daemon"); - msg_error_t res = MSG_task_execute(task); + MSG_task_execute(task); MSG_task_destroy(task); tasks_done++; - if (res == MSG_HOST_FAILURE) { - XBT_INFO("Host has died as expected, do nothing else"); - return 0; - } } XBT_INFO(" daemon done. See you!"); return 0; diff --git a/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh b/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh index 261b76ad6c..1156e2d9dc 100644 --- a/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh +++ b/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh @@ -61,7 +61,6 @@ $ ${bindir}/host_on_off_processes ${platfdir}/small_platform.xml 6 --log=no_loc > [Jupiter:process_daemonJUPI:(3) 9.000011] [msg_test/INFO] Execute daemon > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Turn Jupiter off > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Shutdown vm0 -> [Jupiter:process_daemonJUPI:(3) 10.000000] [msg_test/INFO] Host has died as expected, do nothing else > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Destroy vm0 > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Test 6 is also weird: when the node Jupiter is turned off once again, the VM and its daemon are not killed. However, the issue regarding the shutdown of hosted VMs can be seen a feature not a bug ;) > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Test done. See you!