From 3e79165b3c7d3ebaa59d0bf6dd132987a14a9373 Mon Sep 17 00:00:00 2001 From: Frederic Suter Date: Tue, 26 Feb 2019 11:28:47 +0100 Subject: [PATCH] fix (#324) Throw a HostFailureException when one tries to create (or attach) an actor an a host that is not on. If this happens during the parsing of the deployment file, just let it go. for MSG, we could die if this happens (we don't for now) --- .../platform-failures/platform-failures.tesh | 2 ++ .../s4u-platform-failures.tesh | 2 ++ src/msg/msg_process.cpp | 18 ++++++++++++------ src/s4u/s4u_Actor.cpp | 9 +++++++-- src/simix/ActorImpl.cpp | 4 ++++ src/surf/sg_platf.cpp | 11 ++++++++--- 6 files changed, 35 insertions(+), 11 deletions(-) diff --git a/examples/deprecated/msg/platform-failures/platform-failures.tesh b/examples/deprecated/msg/platform-failures/platform-failures.tesh index f8561ea3f6..0dbaee5be9 100644 --- a/examples/deprecated/msg/platform-failures/platform-failures.tesh +++ b/examples/deprecated/msg/platform-failures/platform-failures.tesh @@ -5,6 +5,7 @@ p Testing a simple master/worker example application handling failures TCP cross ! output sort 19 $ $SG_TEST_EXENV ${bindir:=.}/platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_failures.xml ${srcdir}/../app-masterworker/app-masterworker_d.xml --cfg=path:${srcdir} --cfg=network/crosstraffic:0 "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" --log=surf_cpu.t:verbose > [ 0.000000] (0:maestro@) Cannot launch actor 'worker' on failed host 'Fafard' +> [ 0.000000] (0:maestro@) Deployment include some initially turned off Hosts, nevermind. > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process > [ 0.000000] (1:master@Tremblay) Send a message to worker-0 > [ 0.010309] (1:master@Tremblay) Send to worker-0 completed @@ -112,6 +113,7 @@ p Testing a simple master/worker example application handling failures. TCP cros ! output sort 19 $ $SG_TEST_EXENV ${bindir:=.}/platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_failures.xml ${srcdir}/../app-masterworker/app-masterworker_d.xml --cfg=path:${srcdir} "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" --log=surf_cpu.t:verbose > [ 0.000000] (0:maestro@) Cannot launch actor 'worker' on failed host 'Fafard' +> [ 0.000000] (0:maestro@) Deployment include some initially turned off Hosts, nevermind. > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process > [ 0.000000] (1:master@Tremblay) Send a message to worker-0 > [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 diff --git a/examples/s4u/platform-failures/s4u-platform-failures.tesh b/examples/s4u/platform-failures/s4u-platform-failures.tesh index 0b757a6466..cf6ff662a4 100644 --- a/examples/s4u/platform-failures/s4u-platform-failures.tesh +++ b/examples/s4u/platform-failures/s4u-platform-failures.tesh @@ -5,6 +5,7 @@ p Testing a simple master/worker example application handling failures TCP cross ! output sort 19 $ $SG_TEST_EXENV ${bindir:=.}/s4u-platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_failures.xml ${srcdir:=.}/s4u-platform-failures_d.xml --cfg=path:${srcdir} --cfg=network/crosstraffic:0 "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" --log=surf_cpu.t:verbose > [ 0.000000] (0:maestro@) Cannot launch actor 'worker' on failed host 'Fafard' +> [ 0.000000] (0:maestro@) Deployment include some initially turned off Hosts, nevermind. > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process > [ 0.000000] (1:master@Tremblay) Send a message to worker-0 > [ 0.010309] (1:master@Tremblay) Send to worker-0 completed @@ -112,6 +113,7 @@ p Testing a simple master/worker example application handling failures. TCP cros ! output sort 19 $ $SG_TEST_EXENV ${bindir:=.}/s4u-platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_failures.xml ${srcdir:=.}/s4u-platform-failures_d.xml --cfg=path:${srcdir} "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" --log=surf_cpu.t:verbose > [ 0.000000] (0:maestro@) Cannot launch actor 'worker' on failed host 'Fafard' +> [ 0.000000] (0:maestro@) Deployment include some initially turned off Hosts, nevermind. > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process > [ 0.000000] (1:master@Tremblay) Send a message to worker-0 > [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 diff --git a/src/msg/msg_process.cpp b/src/msg/msg_process.cpp index 83fbcdbb21..0604ec941a 100644 --- a/src/msg/msg_process.cpp +++ b/src/msg/msg_process.cpp @@ -4,6 +4,7 @@ * under the terms of the license (GNU LGPL) which comes with this package. */ #include "msg_private.hpp" +#include "simgrid/Exception.hpp" #include "simgrid/s4u/Host.hpp" #include "src/instr/instr_private.hpp" #include "src/simix/ActorImpl.hpp" @@ -82,20 +83,25 @@ msg_process_t MSG_process_create_with_environment(const char *name, xbt_main_fun xbt_dict_free(&properties); smx_actor_t self = SIMIX_process_self(); - smx_actor_t process = simgrid::simix::simcall([name, function, data, host, &props, self] { - return simgrid::kernel::actor::ActorImpl::create(std::move(name), std::move(function), data, host, &props, self) - .get(); - }); + smx_actor_t actor = nullptr; + try { + actor = simgrid::simix::simcall([name, function, data, host, &props, self] { + return simgrid::kernel::actor::ActorImpl::create(std::move(name), std::move(function), data, host, &props, self) + .get(); + }); + } catch (simgrid::HostFailureException const&) { + XBT_DEBUG("The warning has already been issued. Do nothing more than catching the exception."); + } for (int i = 0; i != argc; ++i) xbt_free(argv[i]); xbt_free(argv); - if (process == nullptr) + if (actor == nullptr) return nullptr; MSG_process_yield(); - return process->ciface(); + return actor->ciface(); } /** @brief Returns the user data of a process. diff --git a/src/s4u/s4u_Actor.cpp b/src/s4u/s4u_Actor.cpp index 1df4ce5bb5..a6a4495bfa 100644 --- a/src/s4u/s4u_Actor.cpp +++ b/src/s4u/s4u_Actor.cpp @@ -3,6 +3,7 @@ /* This program is free software; you can redistribute it and/or modify it * under the terms of the license (GNU LGPL) which comes with this package. */ +#include "simgrid/Exception.hpp" #include "simgrid/actor.h" #include "simgrid/s4u/Actor.hpp" #include "simgrid/s4u/Exec.hpp" @@ -663,9 +664,13 @@ sg_actor_t sg_actor_attach(const char* name, void* data, sg_host_t host, xbt_dic xbt_dict_free(&properties); /* Let's create the process: SIMIX may decide to start it right now, even before returning the flow control to us */ - smx_actor_t actor = simgrid::kernel::actor::ActorImpl::attach(name, data, host, &props).get(); - if (not actor) + smx_actor_t actor = nullptr; + try { + actor = simgrid::kernel::actor::ActorImpl::attach(name, data, host, &props).get(); + } catch (simgrid::HostFailureException const&) { xbt_die("Could not attach"); + } + simgrid::s4u::this_actor::yield(); return actor->ciface(); } diff --git a/src/simix/ActorImpl.cpp b/src/simix/ActorImpl.cpp index 00e499d2d9..1123281656 100644 --- a/src/simix/ActorImpl.cpp +++ b/src/simix/ActorImpl.cpp @@ -79,6 +79,8 @@ ActorImplPtr ActorImpl::attach(std::string name, void* data, s4u::Host* host, if (not host->is_on()) { XBT_WARN("Cannot launch process '%s' on failed host '%s'", name.c_str(), host->get_cname()); + std::rethrow_exception( + std::make_exception_ptr(simgrid::HostFailureException(XBT_THROW_POINT, "Cannot attach actor on failed host."))); return nullptr; } @@ -450,6 +452,8 @@ ActorImplPtr ActorImpl::create(std::string name, simix::ActorCode code, void* da if (not host->is_on()) { XBT_WARN("Cannot launch actor '%s' on failed host '%s'", name.c_str(), host->get_cname()); + std::rethrow_exception( + std::make_exception_ptr(simgrid::HostFailureException(XBT_THROW_POINT, "Cannot create actor on failed host."))); return nullptr; } diff --git a/src/surf/sg_platf.cpp b/src/surf/sg_platf.cpp index 9b6bc757fd..7d09fa5676 100644 --- a/src/surf/sg_platf.cpp +++ b/src/surf/sg_platf.cpp @@ -3,6 +3,7 @@ /* This program is free software; you can redistribute it and/or modify it * under the terms of the license (GNU LGPL) which comes with this package. */ +#include "simgrid/Exception.hpp" #include "simgrid/kernel/routing/ClusterZone.hpp" #include "simgrid/kernel/routing/DijkstraZone.hpp" #include "simgrid/kernel/routing/DragonflyZone.hpp" @@ -463,9 +464,13 @@ void sg_platf_new_actor(simgrid::kernel::routing::ActorCreationArgs* actor) } else { // start_time <= SIMIX_get_clock() XBT_DEBUG("Starting Process %s(%s) right now", arg->name.c_str(), host->get_cname()); - simgrid::kernel::actor::ActorImplPtr actor = simgrid::kernel::actor::ActorImpl::create( - arg->name.c_str(), std::move(code), nullptr, host, arg->properties.get(), nullptr); - + simgrid::kernel::actor::ActorImplPtr actor = nullptr; + try { + actor = simgrid::kernel::actor::ActorImpl::create(arg->name.c_str(), std::move(code), nullptr, host, + arg->properties.get(), nullptr); + } catch (simgrid::HostFailureException const&) { + XBT_WARN("Deployment include some initially turned off Hosts, nevermind."); + } /* The actor creation will fail if the host is currently dead, but that's fine */ if (actor != nullptr) { if (arg->kill_time >= 0) -- 2.20.1