Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
forcefully kill exiting actors even if their host is not off
[simgrid.git] / src / kernel / actor / ActorImpl.cpp
index 9e386fa..c8640b4 100644 (file)
@@ -33,7 +33,7 @@ static unsigned long simix_process_maxpid = 0;
  */
 smx_actor_t SIMIX_process_self()
 {
-  smx_context_t self_context = simgrid::kernel::context::Context::self();
+  simgrid::kernel::context::Context* self_context = simgrid::kernel::context::Context::self();
 
   return (self_context != nullptr) ? self_context->get_actor() : nullptr;
 }
@@ -59,7 +59,16 @@ ActorImpl::ActorImpl(const simgrid::xbt::string& name, s4u::Host* host) : host_(
   simcall.issuer = this;
 }
 
-ActorImpl::~ActorImpl() = default;
+ActorImpl::~ActorImpl()
+{
+  if (simix_global != nullptr && this != simix_global->maestro_process) {
+    if (context_.get() != nullptr) /* the actor was not start()ed yet. This happens if its host was initially off */
+      context_->iwannadie = false; // don't let the simcall's yield() do a Context::stop(), to avoid infinite loops
+    simgrid::simix::simcall([this] { simgrid::s4u::Actor::on_destruction(*ciface()); });
+    if (context_.get() != nullptr)
+      context_->iwannadie = true;
+  }
+}
 
 /* Become an actor in the simulation
  *
@@ -70,7 +79,7 @@ ActorImpl::~ActorImpl() = default;
  */
 
 ActorImplPtr ActorImpl::attach(const std::string& name, void* data, s4u::Host* host,
-                               std::unordered_map<std::string, std::string>* properties)
+                               const std::unordered_map<std::string, std::string>* properties)
 {
   // This is mostly a copy/paste from create(), it'd be nice to share some code between those two functions.
 
@@ -78,14 +87,13 @@ ActorImplPtr ActorImpl::attach(const std::string& name, void* data, s4u::Host* h
 
   if (not host->is_on()) {
     XBT_WARN("Cannot launch process '%s' on failed host '%s'", name.c_str(), host->get_cname());
-    std::rethrow_exception(
-        std::make_exception_ptr(simgrid::HostFailureException(XBT_THROW_POINT, "Cannot attach actor on failed host.")));
+    throw simgrid::HostFailureException(XBT_THROW_POINT, "Cannot attach actor on failed host.");
   }
 
   ActorImpl* actor = new ActorImpl(xbt::string(name), host);
   /* Actor data */
   actor->set_user_data(data);
-  actor->code = nullptr;
+  actor->code_ = nullptr;
 
   XBT_VERB("Create context %s", actor->get_cname());
   xbt_assert(simix_global != nullptr, "simix is not initialized, please call MSG_init first");
@@ -93,8 +101,7 @@ ActorImplPtr ActorImpl::attach(const std::string& name, void* data, s4u::Host* h
 
   /* Add properties */
   if (properties != nullptr)
-    for (auto const& kv : *properties)
-      actor->set_property(kv.first, kv.second);
+    actor->set_properties(*properties);
 
   /* Add the process to it's host process list */
   host->pimpl_->process_list_.push_back(*actor);
@@ -110,8 +117,7 @@ ActorImplPtr ActorImpl::attach(const std::string& name, void* data, s4u::Host* h
   context->attach_start();
 
   /* The on_creation() signal must be delayed until there, where the pid and everything is set */
-  simgrid::s4u::ActorPtr tmp = actor->iface(); // Passing this directly to on_creation will lead to crashes
-  simgrid::s4u::Actor::on_creation(tmp);
+  simgrid::s4u::Actor::on_creation(*actor->ciface());
 
   return ActorImplPtr(actor);
 }
@@ -133,11 +139,41 @@ void ActorImpl::detach()
 
 void ActorImpl::cleanup()
 {
+  finished_ = true;
+
+  if (has_to_auto_restart() && not get_host()->is_on()) {
+    XBT_DEBUG("Insert host %s to watched_hosts because it's off and %s needs to restart", get_host()->get_cname(),
+              get_cname());
+    watched_hosts.insert(get_host()->get_name());
+  }
+
+  if (on_exit) {
+    // Execute the termination callbacks
+    bool failed = context_->iwannadie;
+    for (auto exit_fun = on_exit->crbegin(); exit_fun != on_exit->crend(); ++exit_fun)
+      (*exit_fun)(failed);
+    on_exit.reset();
+  }
+  undaemonize();
+
+  /* cancel non-blocking activities */
+  for (auto activity : comms)
+    boost::static_pointer_cast<activity::CommImpl>(activity)->cancel();
+  comms.clear();
+
+  XBT_DEBUG("%s@%s(%ld) should not run anymore", get_cname(), get_host()->get_cname(), get_pid());
+
   if (this == simix_global->maestro_process) /* Do not cleanup maestro */
     return;
 
   XBT_DEBUG("Cleanup actor %s (%p), waiting synchro %p", get_cname(), this, waiting_synchro.get());
 
+  /* Unregister from the kill timer if any */
+  if (kill_timer != nullptr) {
+    kill_timer->remove();
+    kill_timer = nullptr;
+  }
+
   simix_global->mutex.lock();
 
   simix_global->process_list.erase(pid_);
@@ -145,62 +181,49 @@ void ActorImpl::cleanup()
     simgrid::xbt::intrusive_erase(host_->pimpl_->process_list_, *this);
   if (not smx_destroy_list_hook.is_linked()) {
 #if SIMGRID_HAVE_MC
-    xbt_dynar_push_as(simix_global->dead_actors_vector, smx_actor_t, this);
+    xbt_dynar_push_as(simix_global->dead_actors_vector, ActorImpl*, this);
 #endif
     simix_global->actors_to_destroy.push_back(*this);
   }
-  context_->iwannadie = false;
 
   simix_global->mutex.unlock();
+
+  context_->iwannadie = false; // don't let the simcall's yield() do a Context::stop(), to avoid infinite loops
+  simgrid::simix::simcall([this] { simgrid::s4u::Actor::on_termination(*ciface()); });
+  context_->iwannadie = true;
 }
 
 void ActorImpl::exit()
 {
   context_->iwannadie = true;
-  blocked_            = false;
   suspended_          = false;
   exception_          = nullptr;
 
-  // Forcefully kill the actor if its host is turned off. Not a HostFailureException because you should not survive that
-  if (not host_->is_on())
-    this->throw_exception(std::make_exception_ptr(ForcefulKillException("host failed")));
-
   /* destroy the blocking synchro if any */
   if (waiting_synchro != nullptr) {
+    waiting_synchro->cancel();
+    waiting_synchro->state_ = SIMIX_FAILED;
 
     activity::ExecImplPtr exec   = boost::dynamic_pointer_cast<activity::ExecImpl>(waiting_synchro);
     activity::CommImplPtr comm   = boost::dynamic_pointer_cast<activity::CommImpl>(waiting_synchro);
-    activity::SleepImplPtr sleep = boost::dynamic_pointer_cast<activity::SleepImpl>(waiting_synchro);
-    activity::RawImplPtr raw     = boost::dynamic_pointer_cast<activity::RawImpl>(waiting_synchro);
-    activity::IoImplPtr io       = boost::dynamic_pointer_cast<activity::IoImpl>(waiting_synchro);
-
-    if (exec != nullptr && exec->surf_action_) {
-      exec->cancel();
-      exec->surf_action_->unref();
-      exec->surf_action_ = nullptr;
+
+    if (exec != nullptr) {
+      exec->clean_action();
     } else if (comm != nullptr) {
       comms.remove(waiting_synchro);
-      comm->cancel();
       // Remove first occurrence of &actor->simcall:
       auto i = boost::range::find(waiting_synchro->simcalls_, &simcall);
       if (i != waiting_synchro->simcalls_.end())
         waiting_synchro->simcalls_.remove(&simcall);
-    } else if (sleep != nullptr) {
-      if (sleep->surf_action_)
-        sleep->surf_action_->cancel();
-      sleep->post();
-    } else if (raw != nullptr) {
-      raw->finish();
-    } else if (io != nullptr) {
-      io->cancel();
     } else {
-      simgrid::kernel::activity::ActivityImplPtr activity = waiting_synchro;
-      xbt_die("Activity %s is of unknown type %s", activity->get_cname(),
-              simgrid::xbt::demangle(typeid(activity).name()).get());
+      activity::ActivityImplPtr(waiting_synchro)->finish();
     }
 
     waiting_synchro = nullptr;
   }
+
+  // Forcefully kill the actor if its host is turned off. Not a HostFailureException because you should not survive that
+  this->throw_exception(std::make_exception_ptr(ForcefulKillException(host_->is_on() ? "exited" : "host failed")));
 }
 
 void ActorImpl::kill(ActorImpl* actor)
@@ -247,18 +270,6 @@ double ActorImpl::get_kill_time()
   return kill_timer ? kill_timer->get_date() : 0;
 }
 
-static void dying_daemon(int /*exit_status*/, void* data)
-{
-  std::vector<ActorImpl*>* vect = &simix_global->daemons;
-
-  auto it = std::find(vect->begin(), vect->end(), static_cast<ActorImpl*>(data));
-  xbt_assert(it != vect->end(), "The dying daemon is not a daemon after all. Please report that bug.");
-
-  /* Don't move the whole content since we don't really care about the order */
-  std::swap(*it, vect->back());
-  vect->pop_back();
-}
-
 void ActorImpl::yield()
 {
   XBT_DEBUG("Yield actor '%s'", get_cname());
@@ -270,7 +281,6 @@ void ActorImpl::yield()
   XBT_DEBUG("Control returned to me: '%s'", get_cname());
 
   if (context_->iwannadie) {
-
     XBT_DEBUG("Actor %s@%s is dead", get_cname(), host_->get_cname());
     // throw simgrid::kernel::context::ForcefulKillException(); Does not seem to properly kill the actor
     context_->stop();
@@ -303,23 +313,39 @@ void ActorImpl::daemonize()
   if (not daemon_) {
     daemon_ = true;
     simix_global->daemons.push_back(this);
-    SIMIX_process_on_exit(this, dying_daemon, this);
+  }
+}
+
+void ActorImpl::undaemonize()
+{
+  if (daemon_) {
+    auto& vect = simix_global->daemons;
+    auto it    = std::find(vect.begin(), vect.end(), this);
+    xbt_assert(it != vect.end(), "The dying daemon is not a daemon after all. Please report that bug.");
+    /* Don't move the whole content since we don't really care about the order */
+
+    std::swap(*it, vect.back());
+    vect.pop_back();
+    daemon_ = false;
   }
 }
 
 s4u::Actor* ActorImpl::restart()
 {
+  xbt_assert(this != simix_global->maestro_process, "Restarting maestro is not supported");
+
   XBT_DEBUG("Restarting actor %s on %s", get_cname(), host_->get_cname());
 
   // retrieve the arguments of the old actor
   ProcessArg arg = ProcessArg(host_, this);
 
   // kill the old actor
-  (this == simix_global->maestro_process) ? this->exit() : SIMIX_process_self()->kill(this);
+  context::Context::self()->get_actor()->kill(this);
 
   // start the new actor
   ActorImplPtr actor =
       ActorImpl::create(arg.name, std::move(arg.code), arg.data, arg.host, arg.properties.get(), nullptr);
+  *actor->on_exit = std::move(*arg.on_exit);
   actor->set_kill_time(arg.kill_time);
   actor->set_auto_restart(arg.auto_restart);
 
@@ -343,7 +369,9 @@ activity::ActivityImplPtr ActorImpl::suspend(ActorImpl* issuer)
 
     return nullptr;
   } else {
-    return activity::ExecImplPtr(new activity::ExecImpl("suspend", ""))->set_host(host_)->start(0.0, 1.0, 0.0);
+    activity::ExecImpl* exec = new activity::ExecImpl();
+    (*exec).set_name("suspend").set_host(host_).set_flops_amount(0.0).start();
+    return activity::ExecImplPtr(exec);
   }
 }
 
@@ -367,19 +395,14 @@ void ActorImpl::resume()
   XBT_OUT();
 }
 
-activity::ActivityImplPtr ActorImpl::join(smx_actor_t actor, double timeout)
+activity::ActivityImplPtr ActorImpl::join(ActorImpl* actor, double timeout)
 {
-  activity::ActivityImplPtr res = this->sleep(timeout);
-  intrusive_ptr_add_ref(res.get());
-  SIMIX_process_on_exit(actor,
-                        [](int, void* arg) {
-                          auto sleep = static_cast<simgrid::kernel::activity::SleepImpl*>(arg);
-                          if (sleep->surf_action_)
-                            sleep->surf_action_->finish(simgrid::kernel::resource::Action::State::FINISHED);
-                          intrusive_ptr_release(sleep);
-                        },
-                        res.get());
-  return res;
+  activity::ActivityImplPtr sleep = this->sleep(timeout);
+  SIMIX_process_on_exit(actor, [sleep](bool) {
+    if (sleep->surf_action_)
+      sleep->surf_action_->finish(resource::Action::State::FINISHED);
+  });
+  return sleep;
 }
 
 activity::ActivityImplPtr ActorImpl::sleep(double duration)
@@ -388,8 +411,9 @@ activity::ActivityImplPtr ActorImpl::sleep(double duration)
     throw_exception(std::make_exception_ptr(simgrid::HostFailureException(
         XBT_THROW_POINT, std::string("Host ") + host_->get_cname() + " failed, you cannot sleep there.")));
 
-  return simgrid::kernel::activity::SleepImplPtr(new simgrid::kernel::activity::SleepImpl("sleep", host_))
-      ->start(duration);
+  activity::SleepImpl* sleep = new activity::SleepImpl();
+  (*sleep).set_name("sleep").set_host(host_).set_duration(duration).start();
+  return activity::SleepImplPtr(sleep);
 }
 
 void ActorImpl::throw_exception(std::exception_ptr e)
@@ -401,56 +425,32 @@ void ActorImpl::throw_exception(std::exception_ptr e)
 
   /* cancel the blocking synchro if any */
   if (waiting_synchro) {
-
-    activity::ExecImplPtr exec = boost::dynamic_pointer_cast<activity::ExecImpl>(waiting_synchro);
-    if (exec != nullptr)
-      exec->cancel();
+    waiting_synchro->cancel();
 
     activity::CommImplPtr comm = boost::dynamic_pointer_cast<activity::CommImpl>(waiting_synchro);
-    if (comm != nullptr) {
-      comms.remove(comm);
-      comm->cancel();
-    }
 
-    activity::SleepImplPtr sleep = boost::dynamic_pointer_cast<activity::SleepImpl>(waiting_synchro);
-    if (sleep != nullptr) {
-      SIMIX_process_sleep_destroy(waiting_synchro);
-      if (std::find(begin(simix_global->actors_to_run), end(simix_global->actors_to_run), this) ==
-              end(simix_global->actors_to_run) &&
-          this != SIMIX_process_self()) {
-        XBT_DEBUG("Inserting [%p] %s in the to_run list", this, get_cname());
-        simix_global->actors_to_run.push_back(this);
-      }
-    }
-
-    activity::RawImplPtr raw = boost::dynamic_pointer_cast<activity::RawImpl>(waiting_synchro);
-    if (raw != nullptr) {
-      raw->finish();
-    }
+    if (comm != nullptr)
+      comms.remove(comm);
 
-    activity::IoImplPtr io = boost::dynamic_pointer_cast<activity::IoImpl>(waiting_synchro);
-    if (io != nullptr) {
-      io->cancel();
-    }
+    waiting_synchro = nullptr;
   }
-  waiting_synchro = nullptr;
 }
 
 void ActorImpl::set_host(s4u::Host* dest)
 {
-  simgrid::xbt::intrusive_erase(host_->pimpl_->process_list_, *this);
+  xbt::intrusive_erase(host_->pimpl_->process_list_, *this);
   host_ = dest;
   dest->pimpl_->process_list_.push_back(*this);
 }
 
 ActorImplPtr ActorImpl::init(const std::string& name, s4u::Host* host)
 {
-  ActorImpl* actor = new ActorImpl(simgrid::xbt::string(name), host);
+  ActorImpl* actor = new ActorImpl(xbt::string(name), host);
   actor->set_ppid(this->pid_);
 
   intrusive_ptr_add_ref(actor);
   /* The on_creation() signal must be delayed until there, where the pid and everything is set */
-  s4u::Actor::on_creation(actor->iface());
+  s4u::Actor::on_creation(*actor->ciface());
 
   return ActorImplPtr(actor);
 }
@@ -462,11 +462,10 @@ ActorImpl* ActorImpl::start(const simix::ActorCode& code)
   if (not host_->is_on()) {
     XBT_WARN("Cannot launch actor '%s' on failed host '%s'", name_.c_str(), host_->get_cname());
     intrusive_ptr_release(this);
-    std::rethrow_exception(
-        std::make_exception_ptr(simgrid::HostFailureException(XBT_THROW_POINT, "Cannot start actor on failed host.")));
+    throw simgrid::HostFailureException(XBT_THROW_POINT, "Cannot start actor on failed host.");
   }
 
-  this->code = code;
+  this->code_ = code;
   XBT_VERB("Create context %s", get_cname());
   context_.reset(simix_global->context_factory->create_context(simix::ActorCode(code), this));
 
@@ -484,23 +483,22 @@ ActorImpl* ActorImpl::start(const simix::ActorCode& code)
 }
 
 ActorImplPtr ActorImpl::create(const std::string& name, const simix::ActorCode& code, void* data, s4u::Host* host,
-                               std::unordered_map<std::string, std::string>* properties, ActorImpl* parent_actor)
+                               const std::unordered_map<std::string, std::string>* properties, ActorImpl* parent_actor)
 {
   XBT_DEBUG("Start actor %s@'%s'", name.c_str(), host->get_cname());
 
   ActorImplPtr actor;
   if (parent_actor != nullptr)
-    actor = parent_actor->init(simgrid::xbt::string(name), host);
+    actor = parent_actor->init(xbt::string(name), host);
   else
-    actor = SIMIX_process_self()->init(simgrid::xbt::string(name), host);
+    actor = SIMIX_process_self()->init(xbt::string(name), host);
 
   /* actor data */
   actor->set_user_data(data);
 
   /* Add properties */
   if (properties != nullptr)
-    for (auto const& kv : *properties)
-      actor->set_property(kv.first, kv.second);
+    actor->set_properties(*properties);
 
   actor->start(code);
 
@@ -538,57 +536,6 @@ smx_actor_t SIMIX_process_attach(const char* name, void* data, const char* hostn
   return simgrid::kernel::actor::ActorImpl::attach(name, data, sg_host_by_name(hostname), properties).get();
 }
 
-/** @deprecated When this function gets removed, also remove the xbt_ex class, that is only there to help users to
- * transition */
-void SIMIX_process_throw(smx_actor_t actor, xbt_errcat_t cat, int value, const char* msg)
-{
-  SMX_EXCEPTION(actor, cat, value, msg);
-
-  if (actor->is_suspended())
-    actor->resume();
-
-  /* cancel the blocking synchro if any */
-  if (actor->waiting_synchro) {
-
-    simgrid::kernel::activity::ExecImplPtr exec =
-        boost::dynamic_pointer_cast<simgrid::kernel::activity::ExecImpl>(actor->waiting_synchro);
-    if (exec != nullptr)
-      exec->cancel();
-
-    simgrid::kernel::activity::CommImplPtr comm =
-        boost::dynamic_pointer_cast<simgrid::kernel::activity::CommImpl>(actor->waiting_synchro);
-    if (comm != nullptr) {
-      actor->comms.remove(comm);
-      comm->cancel();
-    }
-
-    simgrid::kernel::activity::SleepImplPtr sleep =
-        boost::dynamic_pointer_cast<simgrid::kernel::activity::SleepImpl>(actor->waiting_synchro);
-    if (sleep != nullptr) {
-      SIMIX_process_sleep_destroy(actor->waiting_synchro);
-      if (std::find(begin(simix_global->actors_to_run), end(simix_global->actors_to_run), actor) ==
-              end(simix_global->actors_to_run) &&
-          actor != SIMIX_process_self()) {
-        XBT_DEBUG("Inserting [%p] %s in the to_run list", actor, actor->get_cname());
-        simix_global->actors_to_run.push_back(actor);
-      }
-    }
-
-    simgrid::kernel::activity::RawImplPtr raw =
-        boost::dynamic_pointer_cast<simgrid::kernel::activity::RawImpl>(actor->waiting_synchro);
-    if (raw != nullptr) {
-      raw->finish();
-    }
-
-    simgrid::kernel::activity::IoImplPtr io =
-        boost::dynamic_pointer_cast<simgrid::kernel::activity::IoImpl>(actor->waiting_synchro);
-    if (io != nullptr) {
-      io->cancel();
-    }
-  }
-  actor->waiting_synchro = nullptr;
-}
-
 void simcall_HANDLER_process_suspend(smx_simcall_t simcall, smx_actor_t actor)
 {
   smx_activity_t sync_suspend = actor->suspend(simcall->issuer);
@@ -666,18 +613,6 @@ void simcall_HANDLER_process_sleep(smx_simcall_t simcall, double duration)
   simcall->issuer->waiting_synchro = sync;
 }
 
-void SIMIX_process_sleep_destroy(smx_activity_t synchro)
-{
-  XBT_DEBUG("Destroy sleep synchro %p", synchro.get());
-  simgrid::kernel::activity::SleepImplPtr sleep =
-      boost::static_pointer_cast<simgrid::kernel::activity::SleepImpl>(synchro);
-
-  if (sleep->surf_action_) {
-    sleep->surf_action_->unref();
-    sleep->surf_action_ = nullptr;
-  }
-}
-
 /**
  * @brief Calling this function makes the process to yield.
  *
@@ -697,20 +632,33 @@ const std::vector<smx_actor_t>& simgrid::simix::process_get_runnable()
 /** @brief Returns the process from PID. */
 smx_actor_t SIMIX_process_from_PID(aid_t PID)
 {
-  auto actor = simix_global->process_list.find(PID);
-  return actor == simix_global->process_list.end() ? nullptr : actor->second;
+  auto item = simix_global->process_list.find(PID);
+  if (item == simix_global->process_list.end()) {
+    for (auto& a : simix_global->actors_to_destroy)
+      if (a.get_pid() == PID)
+        return &a;
+    return nullptr; // Not found, even in the trash
+  }
+  return item->second;
 }
 
 void SIMIX_process_on_exit(smx_actor_t actor, int_f_pvoid_pvoid_t fun, void* data)
 {
-  SIMIX_process_on_exit(actor, [fun](int a, void* b) { fun((void*)(intptr_t)a, b); }, data);
+  SIMIX_process_on_exit(actor, [fun, data](bool failed) {
+    intptr_t status = failed ? SMX_EXIT_FAILURE : SMX_EXIT_SUCCESS;
+    fun(reinterpret_cast<void*>(status), data);
+  });
+}
+
+void SIMIX_process_on_exit(smx_actor_t actor, const std::function<void(int, void*)>& fun, void* data)
+{
+  SIMIX_process_on_exit(actor, [fun, data](bool failed) { fun(failed ? SMX_EXIT_FAILURE : SMX_EXIT_SUCCESS, data); });
 }
 
-void SIMIX_process_on_exit(smx_actor_t actor, const std::function<void(bool, void*)>& fun, void* data)
+void SIMIX_process_on_exit(smx_actor_t actor, const std::function<void(bool /*failed*/)>& fun)
 {
   xbt_assert(actor, "current process not found: are you in maestro context ?");
-
-  actor->on_exit.emplace_back(s_smx_process_exit_fun_t{fun, data});
+  actor->on_exit->emplace_back(fun);
 }
 
 /** @brief Restart a process, starting it again from the beginning. */