Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Remote Exec should not survive to a host failure
[simgrid.git] / src / s4u / s4u_Exec.cpp
index 9d1a552..ab680d8 100644 (file)
@@ -7,9 +7,10 @@
 #include "simgrid/exec.h"
 #include "simgrid/s4u/Actor.hpp"
 #include "simgrid/s4u/Exec.hpp"
+#include "simgrid/s4u/Host.hpp"
 #include "src/kernel/activity/ExecImpl.hpp"
 #include "src/kernel/actor/ActorImpl.hpp"
-#include "src/mc/checker/SimcallObserver.hpp"
+#include "src/kernel/actor/SimcallObserver.hpp"
 #include "xbt/log.h"
 
 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(s4u_exec, s4u_activity, "S4U asynchronous executions");
@@ -24,59 +25,60 @@ Exec::Exec(kernel::activity::ExecImplPtr pimpl)
   pimpl_ = pimpl;
 }
 
+void Exec::complete(Activity::State state)
+{
+  Activity::complete(state);
+  on_completion(*this);
+}
+
 ExecPtr Exec::init()
 {
   auto pimpl = kernel::activity::ExecImplPtr(new kernel::activity::ExecImpl());
+  Host::on_state_change.connect([pimpl](s4u::Host const& h) {
+    if (not h.is_on() && pimpl->state_ == kernel::activity::State::RUNNING &&
+        std::find(pimpl->get_hosts().begin(), pimpl->get_hosts().end(), &h) != pimpl->get_hosts().end()) {
+      pimpl->state_ = kernel::activity::State::FAILED;
+      pimpl->post();
+    }
+  });
   return ExecPtr(pimpl->get_iface());
 }
 
-Exec* Exec::wait()
+Exec* Exec::start()
 {
-  return this->wait_for(-1);
-}
+  kernel::actor::simcall([this] {
+    (*boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_))
+        .set_name(get_name())
+        .set_tracing_category(get_tracing_category())
+        .start();
+  });
 
-Exec* Exec::wait_for(double timeout)
-{
-  if (state_ == State::INITED)
-    vetoable_start();
+  if (suspended_)
+    pimpl_->suspend();
 
-  kernel::actor::ActorImpl* issuer = kernel::actor::ActorImpl::self();
-  kernel::actor::simcall_blocking<void>([this, issuer, timeout] { this->get_impl()->wait_for(issuer, timeout); });
-  state_ = State::FINISHED;
-  on_completion(*this);
-  this->release_dependencies();
+  state_      = State::STARTED;
+  on_start(*this);
   return this;
 }
 
-int Exec::wait_any_for(std::vector<ExecPtr>* execs, double timeout)
+ssize_t Exec::wait_any_for(const std::vector<ExecPtr>& execs, double timeout)
 {
-  std::vector<kernel::activity::ExecImpl*> rexecs(execs->size());
-  std::transform(begin(*execs), end(*execs), begin(rexecs),
+  std::vector<kernel::activity::ExecImpl*> rexecs(execs.size());
+  std::transform(begin(execs), end(execs), begin(rexecs),
                  [](const ExecPtr& exec) { return static_cast<kernel::activity::ExecImpl*>(exec->pimpl_.get()); });
 
   kernel::actor::ActorImpl* issuer = kernel::actor::ActorImpl::self();
-  mc::ExecutionWaitanySimcall observer{issuer, &rexecs, timeout};
-  kernel::actor::simcall_blocking<void>(
+  kernel::actor::ExecutionWaitanySimcall observer{issuer, rexecs, timeout};
+  ssize_t changed_pos = kernel::actor::simcall_blocking(
       [&observer] {
         kernel::activity::ExecImpl::wait_any_for(observer.get_issuer(), observer.get_execs(), observer.get_timeout());
       },
       &observer);
-  int changed_pos = observer.get_result();
-  if (changed_pos != -1) {
-    on_completion(*(execs->at(changed_pos)));
-    execs->at(changed_pos)->release_dependencies();
-  }
+  if (changed_pos != -1)
+    execs.at(changed_pos)->complete(State::FINISHED);
   return changed_pos;
 }
 
-Exec* Exec::cancel()
-{
-  kernel::actor::simcall([this] { boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_)->cancel(); });
-  state_ = State::CANCELED;
-  on_completion(*this);
-  return this;
-}
-
 /** @brief change the execution bound
  * This means changing the maximal amount of flops per second that it may consume, regardless of what the host may
  * deliver. Currently, this cannot be changed once the exec started.
@@ -84,15 +86,25 @@ Exec* Exec::cancel()
 ExecPtr Exec::set_bound(double bound)
 {
   xbt_assert(state_ == State::INITED || state_ == State::STARTING,
-      "Cannot change the bound of an exec after its start");
-  bound_ = bound;
+             "Cannot change the bound of an exec after its start");
+  kernel::actor::simcall(
+      [this, bound] { boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_)->set_bound(bound); });
   return this;
 }
-ExecPtr Exec::set_timeout(double timeout) // XBT_ATTRIB_DEPRECATED_v329
+
+/** @brief  Change the execution priority, don't you think?
+ *
+ * An execution with twice the priority will get twice the amount of flops when the resource is shared.
+ * The default priority is 1.
+ *
+ * Currently, this cannot be changed once the exec started. */
+ExecPtr Exec::set_priority(double priority)
 {
-  xbt_assert(state_ == State::INITED|| state_ == State::STARTING,
-      "Cannot change the bound of an exec after its start");
-  timeout_ = timeout;
+  xbt_assert(state_ == State::INITED || state_ == State::STARTING,
+             "Cannot change the priority of an exec after its start");
+  kernel::actor::simcall([this, priority] {
+    boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_)->set_sharing_penalty(1. / priority);
+  });
   return this;
 }
 
@@ -100,8 +112,10 @@ ExecPtr Exec::set_flops_amount(double flops_amount)
 {
   xbt_assert(state_ == State::INITED || state_ == State::STARTING,
       "Cannot change the flop_amount of an exec after its start");
-  flops_amounts_.assign(1, flops_amount);
-  Activity::set_remaining(flops_amounts_.front());
+  kernel::actor::simcall([this, flops_amount] {
+    boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_)->set_flops_amount(flops_amount);
+  });
+  Activity::set_remaining(flops_amount);
   return this;
 }
 
@@ -109,7 +123,9 @@ ExecPtr Exec::set_flops_amounts(const std::vector<double>& flops_amounts)
 {
   xbt_assert(state_ == State::INITED || state_ == State::STARTING,
       "Cannot change the flops_amounts of an exec after its start");
-  flops_amounts_ = flops_amounts;
+  kernel::actor::simcall([this, flops_amounts] {
+    boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_)->set_flops_amounts(flops_amounts);
+  });
   parallel_      = true;
   return this;
 }
@@ -118,7 +134,9 @@ ExecPtr Exec::set_bytes_amounts(const std::vector<double>& bytes_amounts)
 {
   xbt_assert(state_ == State::INITED || state_ == State::STARTING,
       "Cannot change the bytes_amounts of an exec after its start");
-  bytes_amounts_ = bytes_amounts;
+  kernel::actor::simcall([this, bytes_amounts] {
+    boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_)->set_bytes_amounts(bytes_amounts);
+  });
   parallel_      = true;
   return this;
 }
@@ -134,22 +152,15 @@ unsigned int Exec::get_host_number() const
 {
   return static_cast<kernel::activity::ExecImpl*>(pimpl_.get())->get_host_number();
 }
-double Exec::get_cost() const
+
+double Exec::get_start_time() const
 {
-  return (pimpl_->surf_action_ == nullptr) ? -1 : pimpl_->surf_action_->get_cost();
+  return static_cast<kernel::activity::ExecImpl*>(pimpl_.get())->get_start_time();
 }
 
-/** @brief  Change the execution priority, don't you think?
- *
- * An execution with twice the priority will get twice the amount of flops when the resource is shared.
- * The default priority is 1.
- *
- * Currently, this cannot be changed once the exec started. */
-ExecPtr Exec::set_priority(double priority)
+double Exec::get_finish_time() const
 {
-  xbt_assert(state_ == State::INITED, "Cannot change the priority of an exec after its start");
-  priority_ = priority;
-  return this;
+  return static_cast<kernel::activity::ExecImpl*>(pimpl_.get())->get_finish_time();
 }
 
 /** @brief Change the host on which this activity takes place.
@@ -158,13 +169,13 @@ ExecPtr Exec::set_priority(double priority)
 ExecPtr Exec::set_host(Host* host)
 {
   xbt_assert(state_ == State::INITED || state_ == State::STARTING || state_ == State::STARTED,
-             "Cannot change the host of an exec once it's done (state: %d)", (int)state_);
-  hosts_.assign(1, host);
+             "Cannot change the host of an exec once it's done (state: %s)", to_c_str(state_));
 
   if (state_ == State::STARTED)
     boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_)->migrate(host);
 
-  boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_)->set_host(host);
+  kernel::actor::simcall(
+      [this, host] { boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_)->set_host(host); });
 
   if (state_ == State::STARTING)
   // Setting the host may allow to start the activity, let's try
@@ -176,9 +187,10 @@ ExecPtr Exec::set_host(Host* host)
 ExecPtr Exec::set_hosts(const std::vector<Host*>& hosts)
 {
   xbt_assert(state_ == State::INITED || state_ == State::STARTING,
-      "Cannot change the hosts of an exec once it's done (state: %d)", (int)state_);
+             "Cannot change the hosts of an exec once it's done (state: %s)", to_c_str(state_));
 
-  hosts_    = hosts;
+  kernel::actor::simcall(
+      [this, hosts] { boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_)->set_hosts(hosts); });
   parallel_ = true;
 
   // Setting the host may allow to start the activity, let's try
@@ -188,35 +200,9 @@ ExecPtr Exec::set_hosts(const std::vector<Host*>& hosts)
   return this;
 }
 
-Exec* Exec::start()
+double Exec::get_cost() const
 {
-  if (is_parallel())
-    kernel::actor::simcall([this] {
-      (*boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_))
-          .set_hosts(hosts_)
-          .set_timeout(timeout_)
-          .set_flops_amounts(flops_amounts_)
-          .set_bytes_amounts(bytes_amounts_)
-          .start();
-    });
-  else
-    kernel::actor::simcall([this] {
-      (*boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_))
-          .set_name(get_name())
-          .set_tracing_category(get_tracing_category())
-          .set_sharing_penalty(1. / priority_)
-          .set_bound(bound_)
-          .set_flops_amount(flops_amounts_.front())
-          .start();
-    });
-
-  if (suspended_)
-    pimpl_->suspend();
-
-  state_ = State::STARTED;
-  start_time_ = pimpl_->surf_action_->get_start_time();
-  on_start(*this);
-  return this;
+  return (pimpl_->surf_action_ == nullptr) ? -1 : pimpl_->surf_action_->get_cost();
 }
 
 double Exec::get_remaining() const
@@ -243,6 +229,10 @@ double Exec::get_remaining_ratio() const
         [this]() { return boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_)->get_seq_remaining_ratio(); });
 }
 
+bool Exec::is_assigned() const
+{
+  return not boost::static_pointer_cast<kernel::activity::ExecImpl>(pimpl_)->get_hosts().empty();
+}
 } // namespace s4u
 } // namespace simgrid
 
@@ -298,19 +288,7 @@ int sg_exec_test(sg_exec_t exec)
 
 sg_error_t sg_exec_wait(sg_exec_t exec)
 {
-  sg_error_t status = SG_OK;
-
-  simgrid::s4u::ExecPtr s4u_exec(exec, false);
-  try {
-    s4u_exec->wait_for(-1);
-  } catch (const simgrid::TimeoutException&) {
-    status = SG_ERROR_TIMEOUT;
-  } catch (const simgrid::CancelException&) {
-    status = SG_ERROR_CANCELED;
-  } catch (const simgrid::HostFailureException&) {
-    status = SG_ERROR_HOST;
-  }
-  return status;
+  return sg_exec_wait_for(exec, -1.0);
 }
 
 sg_error_t sg_exec_wait_for(sg_exec_t exec, double timeout)
@@ -321,6 +299,7 @@ sg_error_t sg_exec_wait_for(sg_exec_t exec, double timeout)
   try {
     s4u_exec->wait_for(timeout);
   } catch (const simgrid::TimeoutException&) {
+    s4u_exec->add_ref(); // the wait_for timeouted, keep the exec alive
     status = SG_ERROR_TIMEOUT;
   } catch (const simgrid::CancelException&) {
     status = SG_ERROR_CANCELED;
@@ -330,20 +309,20 @@ sg_error_t sg_exec_wait_for(sg_exec_t exec, double timeout)
   return status;
 }
 
-int sg_exec_wait_any(sg_exec_t* execs, size_t count)
+ssize_t sg_exec_wait_any(sg_exec_t* execs, size_t count)
 {
-  return sg_exec_wait_any_for(execs, count, -1);
+  return sg_exec_wait_any_for(execs, count, -1.0);
 }
 
-int sg_exec_wait_any_for(sg_exec_t* execs, size_t count, double timeout)
+ssize_t sg_exec_wait_any_for(sg_exec_t* execs, size_t count, double timeout)
 {
   std::vector<simgrid::s4u::ExecPtr> s4u_execs;
-  for (unsigned int i = 0; i < count; i++)
+  for (size_t i = 0; i < count; i++)
     s4u_execs.emplace_back(execs[i], false);
 
-  int pos = simgrid::s4u::Exec::wait_any_for(&s4u_execs, timeout);
-  for (unsigned i = 0; i < count; i++) {
-    if (pos != -1 && static_cast<unsigned>(pos) != i)
+  ssize_t pos = simgrid::s4u::Exec::wait_any_for(s4u_execs, timeout);
+  for (size_t i = 0; i < count; i++) {
+    if (pos != -1 && static_cast<size_t>(pos) != i)
       s4u_execs[i]->add_ref();
   }
   return pos;