Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Introduce the SimGrid Chaos Monkey
authorMartin Quinson <martin.quinson@ens-rennes.fr>
Sun, 27 Feb 2022 10:04:54 +0000 (11:04 +0100)
committerMartin Quinson <martin.quinson@ens-rennes.fr>
Sun, 27 Feb 2022 11:44:58 +0000 (12:44 +0100)
See the tools/simgrid-monkey script and its comments for more info.

MANIFEST.in
src/plugins/chaos_monkey.cpp [new file with mode: 0644]
teshsuite/s4u/CMakeLists.txt
teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.cpp [new file with mode: 0644]
tools/cmake/DefinePackages.cmake
tools/simgrid-monkey [new file with mode: 0755]

index eac44c5..60a81f4 100644 (file)
@@ -808,6 +808,7 @@ include teshsuite/s4u/issue71/issue71.tesh
 include teshsuite/s4u/issue71/platform_bad.xml
 include teshsuite/s4u/listen_async/listen_async.cpp
 include teshsuite/s4u/listen_async/listen_async.tesh
+include teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.cpp
 include teshsuite/s4u/ns3-from-src-to-itself/ns3-from-src-to-itself.cpp
 include teshsuite/s4u/ns3-from-src-to-itself/ns3-from-src-to-itself.tesh
 include teshsuite/s4u/ns3-simultaneous-send-rcv/ns3-simultaneous-send-rcv.cpp
@@ -2432,6 +2433,7 @@ include src/msg/msg_private.hpp
 include src/msg/msg_process.cpp
 include src/msg/msg_task.cpp
 include src/plugins/ProducerConsumer.cpp
+include src/plugins/chaos_monkey.cpp
 include src/plugins/file_system/s4u_FileSystem.cpp
 include src/plugins/host_dvfs.cpp
 include src/plugins/host_energy.cpp
@@ -2824,6 +2826,7 @@ include tools/cmake/test_prog/prog_stacksetup.c
 include tools/cmake/test_prog/prog_tsan.cpp
 include tools/doxygen/list_routing_models_examples.sh
 include tools/graphicator/CMakeLists.txt
+include tools/simgrid-monkey
 include tools/smpi/generate_smpi_defines.pl
 include tools/stack-cleaner/README
 include tools/stack-cleaner/as
diff --git a/src/plugins/chaos_monkey.cpp b/src/plugins/chaos_monkey.cpp
new file mode 100644 (file)
index 0000000..32c9ba7
--- /dev/null
@@ -0,0 +1,77 @@
+/* Copyright (c) 2022-2022. The SimGrid Team. All rights reserved.          */
+
+/* This program is free software; you can redistribute it and/or modify it
+ * under the terms of the license (GNU LGPL) which comes with this package. */
+
+// Chaos Monkey plugin: See the simgrid-monkey script for more information
+
+#include <simgrid/kernel/Timer.hpp>
+#include <simgrid/s4u/Engine.hpp>
+#include <simgrid/s4u/Host.hpp>
+#include <xbt/config.hpp>
+
+#include "src/surf/surf_interface.hpp" // SIMGRID_REGISTER_PLUGIN
+
+namespace sg4 = simgrid::s4u;
+static simgrid::config::Flag<bool> cfg_tell{"cmonkey/tell", "Request the Chaos Monkey to display all timestamps",
+                                            false};
+static simgrid::config::Flag<double> cfg_time{"cmonkey/time", "When should the chaos monkey kill a resource", -1.};
+static simgrid::config::Flag<int> cfg_link{"cmonkey/link", "Which link should be killed (number)", -1};
+static simgrid::config::Flag<int> cfg_host{"cmonkey/host", "Which host should be killed (number)", -1};
+static void sg_chaos_monkey_plugin_init();
+// Makes sure that this plugin can be activated from the command line with ``--cfg=plugin:chaos_monkey``
+SIMGRID_REGISTER_PLUGIN(cmonkey, "Chaos monkey", &sg_chaos_monkey_plugin_init)
+
+XBT_LOG_NEW_DEFAULT_SUBCATEGORY(cmonkey, kernel, "Chaos Monkey plugin");
+
+static void sg_chaos_monkey_plugin_init()
+{
+  XBT_INFO("Initializing the chaos monkey");
+
+  // delay the initialization until after the parameter are parsed
+  sg4::Engine::on_platform_created_cb([]() {
+    auto engine = sg4::Engine::get_instance();
+    auto hosts  = engine->get_all_hosts();
+    auto links  = engine->get_all_links();
+
+    sg4::Engine::on_deadlock_cb([]() { exit(2); });
+
+    if (cfg_tell) {
+      XBT_INFO("HOST_COUNT=%zu", hosts.size());
+      XBT_INFO("LINK_COUNT=%zu", links.size());
+      sg4::Engine::on_time_advance_cb([engine](double /* delta*/) { XBT_INFO("TIMESTAMP=%lf", engine->get_clock()); });
+    }
+
+    if (cfg_time >= 0) {
+      int host = cfg_host;
+      int link = cfg_link;
+      xbt_assert(host >= 0 || link >= 0,
+                 "If a kill time is given, you must also specify a resource to kill (either a link or an host)");
+      xbt_assert(host < 0 || link < 0, "Cannot specify both a link and an host to kill");
+      if (host >= 0) {
+        auto* h = hosts[host];
+        simgrid::kernel::timer::Timer::set(cfg_time, [h]() {
+          XBT_INFO("Kill host %s", h->get_cname());
+          h->turn_off();
+        });
+        simgrid::kernel::timer::Timer::set(cfg_time + 30, [h]() {
+          XBT_INFO("Restart host %s", h->get_cname());
+          h->turn_on();
+        });
+      }
+      if (link >= 0) {
+        auto* l = links[link];
+        simgrid::kernel::timer::Timer::set(cfg_time, [l]() {
+          XBT_INFO("Kill link %s", l->get_cname());
+          l->turn_off();
+        });
+        simgrid::kernel::timer::Timer::set(cfg_time + 30, [l]() {
+          XBT_INFO("Restart host %s", l->get_cname());
+          l->turn_on();
+        });
+      }
+    }
+
+    sg4::Engine::on_simulation_end_cb([]() { XBT_INFO("Chaos Monkey done!"); });
+  });
+}
index bf142a3..447c066 100644 (file)
@@ -8,6 +8,7 @@ foreach(x actor actor-autorestart actor-suspend
         activity-lifecycle
         comm-get-sender comm-pt2pt wait-all-for wait-any-for
         cloud-interrupt-migration cloud-two-execs
+       monkey-masterworkers
         concurrent_rw
         dag-incomplete-simulation dependencies
         host-on-off host-on-off-actors host-on-off-recv host-multicore-speed-file io-set-bw
diff --git a/teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.cpp b/teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.cpp
new file mode 100644 (file)
index 0000000..7471705
--- /dev/null
@@ -0,0 +1,128 @@
+/* Copyright (c) 2007-2022. The SimGrid Team. All rights reserved.          */
+
+/* This program is free software; you can redistribute it and/or modify it
+ * under the terms of the license (GNU LGPL) which comes with this package. */
+
+/* This is a version of the masterworkers that (hopefully) survives to the chaos monkey.
+ * It tests synchronous send/receive as well as synchronous computations.
+ *
+ * It is not written to be pleasant to read, but instead to resist the aggressions of the monkey:
+ * - Workers keep going until after a global variable `todo` reaches 0.
+ * - The master is a daemon that just sends infinitely tasks
+ *   (simgrid simulations stop as soon as all non-daemon actors are done).
+ * - The platform is created programmatically to remove path issues and control the problem size.
+ *
+ * Command-line configuration items:
+ * - host-count: how many actors to start (including the master
+ * - task-count: initial value of the `todo` global
+ * - deadline: time at which the simulation is known to be failed (to detect infinite loops).
+ *
+ * See the simgrid-monkey script for more information.
+ */
+
+#include <simgrid/s4u.hpp>
+#include <xbt/config.hpp>
+#include <xbt/string.hpp>
+
+namespace sg4 = simgrid::s4u;
+
+XBT_LOG_NEW_DEFAULT_CATEGORY(s4u_test, "Messages specific for this s4u example");
+
+static simgrid::config::Flag<int> cfg_host_count{"host-count", "Host count (master on one, workers on the others)", 2};
+static simgrid::config::Flag<double> cfg_deadline{"deadline", "When to fail the simulation (infinite loop detection)",
+                                                  120};
+static simgrid::config::Flag<int> cfg_task_count{"task-count", "Amount of tasks that must be executed to succeed", 1};
+
+int todo; // remaining amount of tasks to execute, a global variable
+
+static void master(double comp_size, long comm_size)
+{
+  XBT_INFO("Master booting");
+  sg4::Actor::self()->daemonize();
+
+  auto mailbox = sg4::Mailbox::by_name("mailbox");
+  while (true) { // This is a daemon
+    xbt_assert(sg4::Engine::get_clock() < cfg_deadline,
+               "Failed to run all tasks in less than %d seconds. Is this an infinite loop?", (int)cfg_deadline);
+
+    auto* payload = new double(comp_size);
+    try {
+      XBT_INFO("Try to send a message");
+      mailbox->put(payload, comm_size, 10.0);
+    } catch (const simgrid::TimeoutException&) {
+      delete payload;
+      XBT_INFO("Timeouted while sending a task");
+    } catch (const simgrid::NetworkFailureException&) {
+      delete payload;
+      XBT_INFO("Network error while sending a task");
+    }
+  }
+  THROW_IMPOSSIBLE;
+}
+
+static void worker(int id)
+{
+  XBT_INFO("Worker booting");
+  sg4::Mailbox* mailbox = sg4::Mailbox::by_name("mailbox");
+  while (todo > 0) {
+    xbt_assert(sg4::Engine::get_clock() < cfg_deadline,
+               "Failed to run all tasks in less than %d seconds. Is this an infinite loop?", (int)cfg_deadline);
+    try {
+      XBT_INFO("Waiting a message on %s", mailbox->get_cname());
+      auto payload = mailbox->get_unique<double>(10);
+      xbt_assert(payload != nullptr, "mailbox->get() failed");
+      double comp_size = *payload;
+      if (comp_size < 0) { /* - Exit when -1.0 is received */
+        XBT_INFO("I'm done. See you!");
+        break;
+      }
+      /*  - Otherwise, process the task */
+      XBT_INFO("Start execution...");
+      sg4::this_actor::execute(comp_size);
+      XBT_INFO("Execution complete.");
+      todo--;
+    } catch (const simgrid::TimeoutException&) {
+      XBT_INFO("Timeouted while getting a task.");
+
+    } catch (const simgrid::NetworkFailureException&) {
+      XBT_INFO("Mmh. Something went wrong. Nevermind. Let's keep going!");
+    }
+  }
+}
+
+int main(int argc, char* argv[])
+{
+  sg4::Engine e(&argc, argv);
+
+  XBT_INFO("host count: %d ", (int)cfg_host_count);
+
+  auto* rootzone = sg4::create_full_zone("root");
+  sg4::Host* main; // First host created, where the master will stay
+  std::vector<sg4::Host*> worker_hosts;
+  for (int i = 0; i < cfg_host_count; i++) {
+    auto hostname = std::string("lilibeth ") + std::to_string(i);
+    auto* host    = rootzone->create_host(hostname, 1e15);
+    if (i == 0) {
+      main = host;
+    } else {
+      sg4::LinkInRoute link(rootzone->create_link(hostname, "1MBps")->set_latency("24us")->seal());
+      rootzone->add_route(main->get_netpoint(), host->get_netpoint(), nullptr, nullptr, {link}, true);
+      worker_hosts.push_back(host);
+    }
+  }
+  rootzone->seal();
+  sg4::Engine::get_instance()->on_platform_created(); // FIXME this should not be necessary
+
+  sg4::Actor::create("master", main, master, 50000000, 1000000)->set_auto_restart(true);
+  int id = 0;
+  for (auto* h : worker_hosts)
+    sg4::Actor::create("worker", h, worker, id++)->set_auto_restart(true);
+
+  todo = cfg_task_count;
+  xbt_assert(todo > 0, "Please give more than %d tasks to run", todo);
+
+  e.run();
+
+  XBT_INFO("WE SURVIVED!");
+  return 0;
+}
index 9469044..87ca7ac 100644 (file)
@@ -364,6 +364,7 @@ set(SURF_SRC
 
 set(PLUGINS_SRC
   src/plugins/ProducerConsumer.cpp
+  src/plugins/chaos_monkey.cpp
   src/plugins/host_dvfs.cpp
   src/plugins/host_energy.cpp
   src/plugins/link_energy.cpp
@@ -1107,6 +1108,7 @@ set(CMAKE_SOURCE_FILES
   tools/cmake/test_prog/prog_stacksetup.c
   tools/cmake/test_prog/prog_tsan.cpp
   tools/cmake/cross-mingw.cmake
+  tools/simgrid-monkey
   tools/smpi/generate_smpi_defines.pl
   tools/stack-cleaner/as
   tools/stack-cleaner/cc
diff --git a/tools/simgrid-monkey b/tools/simgrid-monkey
new file mode 100755 (executable)
index 0000000..04567c2
--- /dev/null
@@ -0,0 +1,98 @@
+#! /usr/bin/python3
+
+# The goal is to introduce random failures in a simulation, to test simgrid under extreme conditions.
+# 
+# It is made of several components.
+# 
+# * a plugin: cmonkey. Can be used from the command line as follows:
+#   * --cfg=plugin:cmonkey --cfg=cmonkey/tell:1
+#     Get information about the resource count and the timestamps of each scheduling rounds.
+#   * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/host:1
+#     Kill the host #1 after 42 seconds (using a kernel::Timer so that no actor gets involved in the killing)
+#   * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/link:0
+#     Kill the link #0 after 42 seconds (using a kernel::Timer)
+# 
+# * a python script: tools/simgrid-monkey (this file)
+#   * It takes a regular simgrid simulation as a parameter, use the cmonkey plugin to get the information about it, 
+#     and then restart many runs, with one resource being turn_off() + turn_on() in each run.
+#   * Each resource gets killed between each timestamps, and on each timestamp.
+#   * So the amount of simulations is: 1 + (host_c+link_c) * timestamps * 2
+# 
+# * Test program, written to resist these extreme conditions:
+#   * teshsuite/s4u/monkey-masterworkers: tests synchronous comms and execs
+
+import multiprocessing as mp
+import sys
+import os
+import argparse
+import subprocess
+import copy
+import re
+
+
+def get_info(cmd):
+    cmd_tell = copy.deepcopy(cmd)
+    cmd_tell.append("--cfg=plugin:cmonkey")
+    cmd_tell.append("--cfg=cmonkey/tell:1")
+    cmd_tell.append("--log=root.t:critical")
+    cmd_tell.append("--log=cmonkey.t:info")
+    cmd_tell.append("--log=cmonkey.fmt:%m%n")
+    print(f"Get the initial info from the command ``{' '.join(cmd_tell)}``")
+    first_run = subprocess.run(cmd_tell, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+    if first_run.returncode != 0:
+        msg = f"Peek run of the command raised an error (retcode: {first_run.returncode})\n"
+        msg += f"Full command was {' '.join(cmd_tell)}\n"
+        if first_run.stdout is not None:
+            msg += str(first_run.stdout, errors='replace')
+        raise Exception(msg)
+
+    host_count=0
+    link_count=0
+    timestamps=[]
+    for line in str(first_run.stdout, errors='replace').split("\n"):
+        if re.match("^HOST_COUNT=(.*)", line):
+            m = re.match("^HOST_COUNT=(.*)", line)
+            host_count = int(m.group(1))
+        if re.match("^LINK_COUNT=(.*)", line):
+            m = re.match("^LINK_COUNT=(.*)", line)
+            link_count = int(m.group(1))
+        if re.match("^TIMESTAMP=(.*)", line):
+            m = re.match("^TIMESTAMP=(.*)", line)
+            timestamps.append(float(m.group(1)))
+
+    #print(f"hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}")
+    return (host_count,  link_count,  timestamps)
+
+parser = argparse.ArgumentParser(description='Run a simgrid simulation, and turn off/on resources at random.')
+parser.add_argument('--valgrind', help="Run the simulations in valgrind")
+parser.add_argument('command', nargs='*')
+args = parser.parse_args()
+
+(host_count,  link_count,  timestamps) = get_info(args.command)
+print(f"hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}")
+
+def do_run(cmd, extra_params):
+    cmd = copy.deepcopy(cmd)
+    cmd.append("--cfg=plugin:cmonkey")
+    for p in extra_params:
+        cmd.append(p)
+    print(f"\n#################################################################################\nStart {' '.join(cmd)}")
+    run = subprocess.run(cmd, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+    if run.returncode != 0:
+        msg = f"ERROR (retcode: {run.returncode}). Output:\n"
+        msg += str(run.stdout, errors='replace')
+        print(msg)
+        os.exit(1)
+    print ("Success.")
+
+def doit():
+    prev = 0
+    for pos in range(len(timestamps)):
+        now = timestamps[pos]
+        for host in range(host_count):
+            do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev)/2}",f"--cfg=cmonkey/host:{host}"])
+        for link in range(link_count):
+            do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev)/2}",f"--cfg=cmonkey/link:{link}"])
+doit()