See the tools/simgrid-monkey script and its comments for more info.
include teshsuite/s4u/issue71/platform_bad.xml
include teshsuite/s4u/listen_async/listen_async.cpp
include teshsuite/s4u/listen_async/listen_async.tesh
+include teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.cpp
include teshsuite/s4u/ns3-from-src-to-itself/ns3-from-src-to-itself.cpp
include teshsuite/s4u/ns3-from-src-to-itself/ns3-from-src-to-itself.tesh
include teshsuite/s4u/ns3-simultaneous-send-rcv/ns3-simultaneous-send-rcv.cpp
include src/msg/msg_process.cpp
include src/msg/msg_task.cpp
include src/plugins/ProducerConsumer.cpp
+include src/plugins/chaos_monkey.cpp
include src/plugins/file_system/s4u_FileSystem.cpp
include src/plugins/host_dvfs.cpp
include src/plugins/host_energy.cpp
include tools/cmake/test_prog/prog_tsan.cpp
include tools/doxygen/list_routing_models_examples.sh
include tools/graphicator/CMakeLists.txt
+include tools/simgrid-monkey
include tools/smpi/generate_smpi_defines.pl
include tools/stack-cleaner/README
include tools/stack-cleaner/as
--- /dev/null
+/* Copyright (c) 2022-2022. The SimGrid Team. All rights reserved. */
+
+/* This program is free software; you can redistribute it and/or modify it
+ * under the terms of the license (GNU LGPL) which comes with this package. */
+
+// Chaos Monkey plugin: See the simgrid-monkey script for more information
+
+#include <simgrid/kernel/Timer.hpp>
+#include <simgrid/s4u/Engine.hpp>
+#include <simgrid/s4u/Host.hpp>
+#include <xbt/config.hpp>
+
+#include "src/surf/surf_interface.hpp" // SIMGRID_REGISTER_PLUGIN
+
+namespace sg4 = simgrid::s4u;
+static simgrid::config::Flag<bool> cfg_tell{"cmonkey/tell", "Request the Chaos Monkey to display all timestamps",
+ false};
+static simgrid::config::Flag<double> cfg_time{"cmonkey/time", "When should the chaos monkey kill a resource", -1.};
+static simgrid::config::Flag<int> cfg_link{"cmonkey/link", "Which link should be killed (number)", -1};
+static simgrid::config::Flag<int> cfg_host{"cmonkey/host", "Which host should be killed (number)", -1};
+static void sg_chaos_monkey_plugin_init();
+// Makes sure that this plugin can be activated from the command line with ``--cfg=plugin:chaos_monkey``
+SIMGRID_REGISTER_PLUGIN(cmonkey, "Chaos monkey", &sg_chaos_monkey_plugin_init)
+
+XBT_LOG_NEW_DEFAULT_SUBCATEGORY(cmonkey, kernel, "Chaos Monkey plugin");
+
+static void sg_chaos_monkey_plugin_init()
+{
+ XBT_INFO("Initializing the chaos monkey");
+
+ // delay the initialization until after the parameter are parsed
+ sg4::Engine::on_platform_created_cb([]() {
+ auto engine = sg4::Engine::get_instance();
+ auto hosts = engine->get_all_hosts();
+ auto links = engine->get_all_links();
+
+ sg4::Engine::on_deadlock_cb([]() { exit(2); });
+
+ if (cfg_tell) {
+ XBT_INFO("HOST_COUNT=%zu", hosts.size());
+ XBT_INFO("LINK_COUNT=%zu", links.size());
+ sg4::Engine::on_time_advance_cb([engine](double /* delta*/) { XBT_INFO("TIMESTAMP=%lf", engine->get_clock()); });
+ }
+
+ if (cfg_time >= 0) {
+ int host = cfg_host;
+ int link = cfg_link;
+ xbt_assert(host >= 0 || link >= 0,
+ "If a kill time is given, you must also specify a resource to kill (either a link or an host)");
+ xbt_assert(host < 0 || link < 0, "Cannot specify both a link and an host to kill");
+ if (host >= 0) {
+ auto* h = hosts[host];
+ simgrid::kernel::timer::Timer::set(cfg_time, [h]() {
+ XBT_INFO("Kill host %s", h->get_cname());
+ h->turn_off();
+ });
+ simgrid::kernel::timer::Timer::set(cfg_time + 30, [h]() {
+ XBT_INFO("Restart host %s", h->get_cname());
+ h->turn_on();
+ });
+ }
+ if (link >= 0) {
+ auto* l = links[link];
+ simgrid::kernel::timer::Timer::set(cfg_time, [l]() {
+ XBT_INFO("Kill link %s", l->get_cname());
+ l->turn_off();
+ });
+ simgrid::kernel::timer::Timer::set(cfg_time + 30, [l]() {
+ XBT_INFO("Restart host %s", l->get_cname());
+ l->turn_on();
+ });
+ }
+ }
+
+ sg4::Engine::on_simulation_end_cb([]() { XBT_INFO("Chaos Monkey done!"); });
+ });
+}
activity-lifecycle
comm-get-sender comm-pt2pt wait-all-for wait-any-for
cloud-interrupt-migration cloud-two-execs
+ monkey-masterworkers
concurrent_rw
dag-incomplete-simulation dependencies
host-on-off host-on-off-actors host-on-off-recv host-multicore-speed-file io-set-bw
--- /dev/null
+/* Copyright (c) 2007-2022. The SimGrid Team. All rights reserved. */
+
+/* This program is free software; you can redistribute it and/or modify it
+ * under the terms of the license (GNU LGPL) which comes with this package. */
+
+/* This is a version of the masterworkers that (hopefully) survives to the chaos monkey.
+ * It tests synchronous send/receive as well as synchronous computations.
+ *
+ * It is not written to be pleasant to read, but instead to resist the aggressions of the monkey:
+ * - Workers keep going until after a global variable `todo` reaches 0.
+ * - The master is a daemon that just sends infinitely tasks
+ * (simgrid simulations stop as soon as all non-daemon actors are done).
+ * - The platform is created programmatically to remove path issues and control the problem size.
+ *
+ * Command-line configuration items:
+ * - host-count: how many actors to start (including the master
+ * - task-count: initial value of the `todo` global
+ * - deadline: time at which the simulation is known to be failed (to detect infinite loops).
+ *
+ * See the simgrid-monkey script for more information.
+ */
+
+#include <simgrid/s4u.hpp>
+#include <xbt/config.hpp>
+#include <xbt/string.hpp>
+
+namespace sg4 = simgrid::s4u;
+
+XBT_LOG_NEW_DEFAULT_CATEGORY(s4u_test, "Messages specific for this s4u example");
+
+static simgrid::config::Flag<int> cfg_host_count{"host-count", "Host count (master on one, workers on the others)", 2};
+static simgrid::config::Flag<double> cfg_deadline{"deadline", "When to fail the simulation (infinite loop detection)",
+ 120};
+static simgrid::config::Flag<int> cfg_task_count{"task-count", "Amount of tasks that must be executed to succeed", 1};
+
+int todo; // remaining amount of tasks to execute, a global variable
+
+static void master(double comp_size, long comm_size)
+{
+ XBT_INFO("Master booting");
+ sg4::Actor::self()->daemonize();
+
+ auto mailbox = sg4::Mailbox::by_name("mailbox");
+ while (true) { // This is a daemon
+ xbt_assert(sg4::Engine::get_clock() < cfg_deadline,
+ "Failed to run all tasks in less than %d seconds. Is this an infinite loop?", (int)cfg_deadline);
+
+ auto* payload = new double(comp_size);
+ try {
+ XBT_INFO("Try to send a message");
+ mailbox->put(payload, comm_size, 10.0);
+ } catch (const simgrid::TimeoutException&) {
+ delete payload;
+ XBT_INFO("Timeouted while sending a task");
+ } catch (const simgrid::NetworkFailureException&) {
+ delete payload;
+ XBT_INFO("Network error while sending a task");
+ }
+ }
+ THROW_IMPOSSIBLE;
+}
+
+static void worker(int id)
+{
+ XBT_INFO("Worker booting");
+ sg4::Mailbox* mailbox = sg4::Mailbox::by_name("mailbox");
+ while (todo > 0) {
+ xbt_assert(sg4::Engine::get_clock() < cfg_deadline,
+ "Failed to run all tasks in less than %d seconds. Is this an infinite loop?", (int)cfg_deadline);
+ try {
+ XBT_INFO("Waiting a message on %s", mailbox->get_cname());
+ auto payload = mailbox->get_unique<double>(10);
+ xbt_assert(payload != nullptr, "mailbox->get() failed");
+ double comp_size = *payload;
+ if (comp_size < 0) { /* - Exit when -1.0 is received */
+ XBT_INFO("I'm done. See you!");
+ break;
+ }
+ /* - Otherwise, process the task */
+ XBT_INFO("Start execution...");
+ sg4::this_actor::execute(comp_size);
+ XBT_INFO("Execution complete.");
+ todo--;
+ } catch (const simgrid::TimeoutException&) {
+ XBT_INFO("Timeouted while getting a task.");
+
+ } catch (const simgrid::NetworkFailureException&) {
+ XBT_INFO("Mmh. Something went wrong. Nevermind. Let's keep going!");
+ }
+ }
+}
+
+int main(int argc, char* argv[])
+{
+ sg4::Engine e(&argc, argv);
+
+ XBT_INFO("host count: %d ", (int)cfg_host_count);
+
+ auto* rootzone = sg4::create_full_zone("root");
+ sg4::Host* main; // First host created, where the master will stay
+ std::vector<sg4::Host*> worker_hosts;
+ for (int i = 0; i < cfg_host_count; i++) {
+ auto hostname = std::string("lilibeth ") + std::to_string(i);
+ auto* host = rootzone->create_host(hostname, 1e15);
+ if (i == 0) {
+ main = host;
+ } else {
+ sg4::LinkInRoute link(rootzone->create_link(hostname, "1MBps")->set_latency("24us")->seal());
+ rootzone->add_route(main->get_netpoint(), host->get_netpoint(), nullptr, nullptr, {link}, true);
+ worker_hosts.push_back(host);
+ }
+ }
+ rootzone->seal();
+ sg4::Engine::get_instance()->on_platform_created(); // FIXME this should not be necessary
+
+ sg4::Actor::create("master", main, master, 50000000, 1000000)->set_auto_restart(true);
+ int id = 0;
+ for (auto* h : worker_hosts)
+ sg4::Actor::create("worker", h, worker, id++)->set_auto_restart(true);
+
+ todo = cfg_task_count;
+ xbt_assert(todo > 0, "Please give more than %d tasks to run", todo);
+
+ e.run();
+
+ XBT_INFO("WE SURVIVED!");
+ return 0;
+}
set(PLUGINS_SRC
src/plugins/ProducerConsumer.cpp
+ src/plugins/chaos_monkey.cpp
src/plugins/host_dvfs.cpp
src/plugins/host_energy.cpp
src/plugins/link_energy.cpp
tools/cmake/test_prog/prog_stacksetup.c
tools/cmake/test_prog/prog_tsan.cpp
tools/cmake/cross-mingw.cmake
+ tools/simgrid-monkey
tools/smpi/generate_smpi_defines.pl
tools/stack-cleaner/as
tools/stack-cleaner/cc
--- /dev/null
+#! /usr/bin/python3
+
+# The goal is to introduce random failures in a simulation, to test simgrid under extreme conditions.
+#
+# It is made of several components.
+#
+# * a plugin: cmonkey. Can be used from the command line as follows:
+# * --cfg=plugin:cmonkey --cfg=cmonkey/tell:1
+# Get information about the resource count and the timestamps of each scheduling rounds.
+# * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/host:1
+# Kill the host #1 after 42 seconds (using a kernel::Timer so that no actor gets involved in the killing)
+# * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/link:0
+# Kill the link #0 after 42 seconds (using a kernel::Timer)
+#
+# * a python script: tools/simgrid-monkey (this file)
+# * It takes a regular simgrid simulation as a parameter, use the cmonkey plugin to get the information about it,
+# and then restart many runs, with one resource being turn_off() + turn_on() in each run.
+# * Each resource gets killed between each timestamps, and on each timestamp.
+# * So the amount of simulations is: 1 + (host_c+link_c) * timestamps * 2
+#
+# * Test program, written to resist these extreme conditions:
+# * teshsuite/s4u/monkey-masterworkers: tests synchronous comms and execs
+
+import multiprocessing as mp
+import sys
+import os
+import argparse
+import subprocess
+import copy
+import re
+
+
+def get_info(cmd):
+ cmd_tell = copy.deepcopy(cmd)
+ cmd_tell.append("--cfg=plugin:cmonkey")
+ cmd_tell.append("--cfg=cmonkey/tell:1")
+ cmd_tell.append("--log=root.t:critical")
+ cmd_tell.append("--log=cmonkey.t:info")
+ cmd_tell.append("--log=cmonkey.fmt:%m%n")
+ print(f"Get the initial info from the command ``{' '.join(cmd_tell)}``")
+ first_run = subprocess.run(cmd_tell, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+ if first_run.returncode != 0:
+ msg = f"Peek run of the command raised an error (retcode: {first_run.returncode})\n"
+ msg += f"Full command was {' '.join(cmd_tell)}\n"
+ if first_run.stdout is not None:
+ msg += str(first_run.stdout, errors='replace')
+ raise Exception(msg)
+
+ host_count=0
+ link_count=0
+ timestamps=[]
+ for line in str(first_run.stdout, errors='replace').split("\n"):
+ if re.match("^HOST_COUNT=(.*)", line):
+ m = re.match("^HOST_COUNT=(.*)", line)
+ host_count = int(m.group(1))
+ if re.match("^LINK_COUNT=(.*)", line):
+ m = re.match("^LINK_COUNT=(.*)", line)
+ link_count = int(m.group(1))
+ if re.match("^TIMESTAMP=(.*)", line):
+ m = re.match("^TIMESTAMP=(.*)", line)
+ timestamps.append(float(m.group(1)))
+
+ #print(f"hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}")
+ return (host_count, link_count, timestamps)
+
+parser = argparse.ArgumentParser(description='Run a simgrid simulation, and turn off/on resources at random.')
+parser.add_argument('--valgrind', help="Run the simulations in valgrind")
+parser.add_argument('command', nargs='*')
+args = parser.parse_args()
+
+(host_count, link_count, timestamps) = get_info(args.command)
+print(f"hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}")
+
+def do_run(cmd, extra_params):
+ cmd = copy.deepcopy(cmd)
+ cmd.append("--cfg=plugin:cmonkey")
+ for p in extra_params:
+ cmd.append(p)
+ print(f"\n#################################################################################\nStart {' '.join(cmd)}")
+ run = subprocess.run(cmd, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+ if run.returncode != 0:
+ msg = f"ERROR (retcode: {run.returncode}). Output:\n"
+ msg += str(run.stdout, errors='replace')
+ print(msg)
+ os.exit(1)
+ print ("Success.")
+
+def doit():
+ prev = 0
+ for pos in range(len(timestamps)):
+ now = timestamps[pos]
+ for host in range(host_count):
+ do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev)/2}",f"--cfg=cmonkey/host:{host}"])
+ for link in range(link_count):
+ do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev)/2}",f"--cfg=cmonkey/link:{link}"])
+doit()