From 016d020461cd4d21ea5a617d8f9099d47d7c880c Mon Sep 17 00:00:00 2001 From: Martin Quinson Date: Sun, 27 Feb 2022 11:04:54 +0100 Subject: [PATCH] Introduce the SimGrid Chaos Monkey See the tools/simgrid-monkey script and its comments for more info. --- MANIFEST.in | 3 + src/plugins/chaos_monkey.cpp | 77 +++++++++++ teshsuite/s4u/CMakeLists.txt | 1 + .../monkey-masterworkers.cpp | 128 ++++++++++++++++++ tools/cmake/DefinePackages.cmake | 2 + tools/simgrid-monkey | 98 ++++++++++++++ 6 files changed, 309 insertions(+) create mode 100644 src/plugins/chaos_monkey.cpp create mode 100644 teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.cpp create mode 100755 tools/simgrid-monkey diff --git a/MANIFEST.in b/MANIFEST.in index eac44c5f74..60a81f4616 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -808,6 +808,7 @@ include teshsuite/s4u/issue71/issue71.tesh include teshsuite/s4u/issue71/platform_bad.xml include teshsuite/s4u/listen_async/listen_async.cpp include teshsuite/s4u/listen_async/listen_async.tesh +include teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.cpp include teshsuite/s4u/ns3-from-src-to-itself/ns3-from-src-to-itself.cpp include teshsuite/s4u/ns3-from-src-to-itself/ns3-from-src-to-itself.tesh include teshsuite/s4u/ns3-simultaneous-send-rcv/ns3-simultaneous-send-rcv.cpp @@ -2432,6 +2433,7 @@ include src/msg/msg_private.hpp include src/msg/msg_process.cpp include src/msg/msg_task.cpp include src/plugins/ProducerConsumer.cpp +include src/plugins/chaos_monkey.cpp include src/plugins/file_system/s4u_FileSystem.cpp include src/plugins/host_dvfs.cpp include src/plugins/host_energy.cpp @@ -2824,6 +2826,7 @@ include tools/cmake/test_prog/prog_stacksetup.c include tools/cmake/test_prog/prog_tsan.cpp include tools/doxygen/list_routing_models_examples.sh include tools/graphicator/CMakeLists.txt +include tools/simgrid-monkey include tools/smpi/generate_smpi_defines.pl include tools/stack-cleaner/README include tools/stack-cleaner/as diff --git a/src/plugins/chaos_monkey.cpp b/src/plugins/chaos_monkey.cpp new file mode 100644 index 0000000000..32c9ba7edd --- /dev/null +++ b/src/plugins/chaos_monkey.cpp @@ -0,0 +1,77 @@ +/* Copyright (c) 2022-2022. The SimGrid Team. All rights reserved. */ + +/* This program is free software; you can redistribute it and/or modify it + * under the terms of the license (GNU LGPL) which comes with this package. */ + +// Chaos Monkey plugin: See the simgrid-monkey script for more information + +#include +#include +#include +#include + +#include "src/surf/surf_interface.hpp" // SIMGRID_REGISTER_PLUGIN + +namespace sg4 = simgrid::s4u; +static simgrid::config::Flag cfg_tell{"cmonkey/tell", "Request the Chaos Monkey to display all timestamps", + false}; +static simgrid::config::Flag cfg_time{"cmonkey/time", "When should the chaos monkey kill a resource", -1.}; +static simgrid::config::Flag cfg_link{"cmonkey/link", "Which link should be killed (number)", -1}; +static simgrid::config::Flag cfg_host{"cmonkey/host", "Which host should be killed (number)", -1}; +static void sg_chaos_monkey_plugin_init(); +// Makes sure that this plugin can be activated from the command line with ``--cfg=plugin:chaos_monkey`` +SIMGRID_REGISTER_PLUGIN(cmonkey, "Chaos monkey", &sg_chaos_monkey_plugin_init) + +XBT_LOG_NEW_DEFAULT_SUBCATEGORY(cmonkey, kernel, "Chaos Monkey plugin"); + +static void sg_chaos_monkey_plugin_init() +{ + XBT_INFO("Initializing the chaos monkey"); + + // delay the initialization until after the parameter are parsed + sg4::Engine::on_platform_created_cb([]() { + auto engine = sg4::Engine::get_instance(); + auto hosts = engine->get_all_hosts(); + auto links = engine->get_all_links(); + + sg4::Engine::on_deadlock_cb([]() { exit(2); }); + + if (cfg_tell) { + XBT_INFO("HOST_COUNT=%zu", hosts.size()); + XBT_INFO("LINK_COUNT=%zu", links.size()); + sg4::Engine::on_time_advance_cb([engine](double /* delta*/) { XBT_INFO("TIMESTAMP=%lf", engine->get_clock()); }); + } + + if (cfg_time >= 0) { + int host = cfg_host; + int link = cfg_link; + xbt_assert(host >= 0 || link >= 0, + "If a kill time is given, you must also specify a resource to kill (either a link or an host)"); + xbt_assert(host < 0 || link < 0, "Cannot specify both a link and an host to kill"); + if (host >= 0) { + auto* h = hosts[host]; + simgrid::kernel::timer::Timer::set(cfg_time, [h]() { + XBT_INFO("Kill host %s", h->get_cname()); + h->turn_off(); + }); + simgrid::kernel::timer::Timer::set(cfg_time + 30, [h]() { + XBT_INFO("Restart host %s", h->get_cname()); + h->turn_on(); + }); + } + if (link >= 0) { + auto* l = links[link]; + simgrid::kernel::timer::Timer::set(cfg_time, [l]() { + XBT_INFO("Kill link %s", l->get_cname()); + l->turn_off(); + }); + simgrid::kernel::timer::Timer::set(cfg_time + 30, [l]() { + XBT_INFO("Restart host %s", l->get_cname()); + l->turn_on(); + }); + } + } + + sg4::Engine::on_simulation_end_cb([]() { XBT_INFO("Chaos Monkey done!"); }); + }); +} diff --git a/teshsuite/s4u/CMakeLists.txt b/teshsuite/s4u/CMakeLists.txt index bf142a31e2..447c066e9b 100644 --- a/teshsuite/s4u/CMakeLists.txt +++ b/teshsuite/s4u/CMakeLists.txt @@ -8,6 +8,7 @@ foreach(x actor actor-autorestart actor-suspend activity-lifecycle comm-get-sender comm-pt2pt wait-all-for wait-any-for cloud-interrupt-migration cloud-two-execs + monkey-masterworkers concurrent_rw dag-incomplete-simulation dependencies host-on-off host-on-off-actors host-on-off-recv host-multicore-speed-file io-set-bw diff --git a/teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.cpp b/teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.cpp new file mode 100644 index 0000000000..74717056aa --- /dev/null +++ b/teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.cpp @@ -0,0 +1,128 @@ +/* Copyright (c) 2007-2022. The SimGrid Team. All rights reserved. */ + +/* This program is free software; you can redistribute it and/or modify it + * under the terms of the license (GNU LGPL) which comes with this package. */ + +/* This is a version of the masterworkers that (hopefully) survives to the chaos monkey. + * It tests synchronous send/receive as well as synchronous computations. + * + * It is not written to be pleasant to read, but instead to resist the aggressions of the monkey: + * - Workers keep going until after a global variable `todo` reaches 0. + * - The master is a daemon that just sends infinitely tasks + * (simgrid simulations stop as soon as all non-daemon actors are done). + * - The platform is created programmatically to remove path issues and control the problem size. + * + * Command-line configuration items: + * - host-count: how many actors to start (including the master + * - task-count: initial value of the `todo` global + * - deadline: time at which the simulation is known to be failed (to detect infinite loops). + * + * See the simgrid-monkey script for more information. + */ + +#include +#include +#include + +namespace sg4 = simgrid::s4u; + +XBT_LOG_NEW_DEFAULT_CATEGORY(s4u_test, "Messages specific for this s4u example"); + +static simgrid::config::Flag cfg_host_count{"host-count", "Host count (master on one, workers on the others)", 2}; +static simgrid::config::Flag cfg_deadline{"deadline", "When to fail the simulation (infinite loop detection)", + 120}; +static simgrid::config::Flag cfg_task_count{"task-count", "Amount of tasks that must be executed to succeed", 1}; + +int todo; // remaining amount of tasks to execute, a global variable + +static void master(double comp_size, long comm_size) +{ + XBT_INFO("Master booting"); + sg4::Actor::self()->daemonize(); + + auto mailbox = sg4::Mailbox::by_name("mailbox"); + while (true) { // This is a daemon + xbt_assert(sg4::Engine::get_clock() < cfg_deadline, + "Failed to run all tasks in less than %d seconds. Is this an infinite loop?", (int)cfg_deadline); + + auto* payload = new double(comp_size); + try { + XBT_INFO("Try to send a message"); + mailbox->put(payload, comm_size, 10.0); + } catch (const simgrid::TimeoutException&) { + delete payload; + XBT_INFO("Timeouted while sending a task"); + } catch (const simgrid::NetworkFailureException&) { + delete payload; + XBT_INFO("Network error while sending a task"); + } + } + THROW_IMPOSSIBLE; +} + +static void worker(int id) +{ + XBT_INFO("Worker booting"); + sg4::Mailbox* mailbox = sg4::Mailbox::by_name("mailbox"); + while (todo > 0) { + xbt_assert(sg4::Engine::get_clock() < cfg_deadline, + "Failed to run all tasks in less than %d seconds. Is this an infinite loop?", (int)cfg_deadline); + try { + XBT_INFO("Waiting a message on %s", mailbox->get_cname()); + auto payload = mailbox->get_unique(10); + xbt_assert(payload != nullptr, "mailbox->get() failed"); + double comp_size = *payload; + if (comp_size < 0) { /* - Exit when -1.0 is received */ + XBT_INFO("I'm done. See you!"); + break; + } + /* - Otherwise, process the task */ + XBT_INFO("Start execution..."); + sg4::this_actor::execute(comp_size); + XBT_INFO("Execution complete."); + todo--; + } catch (const simgrid::TimeoutException&) { + XBT_INFO("Timeouted while getting a task."); + + } catch (const simgrid::NetworkFailureException&) { + XBT_INFO("Mmh. Something went wrong. Nevermind. Let's keep going!"); + } + } +} + +int main(int argc, char* argv[]) +{ + sg4::Engine e(&argc, argv); + + XBT_INFO("host count: %d ", (int)cfg_host_count); + + auto* rootzone = sg4::create_full_zone("root"); + sg4::Host* main; // First host created, where the master will stay + std::vector worker_hosts; + for (int i = 0; i < cfg_host_count; i++) { + auto hostname = std::string("lilibeth ") + std::to_string(i); + auto* host = rootzone->create_host(hostname, 1e15); + if (i == 0) { + main = host; + } else { + sg4::LinkInRoute link(rootzone->create_link(hostname, "1MBps")->set_latency("24us")->seal()); + rootzone->add_route(main->get_netpoint(), host->get_netpoint(), nullptr, nullptr, {link}, true); + worker_hosts.push_back(host); + } + } + rootzone->seal(); + sg4::Engine::get_instance()->on_platform_created(); // FIXME this should not be necessary + + sg4::Actor::create("master", main, master, 50000000, 1000000)->set_auto_restart(true); + int id = 0; + for (auto* h : worker_hosts) + sg4::Actor::create("worker", h, worker, id++)->set_auto_restart(true); + + todo = cfg_task_count; + xbt_assert(todo > 0, "Please give more than %d tasks to run", todo); + + e.run(); + + XBT_INFO("WE SURVIVED!"); + return 0; +} diff --git a/tools/cmake/DefinePackages.cmake b/tools/cmake/DefinePackages.cmake index 9469044bcd..87ca7accab 100644 --- a/tools/cmake/DefinePackages.cmake +++ b/tools/cmake/DefinePackages.cmake @@ -364,6 +364,7 @@ set(SURF_SRC set(PLUGINS_SRC src/plugins/ProducerConsumer.cpp + src/plugins/chaos_monkey.cpp src/plugins/host_dvfs.cpp src/plugins/host_energy.cpp src/plugins/link_energy.cpp @@ -1107,6 +1108,7 @@ set(CMAKE_SOURCE_FILES tools/cmake/test_prog/prog_stacksetup.c tools/cmake/test_prog/prog_tsan.cpp tools/cmake/cross-mingw.cmake + tools/simgrid-monkey tools/smpi/generate_smpi_defines.pl tools/stack-cleaner/as tools/stack-cleaner/cc diff --git a/tools/simgrid-monkey b/tools/simgrid-monkey new file mode 100755 index 0000000000..04567c2382 --- /dev/null +++ b/tools/simgrid-monkey @@ -0,0 +1,98 @@ +#! /usr/bin/python3 + +# The goal is to introduce random failures in a simulation, to test simgrid under extreme conditions. +# +# It is made of several components. +# +# * a plugin: cmonkey. Can be used from the command line as follows: +# * --cfg=plugin:cmonkey --cfg=cmonkey/tell:1 +# Get information about the resource count and the timestamps of each scheduling rounds. +# * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/host:1 +# Kill the host #1 after 42 seconds (using a kernel::Timer so that no actor gets involved in the killing) +# * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/link:0 +# Kill the link #0 after 42 seconds (using a kernel::Timer) +# +# * a python script: tools/simgrid-monkey (this file) +# * It takes a regular simgrid simulation as a parameter, use the cmonkey plugin to get the information about it, +# and then restart many runs, with one resource being turn_off() + turn_on() in each run. +# * Each resource gets killed between each timestamps, and on each timestamp. +# * So the amount of simulations is: 1 + (host_c+link_c) * timestamps * 2 +# +# * Test program, written to resist these extreme conditions: +# * teshsuite/s4u/monkey-masterworkers: tests synchronous comms and execs + +import multiprocessing as mp +import sys +import os +import argparse +import subprocess +import copy +import re + + +def get_info(cmd): + cmd_tell = copy.deepcopy(cmd) + cmd_tell.append("--cfg=plugin:cmonkey") + cmd_tell.append("--cfg=cmonkey/tell:1") + cmd_tell.append("--log=root.t:critical") + cmd_tell.append("--log=cmonkey.t:info") + cmd_tell.append("--log=cmonkey.fmt:%m%n") + print(f"Get the initial info from the command ``{' '.join(cmd_tell)}``") + first_run = subprocess.run(cmd_tell, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + if first_run.returncode != 0: + msg = f"Peek run of the command raised an error (retcode: {first_run.returncode})\n" + msg += f"Full command was {' '.join(cmd_tell)}\n" + if first_run.stdout is not None: + msg += str(first_run.stdout, errors='replace') + raise Exception(msg) + + host_count=0 + link_count=0 + timestamps=[] + for line in str(first_run.stdout, errors='replace').split("\n"): + if re.match("^HOST_COUNT=(.*)", line): + m = re.match("^HOST_COUNT=(.*)", line) + host_count = int(m.group(1)) + if re.match("^LINK_COUNT=(.*)", line): + m = re.match("^LINK_COUNT=(.*)", line) + link_count = int(m.group(1)) + if re.match("^TIMESTAMP=(.*)", line): + m = re.match("^TIMESTAMP=(.*)", line) + timestamps.append(float(m.group(1))) + + #print(f"hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}") + return (host_count, link_count, timestamps) + +parser = argparse.ArgumentParser(description='Run a simgrid simulation, and turn off/on resources at random.') +parser.add_argument('--valgrind', help="Run the simulations in valgrind") +parser.add_argument('command', nargs='*') +args = parser.parse_args() + +(host_count, link_count, timestamps) = get_info(args.command) +print(f"hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}") + +def do_run(cmd, extra_params): + cmd = copy.deepcopy(cmd) + cmd.append("--cfg=plugin:cmonkey") + for p in extra_params: + cmd.append(p) + print(f"\n#################################################################################\nStart {' '.join(cmd)}") + run = subprocess.run(cmd, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + if run.returncode != 0: + msg = f"ERROR (retcode: {run.returncode}). Output:\n" + msg += str(run.stdout, errors='replace') + print(msg) + os.exit(1) + print ("Success.") + +def doit(): + prev = 0 + for pos in range(len(timestamps)): + now = timestamps[pos] + for host in range(host_count): + do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev)/2}",f"--cfg=cmonkey/host:{host}"]) + for link in range(link_count): + do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev)/2}",f"--cfg=cmonkey/link:{link}"]) +doit() -- 2.20.1