tools/simgrid-monkey

   1 #! /usr/bin/env python3
   2
   3 # The goal is to introduce random failures in a simulation, to test simgrid under extreme conditions.
   4 #
   5 # It is made of several components.
   6 #
   7 # * a plugin: cmonkey. Can be used from the command line as follows:
   8 #   * --cfg=plugin:cmonkey --cfg=cmonkey/tell:1
   9 #     Get information about the resource count and the timestamps of each scheduling rounds.
  10 #   * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/host:1
  11 #     Kill the host #1 after 42 seconds (using a kernel::Timer so that no actor gets involved in the killing)
  12 #   * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/link:0
  13 #     Kill the link #0 after 42 seconds (using a kernel::Timer)
  14 #
  15 # * a python script: tools/simgrid-monkey (this file)
  16 #   * It takes a regular simgrid simulation as a parameter, use the cmonkey plugin to get the information about it,
  17 #     and then restart many runs, with one resource being turn_off() + turn_on() in each run.
  18 #   * Each resource gets killed between each timestamps, and on each timestamp.
  19 #   * So the amount of simulations is: 1 + (host_c+link_c) * timestamps * 2
  20 #
  21 # * Test program, written to resist these extreme conditions:
  22 #   * teshsuite/s4u/monkey-masterworkers: tests synchronous comms and execs (C++ and python)
  23 #   * teshsuite/s4u/monkey-semaphore: tests async semaphores (C++ only)
  24
  25 import multiprocessing as mp
  26 import sys
  27 import os
  28 import argparse
  29 import subprocess
  30 import copy
  31 import re
  32
  33
  34 def get_info(cmd):
  35     cmd_tell = copy.deepcopy(cmd)
  36     cmd_tell.append("--cfg=plugin:cmonkey")
  37     cmd_tell.append("--cfg=cmonkey/tell:1")
  38     cmd_tell.append("--log=root.t:critical")
  39     cmd_tell.append("--log=cmonkey.t:info")
  40     cmd_tell.append("--log=cmonkey.fmt:%m%n")
  41     print(f"Get the initial info from the command.")
  42     # print(f"from ``{' '.join(cmd_tell)}``")
  43     first_run = subprocess.run(cmd_tell, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
  44
  45     if first_run.returncode != 0:
  46         msg = f"Peek run of the command raised an error (retcode: {first_run.returncode})\n"
  47         msg += f"Full command was {' '.join(cmd_tell)}\n"
  48         if first_run.stdout is not None:
  49             msg += str(first_run.stdout, errors='replace')
  50         raise Exception(msg)
  51
  52     host_count=0
  53     link_count=0
  54     timestamps=[]
  55     for line in str(first_run.stdout, errors='replace').split("\n"):
  56         if re.match("^HOST_COUNT=(.*)", line):
  57             m = re.match("^HOST_COUNT=(.*)", line)
  58             host_count = int(m.group(1))
  59         if re.match("^LINK_COUNT=(.*)", line):
  60             m = re.match("^LINK_COUNT=(.*)", line)
  61             link_count = int(m.group(1))
  62         if re.match("^TIMESTAMP=(.*)", line):
  63             m = re.match("^TIMESTAMP=(.*)", line)
  64             timestamps.append(float(m.group(1)))
  65
  66     #print(f"hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}")
  67     return (host_count,  link_count,  timestamps)
  68
  69 parser = argparse.ArgumentParser(description='Run a simgrid simulation, and turn off/on resources at random.')
  70 parser.add_argument('--valgrind', help="Run the simulations in valgrind")
  71 parser.add_argument('command', nargs='*')
  72 args = parser.parse_args()
  73
  74 (host_count,  link_count,  timestamps) = get_info(args.command)
  75 timestamps = sorted([*{*timestamps}]) # kill duplicates
  76 print(f"Monkey informations: hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}")
  77
  78 error_count = 0
  79 test_count = 0
  80 def do_run(cmd, extra_params, test_todo):
  81     global test_count, error_count
  82     test_count = test_count + 1
  83     cmd = copy.deepcopy(cmd)
  84     cmd.append("--cfg=plugin:cmonkey")
  85     for p in extra_params:
  86         cmd.append(p)
  87     print(f"Start {' '.join(cmd)}")
  88     sys.stdout.flush()
  89
  90     run = subprocess.run(cmd, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
  91
  92     out = str(run.stdout, errors='replace')
  93     if run.returncode != 0:
  94         msg = f"ERROR (retcode: {run.returncode}). Output:\n"
  95         msg += out
  96         print(msg)
  97         sys.exit(1)
  98     for line in out.split("\n"):
  99         if re.match("==.*    in use at exit: ", line) and not re.match("==.* in use at exit: 0 bytes in 0 blocks", line):
 100             m = re.match("==.*    in use at exit: (.*)", line)
 101             print(f"LEAK SUMMARY: {m.group(1)} in use at exit")
 102             error_count += 1
 103
 104         if re.match("==.* ERROR SUMMARY: ", line):
 105             m = re.match("==.* ERROR SUMMARY: (.*)", line)
 106             print(f"valgrind summary: {m.group(1)}")
 107             if not re.match("==.* 0 errors from 0 contexts", line):
 108                 error_count += 1
 109     print (f"Test {test_count} out of {test_todo} succeded.\n")
 110
 111
 112 def doit():
 113     prev_time = 0
 114     test_count = 0
 115     test_todo = 2 * len(timestamps) * (host_count + link_count)
 116     for pos in range(len(timestamps)):
 117         now = timestamps[pos]
 118         for host in range(host_count):
 119             do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev_time)/2}", f"--cfg=cmonkey/host:{host}"], test_todo)
 120         for link in range(link_count):
 121             do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev_time)/2}", f"--cfg=cmonkey/link:{link}"], test_todo)
 122         for host in range(host_count):
 123             do_run(args.command, [f"--cfg=cmonkey/time:{now}", f"--cfg=cmonkey/host:{host}"], test_todo)
 124         for link in range(link_count):
 125             do_run(args.command, [f"--cfg=cmonkey/time:{now}", f"--cfg=cmonkey/link:{link}"], test_todo)
 126 doit()
 127
 128 print(f"In total, the monkey found {error_count} errors.")
 129 sys.exit(error_count)