tools/simgrid-monkey

   1 #! /usr/bin/env python3
   2
   3 # The goal is to introduce random failures in a simulation, to test simgrid under extreme conditions.
   4 #
   5 # It is made of several components.
   6 #
   7 # * a plugin: cmonkey. Can be used from the command line as follows:
   8 #   * --cfg=plugin:cmonkey --cfg=cmonkey/tell:1
   9 #     Get information about the resource count and the timestamps of each scheduling rounds.
  10 #   * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/host:1
  11 #     Kill the host #1 after 42 seconds (using a kernel::Timer so that no actor gets involved in the killing)
  12 #   * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/link:0
  13 #     Kill the link #0 after 42 seconds (using a kernel::Timer)
  14 #
  15 # * a python script: tools/simgrid-monkey (this file)
  16 #   * It takes a regular simgrid simulation as a parameter, use the cmonkey plugin to get the information about it,
  17 #     and then restart many runs, with one resource being turn_off() + turn_on() in each run.
  18 #   * Each resource gets killed between each timestamps, and on each timestamp.
  19 #   * So the amount of simulations is: 1 + (host_c+link_c) * timestamps * 2
  20 #
  21 # * Test program, written to resist these extreme conditions:
  22 #   * teshsuite/s4u/monkey-masterworkers: tests synchronous comms and execs
  23
  24 import multiprocessing as mp
  25 import sys
  26 import os
  27 import argparse
  28 import subprocess
  29 import copy
  30 import re
  31
  32
  33 def get_info(cmd):
  34     cmd_tell = copy.deepcopy(cmd)
  35     cmd_tell.append("--cfg=plugin:cmonkey")
  36     cmd_tell.append("--cfg=cmonkey/tell:1")
  37     cmd_tell.append("--log=root.t:critical")
  38     cmd_tell.append("--log=cmonkey.t:info")
  39     cmd_tell.append("--log=cmonkey.fmt:%m%n")
  40     print(f"Get the initial info from the command.")
  41     # print(f"from ``{' '.join(cmd_tell)}``")
  42     first_run = subprocess.run(cmd_tell, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
  43
  44     if first_run.returncode != 0:
  45         msg = f"Peek run of the command raised an error (retcode: {first_run.returncode})\n"
  46         msg += f"Full command was {' '.join(cmd_tell)}\n"
  47         if first_run.stdout is not None:
  48             msg += str(first_run.stdout, errors='replace')
  49         raise Exception(msg)
  50
  51     host_count=0
  52     link_count=0
  53     timestamps=[]
  54     for line in str(first_run.stdout, errors='replace').split("\n"):
  55         if re.match("^HOST_COUNT=(.*)", line):
  56             m = re.match("^HOST_COUNT=(.*)", line)
  57             host_count = int(m.group(1))
  58         if re.match("^LINK_COUNT=(.*)", line):
  59             m = re.match("^LINK_COUNT=(.*)", line)
  60             link_count = int(m.group(1))
  61         if re.match("^TIMESTAMP=(.*)", line):
  62             m = re.match("^TIMESTAMP=(.*)", line)
  63             timestamps.append(float(m.group(1)))
  64
  65     #print(f"hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}")
  66     return (host_count,  link_count,  timestamps)
  67
  68 parser = argparse.ArgumentParser(description='Run a simgrid simulation, and turn off/on resources at random.')
  69 parser.add_argument('--valgrind', help="Run the simulations in valgrind")
  70 parser.add_argument('command', nargs='*')
  71 args = parser.parse_args()
  72
  73 (host_count,  link_count,  timestamps) = get_info(args.command)
  74 timestamps = sorted([*{*timestamps}]) # kill duplicates
  75 print(f"Monkey informations: hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}")
  76
  77 error_count = 0
  78 test_count = 0
  79 def do_run(cmd, extra_params, test_todo):
  80     global test_count, error_count
  81     test_count = test_count + 1
  82     cmd = copy.deepcopy(cmd)
  83     cmd.append("--cfg=plugin:cmonkey")
  84     for p in extra_params:
  85         cmd.append(p)
  86     print(f"Start {' '.join(cmd)}")
  87     sys.stdout.flush()
  88
  89     run = subprocess.run(cmd, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
  90
  91     out = str(run.stdout, errors='replace')
  92     if run.returncode != 0:
  93         msg = f"ERROR (retcode: {run.returncode}). Output:\n"
  94         msg += out
  95         print(msg)
  96         sys.exit(1)
  97     for line in out.split("\n"):
  98         if re.match("==.*    in use at exit: ", line) and not re.match("==.* in use at exit: 0 bytes in 0 blocks", line):
  99             m = re.match("==.*    in use at exit: (.*)", line)
 100             print(f"LEAK SUMMARY: {m.group(1)} in use at exit")
 101             error_count += 1
 102
 103         if re.match("==.* ERROR SUMMARY: ", line):
 104             m = re.match("==.* ERROR SUMMARY: (.*)", line)
 105             print(f"valgrind summary: {m.group(1)}")
 106             if not re.match("==.* 0 errors from 0 contexts", line):
 107                 error_count += 1
 108     print (f"Test {test_count} out of {test_todo} succeded.\n")
 109
 110
 111 def doit():
 112     prev_time = 0
 113     test_count = 0
 114     test_todo = 2 * len(timestamps) * (host_count + link_count)
 115     for pos in range(len(timestamps)):
 116         now = timestamps[pos]
 117         for host in range(host_count):
 118             do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev_time)/2}", f"--cfg=cmonkey/host:{host}"], test_todo)
 119         for link in range(link_count):
 120             do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev_time)/2}", f"--cfg=cmonkey/link:{link}"], test_todo)
 121         for host in range(host_count):
 122             do_run(args.command, [f"--cfg=cmonkey/time:{now}", f"--cfg=cmonkey/host:{host}"], test_todo)
 123         for link in range(link_count):
 124             do_run(args.command, [f"--cfg=cmonkey/time:{now}", f"--cfg=cmonkey/link:{link}"], test_todo)
 125 doit()
 126
 127 print(f"In total, the monkey found {error_count} errors.")
 128 sys.exit(error_count)