3 # The goal is to introduce random failures in a simulation, to test simgrid under extreme conditions.
5 # It is made of several components.
7 # * a plugin: cmonkey. Can be used from the command line as follows:
8 # * --cfg=plugin:cmonkey --cfg=cmonkey/tell:1
9 # Get information about the resource count and the timestamps of each scheduling rounds.
10 # * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/host:1
11 # Kill the host #1 after 42 seconds (using a kernel::Timer so that no actor gets involved in the killing)
12 # * --cfg=plugin:cmonkey --cfg=cmonkey/time:42 --cfg=cmonkey/link:0
13 # Kill the link #0 after 42 seconds (using a kernel::Timer)
15 # * a python script: tools/simgrid-monkey (this file)
16 # * It takes a regular simgrid simulation as a parameter, use the cmonkey plugin to get the information about it,
17 # and then restart many runs, with one resource being turn_off() + turn_on() in each run.
18 # * Each resource gets killed between each timestamps, and on each timestamp.
19 # * So the amount of simulations is: 1 + (host_c+link_c) * timestamps * 2
21 # * Test program, written to resist these extreme conditions:
22 # * teshsuite/s4u/monkey-masterworkers: tests synchronous comms and execs
24 import multiprocessing as mp
34 cmd_tell = copy.deepcopy(cmd)
35 cmd_tell.append("--cfg=plugin:cmonkey")
36 cmd_tell.append("--cfg=cmonkey/tell:1")
37 cmd_tell.append("--log=root.t:critical")
38 cmd_tell.append("--log=cmonkey.t:info")
39 cmd_tell.append("--log=cmonkey.fmt:%m%n")
40 print(f"Get the initial info from the command.")
41 # print(f"from ``{' '.join(cmd_tell)}``")
42 first_run = subprocess.run(cmd_tell, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
44 if first_run.returncode != 0:
45 msg = f"Peek run of the command raised an error (retcode: {first_run.returncode})\n"
46 msg += f"Full command was {' '.join(cmd_tell)}\n"
47 if first_run.stdout is not None:
48 msg += str(first_run.stdout, errors='replace')
54 for line in str(first_run.stdout, errors='replace').split("\n"):
55 if re.match("^HOST_COUNT=(.*)", line):
56 m = re.match("^HOST_COUNT=(.*)", line)
57 host_count = int(m.group(1))
58 if re.match("^LINK_COUNT=(.*)", line):
59 m = re.match("^LINK_COUNT=(.*)", line)
60 link_count = int(m.group(1))
61 if re.match("^TIMESTAMP=(.*)", line):
62 m = re.match("^TIMESTAMP=(.*)", line)
63 timestamps.append(float(m.group(1)))
65 #print(f"hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}")
66 return (host_count, link_count, timestamps)
68 parser = argparse.ArgumentParser(description='Run a simgrid simulation, and turn off/on resources at random.')
69 parser.add_argument('--valgrind', help="Run the simulations in valgrind")
70 parser.add_argument('command', nargs='*')
71 args = parser.parse_args()
73 (host_count, link_count, timestamps) = get_info(args.command)
74 print(f"Monkey informations: hosts:{host_count} links:{link_count} timestamps:{' '.join(([str(i) for i in timestamps]))}")
76 def do_run(cmd, extra_params):
77 cmd = copy.deepcopy(cmd)
78 cmd.append("--cfg=plugin:cmonkey")
79 for p in extra_params:
81 print(f"Start {' '.join(cmd)}")
82 run = subprocess.run(cmd, shell=False, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
84 if run.returncode != 0:
85 msg = f"ERROR (retcode: {run.returncode}). Output:\n"
86 msg += str(run.stdout, errors='replace')
93 for pos in range(len(timestamps)):
95 for host in range(host_count):
96 do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev)/2}", f"--cfg=cmonkey/host:{host}"])
97 do_run(args.command, [f"--cfg=cmonkey/time:{now}", f"--cfg=cmonkey/host:{host}"])
98 for link in range(link_count):
99 do_run(args.command, [f"--cfg=cmonkey/time:{(now-prev)/2}", f"--cfg=cmonkey/link:{link}"])
100 do_run(args.command, [f"--cfg=cmonkey/time:{now}", f"--cfg=cmonkey/link:{link}"])