Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Python version of the masterworkers monkey
authorMartin Quinson <martin.quinson@ens-rennes.fr>
Wed, 2 Mar 2022 22:28:18 +0000 (23:28 +0100)
committerMartin Quinson <martin.quinson@ens-rennes.fr>
Wed, 2 Mar 2022 23:57:21 +0000 (00:57 +0100)
src/bindings/python/simgrid_python.cpp
teshsuite/s4u/CMakeLists.txt
teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.cpp
teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.py [new file with mode: 0644]
teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.py.tesh [new file with mode: 0644]
teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.tesh

index 206f1d0..dbffb1f 100644 (file)
@@ -155,8 +155,9 @@ PYBIND11_MODULE(simgrid, m)
       .def_static("get_clock",
                   []() // XBT_ATTRIB_DEPRECATED_v334
                   {
-                    PyErr_WarnEx(PyExc_DeprecationWarning,
-                                 "get_clock() is deprecated and  will be dropped after v3.33, use clock instead.", 1);
+                    PyErr_WarnEx(
+                        PyExc_DeprecationWarning,
+                        "get_clock() is deprecated and  will be dropped after v3.33, use `Engine.clock` instead.", 1);
                     return Engine::get_clock();
                   })
       .def_property_readonly_static(
index 86987b7..270d6f0 100644 (file)
@@ -49,12 +49,38 @@ foreach(x basic-link-test basic-parsing-test host-on-off host-on-off-actors host
         monkey-masterworkers
         pid storage_client_server trace-integration seal-platform issue71)
   set(tesh_files    ${tesh_files}    ${CMAKE_CURRENT_SOURCE_DIR}/${x}/${x}.tesh)
-  ADD_TESH(tesh-s4u-${x} --setenv bindir=${CMAKE_BINARY_DIR}/teshsuite/s4u/${x} --setenv srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/s4u/${x} --setenv rootdir=${CMAKE_HOME_DIRECTORY} --setenv platfdir=${CMAKE_HOME_DIRECTORY}/examples/platforms --cd ${CMAKE_BINARY_DIR}/teshsuite/s4u/${x} ${CMAKE_HOME_DIRECTORY}/teshsuite/s4u/${x}/${x}.tesh)
+  ADD_TESH(tesh-s4u-${x}
+           --setenv bindir=${CMAKE_BINARY_DIR}/teshsuite/s4u/${x}
+           --setenv srcdir=${CMAKE_CURRENT_SOURCE_DIR}/${x}
+           --setenv rootdir=${CMAKE_HOME_DIRECTORY}
+           --setenv platfdir=${CMAKE_HOME_DIRECTORY}/examples/platforms
+           --cd ${CMAKE_BINARY_DIR}/teshsuite/s4u/${x}
+           ${CMAKE_HOME_DIRECTORY}/teshsuite/s4u/${x}/${x}.tesh)
 endforeach()
 
+# Python tesh tests
+foreach(x monkey-masterworkers)
+  if(enable_python)
+    ADD_TESH(tesh-python-${x}
+             --setenv srcdir=${CMAKE_CURRENT_SOURCE_DIR}/${x}
+             --setenv pythoncmd=${PYTHON_EXECUTABLE}
+             --setenv LD_LIBRARY_PATH=${TESH_LIBRARY_PATH}
+             --setenv PYTHONPATH=${CMAKE_BINARY_DIR}/lib
+             --setenv platfdir=${CMAKE_HOME_DIRECTORY}/examples/platforms
+             --cd ${CMAKE_BINARY_DIR}/teshsuite/s4u/${x}
+             ${CMAKE_HOME_DIRECTORY}/teshsuite/s4u/${x}/${x}.py.tesh)
+
+  endif()
+endforeach()
+
+
 # Monkey tests are launched directly, not with tesh
 foreach(x  monkey-masterworkers)
   ADD_TEST(monkey-s4u-${x} "${PYTHON_EXECUTABLE}" ${CMAKE_HOME_DIRECTORY}/tools/simgrid-monkey ${CMAKE_BINARY_DIR}/teshsuite/s4u/${x}/${x})
+  if(enable_python)
+    ADD_TEST(monkey-python-${x} "${PYTHON_EXECUTABLE}" ${CMAKE_HOME_DIRECTORY}/tools/simgrid-monkey "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/${x}/${x}.py)
+    set_tests_properties(monkey-python-${x} PROPERTIES ENVIRONMENT "PYTHONPATH=${CMAKE_BINARY_DIR}/lib")
+  endif()
 endforeach()
 
 
index 8e14acd..aef2566 100644 (file)
@@ -96,8 +96,6 @@ int main(int argc, char* argv[])
 {
   sg4::Engine e(&argc, argv);
 
-  XBT_INFO("host count: %d ", (int)cfg_host_count);
-
   auto* rootzone = sg4::create_full_zone("root");
   sg4::Host* main; // First host created, where the master will stay
   std::vector<sg4::Host*> worker_hosts;
diff --git a/teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.py b/teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.py
new file mode 100644 (file)
index 0000000..e8605f9
--- /dev/null
@@ -0,0 +1,102 @@
+# Copyright (c) 2007-2022. The SimGrid Team. All rights reserved.          
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the license (GNU LGPL) which comes with this package. 
+
+"""
+ This is a version of the masterworkers that (hopefully) survives to the chaos monkey.
+ It tests synchronous send/receive as well as synchronous computations.
+
+ It is not written to be pleasant to read, but instead to resist the aggressions of the monkey:
+ - Workers keep going until after a global variable `todo` reaches 0.
+ - The master is a daemon that just sends infinitely tasks
+   (simgrid simulations stop as soon as all non-daemon actors are done).
+ - The platform is created programmatically to remove path issues and control the problem size.
+
+ See the simgrid-monkey script for more information.
+
+ Inline configuration items:
+ - host-count: how many actors to start (including the master
+ - task-count: initial value of the `todo` global
+ - deadline: time at which the simulation is known to be failed (to detect infinite loops).
+
+"""
+
+# Configuration items:
+host_count = 3 # Host count (master on one, workers on the others)
+task_count = 1 # Amount of tasks that must be executed to succeed
+deadline = 120 # When to fail the simulation (infinite loop detection)
+# End of configuration
+
+import sys
+from simgrid import Actor, Engine, Host, this_actor, Mailbox, NetZone, LinkInRoute, TimeoutException, NetworkFailureException
+
+todo = task_count # remaining amount of tasks to execute, a global variable
+
+def master():
+  comp_size = int(1e6)
+  comm_size = int(1e6)
+  this_actor.info("Master booting")
+  Actor.self().daemonize()
+  this_actor.on_exit(lambda killed: this_actor.info("Master dying forcefully." if killed else "Master dying peacefully."))
+
+  while True: # This is a daemon
+    assert Engine.clock < deadline, f"Failed to run all tasks in less than {deadline} seconds. Is this an infinite loop?"
+
+    try: 
+      this_actor.info("Try to send a message")
+      mailbox.put(comp_size, comm_size, 10.)
+    except TimeoutException:
+      this_actor.info("Timeouted while sending a task")
+    except NetworkFailureException:
+      this_actor.info("Got a NetworkFailureException. Wait a second before starting again.")
+      this_actor.sleep_for(1.)
+
+  assert False, "The impossible just happened (yet again): daemons shall not finish."
+
+def worker(id):
+  global todo
+  this_actor.info(f"Worker {id} booting")
+  this_actor.on_exit(lambda killed: this_actor.info(f"Worker {id} dying {'forcefully' if killed else 'peacefully'}."))
+
+  while todo > 0:
+    assert Engine.clock < deadline, f"Failed to run all tasks in less than {deadline} seconds. Is this an infinite loop?"
+
+    try:
+      this_actor.info(f"Waiting a message on mailbox")
+      compute_cost = mailbox.get()
+
+      this_actor.info("Start execution...")
+      this_actor.execute(compute_cost)
+      todo = todo - 1
+      this_actor.info(f"Execution complete. Still {todo} to go.")
+
+    except NetworkFailureException:
+      this_actor.info("Got a NetworkFailureException. Wait a second before starting again.")
+      this_actor.sleep_for(1.)
+    except TimeoutException:
+      this_actor.info("Timeouted while getting a task.")
+
+if __name__ == '__main__':
+  global mailbox
+  e = Engine(sys.argv)
+
+  assert host_count > 2, "You need at least 2 workers (i.e., 3 hosts) or the master will be auto-killed when the only worker gets killed."
+  assert todo > 0, "Please give some tasks to do to the workers."
+
+  mailbox = Mailbox.by_name("mailbox")
+
+  rootzone = NetZone.create_full_zone("Zone1")
+  main = rootzone.create_host("lilibeth 0", 1e9)
+  Actor.create("master", main, master).set_auto_restart(True)
+
+  for i in range(1, host_count):
+    link = rootzone.create_split_duplex_link(f"link {i}", "1MBps").set_latency("24us")
+    host = rootzone.create_host(f"lilibeth {i}", 1e9)
+    rootzone.add_route(main.netpoint, host.netpoint, None, None, [LinkInRoute(link, LinkInRoute.Direction.UP)], True)
+    Actor.create("worker", host, worker, i).set_auto_restart(True)
+
+  e.netzone_root.seal()
+  e.run()
+
+  this_actor.info("WE SURVIVED!")
diff --git a/teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.py.tesh b/teshsuite/s4u/monkey-masterworkers/monkey-masterworkers.py.tesh
new file mode 100644 (file)
index 0000000..eb92dde
--- /dev/null
@@ -0,0 +1,30 @@
+
+p Smoke test: do one arbitrary run of the monkey, just to make sure that *something* is happening.
+
+$ ${pythoncmd:=python3} ${PYTHON_TOOL_OPTIONS:=} ${srcdir:=.}/monkey-masterworkers.py --cfg=plugin:cmonkey --cfg=cmonkey/time:1 --cfg=cmonkey/host:1
+> [0.000000] [xbt_cfg/INFO] Configuration change: Set 'plugin' to 'cmonkey'
+> [0.000000] [cmonkey/INFO] Initializing the chaos monkey
+> [0.000000] [xbt_cfg/INFO] Configuration change: Set 'cmonkey/time' to '1'
+> [0.000000] [xbt_cfg/INFO] Configuration change: Set 'cmonkey/host' to '1'
+> [lilibeth 0:master:(1) 0.000000] [python/INFO] Master booting
+> [lilibeth 1:worker:(2) 0.000000] [python/INFO] Worker 1 booting
+> [lilibeth 2:worker:(3) 0.000000] [python/INFO] Worker 2 booting
+> [lilibeth 1:worker:(2) 0.000000] [python/INFO] Waiting a message on mailbox
+> [lilibeth 2:worker:(3) 0.000000] [python/INFO] Waiting a message on mailbox
+> [lilibeth 0:master:(1) 0.000000] [python/INFO] Try to send a message
+> [1.000000] [cmonkey/INFO] Kill host lilibeth 1
+> [lilibeth 0:master:(1) 1.000000] [python/INFO] Got a NetworkFailureException. Wait a second before starting again.
+> [lilibeth 1:worker:(2) 1.000000] [python/INFO] Worker 1 dying forcefully.
+> [lilibeth 0:master:(1) 2.000000] [python/INFO] Try to send a message
+> [lilibeth 2:worker:(3) 3.031240] [python/INFO] Start execution...
+> [lilibeth 0:master:(1) 3.031240] [python/INFO] Try to send a message
+> [lilibeth 2:worker:(3) 3.032240] [python/INFO] Execution complete. Still 0 to go.
+> [lilibeth 2:worker:(3) 3.032240] [python/INFO] Worker 2 dying peacefully.
+> [lilibeth 0:master:(1) 3.032240] [python/INFO] Master dying forcefully.
+> [31.000000] [cmonkey/INFO] Restart host lilibeth 1
+> [lilibeth 1:worker:(4) 31.000000] [python/INFO] Worker 1 booting
+> [lilibeth 1:worker:(4) 31.000000] [python/INFO] Worker 1 dying peacefully.
+> [lilibeth 1:worker:(4) 31.000000] [python/INFO] Worker 1 dying peacefully.
+> [31.000000] [cmonkey/INFO] Chaos Monkey done!
+> [31.000000] [python/INFO] WE SURVIVED!
+
index 1d43a5b..4d09ca2 100644 (file)
@@ -6,7 +6,6 @@ $ ${bindir:=.}/monkey-masterworkers --cfg=plugin:cmonkey --cfg=cmonkey/time:1 --
 > [0.000000] [cmonkey/INFO] Initializing the chaos monkey
 > [0.000000] [xbt_cfg/INFO] Configuration change: Set 'cmonkey/time' to '1'
 > [0.000000] [xbt_cfg/INFO] Configuration change: Set 'cmonkey/host' to '1'
-> [0.000000] [s4u_test/INFO] host count: 3 
 > [lilibeth 0:master:(1) 0.000000] [s4u_test/INFO] Master booting
 > [lilibeth 1:worker:(2) 0.000000] [s4u_test/INFO] Worker booting
 > [lilibeth 2:worker:(3) 0.000000] [s4u_test/INFO] Worker booting