From: Martin Quinson Date: Sun, 12 Aug 2018 17:32:33 +0000 (+0200) Subject: Merge branch 'master' of github.com:simgrid/simgrid X-Git-Tag: v3_21~259 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/d43e84e981024537b56f15f7dcc937b0202cad2b?hp=7f1e24039c708e51889d620f29dfba9116065157 Merge branch 'master' of github.com:simgrid/simgrid --- diff --git a/.gitignore b/.gitignore index 4d94ed5308..5f2d918713 100644 --- a/.gitignore +++ b/.gitignore @@ -199,9 +199,11 @@ examples/simdag/scheduling/sd_scheduling examples/simdag/test/sd_test examples/simdag/throttling/sd_throttling examples/simdag/typed_tasks/sd_typed_tasks +examples/smpi/ampi/smpi_ampi examples/smpi/energy/f77/sef examples/smpi/energy/f90/sef90 examples/smpi/energy/smpi_energy +examples/smpi/load_balancer_replay/load_balancer_replay examples/smpi/mc/smpi_bugged1 examples/smpi/mc/smpi_bugged1_liveness examples/smpi/mc/smpi_bugged2 @@ -309,6 +311,7 @@ teshsuite/smpi/coll-gather/coll-gather teshsuite/smpi/coll-reduce/coll-reduce teshsuite/smpi/coll-reduce-scatter/coll-reduce-scatter teshsuite/smpi/coll-scatter/coll-scatter +teshsuite/smpi/fort_args/fort_args teshsuite/smpi/isp/umpire/abort teshsuite/smpi/isp/umpire/abort1 teshsuite/smpi/isp/umpire/abort2 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0085c43ae7..3ef1530238 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,13 +3,8 @@ image: debian:testing-slim .build: &build script: - apt-get update - - apt-get install -y python3-sphinx doxygen python3-breathe python3-sphinx-rtd-theme pip3 - - apt-get install -y cmake doxygen libboost-all-dev libboost-dev - - pip3 install --no-deps exhale - - cmake -Denable_documentation=ON . - - make documentation - - mkdir docs/doxyoutput - - mv doc/xml docs/doxyoutput + - apt-get install -y python3-pip doxygen libboost-all-dev libboost-dev fig2dev + - pip3 install --requirement docs/requirements.txt - cd docs - sphinx-build -M html source/ build/ - mv build/html ../public diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index ca97d87a7d..b5f7607ac5 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -686,43 +686,11 @@ INPUT = @CMAKE_HOME_DIRECTORY@/doc/doxygen/index.doc \ @CMAKE_HOME_DIRECTORY@/doc/doxygen/module-trace.doc \ @CMAKE_BINARY_DIR@/doc/doxygen/logcategories.doc \ @CMAKE_HOME_DIRECTORY@/include/ \ - @CMAKE_HOME_DIRECTORY@/include/simgrid/ \ - @CMAKE_HOME_DIRECTORY@/include/simgrid/jedule/ \ - @CMAKE_HOME_DIRECTORY@/include/simgrid/kernel/ \ - @CMAKE_HOME_DIRECTORY@/include/simgrid/kernel/resource/ \ - @CMAKE_HOME_DIRECTORY@/include/simgrid/kernel/routing/ \ - @CMAKE_HOME_DIRECTORY@/include/simgrid/plugins/ \ - @CMAKE_HOME_DIRECTORY@/include/simgrid/s4u/ \ - @CMAKE_HOME_DIRECTORY@/include/simgrid/simix/ \ - @CMAKE_HOME_DIRECTORY@/include/simgrid/smpi/ \ - @CMAKE_HOME_DIRECTORY@/include/xbt \ - @CMAKE_HOME_DIRECTORY@/src/include/simgrid/ \ - @CMAKE_HOME_DIRECTORY@/src/include/surf \ - @CMAKE_HOME_DIRECTORY@/src/include/xbt \ - @CMAKE_HOME_DIRECTORY@/src/instr/ \ - @CMAKE_HOME_DIRECTORY@/src/instr/jedule/ \ - @CMAKE_HOME_DIRECTORY@/src/kernel/ \ - @CMAKE_HOME_DIRECTORY@/src/kernel/activity/ \ - @CMAKE_HOME_DIRECTORY@/src/kernel/context/ \ - @CMAKE_HOME_DIRECTORY@/src/kernel/lmm/ \ - @CMAKE_HOME_DIRECTORY@/src/kernel/resource/ \ - @CMAKE_HOME_DIRECTORY@/src/kernel/routing/ \ - @CMAKE_HOME_DIRECTORY@/src/msg/ \ - @CMAKE_HOME_DIRECTORY@/src/plugins/ \ - @CMAKE_HOME_DIRECTORY@/src/plugins/file_system/ \ - @CMAKE_HOME_DIRECTORY@/src/plugins/vm/ \ - @CMAKE_HOME_DIRECTORY@/src/s4u/ \ - @CMAKE_HOME_DIRECTORY@/src/simdag/ \ - @CMAKE_HOME_DIRECTORY@/src/simgrid/ \ - @CMAKE_HOME_DIRECTORY@/src/simix/ \ - @CMAKE_HOME_DIRECTORY@/src/smpi/ \ - @CMAKE_HOME_DIRECTORY@/src/surf/ \ - @CMAKE_HOME_DIRECTORY@/src/xbt/ \ - @CMAKE_BINARY_DIR@/include \ - @CMAKE_BINARY_DIR@/src \ + @CMAKE_HOME_DIRECTORY@/src/plugins/ \ @CMAKE_HOME_DIRECTORY@/examples/msg/README.doc \ @CMAKE_HOME_DIRECTORY@/examples/s4u/README.doc + # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is # also the default input encoding. Doxygen uses libiconv (or the iconv built @@ -753,7 +721,7 @@ RECURSIVE = YES # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = +EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -767,7 +735,18 @@ EXCLUDE_SYMLINKS = YES # against the file with absolute path, so to exclude all test directories # for example use the pattern */test/* -EXCLUDE_PATTERNS = +EXCLUDE_PATTERNS = */include/simgrid/forward.h +EXCLUDE_PATTERNS += */include/smpi/* + +EXCLUDE_PATTERNS += @CMAKE_HOME_DIRECTORY@/src/surf/xml/simgrid_dtd.* +EXCLUDE_PATTERNS += @CMAKE_HOME_DIRECTORY@/src/simdag/dax_dtd.* +EXCLUDE_PATTERNS += @CMAKE_HOME_DIRECTORY@/src/xbt/automaton/parserPromela.* +EXCLUDE_PATTERNS += @CMAKE_HOME_DIRECTORY@/src/bindings/java/*.cpp @CMAKE_HOME_DIRECTORY@/src/bindings/java/*.h +EXCLUDE_PATTERNS += @CMAKE_HOME_DIRECTORY@/src/simix/popping_accessors.hpp \ + @CMAKE_HOME_DIRECTORY@/src/simix/popping_bodies.cpp \ + @CMAKE_HOME_DIRECTORY@/src/simix/popping_enum.h \ + @CMAKE_HOME_DIRECTORY@/src/simix/popping_generated.cpp + # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the @@ -775,7 +754,10 @@ EXCLUDE_PATTERNS = # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test -EXCLUDE_SYMBOLS = +EXCLUDE_SYMBOLS = s_xbt_dict_cursor s_xbt_dictelm xbt_dynar_s +EXCLUDE_SYMBOLS += xbt_edge xbt_graph xbt_node +EXCLUDE_SYMBOLS += e_xbt_parmap_mode_t +EXCLUDE_SYMBOLS += MPI_* # The EXAMPLE_PATH tag can be used to specify one or more files or # directories that contain example code fragments that are included (see @@ -1470,11 +1452,14 @@ INCLUDE_FILE_PATTERNS = # instead of the = operator. PREDEFINED = __cplusplus \ + DOXYGEN \ XBT_PUBLIC= \ XBT_EXPORT_NO_IMPORT= \ XBT_IMPORT_NO_EXPORT= \ XBT_PUBLIC_DATA=extern \ + XBT_PUBLIC= \ XBT_INLINE= \ + XBT_ALWAYS_INLINE= \ XBT_PRIVATE= \ XBT_ATTRIB_NORETURN= \ XBT_ATTRIB_UNUSED= \ @@ -1489,7 +1474,8 @@ PREDEFINED = __cplusplus \ # overrules the definition found in the source code. EXPAND_AS_DEFINED = COLL_APPLY COLL_GATHERS COLL_ALLGATHERS COLL_ALLGATHERVS COLL_ALLREDUCES COLL_ALLTOALLS \ - COLL_ALLTOALLVS COLL_BCASTS COLL_REDUCES COLL_REDUCE_SCATTERS COLL_SCATTERS COLL_BARRIERS + COLL_ALLTOALLVS COLL_BCASTS COLL_REDUCES COLL_REDUCE_SCATTERS COLL_SCATTERS COLL_BARRIERS \ + MPI_CALL # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then # doxygen's preprocessor will remove all references to function-like macros @@ -1549,7 +1535,7 @@ PERL_PATH = /usr/bin/perl # this option also works with HAVE_DOT disabled, but it is recommended to # install and use dot, since it yields more powerful graphs. -CLASS_DIAGRAMS = YES +CLASS_DIAGRAMS = NO # You can define message sequence charts within doxygen comments using the \msc # command. Doxygen will then run the mscgen tool (see @@ -1668,7 +1654,7 @@ CALL_GRAPH = NO # the time of a run. So in most cases it will be better to enable caller # graphs for selected functions only using the \callergraph command. -CALLER_GRAPH = YES +CALLER_GRAPH = NO # If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen # will generate a graphical hierarchy of all classes instead of a textual one. @@ -1688,7 +1674,7 @@ DIRECTORY_GRAPH = YES # HTML_FILE_EXTENSION to xhtml in order to make the SVG files # visible in IE 9+ (other browsers do not have this requirement). -DOT_IMAGE_FORMAT = png +DOT_IMAGE_FORMAT = svg # If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to # enable generation of interactive SVG images that allow zooming and panning. diff --git a/doc/doxygen/examples.doc b/doc/doxygen/examples.doc deleted file mode 100644 index 78ca077bec..0000000000 --- a/doc/doxygen/examples.doc +++ /dev/null @@ -1,113 +0,0 @@ -/*! @page examples SimGrid Examples - -@tableofcontents - -SimGrid comes with many examples provided in the examples/ directory. -Those examples are described in section @ref msg_examples. Those -examples are commented and should be easy to understand. for a first -step into SimGrid we also provide some more detailed examples in the -sections below. - -@htmlonly -You should also check our online tutorial section that contains a generic tutorial about using SimGrid. -@endhtmlonly - -@section using_msg Using MSG - -@htmlonly -You should also check our online tutorial section that contains a dedicated tutorial. -@endhtmlonly - -Here are some examples on how to use MSG, the most used API. - -MSG comes with an extensive set of examples. It is sometimes difficult -to find the one you need. This list aims at helping you finding the -example from which you can learn what you want to. - -@subsection MSG_ex_basics Basic examples and features - -@subsubsection MSG_ex_master_worker Basic Master/Workers - -Simulation of a master-worker application using a realistic platform and an external description of the deployment. - -@paragraph MSG_ex_mw_TOC Table of contents: - - - @ref MSG_ext_mw_preliminary - - @ref MSG_ext_mw_master - - @ref MSG_ext_mw_worker - - @ref MSG_ext_mw_core - - @ref MSG_ext_mw_platform - - @ref MSG_ext_mw_application - -
- -@dontinclude msg/app-masterworker/app-masterworker.c - -@paragraph MSG_ext_mw_preliminary Preliminary declarations - -@skip include -@until example"); -@skipline Master expects - -@paragraph MSG_ext_mw_master Master code - -This function has to be assigned to a #msg_process_t that will behave as the master. It should not be called directly -but either given as a parameter to #MSG_process_create() or registered as a public function through -#MSG_function_register() and then automatically assigned to a process through #MSG_launch_application(). - -C style arguments (argc/argv) are interpreted as: - - the number of tasks to distribute - - the computational size of each task - - the communication size of each task - - the number of workers managed by the master. - -Tasks are evenly sent in a round-robin style. - -@until return 0; -@until } -@skipline Worker expects - -@paragraph MSG_ext_mw_worker Worker code - -This function has to be assigned to a #msg_process_t that has to behave as a worker. Just like the master function -(described in @ref MSG_ext_mw_master), it should not be called directly. - -C style arguments (argc/argv) are interpreted as: - - a unique id used to build the mailbox name of the worker - -This function keeps waiting for tasks and executes them as it receives them. When a special task named 'finalize' is -received from the master, the process ends its execution. - -@until return 0; -@until } - -@paragraph MSG_ext_mw_core Main function - -This function is the core of the simulation and is divided only into 3 parts: - -# Simulation settings : #MSG_create_environment() creates a realistic - environment - -# Application deployment : create the processes on the right locations with - #MSG_launch_application() - -# The simulation is run with #MSG_main() - -Its arguments are: - - platform_file: the name of a file containing an valid platform description. - - deployment_file: the name of a file containing a valid application description -@line main -@until OK; -@until } - -@paragraph MSG_ext_mw_platform Example of a platform file - -The following platform description can be found in @c examples/msg/platforms/small_platform.xml -@include platforms/small_platform.xml - -@paragraph MSG_ext_mw_application Example of a deployment file - -The following application description can be found in @c examples/msg/app-masterworker/app-masterworker_d.xml: - -@include msg/app-masterworker/app-masterworker_d.xml - -*/ - - diff --git a/docs/requirements.txt b/docs/requirements.txt index cd6467ed82..98294c5608 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1 +1,5 @@ breathe +exhale +sphinx +sphinx_rtd_theme + diff --git a/docs/source/conf.py b/docs/source/conf.py index e5bcf6c9b0..8fd2797e59 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -39,6 +39,7 @@ release = u'3.21' # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ + 'sphinx.ext.todo', # 'sphinx.ext.coverage', 'sphinx.ext.mathjax', # 'sphinx.ext.ifconfig', @@ -46,7 +47,9 @@ extensions = [ 'exhale', ] -breathe_projects = { 'simgrid': '../../doc/xml' } +todo_include_todos = True + +breathe_projects = { 'simgrid': '../build/doxy/xml' } breathe_default_project = "simgrid" # Setup the exhale extension @@ -61,9 +64,32 @@ exhale_args = { # TIP: if using the sphinx-bootstrap-theme, you need # "treeViewIsBootstrap": True, "exhaleExecutesDoxygen": True, - "exhaleDoxygenStdin": "INPUT = ../../include" + "exhaleDoxygenStdin": """ + INPUT = ../../include/simgrid/s4u + GENERATE_XML = YES + PREDEFINED += \ + __cplusplus \ + DOXYGEN \ + XBT_PUBLIC= \ + XBT_EXPORT_NO_IMPORT= \ + XBT_IMPORT_NO_EXPORT= \ + XBT_PUBLIC_DATA=extern \ + XBT_PUBLIC= \ + XBT_INLINE= \ + XBT_ALWAYS_INLINE= \ + XBT_PRIVATE= \ + XBT_ATTRIB_NORETURN= \ + XBT_ATTRIB_UNUSED= \ + XBT_ATTRIB_DEPRECATED_v322(m)= \ + XBT_ATTRIB_DEPRECATED_v323(m)= \ + XBT_ATTRIB_DEPRECATED_v324(m)= + """ } +# For cross-ref generation +primary_domain = 'cpp' + + # Add any paths that contain templates here, relative to this directory. # templates_path = ['_templates'] diff --git a/docs/source/images/tuto-masterworkers-intro.svg b/docs/source/images/tuto-masterworkers-intro.svg new file mode 100644 index 0000000000..331071e575 --- /dev/null +++ b/docs/source/images/tuto-masterworkers-intro.svg @@ -0,0 +1,1221 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + Master + + + + T + + + + T + + + + Worker + + + + Worker + + + + Worker + + + + Worker + + + + Worker + + + + + + + + + The master dispatchesthe tasks to the workers + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + diff --git a/docs/source/images/tuto-masterworkers-question.svg b/docs/source/images/tuto-masterworkers-question.svg new file mode 100644 index 0000000000..c3188764f9 --- /dev/null +++ b/docs/source/images/tuto-masterworkers-question.svg @@ -0,0 +1,1240 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + Master + + + + T + + + + T + + + + Worker + + + + Worker + + + + Worker + + + + Worker + + + + Worker + + + + + + + + + How should the masterdistribute the tasks? + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + ? + + diff --git a/doc/sc3-description.png b/docs/source/images/tuto-masterworkers-result.png similarity index 100% rename from doc/sc3-description.png rename to docs/source/images/tuto-masterworkers-result.png diff --git a/docs/source/index.rst b/docs/source/index.rst index 9b50a14aeb..0be8937cbc 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -24,7 +24,6 @@ Welcome to SimGrid's documentation! :caption: API Reference: API - S4U Indices and tables ================== diff --git a/docs/source/install_yours.rst b/docs/source/install_yours.rst index 920bbbff71..2258c4c68d 100644 --- a/docs/source/install_yours.rst +++ b/docs/source/install_yours.rst @@ -138,7 +138,9 @@ modify the examples directly but instead create your own project in eclipse. This will make it easier to upgrade to another version of SimGrid. -Troubleshooting your project setup +.. _install_yours_troubleshooting: + +Troubleshooting your Project Setup ---------------------------------- Library not found diff --git a/docs/source/usecase_algorithms.rst b/docs/source/usecase_algorithms.rst new file mode 100644 index 0000000000..6a15168afd --- /dev/null +++ b/docs/source/usecase_algorithms.rst @@ -0,0 +1,429 @@ +.. _usecase_simalgo: + +Simulating Algorithms +===================== + +SimGrid was conceived as a tool to study distributed algorithms. Its +modern S4U interface makes it easy to assess Cloud, P2P, HPC, IoT and +similar settings. + +A typical SimGrid simulation is composed of several **Actors** +|api_s4u_Actor|_ , that execute user-provided functions. The actors +have to explicitly use the S4U interface to express their computation, +communication, disk usage and other **Activities** |api_s4u_Activity|_ +, so that they get reflected within the simulator. These activities +take place on **Resources** (CPUs, links, disks). SimGrid predicts the +time taken by each activity and orchestrates accordingly the actors +waiting for the completion of these activities. + +.. |api_s4u_Actor| image:: /images/extlink.png + :align: middle + :width: 12 +.. _api_s4u_Actor: api/classsimgrid_1_1s4u_1_1Actor.html#class-documentation + +.. |api_s4u_Activity| image:: /images/extlink.png + :align: middle + :width: 12 +.. _api_s4u_Activity: api/classsimgrid_1_1s4u_1_1Activity.html#class-documentation + + +Each actor executes a user-provided function on a simulated **Host** +|api_s4u_Host|_ with which it can interact. Communications are not +directly sent to actors, but posted onto **Mailboxes** +|api_s4u_Mailbox|_ that serve as rendez-vous points between +communicating processes. + +.. |api_s4u_Host| image:: /images/extlink.png + :align: middle + :width: 12 +.. _api_s4u_Host: api/classsimgrid_1_1s4u_1_1Host.html#class-documentation + +.. |api_s4u_Mailbox| image:: /images/extlink.png + :align: middle + :width: 12 +.. _api_s4u_Mailbox: api/classsimgrid_1_1s4u_1_1Mailbox.html#class-documentation + + +Discover the Master/Workers +--------------------------- + +This section introduces a first example of SimGrid simulation. This +simple application is composed of two kind of actors: the **master** +is in charge of distributing some computational tasks to a set of +**workers** that execute them. + +.. image:: /images/tuto-masterworkers-intro.svg + :align: center + +We first present a round-robin version of this application, where the +master dispatches the tasks to the workers, one after the other, until +all tasks are dispatched. Later in this tutorial, you will be given +the opportunity to improve this scheme. + +The Actors +.......... + +Let's start with the code of the worker. It is represented by the +*master* function below. This simple function takes 4 parameters, +given as a vector of strings: + + - the number of workers managed by the master. + - the number of tasks to dispatch + - the computational size (in flops to compute) of each task + - the communication size (in bytes to exchange) of each task + +Then, the tasks are sent one after the other, each on a mailbox named +"worker-XXX" where XXX is the number of an existing worker. On the +other side, a given worker (which code is given below) wait for +incoming tasks on its own mailbox. Notice how this mailbox mechanism +allow the actors to find each other without having all information: +the master don't have to know the actors nor even where they are, it +simply pushes the messages on mailbox which name is predetermined. + +At the end, once all tasks are dispatched, the master dispatches +another task per worker, but this time with a negative amount of flops +to compute. Indeed, this application decided by convention, that the +workers should stop when encountering such a negative compute_size. + +At the end of the day, the only SimGrid specific functions used in +this example are :cpp:func:`simgrid::s4u::Mailbox::by_name` and +:cpp:func:`simgrid::s4u::Mailbox::put`. Also, :c:macro:`XBT_INFO` is used +as a replacement to printf() or to cout to ensure that the messages +are nicely logged along with the simulated time and actor name. + + +.. literalinclude:: ../../examples/s4u/app-masterworkers/s4u-app-masterworkers-fun.cpp + :language: c++ + :start-after: master-begin + :end-before: master-end + +Here comes the code of the worker actors. This function expects only one +parameter from its vector of strings: its identifier so that it knows +on which mailbox its incoming tasks will arrive. Its code is very +simple: as long as it gets valid computation requests (whose +compute_amount is positive), it compute this task and waits for the +next one. + +.. literalinclude:: ../../examples/s4u/app-masterworkers/s4u-app-masterworkers-fun.cpp + :language: c++ + :start-after: worker-begin + :end-before: worker-end + +Starting the Simulation +....................... + +And this is it. In only a few lines, we defined the algorithm of our +master/workers examples. Well, this is true, but an algorithm alone is +not enough to define a simulation. + +First, SimGrid is a library, not a program. So you need to define your +own `main()` function, as follows. This function is in charge of +creating a SimGrid simulation engine (on line 3), register the actor +functions to the engine (on lines 7 and 8), load the virtual platform +from its description file (on line 11), map actors onto that platform +(on line 12) and run the simulation until its completion on line 15. + +.. literalinclude:: ../../examples/s4u/app-masterworkers/s4u-app-masterworkers-fun.cpp + :language: c++ + :start-after: main-begin + :end-before: main-end + :linenos: + +After that, the missing pieces are the platform and deployment +files. + +Platform File +............. + +Platform files define the virtual platform on which the provided +application will take place. In contains one or several **Network +Zone** |api_s4u_NetZone|_ that contain both **Host-** |api_s4u_Host|_ +and **Link-** |api_s4u_Link|_ Resources, as well as routing +information. + +Such files can get rather long and boring, so the example below is +only an excerpts of the full ``examples/platforms/small_platform.xml`` +file. For example, most routing information are missing, and only the +route between the hosts Tremblay and Fafard is given. This path +traverses 6 links (4, 3, 2, 0, 1 and 8). The full file, along with +other examples, can be found in the archive under +``examples/platforms``. + +.. |api_s4u_NetZone| image:: /images/extlink.png + :align: middle + :width: 12 +.. _api_s4u_NetZone: api/classsimgrid_1_1s4u_1_1NetZone.html#class-documentation + +.. |api_s4u_Link| image:: /images/extlink.png + :align: middle + :width: 12 +.. _api_s4u_Link: api/classsimgrid_1_1s4u_1_1Link.html#class-documentation + +.. literalinclude:: ../../examples/platforms/small_platform.xml + :language: xml + :lines: 1-10,12-20,56-63,192- + :caption: (excerpts of the small_platform.xml file) + +Deployment File +............... + +Deployment files specify the execution scenario: it lists the actors +that should be started, along with their parameter. In the following +example, we start 6 actors: one master and 5 workers. + +.. literalinclude:: ../../examples/s4u/app-masterworkers/s4u-app-masterworkers_d.xml + :language: xml + +Execution Example +................. + +This time, we have all parts: once the program is compiled, we can +execute it as follows. Note how the XBT_INFO() requests turned into +informative messages. + +.. literalinclude:: ../../examples/s4u/app-masterworkers/s4u-app-masterworkers.tesh + :language: shell + :start-after: s4u-app-masterworkers-fun + :prepend: $$$ ./masterworkers platform.xml deploy.xml + :append: $$$ + :dedent: 2 + + +Improve it Yourself +------------------- + +In this section, you will modify the example presented earlier to +explore the quality of the proposed algorithm. For now, it works and +the simulation prints things, but the truth is that we have no idea of +whether this is a good algorithm to dispatch tasks to the workers. +This very simple setting raises many interesting questions: + +.. image:: /images/tuto-masterworkers-question.svg + :align: center + +- Which algorithm should the master use? Or should the worker decide + by themselves? + + Round Robin is not an efficient algorithm when all tasks are not + processed at the same speed. It would probably be more efficient + if the workers were asking for tasks when ready. + +- Should tasks be grouped in batches or sent separately? + + The workers will starve if they don't get the tasks fast + enough. One possibility to reduce latency would be to send tasks + in pools instead of one by one. But if the pools are too big, the + load balancing will likely get uneven, in particular when + distributing the last tasks. + +- How does the quality of such algorithm dependent on the platform + characteristics and on the task characteristics? + + Whenever the input communication time is very small compared to + processing time and workers are homogeneous, it is likely that the + round-robin algorithm performs very well. Would it still hold true + when transfer time is not negligible? What if some tasks are + performed faster on some specific nodes? + +- The network topology interconnecting the master and the workers + may be quite complicated. How does such a topology impact the + previous result? + + When data transfers are the bottleneck, it is likely that a good + modeling of the platform becomes essential. The SimGrid platform + models are particularly handy to account for complex platform + topologies. + +- What is the best applicative topology? + + Is a flat master worker deployment sufficient? Should we go for a + hierarchical algorithm, with some forwarders taking large pools of + tasks from the master, each of them distributing their tasks to a + sub-pool of workers? Or should we introduce super-peers, + dupplicating the master's role in a peer-to-peer manner? Do the + algorithms require a perfect knowledge of the network? + +- How is such an algorithm sensitive to external workload variation? + + What if bandwidth, latency and computing speed can vary with no + warning? Shouldn't you study whether your algorithm is sensitive + to such load variations? + +- Although an algorithm may be more efficient than another, how does + it interfere with unrelated applications executing on the same + facilities? + +**SimGrid was invented to answer such questions.** Do not believe the +fools saying that all you need to study such settings is a simple +discrete event simulator. Do you really want to reinvent the wheel, +debug your own tool, optimize it and validate its models against real +settings for ages, or do you prefer to sit on the shoulders of a +giant? With SimGrid, you can focus on your algorithm. The whole +simulation mechanism is already working. + +Here is the visualization of a SimGrid simulation of two master worker +applications (one in light gray and the other in dark gray) running in +concurrence and showing resource usage over a long period of time. It +was obtained with the Triva software. + +.. image:: /images/tuto-masterworkers-result.png + :align: center + +Prerequisite +............ + +Before your proceed, you need to :ref:`install SimGrid `, a +C++ compiler and also ``pajeng`` to visualize the traces. The provided +code template requires cmake to compile. On Debian and Ubuntu for +example, you can get them as follows: + +.. code-block:: shell + + sudo apt install simgrid pajeng cmake g++ + +An initial version of the source code is provided on framagit. This +template compiles with cmake. If SimGrid is correctly installed, you +should be able to clone the `repository +`_ and recompile +everything as follows: + +.. code-block:: shell + + git clone git@framagit.org:simgrid/simgrid-template-s4u.git + cd simgrid-template-s4u/ + cmake . + make + +If you struggle with the compilation, then you should double check +your :ref:`SimGrid installation `. On need, please refer to +the :ref:`Troubleshooting your Project Setup +` section. + +Discovering the Provided Code +............................. + +Please compile and execute the provided simulator as follows: + + +.. code-block:: shell + + make master-workers + ./master-workers small_platform.xml master-workers_d.xml + +For a more "fancy" output, you can use simgrid-colorizer. + +.. code-block:: shell + + ./master-workers small_platform.xml master-workers_d.xml 2>&1 | simgrid-colorizer + +If you installed SimGrid to a non-standard path, you may have to +specify the full path to simgrid-colorizer on the above line, such as +``/opt/simgrid/bin/simgrid-colorizer``. If you did not install it at all, +you can find it in /bin/colorize. + +.. todo:: + + Explain how to generate a Gantt-Chart with S4U and pajeng. + +Exercise 1: Simplifying the deployment file +........................................... + +In the provided example, the deployment file is tightly connected to +the platform file ``small_platform.xml`` and adding more workers +quickly becomes a pain: You need to start them (at the bottom of the +file), add to inform the master that they are available by increasing +the right parameter. + +Instead, modify the simulator ``master-workers.c`` into +``master-workers-exo1.c`` so that the master launches a worker process +on `all` the other machines at startup. The new deployment file should +be as simple as: + +.. code-block:: xml + + + + + + + + + + + +Creating the workers from the master +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For that, the master needs to retrieve the list of hosts declared in +the platform with :cpp:func:`simgrid::s4u::Engine::get_all_host()`. +Then, the master should start the worker processes with +:cpp:func:`simgrid::s4u::Actor::create`. + +``Actor::create(name, host, func, params...)`` is a very flexible +function. Its third parameter is the function that the actor should +execute. This function can take any kind of parameter, provided that +you pass similar parameters to ``Actor::create()``. For example, you +could have something like this: + +.. code-block:: cpp + + void my_actor(int param1, double param2, std::string param3) { + ... + } + int main(int argc, char argv**) { + ... + simgrid::s4u::ActorPtr actor; + actor = simgrid::s4u::Actor::create("name", simgrid::s4u::Host::by_name("the_host"), + &my_actor, 42, 3.14, "thevalue"); + ... + } + + +Master-Workers Communication +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, the workers got from their parameter the name of the +mailbox they should use. We can still do so: the master should build +such a parameter before using it in the ``Actor::create()`` call. The +master could even pass directly the mailbox as a parameter to the +workers. + +Since we want later to study concurrent applications, it is advised to +use a mailbox name that is unique over the simulation even if there is +more than one master. + +One possibility for that is to use the actor ID (aid) of each worker +as a mailbox name. The master can retrieve the aid of the newly +created actor with ``actor->get_pid()`` while the actor itself can +retrieve its own aid with ``simgrid::s4u::this_actor::get_pid()``. +The retrieved value is an ``aid_t``, which is an alias for ``long``. + +Instead of having one mailbox per worker, you could also reorganize +completely your application to have only one mailbox per master. All +the workers of a given master would pull their work from the same +mailbox, which should be passed as parameter to the workers. This +reduces the amount of mailboxes, but prevents the master from taking +any scheduling decision. It really depends on how you want to organize +your application and what you want to study with your simulator. + +Wrap up +^^^^^^^ + +In this exercise, we reduced the amount of configuration that our +simulator requests. This is both a good idea, and a dangerous +trend. This simplification is an application of the good old DRY/SPOT +programming principle (Don't Repeat Yourself / Single Point Of Truth +-- `more on wikipedia +`_), and you +really want your programming artefacts to follow these software +engineering principles. + +But at the same time, you should be careful in separating your +scientific contribution (the master/workers algorithm) and the +artefacts used to test it (platform, deployment and workload). This is +why SimGrid forces you to express your platform and deployment files +in XML instead of using a programming interface: it forces a clear +separation of concerns between things of very different nature. + + +.. LocalWords: SimGrid diff --git a/examples/msg/README.doc b/examples/msg/README.doc index 203c2495d2..53ea4d7f2a 100644 --- a/examples/msg/README.doc +++ b/examples/msg/README.doc @@ -1,5 +1,5 @@ -This file follows the Doxygen syntax to be included in the -documentation, but it should remain readable directly. +// This file follows the Doxygen syntax to be included in the +// documentation, but it should remain readable directly. /** @defgroup msg_examples MSG examples @@ -114,12 +114,12 @@ TODO: show the XML files instead if it's what is interesting. On a "XML example */ -As a human, you can stop reading at this point. The rest is garbage: - -Every example must be listed in the following, but it's not possible -to move this content upper as each @example directive seems to eat -everything until the next */ marker (and the content is placed at the -top of the example file). +// As a human, you can stop reading at this point. The rest is garbage: +// +// Every example must be listed in the following, but it's not possible +// to move this content upper as each @example directive seems to eat +// everything until the next */ marker (and the content is placed at the +// top of the example file). /** diff --git a/examples/msg/platform-failures/platform-failures.c b/examples/msg/platform-failures/platform-failures.c index 7771e50925..f1d5d5baa3 100644 --- a/examples/msg/platform-failures/platform-failures.c +++ b/examples/msg/platform-failures/platform-failures.c @@ -25,13 +25,13 @@ static int master(int argc, char *argv[]) for (i = 0; i < number_of_tasks; i++) { char mailbox[256]; snprintf(mailbox, 255, "worker-%ld", i % workers_count); - + XBT_INFO("Send a message to %s", mailbox); msg_task_t task = MSG_task_create("Task", task_comp_size, task_comm_size, xbt_new0(double, 1)); *((double *) task->data) = MSG_get_clock(); switch ( MSG_task_send_with_timeout(task,mailbox,10.0) ) { case MSG_OK: - XBT_INFO("Send completed"); + XBT_INFO("Send to %s completed", mailbox); break; case MSG_HOST_FAILURE: @@ -103,21 +103,20 @@ static int worker(int argc, char *argv[]) while (1) { double time1 = MSG_get_clock(); msg_task_t task = NULL; + XBT_INFO("Waiting a message on %s", mailbox); int retcode = MSG_task_receive( &(task), mailbox); double time2 = MSG_get_clock(); if (retcode == MSG_OK) { - XBT_INFO("Received \"%s\"", MSG_task_get_name(task)); if (MSG_task_get_data(task) == FINALIZE) { MSG_task_destroy(task); break; } if (time1 < *((double *) task->data)) time1 = *((double *) task->data); - XBT_INFO("Communication time : \"%f\"", time2 - time1); - XBT_INFO("Processing \"%s\"", MSG_task_get_name(task)); + XBT_INFO("Start execution..."); retcode = MSG_task_execute(task); if (retcode == MSG_OK) { - XBT_INFO("\"%s\" done", MSG_task_get_name(task)); + XBT_INFO("Execution complete."); free(task->data); MSG_task_destroy(task); } else if (retcode == MSG_HOST_FAILURE) { diff --git a/examples/msg/platform-failures/platform-failures.tesh b/examples/msg/platform-failures/platform-failures.tesh index a169ce4015..33043dbc08 100644 --- a/examples/msg/platform-failures/platform-failures.tesh +++ b/examples/msg/platform-failures/platform-failures.tesh @@ -6,94 +6,106 @@ p Testing a simple master/worker example application handling failures TCP cross $ $SG_TEST_EXENV ${bindir:=.}/platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_with_failures.xml ${srcdir}/../app-masterworker/app-masterworker_d.xml --cfg=path:${srcdir} --cfg=network/crosstraffic:0 "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" > [ 0.000000] (0:maestro@) Cannot launch process 'worker' on failed host 'Fafard' > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process -> [ 0.010309] (1:master@Tremblay) Send completed -> [ 0.010309] (2:worker@Tremblay) Received "Task" -> [ 0.010309] (2:worker@Tremblay) Communication time : "0.010309" -> [ 0.010309] (2:worker@Tremblay) Processing "Task" +> [ 0.000000] (1:master@Tremblay) Send a message to worker-0 +> [ 0.010309] (1:master@Tremblay) Send to worker-0 completed +> [ 0.010309] (2:worker@Tremblay) Start execution... +> [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 0.000000] (3:worker@Jupiter) Waiting a message on worker-1 +> [ 0.000000] (4:worker@Ginette) Waiting a message on worker-3 +> [ 0.000000] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 0.010309] (1:master@Tremblay) Send a message to worker-1 > [ 1.000000] (0:maestro@) Restart processes on host Fafard +> [ 1.000000] (6:worker@Fafard) Waiting a message on worker-2 > [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-1'. Nevermind. Let's keep going! +> [ 1.000000] (1:master@Tremblay) Send a message to worker-2 > [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 2.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-2'. Nevermind. Let's keep going! +> [ 2.000000] (6:worker@Fafard) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (0:maestro@) Restart processes on host Jupiter -> [ 2.010309] (2:worker@Tremblay) "Task" done -> [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 12.030928] (1:master@Tremblay) Send completed -> [ 12.030928] (4:worker@Ginette) Received "Task" -> [ 12.030928] (4:worker@Ginette) Communication time : "1.030928" -> [ 12.030928] (4:worker@Ginette) Processing "Task" -> [ 13.061856] (1:master@Tremblay) Send completed -> [ 13.061856] (5:worker@Bourassa) Received "Task" -> [ 13.061856] (5:worker@Bourassa) Communication time : "1.030928" -> [ 13.061856] (5:worker@Bourassa) Processing "Task" -> [ 13.072165] (1:master@Tremblay) Send completed -> [ 13.072165] (2:worker@Tremblay) Received "Task" -> [ 13.072165] (2:worker@Tremblay) Communication time : "0.010309" -> [ 13.072165] (2:worker@Tremblay) Processing "Task" -> [ 14.030928] (4:worker@Ginette) "Task" done -> [ 14.103093] (1:master@Tremblay) Send completed -> [ 14.103093] (6:worker@Jupiter) Received "Task" -> [ 14.103093] (6:worker@Jupiter) Communication time : "1.030928" -> [ 14.103093] (6:worker@Jupiter) Processing "Task" -> [ 15.061856] (5:worker@Bourassa) "Task" done -> [ 15.072165] (2:worker@Tremblay) "Task" done -> [ 16.103093] (6:worker@Jupiter) "Task" done -> [ 24.103093] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 24.103093] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! -> [ 24.103093] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 25.134021] (1:master@Tremblay) Send completed -> [ 25.134021] (5:worker@Bourassa) Received "Task" -> [ 25.134021] (5:worker@Bourassa) Communication time : "1.030928" -> [ 25.134021] (5:worker@Bourassa) Processing "Task" -> [ 25.144330] (1:master@Tremblay) Send completed -> [ 25.144330] (2:worker@Tremblay) Received "Task" -> [ 25.144330] (2:worker@Tremblay) Communication time : "0.010309" -> [ 25.144330] (2:worker@Tremblay) Processing "Task" -> [ 26.175258] (1:master@Tremblay) Send completed -> [ 26.175258] (6:worker@Jupiter) Received "Task" -> [ 26.175258] (6:worker@Jupiter) Communication time : "1.030928" -> [ 26.175258] (6:worker@Jupiter) Processing "Task" -> [ 27.134021] (5:worker@Bourassa) "Task" done -> [ 27.144330] (2:worker@Tremblay) "Task" done -> [ 28.175258] (6:worker@Jupiter) "Task" done -> [ 36.175258] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 37.206186] (1:master@Tremblay) Send completed -> [ 37.206186] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! -> [ 37.206186] (4:worker@Ginette) Received "Task" -> [ 37.206186] (4:worker@Ginette) Communication time : "1.030928" -> [ 37.206186] (4:worker@Ginette) Processing "Task" -> [ 37.206186] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 37.216495] (1:master@Tremblay) Send completed -> [ 37.216495] (2:worker@Tremblay) Received "Task" -> [ 37.216495] (2:worker@Tremblay) Communication time : "0.010309" -> [ 37.216495] (2:worker@Tremblay) Processing "Task" -> [ 38.247423] (1:master@Tremblay) Send completed -> [ 38.247423] (6:worker@Jupiter) Received "Task" -> [ 38.247423] (6:worker@Jupiter) Communication time : "1.030928" -> [ 38.247423] (6:worker@Jupiter) Processing "Task" -> [ 39.206186] (4:worker@Ginette) "Task" done -> [ 39.216495] (2:worker@Tremblay) "Task" done -> [ 40.247423] (6:worker@Jupiter) "Task" done -> [ 48.247423] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 49.278351] (1:master@Tremblay) Send completed -> [ 49.278351] (4:worker@Ginette) Received "Task" -> [ 49.278351] (4:worker@Ginette) Communication time : "1.030928" -> [ 49.278351] (4:worker@Ginette) Processing "Task" -> [ 50.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! -> [ 50.309278] (1:master@Tremblay) Send completed -> [ 50.309278] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. -> [ 50.309278] (2:worker@Tremblay) Received "finalize" -> [ 50.309278] (2:worker@Tremblay) I'm done. See you! -> [ 50.309278] (5:worker@Bourassa) Received "Task" -> [ 50.309278] (5:worker@Bourassa) Communication time : "1.030928" -> [ 50.309278] (5:worker@Bourassa) Processing "Task" -> [ 50.309278] (6:worker@Jupiter) Received "finalize" -> [ 50.309278] (6:worker@Jupiter) I'm done. See you! -> [ 51.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 52.309278] (0:maestro@) Simulation time 52.3093 -> [ 52.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! -> [ 52.309278] (1:master@Tremblay) Goodbye now! -> [ 52.309278] (5:worker@Bourassa) "Task" done -> [ 52.309278] (5:worker@Bourassa) Received "finalize" -> [ 52.309278] (5:worker@Bourassa) I'm done. See you! +> [ 2.000000] (1:master@Tremblay) Send a message to worker-3 +> [ 2.000000] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 2.010309] (2:worker@Tremblay) Execution complete. +> [ 2.010309] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 3.030928] (1:master@Tremblay) Send to worker-3 completed +> [ 3.030928] (1:master@Tremblay) Send a message to worker-4 +> [ 3.030928] (4:worker@Ginette) Start execution... +> [ 4.061856] (1:master@Tremblay) Send to worker-4 completed +> [ 4.061856] (1:master@Tremblay) Send a message to worker-0 +> [ 4.061856] (5:worker@Bourassa) Start execution... +> [ 4.072165] (1:master@Tremblay) Send to worker-0 completed +> [ 4.072165] (1:master@Tremblay) Send a message to worker-1 +> [ 4.072165] (2:worker@Tremblay) Start execution... +> [ 5.030928] (4:worker@Ginette) Execution complete. +> [ 5.030928] (4:worker@Ginette) Waiting a message on worker-3 +> [ 5.103093] (1:master@Tremblay) Send to worker-1 completed +> [ 5.103093] (1:master@Tremblay) Send a message to worker-2 +> [ 5.103093] (7:worker@Jupiter) Start execution... +> [ 6.061856] (5:worker@Bourassa) Execution complete. +> [ 6.061856] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 6.072165] (2:worker@Tremblay) Execution complete. +> [ 6.072165] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 7.103093] (7:worker@Jupiter) Execution complete. +> [ 7.103093] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 15.103093] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 15.103093] (1:master@Tremblay) Send a message to worker-3 +> [ 15.103093] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! +> [ 15.103093] (1:master@Tremblay) Send a message to worker-4 +> [ 15.103093] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 15.103093] (4:worker@Ginette) Waiting a message on worker-3 +> [ 16.134021] (1:master@Tremblay) Send to worker-4 completed +> [ 16.134021] (1:master@Tremblay) Send a message to worker-0 +> [ 16.134021] (5:worker@Bourassa) Start execution... +> [ 16.144330] (1:master@Tremblay) Send to worker-0 completed +> [ 16.144330] (1:master@Tremblay) Send a message to worker-1 +> [ 16.144330] (2:worker@Tremblay) Start execution... +> [ 17.175258] (1:master@Tremblay) Send to worker-1 completed +> [ 17.175258] (1:master@Tremblay) Send a message to worker-2 +> [ 17.175258] (7:worker@Jupiter) Start execution... +> [ 18.134021] (5:worker@Bourassa) Execution complete. +> [ 18.134021] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 18.144330] (2:worker@Tremblay) Execution complete. +> [ 18.144330] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 19.175258] (7:worker@Jupiter) Execution complete. +> [ 19.175258] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 27.175258] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 27.175258] (1:master@Tremblay) Send a message to worker-3 +> [ 28.206186] (1:master@Tremblay) Send to worker-3 completed +> [ 28.206186] (1:master@Tremblay) Send a message to worker-4 +> [ 28.206186] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! +> [ 28.206186] (1:master@Tremblay) Send a message to worker-0 +> [ 28.206186] (4:worker@Ginette) Start execution... +> [ 28.206186] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 28.206186] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 28.216495] (1:master@Tremblay) Send to worker-0 completed +> [ 28.216495] (1:master@Tremblay) Send a message to worker-1 +> [ 28.216495] (2:worker@Tremblay) Start execution... +> [ 29.247423] (1:master@Tremblay) Send to worker-1 completed +> [ 29.247423] (1:master@Tremblay) Send a message to worker-2 +> [ 29.247423] (7:worker@Jupiter) Start execution... +> [ 30.206186] (4:worker@Ginette) Execution complete. +> [ 30.206186] (4:worker@Ginette) Waiting a message on worker-3 +> [ 30.216495] (2:worker@Tremblay) Execution complete. +> [ 30.216495] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 31.247423] (7:worker@Jupiter) Execution complete. +> [ 31.247423] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 39.247423] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 39.247423] (1:master@Tremblay) Send a message to worker-3 +> [ 40.278351] (1:master@Tremblay) Send to worker-3 completed +> [ 40.278351] (1:master@Tremblay) Send a message to worker-4 +> [ 40.278351] (4:worker@Ginette) Start execution... +> [ 41.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 41.309278] (1:master@Tremblay) Send to worker-4 completed +> [ 41.309278] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. +> [ 41.309278] (2:worker@Tremblay) I'm done. See you! +> [ 41.309278] (5:worker@Bourassa) Start execution... +> [ 41.309278] (7:worker@Jupiter) I'm done. See you! +> [ 42.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 43.309278] (0:maestro@) Simulation time 43.3093 +> [ 43.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! +> [ 43.309278] (1:master@Tremblay) Goodbye now! +> [ 43.309278] (5:worker@Bourassa) Execution complete. +> [ 43.309278] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 43.309278] (5:worker@Bourassa) I'm done. See you! p Testing a simple master/worker example application handling failures. TCP crosstraffic ENABLED @@ -101,186 +113,210 @@ p Testing a simple master/worker example application handling failures. TCP cros $ $SG_TEST_EXENV ${bindir:=.}/platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_with_failures.xml ${srcdir}/../app-masterworker/app-masterworker_d.xml --cfg=path:${srcdir} "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" > [ 0.000000] (0:maestro@) Cannot launch process 'worker' on failed host 'Fafard' > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process -> [ 0.010825] (1:master@Tremblay) Send completed -> [ 0.010825] (2:worker@Tremblay) Received "Task" -> [ 0.010825] (2:worker@Tremblay) Communication time : "0.010825" -> [ 0.010825] (2:worker@Tremblay) Processing "Task" +> [ 0.000000] (1:master@Tremblay) Send a message to worker-0 +> [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 0.000000] (3:worker@Jupiter) Waiting a message on worker-1 +> [ 0.000000] (4:worker@Ginette) Waiting a message on worker-3 +> [ 0.000000] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 0.010825] (2:worker@Tremblay) Start execution... +> [ 0.010825] (1:master@Tremblay) Send to worker-0 completed +> [ 0.010825] (1:master@Tremblay) Send a message to worker-1 > [ 1.000000] (0:maestro@) Restart processes on host Fafard +> [ 1.000000] (6:worker@Fafard) Waiting a message on worker-2 > [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-1'. Nevermind. Let's keep going! +> [ 1.000000] (1:master@Tremblay) Send a message to worker-2 > [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (0:maestro@) Restart processes on host Jupiter -> [ 2.010825] (2:worker@Tremblay) "Task" done -> [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 12.082474] (1:master@Tremblay) Send completed -> [ 12.082474] (4:worker@Ginette) Received "Task" -> [ 12.082474] (4:worker@Ginette) Communication time : "1.082474" -> [ 12.082474] (4:worker@Ginette) Processing "Task" -> [ 13.164948] (1:master@Tremblay) Send completed -> [ 13.164948] (5:worker@Bourassa) Received "Task" -> [ 13.164948] (5:worker@Bourassa) Communication time : "1.082474" -> [ 13.164948] (5:worker@Bourassa) Processing "Task" -> [ 13.175773] (1:master@Tremblay) Send completed -> [ 13.175773] (2:worker@Tremblay) Received "Task" -> [ 13.175773] (2:worker@Tremblay) Communication time : "0.010825" -> [ 13.175773] (2:worker@Tremblay) Processing "Task" -> [ 14.082474] (4:worker@Ginette) "Task" done -> [ 14.258247] (1:master@Tremblay) Send completed -> [ 14.258247] (6:worker@Jupiter) Received "Task" -> [ 14.258247] (6:worker@Jupiter) Communication time : "1.082474" -> [ 14.258247] (6:worker@Jupiter) Processing "Task" -> [ 15.164948] (5:worker@Bourassa) "Task" done -> [ 15.175773] (2:worker@Tremblay) "Task" done -> [ 16.258247] (6:worker@Jupiter) "Task" done -> [ 24.258247] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 24.258247] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! -> [ 24.258247] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 25.340722] (1:master@Tremblay) Send completed -> [ 25.340722] (5:worker@Bourassa) Received "Task" -> [ 25.340722] (5:worker@Bourassa) Communication time : "1.082474" -> [ 25.340722] (5:worker@Bourassa) Processing "Task" -> [ 25.351546] (1:master@Tremblay) Send completed -> [ 25.351546] (2:worker@Tremblay) Received "Task" -> [ 25.351546] (2:worker@Tremblay) Communication time : "0.010825" -> [ 25.351546] (2:worker@Tremblay) Processing "Task" -> [ 26.434021] (1:master@Tremblay) Send completed -> [ 26.434021] (6:worker@Jupiter) Received "Task" -> [ 26.434021] (6:worker@Jupiter) Communication time : "1.082474" -> [ 26.434021] (6:worker@Jupiter) Processing "Task" -> [ 27.340722] (5:worker@Bourassa) "Task" done -> [ 27.351546] (2:worker@Tremblay) "Task" done -> [ 28.434021] (6:worker@Jupiter) "Task" done -> [ 36.434021] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 37.516495] (1:master@Tremblay) Send completed -> [ 37.516495] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! -> [ 37.516495] (4:worker@Ginette) Received "Task" -> [ 37.516495] (4:worker@Ginette) Communication time : "1.082474" -> [ 37.516495] (4:worker@Ginette) Processing "Task" -> [ 37.516495] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 37.527320] (1:master@Tremblay) Send completed -> [ 37.527320] (2:worker@Tremblay) Received "Task" -> [ 37.527320] (2:worker@Tremblay) Communication time : "0.010825" -> [ 37.527320] (2:worker@Tremblay) Processing "Task" -> [ 38.609794] (1:master@Tremblay) Send completed -> [ 38.609794] (6:worker@Jupiter) Received "Task" -> [ 38.609794] (6:worker@Jupiter) Communication time : "1.082474" -> [ 38.609794] (6:worker@Jupiter) Processing "Task" -> [ 39.516495] (4:worker@Ginette) "Task" done -> [ 39.527320] (2:worker@Tremblay) "Task" done -> [ 40.609794] (6:worker@Jupiter) "Task" done -> [ 48.609794] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 49.692268] (1:master@Tremblay) Send completed -> [ 49.692268] (4:worker@Ginette) Received "Task" -> [ 49.692268] (4:worker@Ginette) Communication time : "1.082474" -> [ 49.692268] (4:worker@Ginette) Processing "Task" -> [ 50.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! -> [ 50.774742] (1:master@Tremblay) Send completed -> [ 50.774742] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. -> [ 50.774742] (2:worker@Tremblay) Received "finalize" -> [ 50.774742] (2:worker@Tremblay) I'm done. See you! -> [ 50.774742] (5:worker@Bourassa) Received "Task" -> [ 50.774742] (5:worker@Bourassa) Communication time : "1.082474" -> [ 50.774742] (5:worker@Bourassa) Processing "Task" -> [ 50.774742] (6:worker@Jupiter) Received "finalize" -> [ 50.774742] (6:worker@Jupiter) I'm done. See you! -> [ 51.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 52.774742] (0:maestro@) Simulation time 52.7747 -> [ 52.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! -> [ 52.774742] (1:master@Tremblay) Goodbye now! -> [ 52.774742] (5:worker@Bourassa) "Task" done -> [ 52.774742] (5:worker@Bourassa) Received "finalize" -> [ 52.774742] (5:worker@Bourassa) I'm done. See you! +> [ 2.000000] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 2.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-2'. Nevermind. Let's keep going! +> [ 2.000000] (1:master@Tremblay) Send a message to worker-3 +> [ 2.000000] (6:worker@Fafard) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 2.010825] (2:worker@Tremblay) Execution complete. +> [ 2.010825] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 3.082474] (4:worker@Ginette) Start execution... +> [ 3.082474] (1:master@Tremblay) Send to worker-3 completed +> [ 3.082474] (1:master@Tremblay) Send a message to worker-4 +> [ 4.164948] (5:worker@Bourassa) Start execution... +> [ 4.164948] (1:master@Tremblay) Send to worker-4 completed +> [ 4.164948] (1:master@Tremblay) Send a message to worker-0 +> [ 4.175773] (2:worker@Tremblay) Start execution... +> [ 4.175773] (1:master@Tremblay) Send to worker-0 completed +> [ 4.175773] (1:master@Tremblay) Send a message to worker-1 +> [ 5.082474] (4:worker@Ginette) Execution complete. +> [ 5.082474] (4:worker@Ginette) Waiting a message on worker-3 +> [ 5.258247] (7:worker@Jupiter) Start execution... +> [ 5.258247] (1:master@Tremblay) Send to worker-1 completed +> [ 5.258247] (1:master@Tremblay) Send a message to worker-2 +> [ 6.164948] (5:worker@Bourassa) Execution complete. +> [ 6.164948] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 6.175773] (2:worker@Tremblay) Execution complete. +> [ 6.175773] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 7.258247] (7:worker@Jupiter) Execution complete. +> [ 7.258247] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 15.258247] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 15.258247] (1:master@Tremblay) Send a message to worker-3 +> [ 15.258247] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 15.258247] (4:worker@Ginette) Waiting a message on worker-3 +> [ 15.258247] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! +> [ 15.258247] (1:master@Tremblay) Send a message to worker-4 +> [ 16.340722] (5:worker@Bourassa) Start execution... +> [ 16.340722] (1:master@Tremblay) Send to worker-4 completed +> [ 16.340722] (1:master@Tremblay) Send a message to worker-0 +> [ 16.351546] (2:worker@Tremblay) Start execution... +> [ 16.351546] (1:master@Tremblay) Send to worker-0 completed +> [ 16.351546] (1:master@Tremblay) Send a message to worker-1 +> [ 17.434021] (7:worker@Jupiter) Start execution... +> [ 17.434021] (1:master@Tremblay) Send to worker-1 completed +> [ 17.434021] (1:master@Tremblay) Send a message to worker-2 +> [ 18.340722] (5:worker@Bourassa) Execution complete. +> [ 18.340722] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 18.351546] (2:worker@Tremblay) Execution complete. +> [ 18.351546] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 19.434021] (7:worker@Jupiter) Execution complete. +> [ 19.434021] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 27.434021] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 27.434021] (1:master@Tremblay) Send a message to worker-3 +> [ 28.516495] (4:worker@Ginette) Start execution... +> [ 28.516495] (1:master@Tremblay) Send to worker-3 completed +> [ 28.516495] (1:master@Tremblay) Send a message to worker-4 +> [ 28.516495] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 28.516495] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 28.516495] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! +> [ 28.516495] (1:master@Tremblay) Send a message to worker-0 +> [ 28.527320] (2:worker@Tremblay) Start execution... +> [ 28.527320] (1:master@Tremblay) Send to worker-0 completed +> [ 28.527320] (1:master@Tremblay) Send a message to worker-1 +> [ 29.609794] (7:worker@Jupiter) Start execution... +> [ 29.609794] (1:master@Tremblay) Send to worker-1 completed +> [ 29.609794] (1:master@Tremblay) Send a message to worker-2 +> [ 30.516495] (4:worker@Ginette) Execution complete. +> [ 30.516495] (4:worker@Ginette) Waiting a message on worker-3 +> [ 30.527320] (2:worker@Tremblay) Execution complete. +> [ 30.527320] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 31.609794] (7:worker@Jupiter) Execution complete. +> [ 31.609794] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 39.609794] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 39.609794] (1:master@Tremblay) Send a message to worker-3 +> [ 40.692268] (4:worker@Ginette) Start execution... +> [ 40.692268] (1:master@Tremblay) Send to worker-3 completed +> [ 40.692268] (1:master@Tremblay) Send a message to worker-4 +> [ 41.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 41.774742] (5:worker@Bourassa) Start execution... +> [ 41.774742] (1:master@Tremblay) Send to worker-4 completed +> [ 41.774742] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. +> [ 41.774742] (2:worker@Tremblay) I'm done. See you! +> [ 41.774742] (7:worker@Jupiter) I'm done. See you! +> [ 42.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 43.774742] (5:worker@Bourassa) Execution complete. +> [ 43.774742] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 43.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! +> [ 43.774742] (5:worker@Bourassa) I'm done. See you! +> [ 43.774742] (1:master@Tremblay) Goodbye now! +> [ 43.774742] (0:maestro@) Simulation time 43.7747 p Testing a simple master/worker example application handling failures. CPU_TI optimization enabled ! output sort 19 -$ $SG_TEST_EXENV ${bindir:=.}/platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_with_failures.xml ${srcdir}/../app-masterworker/app-masterworker_d.xml --cfg=path:${srcdir} -cfg=cpu/optim:TI "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" +$ $SG_TEST_EXENV ${bindir:=.}/platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_with_failures.xml ${srcdir}/../app-masterworker/app-masterworker_d.xml --cfg=path:${srcdir} --cfg=cpu/optim:TI "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" > [ 0.000000] (0:maestro@) Cannot launch process 'worker' on failed host 'Fafard' > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process -> [ 0.010825] (1:master@Tremblay) Send completed -> [ 0.010825] (2:worker@Tremblay) Received "Task" -> [ 0.010825] (2:worker@Tremblay) Communication time : "0.010825" -> [ 0.010825] (2:worker@Tremblay) Processing "Task" +> [ 0.000000] (1:master@Tremblay) Send a message to worker-0 +> [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 0.000000] (3:worker@Jupiter) Waiting a message on worker-1 +> [ 0.000000] (4:worker@Ginette) Waiting a message on worker-3 +> [ 0.000000] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 0.010825] (1:master@Tremblay) Send to worker-0 completed +> [ 0.010825] (1:master@Tremblay) Send a message to worker-1 +> [ 0.010825] (2:worker@Tremblay) Start execution... > [ 1.000000] (0:maestro@) Restart processes on host Fafard > [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-1'. Nevermind. Let's keep going! +> [ 1.000000] (1:master@Tremblay) Send a message to worker-2 > [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 1.000000] (6:worker@Fafard) Waiting a message on worker-2 > [ 2.000000] (0:maestro@) Restart processes on host Jupiter -> [ 2.010825] (2:worker@Tremblay) "Task" done -> [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 12.082474] (1:master@Tremblay) Send completed -> [ 12.082474] (4:worker@Ginette) Received "Task" -> [ 12.082474] (4:worker@Ginette) Communication time : "1.082474" -> [ 12.082474] (4:worker@Ginette) Processing "Task" -> [ 13.164948] (1:master@Tremblay) Send completed -> [ 13.164948] (5:worker@Bourassa) Received "Task" -> [ 13.164948] (5:worker@Bourassa) Communication time : "1.082474" -> [ 13.164948] (5:worker@Bourassa) Processing "Task" -> [ 13.175773] (1:master@Tremblay) Send completed -> [ 13.175773] (2:worker@Tremblay) Received "Task" -> [ 13.175773] (2:worker@Tremblay) Communication time : "0.010825" -> [ 13.175773] (2:worker@Tremblay) Processing "Task" -> [ 14.082474] (4:worker@Ginette) "Task" done -> [ 14.258247] (1:master@Tremblay) Send completed -> [ 14.258247] (6:worker@Jupiter) Received "Task" -> [ 14.258247] (6:worker@Jupiter) Communication time : "1.082474" -> [ 14.258247] (6:worker@Jupiter) Processing "Task" -> [ 15.164948] (5:worker@Bourassa) "Task" done -> [ 15.175773] (2:worker@Tremblay) "Task" done -> [ 16.258247] (6:worker@Jupiter) "Task" done -> [ 24.258247] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 24.258247] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! -> [ 24.258247] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 25.340722] (1:master@Tremblay) Send completed -> [ 25.340722] (5:worker@Bourassa) Received "Task" -> [ 25.340722] (5:worker@Bourassa) Communication time : "1.082474" -> [ 25.340722] (5:worker@Bourassa) Processing "Task" -> [ 25.351546] (1:master@Tremblay) Send completed -> [ 25.351546] (2:worker@Tremblay) Received "Task" -> [ 25.351546] (2:worker@Tremblay) Communication time : "0.010825" -> [ 25.351546] (2:worker@Tremblay) Processing "Task" -> [ 26.434021] (1:master@Tremblay) Send completed -> [ 26.434021] (6:worker@Jupiter) Received "Task" -> [ 26.434021] (6:worker@Jupiter) Communication time : "1.082474" -> [ 26.434021] (6:worker@Jupiter) Processing "Task" -> [ 27.340722] (5:worker@Bourassa) "Task" done -> [ 27.351546] (2:worker@Tremblay) "Task" done -> [ 28.434021] (6:worker@Jupiter) "Task" done -> [ 36.434021] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 37.516495] (1:master@Tremblay) Send completed -> [ 37.516495] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! -> [ 37.516495] (4:worker@Ginette) Received "Task" -> [ 37.516495] (4:worker@Ginette) Communication time : "1.082474" -> [ 37.516495] (4:worker@Ginette) Processing "Task" -> [ 37.516495] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 37.527320] (1:master@Tremblay) Send completed -> [ 37.527320] (2:worker@Tremblay) Received "Task" -> [ 37.527320] (2:worker@Tremblay) Communication time : "0.010825" -> [ 37.527320] (2:worker@Tremblay) Processing "Task" -> [ 38.609794] (1:master@Tremblay) Send completed -> [ 38.609794] (6:worker@Jupiter) Received "Task" -> [ 38.609794] (6:worker@Jupiter) Communication time : "1.082474" -> [ 38.609794] (6:worker@Jupiter) Processing "Task" -> [ 39.516495] (4:worker@Ginette) "Task" done -> [ 39.527320] (2:worker@Tremblay) "Task" done -> [ 40.609794] (6:worker@Jupiter) "Task" done -> [ 48.609794] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 49.692268] (1:master@Tremblay) Send completed -> [ 49.692268] (4:worker@Ginette) Received "Task" -> [ 49.692268] (4:worker@Ginette) Communication time : "1.082474" -> [ 49.692268] (4:worker@Ginette) Processing "Task" -> [ 50.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! -> [ 50.774742] (1:master@Tremblay) Send completed -> [ 50.774742] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. -> [ 50.774742] (2:worker@Tremblay) Received "finalize" -> [ 50.774742] (2:worker@Tremblay) I'm done. See you! -> [ 50.774742] (5:worker@Bourassa) Received "Task" -> [ 50.774742] (5:worker@Bourassa) Communication time : "1.082474" -> [ 50.774742] (5:worker@Bourassa) Processing "Task" -> [ 50.774742] (6:worker@Jupiter) Received "finalize" -> [ 50.774742] (6:worker@Jupiter) I'm done. See you! -> [ 51.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 52.774742] (0:maestro@) Simulation time 52.7747 -> [ 52.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! -> [ 52.774742] (1:master@Tremblay) Goodbye now! -> [ 52.774742] (5:worker@Bourassa) "Task" done -> [ 52.774742] (5:worker@Bourassa) Received "finalize" -> [ 52.774742] (5:worker@Bourassa) I'm done. See you! +> [ 2.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-2'. Nevermind. Let's keep going! +> [ 2.000000] (1:master@Tremblay) Send a message to worker-3 +> [ 2.000000] (6:worker@Fafard) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 2.000000] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 2.010825] (2:worker@Tremblay) Execution complete. +> [ 2.010825] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 3.082474] (1:master@Tremblay) Send to worker-3 completed +> [ 3.082474] (1:master@Tremblay) Send a message to worker-4 +> [ 3.082474] (4:worker@Ginette) Start execution... +> [ 4.164948] (1:master@Tremblay) Send to worker-4 completed +> [ 4.164948] (1:master@Tremblay) Send a message to worker-0 +> [ 4.164948] (5:worker@Bourassa) Start execution... +> [ 4.175773] (1:master@Tremblay) Send to worker-0 completed +> [ 4.175773] (1:master@Tremblay) Send a message to worker-1 +> [ 4.175773] (2:worker@Tremblay) Start execution... +> [ 5.082474] (4:worker@Ginette) Execution complete. +> [ 5.082474] (4:worker@Ginette) Waiting a message on worker-3 +> [ 5.258247] (1:master@Tremblay) Send to worker-1 completed +> [ 5.258247] (1:master@Tremblay) Send a message to worker-2 +> [ 5.258247] (7:worker@Jupiter) Start execution... +> [ 6.164948] (5:worker@Bourassa) Execution complete. +> [ 6.164948] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 6.175773] (2:worker@Tremblay) Execution complete. +> [ 6.175773] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 7.258247] (7:worker@Jupiter) Execution complete. +> [ 7.258247] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 15.258247] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 15.258247] (1:master@Tremblay) Send a message to worker-3 +> [ 15.258247] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! +> [ 15.258247] (1:master@Tremblay) Send a message to worker-4 +> [ 15.258247] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 15.258247] (4:worker@Ginette) Waiting a message on worker-3 +> [ 16.340722] (1:master@Tremblay) Send to worker-4 completed +> [ 16.340722] (1:master@Tremblay) Send a message to worker-0 +> [ 16.340722] (5:worker@Bourassa) Start execution... +> [ 16.351546] (1:master@Tremblay) Send to worker-0 completed +> [ 16.351546] (1:master@Tremblay) Send a message to worker-1 +> [ 16.351546] (2:worker@Tremblay) Start execution... +> [ 17.434021] (1:master@Tremblay) Send to worker-1 completed +> [ 17.434021] (1:master@Tremblay) Send a message to worker-2 +> [ 17.434021] (7:worker@Jupiter) Start execution... +> [ 18.340722] (5:worker@Bourassa) Execution complete. +> [ 18.340722] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 18.351546] (2:worker@Tremblay) Execution complete. +> [ 18.351546] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 19.434021] (7:worker@Jupiter) Execution complete. +> [ 19.434021] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 27.434021] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 27.434021] (1:master@Tremblay) Send a message to worker-3 +> [ 28.516495] (1:master@Tremblay) Send to worker-3 completed +> [ 28.516495] (1:master@Tremblay) Send a message to worker-4 +> [ 28.516495] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! +> [ 28.516495] (1:master@Tremblay) Send a message to worker-0 +> [ 28.516495] (4:worker@Ginette) Start execution... +> [ 28.516495] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 28.516495] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 28.527320] (1:master@Tremblay) Send to worker-0 completed +> [ 28.527320] (1:master@Tremblay) Send a message to worker-1 +> [ 28.527320] (2:worker@Tremblay) Start execution... +> [ 29.609794] (1:master@Tremblay) Send to worker-1 completed +> [ 29.609794] (1:master@Tremblay) Send a message to worker-2 +> [ 29.609794] (7:worker@Jupiter) Start execution... +> [ 30.516495] (4:worker@Ginette) Execution complete. +> [ 30.516495] (4:worker@Ginette) Waiting a message on worker-3 +> [ 30.527320] (2:worker@Tremblay) Execution complete. +> [ 30.527320] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 31.609794] (7:worker@Jupiter) Execution complete. +> [ 31.609794] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 39.609794] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 39.609794] (1:master@Tremblay) Send a message to worker-3 +> [ 40.692268] (1:master@Tremblay) Send to worker-3 completed +> [ 40.692268] (1:master@Tremblay) Send a message to worker-4 +> [ 40.692268] (4:worker@Ginette) Start execution... +> [ 41.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 41.774742] (1:master@Tremblay) Send to worker-4 completed +> [ 41.774742] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. +> [ 41.774742] (2:worker@Tremblay) I'm done. See you! +> [ 41.774742] (5:worker@Bourassa) Start execution... +> [ 41.774742] (7:worker@Jupiter) I'm done. See you! +> [ 42.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 43.774742] (0:maestro@) Simulation time 43.7747 +> [ 43.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! +> [ 43.774742] (1:master@Tremblay) Goodbye now! +> [ 43.774742] (5:worker@Bourassa) Execution complete. +> [ 43.774742] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 43.774742] (5:worker@Bourassa) I'm done. See you! diff --git a/examples/platforms/trace/ginette_state.trace b/examples/platforms/trace/ginette_state.trace index e3fbe4028b..108bce3cf1 100644 --- a/examples/platforms/trace/ginette_state.trace +++ b/examples/platforms/trace/ginette_state.trace @@ -1,2 +1,2 @@ -50 0 -60 1 +41 0 +50 1 diff --git a/examples/platforms/trace/link3_state.trace b/examples/platforms/trace/link3_state.trace index 8cad9d0248..aa52029ac3 100644 --- a/examples/platforms/trace/link3_state.trace +++ b/examples/platforms/trace/link3_state.trace @@ -1,4 +1,6 @@ 13 0 14 1 +15 0 +16 1 20 0 25 1 diff --git a/examples/platforms/trace/link4_state.trace b/examples/platforms/trace/link4_state.trace index 006aa06fe7..fab0dd2e22 100644 --- a/examples/platforms/trace/link4_state.trace +++ b/examples/platforms/trace/link4_state.trace @@ -1,2 +1,2 @@ -35 0 -40 1 +25 0 +30 1 diff --git a/examples/s4u/CMakeLists.txt b/examples/s4u/CMakeLists.txt index c8a987ec4f..c33de9ee87 100644 --- a/examples/s4u/CMakeLists.txt +++ b/examples/s4u/CMakeLists.txt @@ -32,7 +32,6 @@ foreach(variant fun class) endforeach() set(tesh_files ${tesh_files} ${CMAKE_CURRENT_SOURCE_DIR}/app-masterworkers/s4u-app-masterworkers.tesh) - # CHORD EXAMPLE add_executable (s4u-dht-chord dht-chord/s4u-dht-chord.cpp dht-chord/s4u-dht-chord-node.cpp) target_link_libraries(s4u-dht-chord simgrid) @@ -102,7 +101,7 @@ foreach(example actor-create actor-daemon actor-join actor-kill energy-exec energy-boot energy-link energy-vm engine-filtering exec-async exec-basic exec-dvfs exec-monitor exec-ptask exec-remote - platform-properties plugin-hostload mutex + platform-failures platform-properties plugin-hostload mutex io-async io-file-system io-file-remote io-storage-raw replay-comm replay-storage routing-get-clusters diff --git a/examples/s4u/README.doc b/examples/s4u/README.doc index 9b302e5cf5..3d564cadff 100644 --- a/examples/s4u/README.doc +++ b/examples/s4u/README.doc @@ -1,12 +1,12 @@ -S4U (Simgrid for you) is the next interface of SimGrid, expected to be released with SimGrid 4.0. - -Even if it is not completely rock stable yet, it may well already fit -your needs. You are welcome to try it and report any interface -glitches that you see. Be however warned that the interface may change -until the final release. You will have to adapt your code on the way. - -This file follows the Doxygen syntax to be included in the -documentation, but it should remain readable directly. +// S4U (Simgrid for you) is the next interface of SimGrid, expected to be released with SimGrid 4.0. +// +// Even if it is not completely rock stable yet, it may well already fit +// your needs. You are welcome to try it and report any interface +// glitches that you see. Be however warned that the interface may change +// until the final release. You will have to adapt your code on the way. +// +// This file follows the Doxygen syntax to be included in the +// documentation, but it should remain readable directly. /** @defgroup s4u_examples S4U examples @@ -286,9 +286,14 @@ than the previous examples. Shows how to implement a classical communication pattern, where a token is exchanged along a ring to reach every participant. - - Master Workers: @ref examples/s4u/app-masterworker/s4u-app-masterworker.cpp @n + - Master Workers: @ref examples/s4u/app-masterworkers/s4u-app-masterworkers-class.cpp + @ref examples/s4u/app-masterworkers/s4u-app-masterworkers-fun.cpp @n Another good old example, where one Master process has a bunch of task to dispatch to a set of several Worker - processes. + processes. This example comes in two equivalent variants, one + where the actors are specified as simple functions (which is easier to + understand for newcomers) and one where the actors are specified + as classes (which is more powerful for the users wanting to build + their own projects upon the example). @subsection s4u_ex_app_data Data diffusion @@ -324,7 +329,8 @@ than the previous examples. @example examples/s4u/async-waitany/s4u-async-waitany.cpp @example examples/s4u/app-bittorrent/s4u-bittorrent.cpp @example examples/s4u/app-chainsend/s4u-app-chainsend.cpp -@example examples/s4u/app-masterworker/s4u-app-masterworker.cpp +@example examples/s4u/app-masterworkers/s4u-app-masterworkers-class.cpp +@example examples/s4u/app-masterworkers/s4u-app-masterworkers-fun.cpp @example examples/s4u/app-pingpong/s4u-app-pingpong.cpp @example examples/s4u/app-token-ring/s4u-app-token-ring.cpp @example examples/s4u/dht-chord/s4u-dht-chord.cpp diff --git a/examples/s4u/app-masterworkers/s4u-app-masterworkers.tesh b/examples/s4u/app-masterworkers/s4u-app-masterworkers.tesh index a0f32d6956..206dd72563 100644 --- a/examples/s4u/app-masterworkers/s4u-app-masterworkers.tesh +++ b/examples/s4u/app-masterworkers/s4u-app-masterworkers.tesh @@ -63,4 +63,3 @@ $ $SG_TEST_EXENV ${bindir:=.}/s4u-app-masterworkers-fun$EXEEXT ${platfdir}/small > [ 4.965689] (worker@Ginette) Exiting now. > [ 5.133855] (maestro@) Simulation is over > [ 5.133855] (worker@Bourassa) Exiting now. - diff --git a/examples/s4u/app-masterworkers/s4u-app-masterworkers_d.xml b/examples/s4u/app-masterworkers/s4u-app-masterworkers_d.xml index 3d93a78b7e..29fddcaef6 100644 --- a/examples/s4u/app-masterworkers/s4u-app-masterworkers_d.xml +++ b/examples/s4u/app-masterworkers/s4u-app-masterworkers_d.xml @@ -9,19 +9,19 @@ - + - + - + - + - + diff --git a/examples/s4u/platform-failures/s4u-platform-failures.cpp b/examples/s4u/platform-failures/s4u-platform-failures.cpp index 941ea2aba8..8b54502938 100644 --- a/examples/s4u/platform-failures/s4u-platform-failures.cpp +++ b/examples/s4u/platform-failures/s4u-platform-failures.cpp @@ -25,8 +25,9 @@ static int master(int argc, char* argv[]) mailbox = simgrid::s4u::Mailbox::by_name(std::string("worker-") + std::to_string(i % workers_count)); double* payload = new double(comp_size); try { + XBT_INFO("Send a message to %s", mailbox->get_cname()); mailbox->put(payload, comm_size, 10.0); - XBT_INFO("Send completed"); + XBT_INFO("Send to %s completed", mailbox->get_cname()); } catch (xbt_ex& e) { switch (e.category) { case host_error: @@ -84,8 +85,27 @@ static int worker(int argc, char* argv[]) double comp_size = -1; while (1) { try { + XBT_INFO("Waiting a message on %s", mailbox->get_cname()); payload = static_cast(mailbox->get()); comp_size = *payload; + xbt_assert(payload != nullptr, "mailbox->get() failed"); + if (comp_size < 0) { /* - Exit when -1.0 is received */ + XBT_INFO("I'm done. See you!"); + break; + } + /* - Otherwise, process the task */ + try { + XBT_INFO("Start execution..."); + simgrid::s4u::this_actor::execute(comp_size); + XBT_INFO("Execution complete."); + } catch (xbt_ex& e) { + if (e.category == host_error) { + XBT_INFO("Gloups. The cpu on which I'm running just turned off!. See you!"); + return -1; + } else + xbt_die("Unexpected behavior"); + } + delete payload; } catch (xbt_ex& e) { switch (e.category) { @@ -99,23 +119,7 @@ static int worker(int argc, char* argv[]) xbt_die("Unexpected behavior"); } } - xbt_assert(payload != nullptr, "mailbox->get() failed"); - if (comp_size < 0) { /* - Exit when -1.0 is received */ - XBT_INFO("I'm done. See you!"); - break; - } - /* - Otherwise, process the task */ - try { - simgrid::s4u::this_actor::execute(comp_size); - } catch (xbt_ex& e) { - if (e.category == host_error) { - XBT_INFO("Gloups. The cpu on which I'm running just turned off!. See you!"); - return -1; - } else - xbt_die("Unexpected behavior"); - } } - XBT_INFO("I'm done. See you!"); return 0; } diff --git a/examples/s4u/platform-failures/s4u-platform-failures.tesh b/examples/s4u/platform-failures/s4u-platform-failures.tesh index 5738a23318..4b365e854f 100644 --- a/examples/s4u/platform-failures/s4u-platform-failures.tesh +++ b/examples/s4u/platform-failures/s4u-platform-failures.tesh @@ -6,37 +6,210 @@ p Testing a simple master/worker example application handling failures TCP cross $ $SG_TEST_EXENV ${bindir:=.}/s4u-platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_with_failures.xml ${bindir}/../app-masterworker/s4u-app-masterworker_d.xml --cfg=path:${srcdir} --cfg=network/crosstraffic:0 "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" > [ 0.000000] (0:maestro@) Cannot launch process 'worker' on failed host 'Fafard' > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process -> [ 0.010309] (1:master@Tremblay) Send completed +> [ 0.000000] (1:master@Tremblay) Send a message to worker-0 +> [ 0.010309] (1:master@Tremblay) Send to worker-0 completed +> [ 0.010309] (2:worker@Tremblay) Start execution... +> [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 0.000000] (3:worker@Jupiter) Waiting a message on worker-1 +> [ 0.000000] (4:worker@Ginette) Waiting a message on worker-3 +> [ 0.000000] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 0.010309] (1:master@Tremblay) Send a message to worker-1 > [ 1.000000] (0:maestro@) Restart processes on host Fafard +> [ 1.000000] (6:worker@Fafard) Waiting a message on worker-2 > [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-1'. Nevermind. Let's keep going! +> [ 1.000000] (1:master@Tremblay) Send a message to worker-2 > [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 2.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-2'. Nevermind. Let's keep going! +> [ 2.000000] (6:worker@Fafard) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (0:maestro@) Restart processes on host Jupiter -> [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 12.030928] (1:master@Tremblay) Send completed -> [ 13.061856] (1:master@Tremblay) Send completed -> [ 13.072165] (1:master@Tremblay) Send completed -> [ 14.103093] (1:master@Tremblay) Send completed -> [ 24.103093] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 24.103093] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! -> [ 24.103093] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 25.134021] (1:master@Tremblay) Send completed -> [ 25.144330] (1:master@Tremblay) Send completed -> [ 26.175258] (1:master@Tremblay) Send completed -> [ 36.175258] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 37.206186] (1:master@Tremblay) Send completed -> [ 37.206186] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! -> [ 37.206186] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 38.247423] (1:master@Tremblay) Send completed -> [ 48.247423] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 49.278351] (1:master@Tremblay) Send completed -> [ 50.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! -> [ 50.309278] (1:master@Tremblay) Send completed -> [ 50.309278] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. -> [ 50.309278] (2:worker@Tremblay) I'm done. See you! -> [ 50.309278] (6:worker@Jupiter) I'm done. See you! -> [ 51.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 52.309278] (0:maestro@) Simulation time 52.3093 -> [ 52.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! -> [ 52.309278] (1:master@Tremblay) Goodbye now! -> [ 52.309278] (5:worker@Bourassa) I'm done. See you! +> [ 2.000000] (1:master@Tremblay) Send a message to worker-3 +> [ 2.000000] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 2.010309] (2:worker@Tremblay) Execution complete. +> [ 2.010309] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 3.030928] (1:master@Tremblay) Send to worker-3 completed +> [ 3.030928] (1:master@Tremblay) Send a message to worker-4 +> [ 3.030928] (4:worker@Ginette) Start execution... +> [ 4.061856] (1:master@Tremblay) Send to worker-4 completed +> [ 4.061856] (1:master@Tremblay) Send a message to worker-0 +> [ 4.061856] (5:worker@Bourassa) Start execution... +> [ 4.072165] (1:master@Tremblay) Send to worker-0 completed +> [ 4.072165] (1:master@Tremblay) Send a message to worker-1 +> [ 4.072165] (2:worker@Tremblay) Start execution... +> [ 5.030928] (4:worker@Ginette) Execution complete. +> [ 5.030928] (4:worker@Ginette) Waiting a message on worker-3 +> [ 5.103093] (1:master@Tremblay) Send to worker-1 completed +> [ 5.103093] (1:master@Tremblay) Send a message to worker-2 +> [ 5.103093] (7:worker@Jupiter) Start execution... +> [ 6.061856] (5:worker@Bourassa) Execution complete. +> [ 6.061856] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 6.072165] (2:worker@Tremblay) Execution complete. +> [ 6.072165] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 7.103093] (7:worker@Jupiter) Execution complete. +> [ 7.103093] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 15.103093] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 15.103093] (1:master@Tremblay) Send a message to worker-3 +> [ 15.103093] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! +> [ 15.103093] (1:master@Tremblay) Send a message to worker-4 +> [ 15.103093] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 15.103093] (4:worker@Ginette) Waiting a message on worker-3 +> [ 16.134021] (1:master@Tremblay) Send to worker-4 completed +> [ 16.134021] (1:master@Tremblay) Send a message to worker-0 +> [ 16.134021] (5:worker@Bourassa) Start execution... +> [ 16.144330] (1:master@Tremblay) Send to worker-0 completed +> [ 16.144330] (1:master@Tremblay) Send a message to worker-1 +> [ 16.144330] (2:worker@Tremblay) Start execution... +> [ 17.175258] (1:master@Tremblay) Send to worker-1 completed +> [ 17.175258] (1:master@Tremblay) Send a message to worker-2 +> [ 17.175258] (7:worker@Jupiter) Start execution... +> [ 18.134021] (5:worker@Bourassa) Execution complete. +> [ 18.134021] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 18.144330] (2:worker@Tremblay) Execution complete. +> [ 18.144330] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 19.175258] (7:worker@Jupiter) Execution complete. +> [ 19.175258] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 27.175258] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 27.175258] (1:master@Tremblay) Send a message to worker-3 +> [ 28.206186] (1:master@Tremblay) Send to worker-3 completed +> [ 28.206186] (1:master@Tremblay) Send a message to worker-4 +> [ 28.206186] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! +> [ 28.206186] (1:master@Tremblay) Send a message to worker-0 +> [ 28.206186] (4:worker@Ginette) Start execution... +> [ 28.206186] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 28.206186] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 28.216495] (1:master@Tremblay) Send to worker-0 completed +> [ 28.216495] (1:master@Tremblay) Send a message to worker-1 +> [ 28.216495] (2:worker@Tremblay) Start execution... +> [ 29.247423] (1:master@Tremblay) Send to worker-1 completed +> [ 29.247423] (1:master@Tremblay) Send a message to worker-2 +> [ 29.247423] (7:worker@Jupiter) Start execution... +> [ 30.206186] (4:worker@Ginette) Execution complete. +> [ 30.206186] (4:worker@Ginette) Waiting a message on worker-3 +> [ 30.216495] (2:worker@Tremblay) Execution complete. +> [ 30.216495] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 31.247423] (7:worker@Jupiter) Execution complete. +> [ 31.247423] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 39.247423] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 39.247423] (1:master@Tremblay) Send a message to worker-3 +> [ 40.278351] (1:master@Tremblay) Send to worker-3 completed +> [ 40.278351] (1:master@Tremblay) Send a message to worker-4 +> [ 40.278351] (4:worker@Ginette) Start execution... +> [ 41.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 41.309278] (1:master@Tremblay) Send to worker-4 completed +> [ 41.309278] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. +> [ 41.309278] (2:worker@Tremblay) I'm done. See you! +> [ 41.309278] (5:worker@Bourassa) Start execution... +> [ 41.309278] (7:worker@Jupiter) I'm done. See you! +> [ 42.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 43.309278] (0:maestro@) Simulation time 43.3093 +> [ 43.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! +> [ 43.309278] (1:master@Tremblay) Goodbye now! +> [ 43.309278] (5:worker@Bourassa) Execution complete. +> [ 43.309278] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 43.309278] (5:worker@Bourassa) I'm done. See you! +p Testing a simple master/worker example application handling failures. TCP crosstraffic ENABLED + +! output sort 19 +$ $SG_TEST_EXENV ${bindir:=.}/s4u-platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_with_failures.xml ${bindir}/../app-masterworker/s4u-app-masterworker_d.xml --cfg=path:${srcdir} "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" +> [ 0.000000] (0:maestro@) Cannot launch process 'worker' on failed host 'Fafard' +> [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process +> [ 0.000000] (1:master@Tremblay) Send a message to worker-0 +> [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 0.000000] (3:worker@Jupiter) Waiting a message on worker-1 +> [ 0.000000] (4:worker@Ginette) Waiting a message on worker-3 +> [ 0.000000] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 0.010825] (2:worker@Tremblay) Start execution... +> [ 0.010825] (1:master@Tremblay) Send to worker-0 completed +> [ 0.010825] (1:master@Tremblay) Send a message to worker-1 +> [ 1.000000] (0:maestro@) Restart processes on host Fafard +> [ 1.000000] (6:worker@Fafard) Waiting a message on worker-2 +> [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-1'. Nevermind. Let's keep going! +> [ 1.000000] (1:master@Tremblay) Send a message to worker-2 +> [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 2.000000] (0:maestro@) Restart processes on host Jupiter +> [ 2.000000] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 2.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-2'. Nevermind. Let's keep going! +> [ 2.000000] (1:master@Tremblay) Send a message to worker-3 +> [ 2.000000] (6:worker@Fafard) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 2.010825] (2:worker@Tremblay) Execution complete. +> [ 2.010825] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 3.082474] (4:worker@Ginette) Start execution... +> [ 3.082474] (1:master@Tremblay) Send to worker-3 completed +> [ 3.082474] (1:master@Tremblay) Send a message to worker-4 +> [ 4.164948] (5:worker@Bourassa) Start execution... +> [ 4.164948] (1:master@Tremblay) Send to worker-4 completed +> [ 4.164948] (1:master@Tremblay) Send a message to worker-0 +> [ 4.175773] (2:worker@Tremblay) Start execution... +> [ 4.175773] (1:master@Tremblay) Send to worker-0 completed +> [ 4.175773] (1:master@Tremblay) Send a message to worker-1 +> [ 5.082474] (4:worker@Ginette) Execution complete. +> [ 5.082474] (4:worker@Ginette) Waiting a message on worker-3 +> [ 5.258247] (7:worker@Jupiter) Start execution... +> [ 5.258247] (1:master@Tremblay) Send to worker-1 completed +> [ 5.258247] (1:master@Tremblay) Send a message to worker-2 +> [ 6.164948] (5:worker@Bourassa) Execution complete. +> [ 6.164948] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 6.175773] (2:worker@Tremblay) Execution complete. +> [ 6.175773] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 7.258247] (7:worker@Jupiter) Execution complete. +> [ 7.258247] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 15.258247] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 15.258247] (1:master@Tremblay) Send a message to worker-3 +> [ 15.258247] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 15.258247] (4:worker@Ginette) Waiting a message on worker-3 +> [ 15.258247] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! +> [ 15.258247] (1:master@Tremblay) Send a message to worker-4 +> [ 16.340722] (5:worker@Bourassa) Start execution... +> [ 16.340722] (1:master@Tremblay) Send to worker-4 completed +> [ 16.340722] (1:master@Tremblay) Send a message to worker-0 +> [ 16.351546] (2:worker@Tremblay) Start execution... +> [ 16.351546] (1:master@Tremblay) Send to worker-0 completed +> [ 16.351546] (1:master@Tremblay) Send a message to worker-1 +> [ 17.434021] (7:worker@Jupiter) Start execution... +> [ 17.434021] (1:master@Tremblay) Send to worker-1 completed +> [ 17.434021] (1:master@Tremblay) Send a message to worker-2 +> [ 18.340722] (5:worker@Bourassa) Execution complete. +> [ 18.340722] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 18.351546] (2:worker@Tremblay) Execution complete. +> [ 18.351546] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 19.434021] (7:worker@Jupiter) Execution complete. +> [ 19.434021] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 27.434021] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 27.434021] (1:master@Tremblay) Send a message to worker-3 +> [ 28.516495] (4:worker@Ginette) Start execution... +> [ 28.516495] (1:master@Tremblay) Send to worker-3 completed +> [ 28.516495] (1:master@Tremblay) Send a message to worker-4 +> [ 28.516495] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 28.516495] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 28.516495] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! +> [ 28.516495] (1:master@Tremblay) Send a message to worker-0 +> [ 28.527320] (2:worker@Tremblay) Start execution... +> [ 28.527320] (1:master@Tremblay) Send to worker-0 completed +> [ 28.527320] (1:master@Tremblay) Send a message to worker-1 +> [ 29.609794] (7:worker@Jupiter) Start execution... +> [ 29.609794] (1:master@Tremblay) Send to worker-1 completed +> [ 29.609794] (1:master@Tremblay) Send a message to worker-2 +> [ 30.516495] (4:worker@Ginette) Execution complete. +> [ 30.516495] (4:worker@Ginette) Waiting a message on worker-3 +> [ 30.527320] (2:worker@Tremblay) Execution complete. +> [ 30.527320] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 31.609794] (7:worker@Jupiter) Execution complete. +> [ 31.609794] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 39.609794] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 39.609794] (1:master@Tremblay) Send a message to worker-3 +> [ 40.692268] (4:worker@Ginette) Start execution... +> [ 40.692268] (1:master@Tremblay) Send to worker-3 completed +> [ 40.692268] (1:master@Tremblay) Send a message to worker-4 +> [ 41.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 41.774742] (5:worker@Bourassa) Start execution... +> [ 41.774742] (1:master@Tremblay) Send to worker-4 completed +> [ 41.774742] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. +> [ 41.774742] (2:worker@Tremblay) I'm done. See you! +> [ 41.774742] (7:worker@Jupiter) I'm done. See you! +> [ 42.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 43.774742] (5:worker@Bourassa) Execution complete. +> [ 43.774742] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 43.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! +> [ 43.774742] (5:worker@Bourassa) I'm done. See you! +> [ 43.774742] (1:master@Tremblay) Goodbye now! +> [ 43.774742] (0:maestro@) Simulation time 43.7747 diff --git a/examples/s4u/platform-properties/s4u-platform-properties.cpp b/examples/s4u/platform-properties/s4u-platform-properties.cpp index d35bf9f5b1..085464e95a 100644 --- a/examples/s4u/platform-properties/s4u-platform-properties.cpp +++ b/examples/s4u/platform-properties/s4u-platform-properties.cpp @@ -109,7 +109,7 @@ int main(int argc, char* argv[]) e.register_function("carole", carole); e.register_function("david", david); - size_t totalHosts = sg_host_count(); + size_t totalHosts = e.get_host_count(); XBT_INFO("There are %zu hosts in the environment", totalHosts); std::vector hosts = e.get_all_hosts(); diff --git a/include/simgrid/s4u/Activity.hpp b/include/simgrid/s4u/Activity.hpp index 615f7bb3ba..6cd6276393 100644 --- a/include/simgrid/s4u/Activity.hpp +++ b/include/simgrid/s4u/Activity.hpp @@ -27,6 +27,7 @@ namespace s4u { * - Synchronization activities may possibly be connected to no action. */ class XBT_PUBLIC Activity { +#ifndef DOXYGEN friend Comm; friend XBT_PUBLIC void intrusive_ptr_release(Comm * c); friend XBT_PUBLIC void intrusive_ptr_add_ref(Comm * c); @@ -36,6 +37,7 @@ class XBT_PUBLIC Activity { friend Io; friend XBT_PUBLIC void intrusive_ptr_release(Io* i); friend XBT_PUBLIC void intrusive_ptr_add_ref(Io* i); +#endif protected: Activity() = default; diff --git a/include/simgrid/s4u/Actor.hpp b/include/simgrid/s4u/Actor.hpp index 09e3df6803..cf5f0f10ef 100644 --- a/include/simgrid/s4u/Actor.hpp +++ b/include/simgrid/s4u/Actor.hpp @@ -18,7 +18,7 @@ namespace simgrid { namespace s4u { -/** @ingroup s4u_api +/** * * An actor is an independent stream of execution in your distributed application. * @@ -121,10 +121,12 @@ namespace s4u { /** @brief Simulation Agent */ class XBT_PUBLIC Actor : public simgrid::xbt::Extendable { +#ifndef DOXYGEN friend Exec; friend Mailbox; friend simgrid::kernel::actor::ActorImpl; friend simgrid::kernel::activity::MailboxImpl; +#endif kernel::actor::ActorImpl* pimpl_ = nullptr; /** Wrap a (possibly non-copyable) single-use task into a `std::function` */ @@ -169,12 +171,16 @@ public: /** Signal indicating that the given actor is about to disappear */ static simgrid::xbt::signal on_destruction; - /** Create an actor using a function + /** Create an actor from a std::function * * If the actor is restarted, the actor has a fresh copy of the function. */ static ActorPtr create(std::string name, s4u::Host* host, std::function code); + /** Create an actor from a std::function + * + * If the actor is restarted, the actor has a fresh copy of the function. + */ static ActorPtr create(std::string name, s4u::Host* host, std::function*)> code, std::vector* args) { @@ -245,6 +251,15 @@ public: /** Retrieves the time at which that actor will be killed (or -1 if not set) */ double get_kill_time(); + /** @brief Moves the actor to another host + * + * If the actor is currently blocked on an execution activity, the activity is also + * migrated to the new host. If it's blocked on another kind of activity, an error is + * raised as the mandated code is not written yet. Please report that bug if you need it. + * + * Asynchronous activities started by the actor are not migrated automatically, so you have + * to take care of this yourself (only you knows which ones should be migrated). + */ void migrate(Host * new_host); /** Ask the actor to die. @@ -262,7 +277,7 @@ public: /** Retrieves the actor that have the given PID (or nullptr if not existing) */ static ActorPtr by_pid(aid_t pid); - /** @brief Wait for the actor to finish. + /** Wait for the actor to finish. * * This blocks the calling actor until the actor on which we call join() is terminated */ @@ -282,6 +297,7 @@ public: const char* get_property(std::string key); void set_property(std::string key, std::string value); +#ifndef DOXYGEN /** @deprecated See Actor::create() */ XBT_ATTRIB_DEPRECATED_v323("Please use Actor::create()") static ActorPtr createActor( const char* name, s4u::Host* host, std::function code) @@ -373,6 +389,7 @@ public: { set_property(key, value); } +#endif }; /** @ingroup s4u_api @@ -439,6 +456,7 @@ XBT_PUBLIC bool is_suspended(); /** @brief kill the actor. */ XBT_PUBLIC void exit(); +#ifndef DOXYGEN /** @deprecated Please use std::function for first parameter */ XBT_ATTRIB_DEPRECATED_v323("Please use std::function for first parameter.") XBT_PUBLIC void on_exit(int_f_pvoid_pvoid_t fun, void* data); @@ -466,6 +484,7 @@ XBT_ATTRIB_DEPRECATED_v323("Please use this_actor::is_suspended()") XBT_PUBLIC b XBT_ATTRIB_DEPRECATED_v323("Please use this_actor::on_exit()") XBT_PUBLIC void onExit(int_f_pvoid_pvoid_t fun, void* data); /** @deprecated See this_actor::exit() */ XBT_ATTRIB_DEPRECATED_v324("Please use this_actor::exit()") XBT_PUBLIC void kill(); +#endif } /** @} */ diff --git a/include/simgrid/s4u/ConditionVariable.hpp b/include/simgrid/s4u/ConditionVariable.hpp index 06037981ee..e2f5cce6c1 100644 --- a/include/simgrid/s4u/ConditionVariable.hpp +++ b/include/simgrid/s4u/ConditionVariable.hpp @@ -23,8 +23,10 @@ namespace s4u { */ class XBT_PUBLIC ConditionVariable { private: +#ifndef DOXYGEN friend kernel::activity::ConditionVariableImpl; smx_cond_t cond_; +#endif explicit ConditionVariable(smx_cond_t cond) : cond_(cond) {} public: ConditionVariable(ConditionVariable const&) = delete; diff --git a/include/simgrid/s4u/Engine.hpp b/include/simgrid/s4u/Engine.hpp index fe486f403a..6bb6e896ca 100644 --- a/include/simgrid/s4u/Engine.hpp +++ b/include/simgrid/s4u/Engine.hpp @@ -84,23 +84,26 @@ public: void load_deployment(std::string deploy); protected: - friend s4u::Host; - friend s4u::Link; - friend s4u::Storage; +#ifndef DOXYGEN + friend Host; + friend Link; + friend Storage; friend kernel::routing::NetPoint; friend kernel::routing::NetZoneImpl; friend kernel::resource::LinkImpl; - void host_register(std::string name, simgrid::s4u::Host* host); + void host_register(std::string name, Host* host); void host_unregister(std::string name); - void link_register(std::string name, simgrid::s4u::Link* link); + void link_register(std::string name, Link* link); void link_unregister(std::string name); - void storage_register(std::string name, simgrid::s4u::Storage* storage); + void storage_register(std::string name, Storage* storage); void storage_unregister(std::string name); void netpoint_register(simgrid::kernel::routing::NetPoint* card); void netpoint_unregister(simgrid::kernel::routing::NetPoint* card); +#endif /*DOXYGEN*/ public: size_t get_host_count(); + /** @brief Returns the list of all hosts found in the platform */ std::vector get_all_hosts(); std::vector get_filtered_hosts(std::function filter); simgrid::s4u::Host* host_by_name(std::string name); diff --git a/include/simgrid/s4u/Exec.hpp b/include/simgrid/s4u/Exec.hpp index b47ac41e9e..26af5d02e4 100644 --- a/include/simgrid/s4u/Exec.hpp +++ b/include/simgrid/s4u/Exec.hpp @@ -14,6 +14,11 @@ namespace simgrid { namespace s4u { +/** Computation #Activity, representing the asynchronous disk access. + * + * They are generated from simgrid::s4u::this_actor::exec_init() or simgrid::s4u::Host::execute(). + */ + class XBT_PUBLIC Exec : public Activity { Exec() : Activity() {} public: diff --git a/include/simgrid/s4u/Host.hpp b/include/simgrid/s4u/Host.hpp index 4098162236..26568e8b1d 100644 --- a/include/simgrid/s4u/Host.hpp +++ b/include/simgrid/s4u/Host.hpp @@ -35,8 +35,10 @@ namespace s4u { * and actors can retrieve the host on which they run using simgrid::s4u::Host::current(). */ class XBT_PUBLIC Host : public simgrid::xbt::Extendable { +#ifndef DOXYGEN friend simgrid::vm::VMModel; // Use the pimpl_cpu to compute the VM sharing friend simgrid::vm::VirtualMachineImpl; // creates the the pimpl_cpu +#endif public: explicit Host(std::string name); @@ -95,6 +97,8 @@ public: const char* get_property(std::string key) const; void set_property(std::string key, std::string value); std::unordered_map* get_properties(); + +#ifndef DOXYGEN /** @deprecated See Host::get_properties() */ XBT_ATTRIB_DEPRECATED_v323("Please use Host::get_properties()") std::map* getProperties() { @@ -104,6 +108,7 @@ public: res->insert(kv); return res; } +#endif double get_speed() const; double get_available_speed() const; @@ -115,6 +120,7 @@ public: void set_pstate(int pstate_index); int get_pstate() const; +#ifndef DOXYGEN /** @deprecated See Host::get_speed() */ XBT_ATTRIB_DEPRECATED_v323("Please use Host::get_speed() instead.") double getSpeed() { return get_speed(); } /** @deprecated See Host::get_pstate_speed() */ @@ -122,6 +128,7 @@ public: { return get_pstate_speed(pstate_index); } +#endif std::vector get_attached_storages() const; XBT_ATTRIB_DEPRECATED_v323("Please use Host::get_attached_storages() instead.") void getAttachedStorages( @@ -151,6 +158,7 @@ public: void execute(double flops, double priority); // Deprecated functions +#ifndef DOXYGEN /** @deprecated See Host::get_name() */ XBT_ATTRIB_DEPRECATED_v323("Please use Host::get_name()") simgrid::xbt::string const& getName() const { @@ -203,6 +211,7 @@ public: { return get_pstate_count(); } +#endif /* !DOXYGEN */ private: simgrid::xbt::string name_ {"noname"}; diff --git a/include/simgrid/s4u/Io.hpp b/include/simgrid/s4u/Io.hpp index 17fd1f2605..72b5ea2296 100644 --- a/include/simgrid/s4u/Io.hpp +++ b/include/simgrid/s4u/Io.hpp @@ -15,6 +15,11 @@ namespace simgrid { namespace s4u { +/** I/O Activity, representing the asynchronous disk access. + * + * They are generated from simgrid::s4u::Storage::read() and simgrid::s4u::Storage::write(). + */ + class XBT_PUBLIC Io : public Activity { public: enum class OpType { READ, WRITE }; diff --git a/include/simgrid/s4u/Link.hpp b/include/simgrid/s4u/Link.hpp index bd4a84bac9..c6dd56d3d1 100644 --- a/include/simgrid/s4u/Link.hpp +++ b/include/simgrid/s4u/Link.hpp @@ -22,7 +22,9 @@ namespace simgrid { namespace s4u { /** @brief A Link represents the network facilities between [hosts](@ref simgrid::s4u::Host) */ class XBT_PUBLIC Link : public simgrid::xbt::Extendable { +#ifndef DOXYGEN friend simgrid::kernel::resource::LinkImpl; +#endif // Links are created from the NetZone, and destroyed by their private implementation when the simulation ends explicit Link(kernel::resource::LinkImpl* pimpl) : pimpl_(pimpl) {} diff --git a/include/simgrid/s4u/Mailbox.hpp b/include/simgrid/s4u/Mailbox.hpp index 6dd039475b..78950269a5 100644 --- a/include/simgrid/s4u/Mailbox.hpp +++ b/include/simgrid/s4u/Mailbox.hpp @@ -102,8 +102,10 @@ namespace s4u { * @section s4u_mb_api The API */ class XBT_PUBLIC Mailbox { +#ifndef DOXYGEN friend Comm; friend simgrid::kernel::activity::MailboxImpl; +#endif simgrid::kernel::activity::MailboxImpl* pimpl_; diff --git a/include/simgrid/s4u/Mutex.hpp b/include/simgrid/s4u/Mutex.hpp index b4df2e8b42..ae785f4841 100644 --- a/include/simgrid/s4u/Mutex.hpp +++ b/include/simgrid/s4u/Mutex.hpp @@ -29,8 +29,10 @@ class ConditionVariable; * */ class XBT_PUBLIC Mutex { +#ifndef DOXYGEN friend ConditionVariable; friend simgrid::kernel::activity::MutexImpl; +#endif simgrid::kernel::activity::MutexImpl* pimpl_; explicit Mutex(simgrid::kernel::activity::MutexImpl* mutex) : pimpl_(mutex) {} diff --git a/include/simgrid/s4u/NetZone.hpp b/include/simgrid/s4u/NetZone.hpp index 8156699c38..ab145b9de4 100644 --- a/include/simgrid/s4u/NetZone.hpp +++ b/include/simgrid/s4u/NetZone.hpp @@ -25,7 +25,9 @@ namespace s4u { */ class XBT_PUBLIC NetZone { protected: +#ifndef DOXYGEN friend simgrid::kernel::routing::NetZoneImpl; +#endif explicit NetZone(kernel::routing::NetZoneImpl* impl); ~NetZone(); diff --git a/include/simgrid/s4u/Storage.hpp b/include/simgrid/s4u/Storage.hpp index b807c7fa3f..85005c1259 100644 --- a/include/simgrid/s4u/Storage.hpp +++ b/include/simgrid/s4u/Storage.hpp @@ -17,18 +17,21 @@ #include namespace simgrid { -namespace xbt { -extern template class XBT_PUBLIC Extendable; -} namespace s4u { +#ifndef DOXYGEN /** @deprecated Engine::get_all_storages() */ XBT_ATTRIB_DEPRECATED_v322("Please use Engine::get_all_storages()") XBT_PUBLIC void getStorageList(std::map* whereTo); +#endif + +/** Storage represent the disk resources, usually associated to a given host */ class XBT_PUBLIC Storage : public simgrid::xbt::Extendable { +#ifndef DOXYGEN friend s4u::Engine; friend s4u::Io; friend simgrid::surf::StorageImpl; +#endif /* DOXYGEN */ public: explicit Storage(std::string name, surf::StorageImpl * pimpl); diff --git a/src/mc/sosp/PageStore.cpp b/src/mc/sosp/PageStore.cpp index f0191c4889..e9a7fc16aa 100644 --- a/src/mc/sosp/PageStore.cpp +++ b/src/mc/sosp/PageStore.cpp @@ -1,5 +1,4 @@ -/* Copyright (c) 2015-2018. The SimGrid Team. - * All rights reserved. */ +/* Copyright (c) 2015-2018. The SimGrid Team. All rights reserved. */ /* This program is free software; you can redistribute it and/or modify it * under the terms of the license (GNU LGPL) which comes with this package. */ @@ -49,7 +48,7 @@ static XBT_ALWAYS_INLINE PageStore::hash_type mc_hash_page(const void* data) // ***** snapshot_page_manager -PageStore::PageStore(size_t size) : memory_(nullptr), capacity_(size), top_index_(0) +PageStore::PageStore(std::size_t size) : memory_(nullptr), capacity_(size), top_index_(0) { // Using mmap in order to be able to expand the region by relocating it somewhere else in the virtual memory space: void* memory = diff --git a/src/mc/sosp/mc_snapshot.cpp b/src/mc/sosp/mc_snapshot.cpp index a17e3fa974..68e80aa6dc 100644 --- a/src/mc/sosp/mc_snapshot.cpp +++ b/src/mc/sosp/mc_snapshot.cpp @@ -23,8 +23,7 @@ * * @param addr Pointer * @param snapshot Snapshot - * @param Snapshot region in the snapshot this pointer belongs to - * (or nullptr if it does not belong to any snapshot region) + * @param process_index rank requesting the region * */ mc_mem_region_t mc_get_snapshot_region(const void* addr, const simgrid::mc::Snapshot* snapshot, int process_index) { @@ -102,11 +101,11 @@ const void* MC_region_read_fragmented(mc_mem_region_t region, void* target, cons /** Compare memory between snapshots (with known regions) * * @param addr1 Address in the first snapshot - * @param snapshot2 Region of the address in the first snapshot + * @param region1 Region of the address in the first snapshot * @param addr2 Address in the second snapshot - * @param snapshot2 Region of the address in the second snapshot - * @return same as memcmp - * */ + * @param region2 Region of the address in the second snapshot + * @return same semantic as memcmp + */ int MC_snapshot_region_memcmp(const void* addr1, mc_mem_region_t region1, const void* addr2, mc_mem_region_t region2, size_t size) { diff --git a/src/s4u/s4u_Actor.cpp b/src/s4u/s4u_Actor.cpp index b1f74569ab..b933c632c4 100644 --- a/src/s4u/s4u_Actor.cpp +++ b/src/s4u/s4u_Actor.cpp @@ -8,7 +8,9 @@ #include "simgrid/s4u/Exec.hpp" #include "simgrid/s4u/Host.hpp" #include "src/kernel/activity/ExecImpl.hpp" +#include "src/simix/smx_host_private.hpp" #include "src/simix/smx_private.hpp" +#include "src/surf/HostImpl.hpp" #include @@ -73,7 +75,16 @@ void Actor::join(double timeout) void Actor::set_auto_restart(bool autorestart) { - simgrid::simix::simcall([this, autorestart]() { pimpl_->set_auto_restart(autorestart); }); + simgrid::simix::simcall([this, autorestart]() { + pimpl_->set_auto_restart(autorestart); + + std::map actors_map = pimpl_->host_->pimpl_->actors_at_boot_; + if (actors_map.find(pimpl_->name_) == actors_map.end()) { + simgrid::kernel::actor::ProcessArg* arg = new simgrid::kernel::actor::ProcessArg(pimpl_->host_, pimpl_); + XBT_DEBUG("Adding Process %s to the actors_at_boot_ list of Host %s", arg->name.c_str(), arg->host->get_cname()); + actors_map.insert({arg->name, arg}); + } + }); } void Actor::on_exit(int_f_pvoid_pvoid_t fun, void* data) /* deprecated */ @@ -86,15 +97,6 @@ void Actor::on_exit(std::function fun, void* data) simgrid::simix::simcall([this, fun, data] { SIMIX_process_on_exit(pimpl_, fun, data); }); } -/** @brief Moves the actor to another host - * - * If the actor is currently blocked on an execution activity, the activity is also - * migrated to the new host. If it's blocked on another kind of activity, an error is - * raised as the mandated code is not written yet. Please report that bug if you need it. - * - * Asynchronous activities started by the actor are not migrated automatically, so you have - * to take care of this yourself (only you knows which ones should be migrated). - */ void Actor::migrate(Host* new_host) { s4u::Actor::on_migration_start(this); diff --git a/src/s4u/s4u_Engine.cpp b/src/s4u/s4u_Engine.cpp index afdc16fc64..3f8d5677e8 100644 --- a/src/s4u/s4u_Engine.cpp +++ b/src/s4u/s4u_Engine.cpp @@ -106,7 +106,6 @@ void Engine::getHostList(std::vector* list) list->push_back(kv.second); } -/** @brief Returns the list of all hosts found in the platform */ std::vector Engine::get_all_hosts() { std::vector res; diff --git a/src/s4u/s4u_Host.cpp b/src/s4u/s4u_Host.cpp index 8e5edcbecd..e78c4eb956 100644 --- a/src/s4u/s4u_Host.cpp +++ b/src/s4u/s4u_Host.cpp @@ -86,8 +86,8 @@ void Host::turn_on() { if (is_off()) { simgrid::simix::simcall([this] { - this->pimpl_->turn_on(); this->pimpl_cpu->turn_on(); + this->pimpl_->turn_on(); on_state_change(*this); }); } diff --git a/src/simix/ActorImpl.cpp b/src/simix/ActorImpl.cpp index c27e6d0386..8c60d57d16 100644 --- a/src/simix/ActorImpl.cpp +++ b/src/simix/ActorImpl.cpp @@ -450,7 +450,9 @@ void SIMIX_process_kill(smx_actor_t process, smx_actor_t issuer) { /* destroy the blocking synchro if any */ if (process->waiting_synchro != nullptr) { - + if (process->host_->is_off()) { + SMX_EXCEPTION(process, host_error, 0, "Host failed"); + } simgrid::kernel::activity::ExecImplPtr exec = boost::dynamic_pointer_cast(process->waiting_synchro); simgrid::kernel::activity::CommImplPtr comm = @@ -698,10 +700,14 @@ void SIMIX_process_yield(smx_actor_t self) self->finished_ = true; /* execute the on_exit functions */ SIMIX_process_on_exit_runall(self); - /* Add the process to the list of process to restart, only if the host is down */ - if (self->auto_restart_ && self->host_->is_off()) { - SIMIX_host_add_auto_restart_process(self->host_, self); + + if (self->auto_restart_ && self->host_->is_off() && + watched_hosts.find(self->host_->get_cname()) == watched_hosts.end()) { + XBT_DEBUG("Push host %s to watched_hosts because it's off and %s needs to restart", self->host_->get_cname(), + self->get_cname()); + watched_hosts.insert(self->host_->get_cname()); } + XBT_DEBUG("Process %s@%s is dead", self->get_cname(), self->host_->get_cname()); self->context_->stop(); } diff --git a/src/simix/libsmx.cpp b/src/simix/libsmx.cpp index 68badd050e..d3409086df 100644 --- a/src/simix/libsmx.cpp +++ b/src/simix/libsmx.cpp @@ -34,9 +34,10 @@ XBT_LOG_EXTERNAL_DEFAULT_CATEGORY(simix); * to create the SIMIX synchro. It can raise a host_error exception if the host crashed. * * @param name Name of the execution synchro to create + * @param category Tracing category * @param flops_amount amount Computation amount (in flops) * @param priority computation priority - * @param bound + * @param bound Maximal speed for this execution (in flops) or -1 if no limit * @param host host where the synchro will be executed * @return A new SIMIX execution synchronization */ diff --git a/src/simix/smx_global.cpp b/src/simix/smx_global.cpp index c527f79be5..eecd1d4c2b 100644 --- a/src/simix/smx_global.cpp +++ b/src/simix/smx_global.cpp @@ -506,7 +506,7 @@ void SIMIX_run() /* Autorestart all process */ for (auto const& host : host_that_restart) { XBT_INFO("Restart processes on host %s", host->get_cname()); - SIMIX_host_autorestart(host); + host->turn_on(); } host_that_restart.clear(); diff --git a/src/simix/smx_host.cpp b/src/simix/smx_host.cpp index 05dc306c6f..dc7f150213 100644 --- a/src/simix/smx_host.cpp +++ b/src/simix/smx_host.cpp @@ -24,42 +24,6 @@ const char* sg_host_self_get_name() return host->get_cname(); } -/** - * @brief Add a process to the list of the processes that the host will restart when it comes back - * This function add a process to the list of the processes that will be restarted when the host comes - * back. It is expected that this function is called when the host is down. - * The processes will only be restarted once, meaning that you will have to register the process - * again to restart the process again. - */ -void SIMIX_host_add_auto_restart_process(sg_host_t host, simgrid::kernel::actor::ActorImpl* actor) -{ - simgrid::kernel::actor::ProcessArg* arg = new simgrid::kernel::actor::ProcessArg(host, actor); - - if (host->is_off() && watched_hosts.find(host->get_cname()) == watched_hosts.end()) { - watched_hosts.insert(host->get_cname()); - XBT_DEBUG("Push host %s to watched_hosts because state == SURF_RESOURCE_OFF", host->get_cname()); - } - XBT_DEBUG("Adding Process %s to the auto-restart list of Host %s", arg->name.c_str(), arg->host->get_cname()); - host->pimpl_->auto_restart_processes_.push_back(arg); -} - -/** @brief Restart the list of processes that have been registered to the host */ -void SIMIX_host_autorestart(sg_host_t host) -{ - std::vector process_list = host->pimpl_->auto_restart_processes_; - - for (auto const& arg : process_list) { - XBT_DEBUG("Restarting Process %s@%s right now", arg->name.c_str(), arg->host->get_cname()); - smx_actor_t actor = simix_global->create_process_function(arg->name.c_str(), arg->code, nullptr, arg->host, - arg->properties.get(), nullptr); - if (arg->kill_time >= 0) - simcall_process_set_kill_time(actor, arg->kill_time); - if (arg->auto_restart) - actor->auto_restart_ = arg->auto_restart; - } - process_list.clear(); -} - simgrid::kernel::activity::ExecImplPtr SIMIX_execution_start(std::string name, std::string category, double flops_amount, double priority, double bound, sg_host_t host) diff --git a/src/simix/smx_host_private.hpp b/src/simix/smx_host_private.hpp index fe4c5dcdc3..1a6da742c3 100644 --- a/src/simix/smx_host_private.hpp +++ b/src/simix/smx_host_private.hpp @@ -10,9 +10,6 @@ #include -XBT_PRIVATE void SIMIX_host_add_auto_restart_process(sg_host_t host, simgrid::kernel::actor::ActorImpl* actor); -XBT_PRIVATE void SIMIX_host_autorestart(sg_host_t host); - XBT_PRIVATE void SIMIX_execution_finish(smx_activity_t synchro); XBT_PRIVATE void SIMIX_set_category(smx_activity_t synchro, std::string category); diff --git a/src/surf/HostImpl.cpp b/src/surf/HostImpl.cpp index 1084bc0a83..8fa93261b5 100644 --- a/src/surf/HostImpl.cpp +++ b/src/surf/HostImpl.cpp @@ -102,6 +102,7 @@ HostImpl::HostImpl(s4u::Host* host) : piface_(host) delete piface_->pimpl_; piface_->pimpl_ = this; } + HostImpl::~HostImpl() { /* All processes should be gone when the host is turned off (by the end of the simulation). */ @@ -113,12 +114,9 @@ HostImpl::~HostImpl() SIMIX_display_process_status(); THROWF(arg_error, 0, "%s", msg.c_str()); } - for (auto const& arg : auto_restart_processes_) - delete arg; - auto_restart_processes_.clear(); - for (auto const& arg : boot_processes_) - delete arg; - boot_processes_.clear(); + for (auto const& arg : actors_at_boot_) + delete arg.second; + actors_at_boot_.clear(); } /** Re-starts all the actors that are marked as restartable. @@ -127,8 +125,9 @@ HostImpl::~HostImpl() */ void HostImpl::turn_on() { - for (auto const& arg : boot_processes_) { - XBT_DEBUG("Booting Process %s(%s) right now", arg->name.c_str(), arg->host->get_cname()); + for (auto const& elm : actors_at_boot_) { + kernel::actor::ProcessArg* arg = elm.second; + XBT_DEBUG("Booting Actor %s(%s) right now", arg->name.c_str(), arg->host->get_cname()); smx_actor_t actor = simix_global->create_process_function(arg->name.c_str(), arg->code, nullptr, arg->host, arg->properties.get(), nullptr); if (arg->kill_time >= 0) @@ -142,11 +141,21 @@ void HostImpl::turn_off() { if (not process_list_.empty()) { for (auto& actor : process_list_) { - SIMIX_process_kill(&actor, SIMIX_process_self()); - XBT_DEBUG("Killing %s@%s on behalf of %s which turned off that host.", actor.get_cname(), + XBT_DEBUG("Killing Actor %s@%s on behalf of %s which turned off that host.", actor.get_cname(), actor.host_->get_cname(), SIMIX_process_self()->get_cname()); + SIMIX_process_kill(&actor, SIMIX_process_self()); } } + // When a host is turned off, we want to keep only the actors that should restart for when it will boot again. + // Then get rid of the others. + auto elm = actors_at_boot_.begin(); + while (elm != actors_at_boot_.end()) { + if (not elm->second->auto_restart) { + delete elm->second; + actors_at_boot_.erase(elm); + } else + ++elm; + } } std::vector HostImpl::get_all_actors() diff --git a/src/surf/HostImpl.hpp b/src/surf/HostImpl.hpp index 6672d7afdd..cca54d6485 100644 --- a/src/surf/HostImpl.hpp +++ b/src/surf/HostImpl.hpp @@ -66,8 +66,7 @@ public: // FIXME: make these private ActorList process_list_; - std::vector auto_restart_processes_; - std::vector boot_processes_; + std::map actors_at_boot_; }; } } diff --git a/src/surf/cpu_cas01.cpp b/src/surf/cpu_cas01.cpp index 62e69e23c7..b59498075f 100644 --- a/src/surf/cpu_cas01.cpp +++ b/src/surf/cpu_cas01.cpp @@ -130,16 +130,17 @@ void CpuCas01::apply_event(tmgr_trace_event_t event, double value) xbt_assert(get_core_count() == 1, "FIXME: add state change code also for constraint_core[i]"); if (value > 0) { - if (is_off()) + if (is_off()) { host_that_restart.push_back(get_host()); - turn_on(); + get_host()->turn_on(); + } } else { kernel::lmm::Constraint* cnst = get_constraint(); kernel::lmm::Variable* var = nullptr; const kernel::lmm::Element* elem = nullptr; double date = surf_get_clock(); - turn_off(); + get_host()->turn_off(); while ((var = cnst->get_variable(&elem))) { kernel::resource::Action* action = static_cast(var->get_id()); diff --git a/src/surf/cpu_ti.cpp b/src/surf/cpu_ti.cpp index 99d32a6738..c7f6266ce2 100644 --- a/src/surf/cpu_ti.cpp +++ b/src/surf/cpu_ti.cpp @@ -398,11 +398,12 @@ void CpuTi::apply_event(tmgr_trace_event_t event, double value) } else if (event == state_event_) { if (value > 0) { - if (is_off()) + if (is_off()) { host_that_restart.push_back(get_host()); - turn_on(); + get_host()->turn_on(); + } } else { - turn_off(); + get_host()->turn_off(); double date = surf_get_clock(); /* put all action running on cpu to failed */ diff --git a/src/surf/ptask_L07.cpp b/src/surf/ptask_L07.cpp index bffa33376d..6e7546186d 100644 --- a/src/surf/ptask_L07.cpp +++ b/src/surf/ptask_L07.cpp @@ -315,10 +315,13 @@ void CpuL07::apply_event(tmgr_trace_event_t triggered, double value) tmgr_trace_event_unref(&speed_.event); } else if (triggered == state_event_) { - if (value > 0) - turn_on(); - else - turn_off(); + if (value > 0) { + if (is_off()) { + host_that_restart.push_back(get_host()); + get_host()->turn_on(); + } + } else + get_host()->turn_off(); tmgr_trace_event_unref(&state_event_); } else { diff --git a/src/surf/sg_platf.cpp b/src/surf/sg_platf.cpp index a3a9c52ac7..55de70536e 100644 --- a/src/surf/sg_platf.cpp +++ b/src/surf/sg_platf.cpp @@ -444,7 +444,7 @@ void sg_platf_new_actor(simgrid::kernel::routing::ActorCreationArgs* actor) simgrid::kernel::actor::ProcessArg* arg = new simgrid::kernel::actor::ProcessArg(actor_name, code, nullptr, host, kill_time, properties, auto_restart); - host->pimpl_->boot_processes_.push_back(arg); + host->pimpl_->actors_at_boot_.insert({actor_name, arg}); if (start_time > SIMIX_get_clock()) { diff --git a/teshsuite/msg/host_on_off/host_on_off.c b/teshsuite/msg/host_on_off/host_on_off.c index 8c34453a53..9922503d12 100644 --- a/teshsuite/msg/host_on_off/host_on_off.c +++ b/teshsuite/msg/host_on_off/host_on_off.c @@ -17,6 +17,10 @@ static int slave(int argc, char *argv[]) while (1) { res = MSG_task_receive(&(task), mailbox); + if (res == MSG_HOST_FAILURE) { + XBT_DEBUG("The host has been turned off, this was expected"); + return 1; + } xbt_assert(res == MSG_OK, "MSG_task_get failed"); if (!strcmp(MSG_task_get_name(task), "finalize")) { diff --git a/teshsuite/msg/host_on_off_processes/host_on_off_processes.cpp b/teshsuite/msg/host_on_off_processes/host_on_off_processes.cpp index f84aaed537..8f0b5b2d36 100644 --- a/teshsuite/msg/host_on_off_processes/host_on_off_processes.cpp +++ b/teshsuite/msg/host_on_off_processes/host_on_off_processes.cpp @@ -27,10 +27,13 @@ static int process_daemon(int /*argc*/, char** /*argv*/) msg_task_t task = MSG_task_create("daemon", MSG_host_get_speed(MSG_host_self()), 0, NULL); MSG_process_set_data(self, task); XBT_INFO(" Execute daemon"); - MSG_task_execute(task); - MSG_process_set_data(self, NULL); + msg_error_t res = MSG_task_execute(task); MSG_task_destroy(task); tasks_done++; + if (res == MSG_HOST_FAILURE) { + XBT_INFO("Host as died as expected, do nothing else"); + return 0; + } } XBT_INFO(" daemon done. See you!"); return 0; @@ -247,7 +250,6 @@ int main(int argc, char* argv[]) MSG_create_environment(argv[1]); - MSG_process_set_data_cleanup(task_cleanup_handler); MSG_process_create("test_launcher", test_launcher, NULL, MSG_get_host_by_name("Tremblay")); res = MSG_main(); diff --git a/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh b/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh index ba402ab5a0..f89a77e731 100644 --- a/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh +++ b/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh @@ -65,6 +65,8 @@ $ ${bindir}/host_on_off_processes ${platfdir}/small_platform.xml 5 --log=no_loc > [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Turn Jupiter off > [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Test 5 seems ok (number of Process: 2, it should be 2) > [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Test done. See you! +> [Jupiter:commRX:(2) 20.000000] [msg_test/INFO] Receive message: HOST_FAILURE +> [Jupiter:commRX:(2) 20.000000] [msg_test/INFO] RX Done > [Tremblay:commTX:(3) 40.000000] [msg_test/INFO] TX done > [40.000000] [msg_test/INFO] Simulation time 40 @@ -85,6 +87,7 @@ $ ${bindir}/host_on_off_processes ${platfdir}/small_platform.xml 6 --log=no_loc > [Jupiter:process_daemonJUPI:(3) 9.000011] [msg_test/INFO] Execute daemon > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Turn Jupiter off > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Shutdown vm0 +> [Jupiter:process_daemonJUPI:(3) 10.000000] [msg_test/INFO] Host as died as expected, do nothing else > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Destroy vm0 > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Test 6 is also weird: when the node Jupiter is turned off once again, the VM and its daemon are not killed. However, the issue regarding the shutdown of hosted VMs can be seen a feature not a bug ;) > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Test done. See you! diff --git a/teshsuite/msg/host_on_off_recv/host_on_off_recv.c b/teshsuite/msg/host_on_off_recv/host_on_off_recv.c index 97363f8ab2..9755c8d3f6 100644 --- a/teshsuite/msg/host_on_off_recv/host_on_off_recv.c +++ b/teshsuite/msg/host_on_off_recv/host_on_off_recv.c @@ -41,7 +41,10 @@ static int slave(int argc, char *argv[]) msg_task_t task = NULL; msg_error_t error = MSG_task_receive(&(task), mailbox); if (error) { - XBT_ERROR("Error while receiving message"); + if (error != MSG_HOST_FAILURE) + XBT_ERROR("Error while receiving message"); + else + XBT_DEBUG("The host has been turned off, this was expected"); return 1; } diff --git a/teshsuite/s4u/CMakeLists.txt b/teshsuite/s4u/CMakeLists.txt index f54101688e..a0eb9f6c1b 100644 --- a/teshsuite/s4u/CMakeLists.txt +++ b/teshsuite/s4u/CMakeLists.txt @@ -11,7 +11,7 @@ endforeach() ## Add the tests. ## Some need to be run with all factories, some need not tesh to run -foreach(x actor actor-migration cloud-interrupt-migration concurrent_rw) # TODO: actor-autorestart is disabled for now +foreach(x actor actor-autorestart actor-migration cloud-interrupt-migration concurrent_rw) # TODO: actor-autorestart is disabled for now set(tesh_files ${tesh_files} ${CMAKE_CURRENT_SOURCE_DIR}/${x}/${x}.tesh) ADD_TESH_FACTORIES(tesh-s4u-${x} "thread;ucontext;raw;boost" --setenv srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/s4u/${x} --setenv platfdir=${CMAKE_HOME_DIRECTORY}/examples/platforms --cd ${CMAKE_BINARY_DIR}/teshsuite/s4u/${x} ${CMAKE_HOME_DIRECTORY}/teshsuite/s4u/${x}/${x}.tesh) endforeach() diff --git a/teshsuite/s4u/actor-autorestart/actor-autorestart.cpp b/teshsuite/s4u/actor-autorestart/actor-autorestart.cpp index bb2d96414d..47e132a9e1 100644 --- a/teshsuite/s4u/actor-autorestart/actor-autorestart.cpp +++ b/teshsuite/s4u/actor-autorestart/actor-autorestart.cpp @@ -4,14 +4,24 @@ * under the terms of the license (GNU LGPL) which comes with this package. */ #include "simgrid/s4u.hpp" +#include XBT_LOG_NEW_DEFAULT_CATEGORY(s4u_test, "Messages specific for this s4u example"); static void dummy() { XBT_INFO("I start"); - simgrid::s4u::this_actor::sleep_for(200); - XBT_INFO("I stop"); + try { + simgrid::s4u::this_actor::sleep_for(200); + XBT_INFO("I stop"); + } catch (xbt_ex& e) { + if (e.category == host_error) { + XBT_DEBUG("The host has died ... as expected. This actor silently stops"); + } else { + XBT_ERROR("An unexpected exception has been raised."); + throw; + } + } } static void autostart() diff --git a/tools/cmake/DefinePackages.cmake b/tools/cmake/DefinePackages.cmake index c9974ce8cf..bd9de442b3 100644 --- a/tools/cmake/DefinePackages.cmake +++ b/tools/cmake/DefinePackages.cmake @@ -877,7 +877,6 @@ set(DOC_SOURCES doc/doxygen/application.doc doc/doxygen/community.doc doc/doxygen/deployment.doc - doc/doxygen/examples.doc doc/doxygen/footer.html doc/doxygen/getting_started.doc doc/doxygen/header.html @@ -954,7 +953,6 @@ set(DOC_TOOLS # these files get copied automatically to the html documentation set(DOC_IMG - ${CMAKE_HOME_DIRECTORY}/doc/sc3-description.png ${CMAKE_HOME_DIRECTORY}/doc/webcruft/AS_hierarchy.png ${CMAKE_HOME_DIRECTORY}/doc/webcruft/eclipseScreenShot.png ${CMAKE_HOME_DIRECTORY}/doc/webcruft/Paje_MSG_screenshot.jpg diff --git a/tools/cmake/Documentation.cmake b/tools/cmake/Documentation.cmake index 9efb34430b..c25c77769d 100644 --- a/tools/cmake/Documentation.cmake +++ b/tools/cmake/Documentation.cmake @@ -24,6 +24,8 @@ if(enable_documentation) COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/doc/example_lists COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_BINARY_DIR}/doc/html COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/doc/html + COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_BINARY_DIR}/doc/xml + COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_BINARY_DIR}/docs/source/api WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/doc )