From: Martin Quinson Date: Sat, 11 Aug 2018 20:49:38 +0000 (+0200) Subject: Merge branches 'auto_restart' and 'auto_restart' of framagit.org:simgrid/simgrid X-Git-Tag: v3_21~261^2~1 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/937f2eb5b429c3c03fc989a25fbc26fabd0cf529?hp=6bc18b576fa4a890d8b9916d292ca67dfbf60dd3 Merge branches 'auto_restart' and 'auto_restart' of framagit.org:simgrid/simgrid Also, don't run doxygen manually on framagit/gitlab-ci --- diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index be346507e2..3ef1530238 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,12 +3,8 @@ image: debian:testing-slim .build: &build script: - apt-get update - - apt-get install -y python3-pip cmake doxygen libboost-all-dev libboost-dev fig2dev + - apt-get install -y python3-pip doxygen libboost-all-dev libboost-dev fig2dev - pip3 install --requirement docs/requirements.txt - - cmake -Denable_documentation=ON . - - make documentation - - mkdir docs/doxyoutput - - cp -r doc/xml docs/doxyoutput - cd docs - sphinx-build -M html source/ build/ - mv build/html ../public diff --git a/examples/msg/platform-failures/platform-failures.c b/examples/msg/platform-failures/platform-failures.c index 7771e50925..f1d5d5baa3 100644 --- a/examples/msg/platform-failures/platform-failures.c +++ b/examples/msg/platform-failures/platform-failures.c @@ -25,13 +25,13 @@ static int master(int argc, char *argv[]) for (i = 0; i < number_of_tasks; i++) { char mailbox[256]; snprintf(mailbox, 255, "worker-%ld", i % workers_count); - + XBT_INFO("Send a message to %s", mailbox); msg_task_t task = MSG_task_create("Task", task_comp_size, task_comm_size, xbt_new0(double, 1)); *((double *) task->data) = MSG_get_clock(); switch ( MSG_task_send_with_timeout(task,mailbox,10.0) ) { case MSG_OK: - XBT_INFO("Send completed"); + XBT_INFO("Send to %s completed", mailbox); break; case MSG_HOST_FAILURE: @@ -103,21 +103,20 @@ static int worker(int argc, char *argv[]) while (1) { double time1 = MSG_get_clock(); msg_task_t task = NULL; + XBT_INFO("Waiting a message on %s", mailbox); int retcode = MSG_task_receive( &(task), mailbox); double time2 = MSG_get_clock(); if (retcode == MSG_OK) { - XBT_INFO("Received \"%s\"", MSG_task_get_name(task)); if (MSG_task_get_data(task) == FINALIZE) { MSG_task_destroy(task); break; } if (time1 < *((double *) task->data)) time1 = *((double *) task->data); - XBT_INFO("Communication time : \"%f\"", time2 - time1); - XBT_INFO("Processing \"%s\"", MSG_task_get_name(task)); + XBT_INFO("Start execution..."); retcode = MSG_task_execute(task); if (retcode == MSG_OK) { - XBT_INFO("\"%s\" done", MSG_task_get_name(task)); + XBT_INFO("Execution complete."); free(task->data); MSG_task_destroy(task); } else if (retcode == MSG_HOST_FAILURE) { diff --git a/examples/msg/platform-failures/platform-failures.tesh b/examples/msg/platform-failures/platform-failures.tesh index a169ce4015..33043dbc08 100644 --- a/examples/msg/platform-failures/platform-failures.tesh +++ b/examples/msg/platform-failures/platform-failures.tesh @@ -6,94 +6,106 @@ p Testing a simple master/worker example application handling failures TCP cross $ $SG_TEST_EXENV ${bindir:=.}/platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_with_failures.xml ${srcdir}/../app-masterworker/app-masterworker_d.xml --cfg=path:${srcdir} --cfg=network/crosstraffic:0 "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" > [ 0.000000] (0:maestro@) Cannot launch process 'worker' on failed host 'Fafard' > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process -> [ 0.010309] (1:master@Tremblay) Send completed -> [ 0.010309] (2:worker@Tremblay) Received "Task" -> [ 0.010309] (2:worker@Tremblay) Communication time : "0.010309" -> [ 0.010309] (2:worker@Tremblay) Processing "Task" +> [ 0.000000] (1:master@Tremblay) Send a message to worker-0 +> [ 0.010309] (1:master@Tremblay) Send to worker-0 completed +> [ 0.010309] (2:worker@Tremblay) Start execution... +> [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 0.000000] (3:worker@Jupiter) Waiting a message on worker-1 +> [ 0.000000] (4:worker@Ginette) Waiting a message on worker-3 +> [ 0.000000] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 0.010309] (1:master@Tremblay) Send a message to worker-1 > [ 1.000000] (0:maestro@) Restart processes on host Fafard +> [ 1.000000] (6:worker@Fafard) Waiting a message on worker-2 > [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-1'. Nevermind. Let's keep going! +> [ 1.000000] (1:master@Tremblay) Send a message to worker-2 > [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 2.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-2'. Nevermind. Let's keep going! +> [ 2.000000] (6:worker@Fafard) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (0:maestro@) Restart processes on host Jupiter -> [ 2.010309] (2:worker@Tremblay) "Task" done -> [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 12.030928] (1:master@Tremblay) Send completed -> [ 12.030928] (4:worker@Ginette) Received "Task" -> [ 12.030928] (4:worker@Ginette) Communication time : "1.030928" -> [ 12.030928] (4:worker@Ginette) Processing "Task" -> [ 13.061856] (1:master@Tremblay) Send completed -> [ 13.061856] (5:worker@Bourassa) Received "Task" -> [ 13.061856] (5:worker@Bourassa) Communication time : "1.030928" -> [ 13.061856] (5:worker@Bourassa) Processing "Task" -> [ 13.072165] (1:master@Tremblay) Send completed -> [ 13.072165] (2:worker@Tremblay) Received "Task" -> [ 13.072165] (2:worker@Tremblay) Communication time : "0.010309" -> [ 13.072165] (2:worker@Tremblay) Processing "Task" -> [ 14.030928] (4:worker@Ginette) "Task" done -> [ 14.103093] (1:master@Tremblay) Send completed -> [ 14.103093] (6:worker@Jupiter) Received "Task" -> [ 14.103093] (6:worker@Jupiter) Communication time : "1.030928" -> [ 14.103093] (6:worker@Jupiter) Processing "Task" -> [ 15.061856] (5:worker@Bourassa) "Task" done -> [ 15.072165] (2:worker@Tremblay) "Task" done -> [ 16.103093] (6:worker@Jupiter) "Task" done -> [ 24.103093] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 24.103093] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! -> [ 24.103093] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 25.134021] (1:master@Tremblay) Send completed -> [ 25.134021] (5:worker@Bourassa) Received "Task" -> [ 25.134021] (5:worker@Bourassa) Communication time : "1.030928" -> [ 25.134021] (5:worker@Bourassa) Processing "Task" -> [ 25.144330] (1:master@Tremblay) Send completed -> [ 25.144330] (2:worker@Tremblay) Received "Task" -> [ 25.144330] (2:worker@Tremblay) Communication time : "0.010309" -> [ 25.144330] (2:worker@Tremblay) Processing "Task" -> [ 26.175258] (1:master@Tremblay) Send completed -> [ 26.175258] (6:worker@Jupiter) Received "Task" -> [ 26.175258] (6:worker@Jupiter) Communication time : "1.030928" -> [ 26.175258] (6:worker@Jupiter) Processing "Task" -> [ 27.134021] (5:worker@Bourassa) "Task" done -> [ 27.144330] (2:worker@Tremblay) "Task" done -> [ 28.175258] (6:worker@Jupiter) "Task" done -> [ 36.175258] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 37.206186] (1:master@Tremblay) Send completed -> [ 37.206186] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! -> [ 37.206186] (4:worker@Ginette) Received "Task" -> [ 37.206186] (4:worker@Ginette) Communication time : "1.030928" -> [ 37.206186] (4:worker@Ginette) Processing "Task" -> [ 37.206186] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 37.216495] (1:master@Tremblay) Send completed -> [ 37.216495] (2:worker@Tremblay) Received "Task" -> [ 37.216495] (2:worker@Tremblay) Communication time : "0.010309" -> [ 37.216495] (2:worker@Tremblay) Processing "Task" -> [ 38.247423] (1:master@Tremblay) Send completed -> [ 38.247423] (6:worker@Jupiter) Received "Task" -> [ 38.247423] (6:worker@Jupiter) Communication time : "1.030928" -> [ 38.247423] (6:worker@Jupiter) Processing "Task" -> [ 39.206186] (4:worker@Ginette) "Task" done -> [ 39.216495] (2:worker@Tremblay) "Task" done -> [ 40.247423] (6:worker@Jupiter) "Task" done -> [ 48.247423] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 49.278351] (1:master@Tremblay) Send completed -> [ 49.278351] (4:worker@Ginette) Received "Task" -> [ 49.278351] (4:worker@Ginette) Communication time : "1.030928" -> [ 49.278351] (4:worker@Ginette) Processing "Task" -> [ 50.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! -> [ 50.309278] (1:master@Tremblay) Send completed -> [ 50.309278] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. -> [ 50.309278] (2:worker@Tremblay) Received "finalize" -> [ 50.309278] (2:worker@Tremblay) I'm done. See you! -> [ 50.309278] (5:worker@Bourassa) Received "Task" -> [ 50.309278] (5:worker@Bourassa) Communication time : "1.030928" -> [ 50.309278] (5:worker@Bourassa) Processing "Task" -> [ 50.309278] (6:worker@Jupiter) Received "finalize" -> [ 50.309278] (6:worker@Jupiter) I'm done. See you! -> [ 51.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 52.309278] (0:maestro@) Simulation time 52.3093 -> [ 52.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! -> [ 52.309278] (1:master@Tremblay) Goodbye now! -> [ 52.309278] (5:worker@Bourassa) "Task" done -> [ 52.309278] (5:worker@Bourassa) Received "finalize" -> [ 52.309278] (5:worker@Bourassa) I'm done. See you! +> [ 2.000000] (1:master@Tremblay) Send a message to worker-3 +> [ 2.000000] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 2.010309] (2:worker@Tremblay) Execution complete. +> [ 2.010309] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 3.030928] (1:master@Tremblay) Send to worker-3 completed +> [ 3.030928] (1:master@Tremblay) Send a message to worker-4 +> [ 3.030928] (4:worker@Ginette) Start execution... +> [ 4.061856] (1:master@Tremblay) Send to worker-4 completed +> [ 4.061856] (1:master@Tremblay) Send a message to worker-0 +> [ 4.061856] (5:worker@Bourassa) Start execution... +> [ 4.072165] (1:master@Tremblay) Send to worker-0 completed +> [ 4.072165] (1:master@Tremblay) Send a message to worker-1 +> [ 4.072165] (2:worker@Tremblay) Start execution... +> [ 5.030928] (4:worker@Ginette) Execution complete. +> [ 5.030928] (4:worker@Ginette) Waiting a message on worker-3 +> [ 5.103093] (1:master@Tremblay) Send to worker-1 completed +> [ 5.103093] (1:master@Tremblay) Send a message to worker-2 +> [ 5.103093] (7:worker@Jupiter) Start execution... +> [ 6.061856] (5:worker@Bourassa) Execution complete. +> [ 6.061856] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 6.072165] (2:worker@Tremblay) Execution complete. +> [ 6.072165] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 7.103093] (7:worker@Jupiter) Execution complete. +> [ 7.103093] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 15.103093] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 15.103093] (1:master@Tremblay) Send a message to worker-3 +> [ 15.103093] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! +> [ 15.103093] (1:master@Tremblay) Send a message to worker-4 +> [ 15.103093] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 15.103093] (4:worker@Ginette) Waiting a message on worker-3 +> [ 16.134021] (1:master@Tremblay) Send to worker-4 completed +> [ 16.134021] (1:master@Tremblay) Send a message to worker-0 +> [ 16.134021] (5:worker@Bourassa) Start execution... +> [ 16.144330] (1:master@Tremblay) Send to worker-0 completed +> [ 16.144330] (1:master@Tremblay) Send a message to worker-1 +> [ 16.144330] (2:worker@Tremblay) Start execution... +> [ 17.175258] (1:master@Tremblay) Send to worker-1 completed +> [ 17.175258] (1:master@Tremblay) Send a message to worker-2 +> [ 17.175258] (7:worker@Jupiter) Start execution... +> [ 18.134021] (5:worker@Bourassa) Execution complete. +> [ 18.134021] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 18.144330] (2:worker@Tremblay) Execution complete. +> [ 18.144330] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 19.175258] (7:worker@Jupiter) Execution complete. +> [ 19.175258] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 27.175258] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 27.175258] (1:master@Tremblay) Send a message to worker-3 +> [ 28.206186] (1:master@Tremblay) Send to worker-3 completed +> [ 28.206186] (1:master@Tremblay) Send a message to worker-4 +> [ 28.206186] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! +> [ 28.206186] (1:master@Tremblay) Send a message to worker-0 +> [ 28.206186] (4:worker@Ginette) Start execution... +> [ 28.206186] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 28.206186] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 28.216495] (1:master@Tremblay) Send to worker-0 completed +> [ 28.216495] (1:master@Tremblay) Send a message to worker-1 +> [ 28.216495] (2:worker@Tremblay) Start execution... +> [ 29.247423] (1:master@Tremblay) Send to worker-1 completed +> [ 29.247423] (1:master@Tremblay) Send a message to worker-2 +> [ 29.247423] (7:worker@Jupiter) Start execution... +> [ 30.206186] (4:worker@Ginette) Execution complete. +> [ 30.206186] (4:worker@Ginette) Waiting a message on worker-3 +> [ 30.216495] (2:worker@Tremblay) Execution complete. +> [ 30.216495] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 31.247423] (7:worker@Jupiter) Execution complete. +> [ 31.247423] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 39.247423] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 39.247423] (1:master@Tremblay) Send a message to worker-3 +> [ 40.278351] (1:master@Tremblay) Send to worker-3 completed +> [ 40.278351] (1:master@Tremblay) Send a message to worker-4 +> [ 40.278351] (4:worker@Ginette) Start execution... +> [ 41.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 41.309278] (1:master@Tremblay) Send to worker-4 completed +> [ 41.309278] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. +> [ 41.309278] (2:worker@Tremblay) I'm done. See you! +> [ 41.309278] (5:worker@Bourassa) Start execution... +> [ 41.309278] (7:worker@Jupiter) I'm done. See you! +> [ 42.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 43.309278] (0:maestro@) Simulation time 43.3093 +> [ 43.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! +> [ 43.309278] (1:master@Tremblay) Goodbye now! +> [ 43.309278] (5:worker@Bourassa) Execution complete. +> [ 43.309278] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 43.309278] (5:worker@Bourassa) I'm done. See you! p Testing a simple master/worker example application handling failures. TCP crosstraffic ENABLED @@ -101,186 +113,210 @@ p Testing a simple master/worker example application handling failures. TCP cros $ $SG_TEST_EXENV ${bindir:=.}/platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_with_failures.xml ${srcdir}/../app-masterworker/app-masterworker_d.xml --cfg=path:${srcdir} "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" > [ 0.000000] (0:maestro@) Cannot launch process 'worker' on failed host 'Fafard' > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process -> [ 0.010825] (1:master@Tremblay) Send completed -> [ 0.010825] (2:worker@Tremblay) Received "Task" -> [ 0.010825] (2:worker@Tremblay) Communication time : "0.010825" -> [ 0.010825] (2:worker@Tremblay) Processing "Task" +> [ 0.000000] (1:master@Tremblay) Send a message to worker-0 +> [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 0.000000] (3:worker@Jupiter) Waiting a message on worker-1 +> [ 0.000000] (4:worker@Ginette) Waiting a message on worker-3 +> [ 0.000000] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 0.010825] (2:worker@Tremblay) Start execution... +> [ 0.010825] (1:master@Tremblay) Send to worker-0 completed +> [ 0.010825] (1:master@Tremblay) Send a message to worker-1 > [ 1.000000] (0:maestro@) Restart processes on host Fafard +> [ 1.000000] (6:worker@Fafard) Waiting a message on worker-2 > [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-1'. Nevermind. Let's keep going! +> [ 1.000000] (1:master@Tremblay) Send a message to worker-2 > [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (0:maestro@) Restart processes on host Jupiter -> [ 2.010825] (2:worker@Tremblay) "Task" done -> [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 12.082474] (1:master@Tremblay) Send completed -> [ 12.082474] (4:worker@Ginette) Received "Task" -> [ 12.082474] (4:worker@Ginette) Communication time : "1.082474" -> [ 12.082474] (4:worker@Ginette) Processing "Task" -> [ 13.164948] (1:master@Tremblay) Send completed -> [ 13.164948] (5:worker@Bourassa) Received "Task" -> [ 13.164948] (5:worker@Bourassa) Communication time : "1.082474" -> [ 13.164948] (5:worker@Bourassa) Processing "Task" -> [ 13.175773] (1:master@Tremblay) Send completed -> [ 13.175773] (2:worker@Tremblay) Received "Task" -> [ 13.175773] (2:worker@Tremblay) Communication time : "0.010825" -> [ 13.175773] (2:worker@Tremblay) Processing "Task" -> [ 14.082474] (4:worker@Ginette) "Task" done -> [ 14.258247] (1:master@Tremblay) Send completed -> [ 14.258247] (6:worker@Jupiter) Received "Task" -> [ 14.258247] (6:worker@Jupiter) Communication time : "1.082474" -> [ 14.258247] (6:worker@Jupiter) Processing "Task" -> [ 15.164948] (5:worker@Bourassa) "Task" done -> [ 15.175773] (2:worker@Tremblay) "Task" done -> [ 16.258247] (6:worker@Jupiter) "Task" done -> [ 24.258247] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 24.258247] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! -> [ 24.258247] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 25.340722] (1:master@Tremblay) Send completed -> [ 25.340722] (5:worker@Bourassa) Received "Task" -> [ 25.340722] (5:worker@Bourassa) Communication time : "1.082474" -> [ 25.340722] (5:worker@Bourassa) Processing "Task" -> [ 25.351546] (1:master@Tremblay) Send completed -> [ 25.351546] (2:worker@Tremblay) Received "Task" -> [ 25.351546] (2:worker@Tremblay) Communication time : "0.010825" -> [ 25.351546] (2:worker@Tremblay) Processing "Task" -> [ 26.434021] (1:master@Tremblay) Send completed -> [ 26.434021] (6:worker@Jupiter) Received "Task" -> [ 26.434021] (6:worker@Jupiter) Communication time : "1.082474" -> [ 26.434021] (6:worker@Jupiter) Processing "Task" -> [ 27.340722] (5:worker@Bourassa) "Task" done -> [ 27.351546] (2:worker@Tremblay) "Task" done -> [ 28.434021] (6:worker@Jupiter) "Task" done -> [ 36.434021] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 37.516495] (1:master@Tremblay) Send completed -> [ 37.516495] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! -> [ 37.516495] (4:worker@Ginette) Received "Task" -> [ 37.516495] (4:worker@Ginette) Communication time : "1.082474" -> [ 37.516495] (4:worker@Ginette) Processing "Task" -> [ 37.516495] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 37.527320] (1:master@Tremblay) Send completed -> [ 37.527320] (2:worker@Tremblay) Received "Task" -> [ 37.527320] (2:worker@Tremblay) Communication time : "0.010825" -> [ 37.527320] (2:worker@Tremblay) Processing "Task" -> [ 38.609794] (1:master@Tremblay) Send completed -> [ 38.609794] (6:worker@Jupiter) Received "Task" -> [ 38.609794] (6:worker@Jupiter) Communication time : "1.082474" -> [ 38.609794] (6:worker@Jupiter) Processing "Task" -> [ 39.516495] (4:worker@Ginette) "Task" done -> [ 39.527320] (2:worker@Tremblay) "Task" done -> [ 40.609794] (6:worker@Jupiter) "Task" done -> [ 48.609794] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 49.692268] (1:master@Tremblay) Send completed -> [ 49.692268] (4:worker@Ginette) Received "Task" -> [ 49.692268] (4:worker@Ginette) Communication time : "1.082474" -> [ 49.692268] (4:worker@Ginette) Processing "Task" -> [ 50.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! -> [ 50.774742] (1:master@Tremblay) Send completed -> [ 50.774742] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. -> [ 50.774742] (2:worker@Tremblay) Received "finalize" -> [ 50.774742] (2:worker@Tremblay) I'm done. See you! -> [ 50.774742] (5:worker@Bourassa) Received "Task" -> [ 50.774742] (5:worker@Bourassa) Communication time : "1.082474" -> [ 50.774742] (5:worker@Bourassa) Processing "Task" -> [ 50.774742] (6:worker@Jupiter) Received "finalize" -> [ 50.774742] (6:worker@Jupiter) I'm done. See you! -> [ 51.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 52.774742] (0:maestro@) Simulation time 52.7747 -> [ 52.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! -> [ 52.774742] (1:master@Tremblay) Goodbye now! -> [ 52.774742] (5:worker@Bourassa) "Task" done -> [ 52.774742] (5:worker@Bourassa) Received "finalize" -> [ 52.774742] (5:worker@Bourassa) I'm done. See you! +> [ 2.000000] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 2.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-2'. Nevermind. Let's keep going! +> [ 2.000000] (1:master@Tremblay) Send a message to worker-3 +> [ 2.000000] (6:worker@Fafard) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 2.010825] (2:worker@Tremblay) Execution complete. +> [ 2.010825] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 3.082474] (4:worker@Ginette) Start execution... +> [ 3.082474] (1:master@Tremblay) Send to worker-3 completed +> [ 3.082474] (1:master@Tremblay) Send a message to worker-4 +> [ 4.164948] (5:worker@Bourassa) Start execution... +> [ 4.164948] (1:master@Tremblay) Send to worker-4 completed +> [ 4.164948] (1:master@Tremblay) Send a message to worker-0 +> [ 4.175773] (2:worker@Tremblay) Start execution... +> [ 4.175773] (1:master@Tremblay) Send to worker-0 completed +> [ 4.175773] (1:master@Tremblay) Send a message to worker-1 +> [ 5.082474] (4:worker@Ginette) Execution complete. +> [ 5.082474] (4:worker@Ginette) Waiting a message on worker-3 +> [ 5.258247] (7:worker@Jupiter) Start execution... +> [ 5.258247] (1:master@Tremblay) Send to worker-1 completed +> [ 5.258247] (1:master@Tremblay) Send a message to worker-2 +> [ 6.164948] (5:worker@Bourassa) Execution complete. +> [ 6.164948] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 6.175773] (2:worker@Tremblay) Execution complete. +> [ 6.175773] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 7.258247] (7:worker@Jupiter) Execution complete. +> [ 7.258247] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 15.258247] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 15.258247] (1:master@Tremblay) Send a message to worker-3 +> [ 15.258247] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 15.258247] (4:worker@Ginette) Waiting a message on worker-3 +> [ 15.258247] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! +> [ 15.258247] (1:master@Tremblay) Send a message to worker-4 +> [ 16.340722] (5:worker@Bourassa) Start execution... +> [ 16.340722] (1:master@Tremblay) Send to worker-4 completed +> [ 16.340722] (1:master@Tremblay) Send a message to worker-0 +> [ 16.351546] (2:worker@Tremblay) Start execution... +> [ 16.351546] (1:master@Tremblay) Send to worker-0 completed +> [ 16.351546] (1:master@Tremblay) Send a message to worker-1 +> [ 17.434021] (7:worker@Jupiter) Start execution... +> [ 17.434021] (1:master@Tremblay) Send to worker-1 completed +> [ 17.434021] (1:master@Tremblay) Send a message to worker-2 +> [ 18.340722] (5:worker@Bourassa) Execution complete. +> [ 18.340722] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 18.351546] (2:worker@Tremblay) Execution complete. +> [ 18.351546] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 19.434021] (7:worker@Jupiter) Execution complete. +> [ 19.434021] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 27.434021] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 27.434021] (1:master@Tremblay) Send a message to worker-3 +> [ 28.516495] (4:worker@Ginette) Start execution... +> [ 28.516495] (1:master@Tremblay) Send to worker-3 completed +> [ 28.516495] (1:master@Tremblay) Send a message to worker-4 +> [ 28.516495] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 28.516495] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 28.516495] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! +> [ 28.516495] (1:master@Tremblay) Send a message to worker-0 +> [ 28.527320] (2:worker@Tremblay) Start execution... +> [ 28.527320] (1:master@Tremblay) Send to worker-0 completed +> [ 28.527320] (1:master@Tremblay) Send a message to worker-1 +> [ 29.609794] (7:worker@Jupiter) Start execution... +> [ 29.609794] (1:master@Tremblay) Send to worker-1 completed +> [ 29.609794] (1:master@Tremblay) Send a message to worker-2 +> [ 30.516495] (4:worker@Ginette) Execution complete. +> [ 30.516495] (4:worker@Ginette) Waiting a message on worker-3 +> [ 30.527320] (2:worker@Tremblay) Execution complete. +> [ 30.527320] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 31.609794] (7:worker@Jupiter) Execution complete. +> [ 31.609794] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 39.609794] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 39.609794] (1:master@Tremblay) Send a message to worker-3 +> [ 40.692268] (4:worker@Ginette) Start execution... +> [ 40.692268] (1:master@Tremblay) Send to worker-3 completed +> [ 40.692268] (1:master@Tremblay) Send a message to worker-4 +> [ 41.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 41.774742] (5:worker@Bourassa) Start execution... +> [ 41.774742] (1:master@Tremblay) Send to worker-4 completed +> [ 41.774742] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. +> [ 41.774742] (2:worker@Tremblay) I'm done. See you! +> [ 41.774742] (7:worker@Jupiter) I'm done. See you! +> [ 42.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 43.774742] (5:worker@Bourassa) Execution complete. +> [ 43.774742] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 43.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! +> [ 43.774742] (5:worker@Bourassa) I'm done. See you! +> [ 43.774742] (1:master@Tremblay) Goodbye now! +> [ 43.774742] (0:maestro@) Simulation time 43.7747 p Testing a simple master/worker example application handling failures. CPU_TI optimization enabled ! output sort 19 -$ $SG_TEST_EXENV ${bindir:=.}/platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_with_failures.xml ${srcdir}/../app-masterworker/app-masterworker_d.xml --cfg=path:${srcdir} -cfg=cpu/optim:TI "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" +$ $SG_TEST_EXENV ${bindir:=.}/platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_with_failures.xml ${srcdir}/../app-masterworker/app-masterworker_d.xml --cfg=path:${srcdir} --cfg=cpu/optim:TI "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" > [ 0.000000] (0:maestro@) Cannot launch process 'worker' on failed host 'Fafard' > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process -> [ 0.010825] (1:master@Tremblay) Send completed -> [ 0.010825] (2:worker@Tremblay) Received "Task" -> [ 0.010825] (2:worker@Tremblay) Communication time : "0.010825" -> [ 0.010825] (2:worker@Tremblay) Processing "Task" +> [ 0.000000] (1:master@Tremblay) Send a message to worker-0 +> [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 0.000000] (3:worker@Jupiter) Waiting a message on worker-1 +> [ 0.000000] (4:worker@Ginette) Waiting a message on worker-3 +> [ 0.000000] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 0.010825] (1:master@Tremblay) Send to worker-0 completed +> [ 0.010825] (1:master@Tremblay) Send a message to worker-1 +> [ 0.010825] (2:worker@Tremblay) Start execution... > [ 1.000000] (0:maestro@) Restart processes on host Fafard > [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-1'. Nevermind. Let's keep going! +> [ 1.000000] (1:master@Tremblay) Send a message to worker-2 > [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 1.000000] (6:worker@Fafard) Waiting a message on worker-2 > [ 2.000000] (0:maestro@) Restart processes on host Jupiter -> [ 2.010825] (2:worker@Tremblay) "Task" done -> [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 12.082474] (1:master@Tremblay) Send completed -> [ 12.082474] (4:worker@Ginette) Received "Task" -> [ 12.082474] (4:worker@Ginette) Communication time : "1.082474" -> [ 12.082474] (4:worker@Ginette) Processing "Task" -> [ 13.164948] (1:master@Tremblay) Send completed -> [ 13.164948] (5:worker@Bourassa) Received "Task" -> [ 13.164948] (5:worker@Bourassa) Communication time : "1.082474" -> [ 13.164948] (5:worker@Bourassa) Processing "Task" -> [ 13.175773] (1:master@Tremblay) Send completed -> [ 13.175773] (2:worker@Tremblay) Received "Task" -> [ 13.175773] (2:worker@Tremblay) Communication time : "0.010825" -> [ 13.175773] (2:worker@Tremblay) Processing "Task" -> [ 14.082474] (4:worker@Ginette) "Task" done -> [ 14.258247] (1:master@Tremblay) Send completed -> [ 14.258247] (6:worker@Jupiter) Received "Task" -> [ 14.258247] (6:worker@Jupiter) Communication time : "1.082474" -> [ 14.258247] (6:worker@Jupiter) Processing "Task" -> [ 15.164948] (5:worker@Bourassa) "Task" done -> [ 15.175773] (2:worker@Tremblay) "Task" done -> [ 16.258247] (6:worker@Jupiter) "Task" done -> [ 24.258247] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 24.258247] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! -> [ 24.258247] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 25.340722] (1:master@Tremblay) Send completed -> [ 25.340722] (5:worker@Bourassa) Received "Task" -> [ 25.340722] (5:worker@Bourassa) Communication time : "1.082474" -> [ 25.340722] (5:worker@Bourassa) Processing "Task" -> [ 25.351546] (1:master@Tremblay) Send completed -> [ 25.351546] (2:worker@Tremblay) Received "Task" -> [ 25.351546] (2:worker@Tremblay) Communication time : "0.010825" -> [ 25.351546] (2:worker@Tremblay) Processing "Task" -> [ 26.434021] (1:master@Tremblay) Send completed -> [ 26.434021] (6:worker@Jupiter) Received "Task" -> [ 26.434021] (6:worker@Jupiter) Communication time : "1.082474" -> [ 26.434021] (6:worker@Jupiter) Processing "Task" -> [ 27.340722] (5:worker@Bourassa) "Task" done -> [ 27.351546] (2:worker@Tremblay) "Task" done -> [ 28.434021] (6:worker@Jupiter) "Task" done -> [ 36.434021] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 37.516495] (1:master@Tremblay) Send completed -> [ 37.516495] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! -> [ 37.516495] (4:worker@Ginette) Received "Task" -> [ 37.516495] (4:worker@Ginette) Communication time : "1.082474" -> [ 37.516495] (4:worker@Ginette) Processing "Task" -> [ 37.516495] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 37.527320] (1:master@Tremblay) Send completed -> [ 37.527320] (2:worker@Tremblay) Received "Task" -> [ 37.527320] (2:worker@Tremblay) Communication time : "0.010825" -> [ 37.527320] (2:worker@Tremblay) Processing "Task" -> [ 38.609794] (1:master@Tremblay) Send completed -> [ 38.609794] (6:worker@Jupiter) Received "Task" -> [ 38.609794] (6:worker@Jupiter) Communication time : "1.082474" -> [ 38.609794] (6:worker@Jupiter) Processing "Task" -> [ 39.516495] (4:worker@Ginette) "Task" done -> [ 39.527320] (2:worker@Tremblay) "Task" done -> [ 40.609794] (6:worker@Jupiter) "Task" done -> [ 48.609794] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 49.692268] (1:master@Tremblay) Send completed -> [ 49.692268] (4:worker@Ginette) Received "Task" -> [ 49.692268] (4:worker@Ginette) Communication time : "1.082474" -> [ 49.692268] (4:worker@Ginette) Processing "Task" -> [ 50.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! -> [ 50.774742] (1:master@Tremblay) Send completed -> [ 50.774742] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. -> [ 50.774742] (2:worker@Tremblay) Received "finalize" -> [ 50.774742] (2:worker@Tremblay) I'm done. See you! -> [ 50.774742] (5:worker@Bourassa) Received "Task" -> [ 50.774742] (5:worker@Bourassa) Communication time : "1.082474" -> [ 50.774742] (5:worker@Bourassa) Processing "Task" -> [ 50.774742] (6:worker@Jupiter) Received "finalize" -> [ 50.774742] (6:worker@Jupiter) I'm done. See you! -> [ 51.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 52.774742] (0:maestro@) Simulation time 52.7747 -> [ 52.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! -> [ 52.774742] (1:master@Tremblay) Goodbye now! -> [ 52.774742] (5:worker@Bourassa) "Task" done -> [ 52.774742] (5:worker@Bourassa) Received "finalize" -> [ 52.774742] (5:worker@Bourassa) I'm done. See you! +> [ 2.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-2'. Nevermind. Let's keep going! +> [ 2.000000] (1:master@Tremblay) Send a message to worker-3 +> [ 2.000000] (6:worker@Fafard) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 2.000000] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 2.010825] (2:worker@Tremblay) Execution complete. +> [ 2.010825] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 3.082474] (1:master@Tremblay) Send to worker-3 completed +> [ 3.082474] (1:master@Tremblay) Send a message to worker-4 +> [ 3.082474] (4:worker@Ginette) Start execution... +> [ 4.164948] (1:master@Tremblay) Send to worker-4 completed +> [ 4.164948] (1:master@Tremblay) Send a message to worker-0 +> [ 4.164948] (5:worker@Bourassa) Start execution... +> [ 4.175773] (1:master@Tremblay) Send to worker-0 completed +> [ 4.175773] (1:master@Tremblay) Send a message to worker-1 +> [ 4.175773] (2:worker@Tremblay) Start execution... +> [ 5.082474] (4:worker@Ginette) Execution complete. +> [ 5.082474] (4:worker@Ginette) Waiting a message on worker-3 +> [ 5.258247] (1:master@Tremblay) Send to worker-1 completed +> [ 5.258247] (1:master@Tremblay) Send a message to worker-2 +> [ 5.258247] (7:worker@Jupiter) Start execution... +> [ 6.164948] (5:worker@Bourassa) Execution complete. +> [ 6.164948] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 6.175773] (2:worker@Tremblay) Execution complete. +> [ 6.175773] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 7.258247] (7:worker@Jupiter) Execution complete. +> [ 7.258247] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 15.258247] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 15.258247] (1:master@Tremblay) Send a message to worker-3 +> [ 15.258247] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! +> [ 15.258247] (1:master@Tremblay) Send a message to worker-4 +> [ 15.258247] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 15.258247] (4:worker@Ginette) Waiting a message on worker-3 +> [ 16.340722] (1:master@Tremblay) Send to worker-4 completed +> [ 16.340722] (1:master@Tremblay) Send a message to worker-0 +> [ 16.340722] (5:worker@Bourassa) Start execution... +> [ 16.351546] (1:master@Tremblay) Send to worker-0 completed +> [ 16.351546] (1:master@Tremblay) Send a message to worker-1 +> [ 16.351546] (2:worker@Tremblay) Start execution... +> [ 17.434021] (1:master@Tremblay) Send to worker-1 completed +> [ 17.434021] (1:master@Tremblay) Send a message to worker-2 +> [ 17.434021] (7:worker@Jupiter) Start execution... +> [ 18.340722] (5:worker@Bourassa) Execution complete. +> [ 18.340722] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 18.351546] (2:worker@Tremblay) Execution complete. +> [ 18.351546] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 19.434021] (7:worker@Jupiter) Execution complete. +> [ 19.434021] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 27.434021] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 27.434021] (1:master@Tremblay) Send a message to worker-3 +> [ 28.516495] (1:master@Tremblay) Send to worker-3 completed +> [ 28.516495] (1:master@Tremblay) Send a message to worker-4 +> [ 28.516495] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! +> [ 28.516495] (1:master@Tremblay) Send a message to worker-0 +> [ 28.516495] (4:worker@Ginette) Start execution... +> [ 28.516495] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 28.516495] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 28.527320] (1:master@Tremblay) Send to worker-0 completed +> [ 28.527320] (1:master@Tremblay) Send a message to worker-1 +> [ 28.527320] (2:worker@Tremblay) Start execution... +> [ 29.609794] (1:master@Tremblay) Send to worker-1 completed +> [ 29.609794] (1:master@Tremblay) Send a message to worker-2 +> [ 29.609794] (7:worker@Jupiter) Start execution... +> [ 30.516495] (4:worker@Ginette) Execution complete. +> [ 30.516495] (4:worker@Ginette) Waiting a message on worker-3 +> [ 30.527320] (2:worker@Tremblay) Execution complete. +> [ 30.527320] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 31.609794] (7:worker@Jupiter) Execution complete. +> [ 31.609794] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 39.609794] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 39.609794] (1:master@Tremblay) Send a message to worker-3 +> [ 40.692268] (1:master@Tremblay) Send to worker-3 completed +> [ 40.692268] (1:master@Tremblay) Send a message to worker-4 +> [ 40.692268] (4:worker@Ginette) Start execution... +> [ 41.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 41.774742] (1:master@Tremblay) Send to worker-4 completed +> [ 41.774742] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. +> [ 41.774742] (2:worker@Tremblay) I'm done. See you! +> [ 41.774742] (5:worker@Bourassa) Start execution... +> [ 41.774742] (7:worker@Jupiter) I'm done. See you! +> [ 42.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 43.774742] (0:maestro@) Simulation time 43.7747 +> [ 43.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! +> [ 43.774742] (1:master@Tremblay) Goodbye now! +> [ 43.774742] (5:worker@Bourassa) Execution complete. +> [ 43.774742] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 43.774742] (5:worker@Bourassa) I'm done. See you! diff --git a/examples/platforms/trace/ginette_state.trace b/examples/platforms/trace/ginette_state.trace index e3fbe4028b..108bce3cf1 100644 --- a/examples/platforms/trace/ginette_state.trace +++ b/examples/platforms/trace/ginette_state.trace @@ -1,2 +1,2 @@ -50 0 -60 1 +41 0 +50 1 diff --git a/examples/platforms/trace/link3_state.trace b/examples/platforms/trace/link3_state.trace index 8cad9d0248..aa52029ac3 100644 --- a/examples/platforms/trace/link3_state.trace +++ b/examples/platforms/trace/link3_state.trace @@ -1,4 +1,6 @@ 13 0 14 1 +15 0 +16 1 20 0 25 1 diff --git a/examples/platforms/trace/link4_state.trace b/examples/platforms/trace/link4_state.trace index 006aa06fe7..fab0dd2e22 100644 --- a/examples/platforms/trace/link4_state.trace +++ b/examples/platforms/trace/link4_state.trace @@ -1,2 +1,2 @@ -35 0 -40 1 +25 0 +30 1 diff --git a/examples/s4u/CMakeLists.txt b/examples/s4u/CMakeLists.txt index c8a987ec4f..dae1364bf5 100644 --- a/examples/s4u/CMakeLists.txt +++ b/examples/s4u/CMakeLists.txt @@ -32,7 +32,6 @@ foreach(variant fun class) endforeach() set(tesh_files ${tesh_files} ${CMAKE_CURRENT_SOURCE_DIR}/app-masterworkers/s4u-app-masterworkers.tesh) - # CHORD EXAMPLE add_executable (s4u-dht-chord dht-chord/s4u-dht-chord.cpp dht-chord/s4u-dht-chord-node.cpp) target_link_libraries(s4u-dht-chord simgrid) @@ -102,8 +101,13 @@ foreach(example actor-create actor-daemon actor-join actor-kill energy-exec energy-boot energy-link energy-vm engine-filtering exec-async exec-basic exec-dvfs exec-monitor exec-ptask exec-remote +<<<<<<< HEAD platform-properties plugin-hostload mutex io-async io-file-system io-file-remote io-storage-raw +======= + platform-failures platform-properties plugin-hostload mutex + io-file-system io-file-remote io-storage-raw +>>>>>>> 4ccbacb51eb49323847a906c3e79ea838d76e2a7 replay-comm replay-storage routing-get-clusters ) diff --git a/examples/s4u/platform-failures/s4u-platform-failures.cpp b/examples/s4u/platform-failures/s4u-platform-failures.cpp index 941ea2aba8..8b54502938 100644 --- a/examples/s4u/platform-failures/s4u-platform-failures.cpp +++ b/examples/s4u/platform-failures/s4u-platform-failures.cpp @@ -25,8 +25,9 @@ static int master(int argc, char* argv[]) mailbox = simgrid::s4u::Mailbox::by_name(std::string("worker-") + std::to_string(i % workers_count)); double* payload = new double(comp_size); try { + XBT_INFO("Send a message to %s", mailbox->get_cname()); mailbox->put(payload, comm_size, 10.0); - XBT_INFO("Send completed"); + XBT_INFO("Send to %s completed", mailbox->get_cname()); } catch (xbt_ex& e) { switch (e.category) { case host_error: @@ -84,8 +85,27 @@ static int worker(int argc, char* argv[]) double comp_size = -1; while (1) { try { + XBT_INFO("Waiting a message on %s", mailbox->get_cname()); payload = static_cast(mailbox->get()); comp_size = *payload; + xbt_assert(payload != nullptr, "mailbox->get() failed"); + if (comp_size < 0) { /* - Exit when -1.0 is received */ + XBT_INFO("I'm done. See you!"); + break; + } + /* - Otherwise, process the task */ + try { + XBT_INFO("Start execution..."); + simgrid::s4u::this_actor::execute(comp_size); + XBT_INFO("Execution complete."); + } catch (xbt_ex& e) { + if (e.category == host_error) { + XBT_INFO("Gloups. The cpu on which I'm running just turned off!. See you!"); + return -1; + } else + xbt_die("Unexpected behavior"); + } + delete payload; } catch (xbt_ex& e) { switch (e.category) { @@ -99,23 +119,7 @@ static int worker(int argc, char* argv[]) xbt_die("Unexpected behavior"); } } - xbt_assert(payload != nullptr, "mailbox->get() failed"); - if (comp_size < 0) { /* - Exit when -1.0 is received */ - XBT_INFO("I'm done. See you!"); - break; - } - /* - Otherwise, process the task */ - try { - simgrid::s4u::this_actor::execute(comp_size); - } catch (xbt_ex& e) { - if (e.category == host_error) { - XBT_INFO("Gloups. The cpu on which I'm running just turned off!. See you!"); - return -1; - } else - xbt_die("Unexpected behavior"); - } } - XBT_INFO("I'm done. See you!"); return 0; } diff --git a/examples/s4u/platform-failures/s4u-platform-failures.tesh b/examples/s4u/platform-failures/s4u-platform-failures.tesh index 5738a23318..4b365e854f 100644 --- a/examples/s4u/platform-failures/s4u-platform-failures.tesh +++ b/examples/s4u/platform-failures/s4u-platform-failures.tesh @@ -6,37 +6,210 @@ p Testing a simple master/worker example application handling failures TCP cross $ $SG_TEST_EXENV ${bindir:=.}/s4u-platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_with_failures.xml ${bindir}/../app-masterworker/s4u-app-masterworker_d.xml --cfg=path:${srcdir} --cfg=network/crosstraffic:0 "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" > [ 0.000000] (0:maestro@) Cannot launch process 'worker' on failed host 'Fafard' > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process -> [ 0.010309] (1:master@Tremblay) Send completed +> [ 0.000000] (1:master@Tremblay) Send a message to worker-0 +> [ 0.010309] (1:master@Tremblay) Send to worker-0 completed +> [ 0.010309] (2:worker@Tremblay) Start execution... +> [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 0.000000] (3:worker@Jupiter) Waiting a message on worker-1 +> [ 0.000000] (4:worker@Ginette) Waiting a message on worker-3 +> [ 0.000000] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 0.010309] (1:master@Tremblay) Send a message to worker-1 > [ 1.000000] (0:maestro@) Restart processes on host Fafard +> [ 1.000000] (6:worker@Fafard) Waiting a message on worker-2 > [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-1'. Nevermind. Let's keep going! +> [ 1.000000] (1:master@Tremblay) Send a message to worker-2 > [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 2.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-2'. Nevermind. Let's keep going! +> [ 2.000000] (6:worker@Fafard) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (0:maestro@) Restart processes on host Jupiter -> [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 12.030928] (1:master@Tremblay) Send completed -> [ 13.061856] (1:master@Tremblay) Send completed -> [ 13.072165] (1:master@Tremblay) Send completed -> [ 14.103093] (1:master@Tremblay) Send completed -> [ 24.103093] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 24.103093] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! -> [ 24.103093] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 25.134021] (1:master@Tremblay) Send completed -> [ 25.144330] (1:master@Tremblay) Send completed -> [ 26.175258] (1:master@Tremblay) Send completed -> [ 36.175258] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 37.206186] (1:master@Tremblay) Send completed -> [ 37.206186] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! -> [ 37.206186] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! -> [ 38.247423] (1:master@Tremblay) Send completed -> [ 48.247423] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 49.278351] (1:master@Tremblay) Send completed -> [ 50.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! -> [ 50.309278] (1:master@Tremblay) Send completed -> [ 50.309278] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. -> [ 50.309278] (2:worker@Tremblay) I'm done. See you! -> [ 50.309278] (6:worker@Jupiter) I'm done. See you! -> [ 51.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 52.309278] (0:maestro@) Simulation time 52.3093 -> [ 52.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! -> [ 52.309278] (1:master@Tremblay) Goodbye now! -> [ 52.309278] (5:worker@Bourassa) I'm done. See you! +> [ 2.000000] (1:master@Tremblay) Send a message to worker-3 +> [ 2.000000] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 2.010309] (2:worker@Tremblay) Execution complete. +> [ 2.010309] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 3.030928] (1:master@Tremblay) Send to worker-3 completed +> [ 3.030928] (1:master@Tremblay) Send a message to worker-4 +> [ 3.030928] (4:worker@Ginette) Start execution... +> [ 4.061856] (1:master@Tremblay) Send to worker-4 completed +> [ 4.061856] (1:master@Tremblay) Send a message to worker-0 +> [ 4.061856] (5:worker@Bourassa) Start execution... +> [ 4.072165] (1:master@Tremblay) Send to worker-0 completed +> [ 4.072165] (1:master@Tremblay) Send a message to worker-1 +> [ 4.072165] (2:worker@Tremblay) Start execution... +> [ 5.030928] (4:worker@Ginette) Execution complete. +> [ 5.030928] (4:worker@Ginette) Waiting a message on worker-3 +> [ 5.103093] (1:master@Tremblay) Send to worker-1 completed +> [ 5.103093] (1:master@Tremblay) Send a message to worker-2 +> [ 5.103093] (7:worker@Jupiter) Start execution... +> [ 6.061856] (5:worker@Bourassa) Execution complete. +> [ 6.061856] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 6.072165] (2:worker@Tremblay) Execution complete. +> [ 6.072165] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 7.103093] (7:worker@Jupiter) Execution complete. +> [ 7.103093] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 15.103093] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 15.103093] (1:master@Tremblay) Send a message to worker-3 +> [ 15.103093] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! +> [ 15.103093] (1:master@Tremblay) Send a message to worker-4 +> [ 15.103093] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 15.103093] (4:worker@Ginette) Waiting a message on worker-3 +> [ 16.134021] (1:master@Tremblay) Send to worker-4 completed +> [ 16.134021] (1:master@Tremblay) Send a message to worker-0 +> [ 16.134021] (5:worker@Bourassa) Start execution... +> [ 16.144330] (1:master@Tremblay) Send to worker-0 completed +> [ 16.144330] (1:master@Tremblay) Send a message to worker-1 +> [ 16.144330] (2:worker@Tremblay) Start execution... +> [ 17.175258] (1:master@Tremblay) Send to worker-1 completed +> [ 17.175258] (1:master@Tremblay) Send a message to worker-2 +> [ 17.175258] (7:worker@Jupiter) Start execution... +> [ 18.134021] (5:worker@Bourassa) Execution complete. +> [ 18.134021] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 18.144330] (2:worker@Tremblay) Execution complete. +> [ 18.144330] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 19.175258] (7:worker@Jupiter) Execution complete. +> [ 19.175258] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 27.175258] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 27.175258] (1:master@Tremblay) Send a message to worker-3 +> [ 28.206186] (1:master@Tremblay) Send to worker-3 completed +> [ 28.206186] (1:master@Tremblay) Send a message to worker-4 +> [ 28.206186] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! +> [ 28.206186] (1:master@Tremblay) Send a message to worker-0 +> [ 28.206186] (4:worker@Ginette) Start execution... +> [ 28.206186] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 28.206186] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 28.216495] (1:master@Tremblay) Send to worker-0 completed +> [ 28.216495] (1:master@Tremblay) Send a message to worker-1 +> [ 28.216495] (2:worker@Tremblay) Start execution... +> [ 29.247423] (1:master@Tremblay) Send to worker-1 completed +> [ 29.247423] (1:master@Tremblay) Send a message to worker-2 +> [ 29.247423] (7:worker@Jupiter) Start execution... +> [ 30.206186] (4:worker@Ginette) Execution complete. +> [ 30.206186] (4:worker@Ginette) Waiting a message on worker-3 +> [ 30.216495] (2:worker@Tremblay) Execution complete. +> [ 30.216495] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 31.247423] (7:worker@Jupiter) Execution complete. +> [ 31.247423] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 39.247423] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 39.247423] (1:master@Tremblay) Send a message to worker-3 +> [ 40.278351] (1:master@Tremblay) Send to worker-3 completed +> [ 40.278351] (1:master@Tremblay) Send a message to worker-4 +> [ 40.278351] (4:worker@Ginette) Start execution... +> [ 41.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 41.309278] (1:master@Tremblay) Send to worker-4 completed +> [ 41.309278] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. +> [ 41.309278] (2:worker@Tremblay) I'm done. See you! +> [ 41.309278] (5:worker@Bourassa) Start execution... +> [ 41.309278] (7:worker@Jupiter) I'm done. See you! +> [ 42.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 43.309278] (0:maestro@) Simulation time 43.3093 +> [ 43.309278] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! +> [ 43.309278] (1:master@Tremblay) Goodbye now! +> [ 43.309278] (5:worker@Bourassa) Execution complete. +> [ 43.309278] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 43.309278] (5:worker@Bourassa) I'm done. See you! +p Testing a simple master/worker example application handling failures. TCP crosstraffic ENABLED + +! output sort 19 +$ $SG_TEST_EXENV ${bindir:=.}/s4u-platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_with_failures.xml ${bindir}/../app-masterworker/s4u-app-masterworker_d.xml --cfg=path:${srcdir} "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" +> [ 0.000000] (0:maestro@) Cannot launch process 'worker' on failed host 'Fafard' +> [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process +> [ 0.000000] (1:master@Tremblay) Send a message to worker-0 +> [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 0.000000] (3:worker@Jupiter) Waiting a message on worker-1 +> [ 0.000000] (4:worker@Ginette) Waiting a message on worker-3 +> [ 0.000000] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 0.010825] (2:worker@Tremblay) Start execution... +> [ 0.010825] (1:master@Tremblay) Send to worker-0 completed +> [ 0.010825] (1:master@Tremblay) Send a message to worker-1 +> [ 1.000000] (0:maestro@) Restart processes on host Fafard +> [ 1.000000] (6:worker@Fafard) Waiting a message on worker-2 +> [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-1'. Nevermind. Let's keep going! +> [ 1.000000] (1:master@Tremblay) Send a message to worker-2 +> [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 2.000000] (0:maestro@) Restart processes on host Jupiter +> [ 2.000000] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 2.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-2'. Nevermind. Let's keep going! +> [ 2.000000] (1:master@Tremblay) Send a message to worker-3 +> [ 2.000000] (6:worker@Fafard) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 2.010825] (2:worker@Tremblay) Execution complete. +> [ 2.010825] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 3.082474] (4:worker@Ginette) Start execution... +> [ 3.082474] (1:master@Tremblay) Send to worker-3 completed +> [ 3.082474] (1:master@Tremblay) Send a message to worker-4 +> [ 4.164948] (5:worker@Bourassa) Start execution... +> [ 4.164948] (1:master@Tremblay) Send to worker-4 completed +> [ 4.164948] (1:master@Tremblay) Send a message to worker-0 +> [ 4.175773] (2:worker@Tremblay) Start execution... +> [ 4.175773] (1:master@Tremblay) Send to worker-0 completed +> [ 4.175773] (1:master@Tremblay) Send a message to worker-1 +> [ 5.082474] (4:worker@Ginette) Execution complete. +> [ 5.082474] (4:worker@Ginette) Waiting a message on worker-3 +> [ 5.258247] (7:worker@Jupiter) Start execution... +> [ 5.258247] (1:master@Tremblay) Send to worker-1 completed +> [ 5.258247] (1:master@Tremblay) Send a message to worker-2 +> [ 6.164948] (5:worker@Bourassa) Execution complete. +> [ 6.164948] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 6.175773] (2:worker@Tremblay) Execution complete. +> [ 6.175773] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 7.258247] (7:worker@Jupiter) Execution complete. +> [ 7.258247] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 15.258247] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 15.258247] (1:master@Tremblay) Send a message to worker-3 +> [ 15.258247] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 15.258247] (4:worker@Ginette) Waiting a message on worker-3 +> [ 15.258247] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! +> [ 15.258247] (1:master@Tremblay) Send a message to worker-4 +> [ 16.340722] (5:worker@Bourassa) Start execution... +> [ 16.340722] (1:master@Tremblay) Send to worker-4 completed +> [ 16.340722] (1:master@Tremblay) Send a message to worker-0 +> [ 16.351546] (2:worker@Tremblay) Start execution... +> [ 16.351546] (1:master@Tremblay) Send to worker-0 completed +> [ 16.351546] (1:master@Tremblay) Send a message to worker-1 +> [ 17.434021] (7:worker@Jupiter) Start execution... +> [ 17.434021] (1:master@Tremblay) Send to worker-1 completed +> [ 17.434021] (1:master@Tremblay) Send a message to worker-2 +> [ 18.340722] (5:worker@Bourassa) Execution complete. +> [ 18.340722] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 18.351546] (2:worker@Tremblay) Execution complete. +> [ 18.351546] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 19.434021] (7:worker@Jupiter) Execution complete. +> [ 19.434021] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 27.434021] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 27.434021] (1:master@Tremblay) Send a message to worker-3 +> [ 28.516495] (4:worker@Ginette) Start execution... +> [ 28.516495] (1:master@Tremblay) Send to worker-3 completed +> [ 28.516495] (1:master@Tremblay) Send a message to worker-4 +> [ 28.516495] (5:worker@Bourassa) Mmh. Something went wrong. Nevermind. Let's keep going! +> [ 28.516495] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 28.516495] (1:master@Tremblay) Mmh. Something went wrong with 'worker-4'. Nevermind. Let's keep going! +> [ 28.516495] (1:master@Tremblay) Send a message to worker-0 +> [ 28.527320] (2:worker@Tremblay) Start execution... +> [ 28.527320] (1:master@Tremblay) Send to worker-0 completed +> [ 28.527320] (1:master@Tremblay) Send a message to worker-1 +> [ 29.609794] (7:worker@Jupiter) Start execution... +> [ 29.609794] (1:master@Tremblay) Send to worker-1 completed +> [ 29.609794] (1:master@Tremblay) Send a message to worker-2 +> [ 30.516495] (4:worker@Ginette) Execution complete. +> [ 30.516495] (4:worker@Ginette) Waiting a message on worker-3 +> [ 30.527320] (2:worker@Tremblay) Execution complete. +> [ 30.527320] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 31.609794] (7:worker@Jupiter) Execution complete. +> [ 31.609794] (7:worker@Jupiter) Waiting a message on worker-1 +> [ 39.609794] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 39.609794] (1:master@Tremblay) Send a message to worker-3 +> [ 40.692268] (4:worker@Ginette) Start execution... +> [ 40.692268] (1:master@Tremblay) Send to worker-3 completed +> [ 40.692268] (1:master@Tremblay) Send a message to worker-4 +> [ 41.000000] (4:worker@Ginette) Gloups. The cpu on which I'm running just turned off!. See you! +> [ 41.774742] (5:worker@Bourassa) Start execution... +> [ 41.774742] (1:master@Tremblay) Send to worker-4 completed +> [ 41.774742] (1:master@Tremblay) All tasks have been dispatched. Let's tell everybody the computation is over. +> [ 41.774742] (2:worker@Tremblay) I'm done. See you! +> [ 41.774742] (7:worker@Jupiter) I'm done. See you! +> [ 42.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! +> [ 43.774742] (5:worker@Bourassa) Execution complete. +> [ 43.774742] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 43.774742] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-3'. Nevermind. Let's keep going! +> [ 43.774742] (5:worker@Bourassa) I'm done. See you! +> [ 43.774742] (1:master@Tremblay) Goodbye now! +> [ 43.774742] (0:maestro@) Simulation time 43.7747 diff --git a/src/s4u/s4u_Actor.cpp b/src/s4u/s4u_Actor.cpp index c1d7128b9d..b933c632c4 100644 --- a/src/s4u/s4u_Actor.cpp +++ b/src/s4u/s4u_Actor.cpp @@ -8,7 +8,9 @@ #include "simgrid/s4u/Exec.hpp" #include "simgrid/s4u/Host.hpp" #include "src/kernel/activity/ExecImpl.hpp" +#include "src/simix/smx_host_private.hpp" #include "src/simix/smx_private.hpp" +#include "src/surf/HostImpl.hpp" #include @@ -73,7 +75,16 @@ void Actor::join(double timeout) void Actor::set_auto_restart(bool autorestart) { - simgrid::simix::simcall([this, autorestart]() { pimpl_->set_auto_restart(autorestart); }); + simgrid::simix::simcall([this, autorestart]() { + pimpl_->set_auto_restart(autorestart); + + std::map actors_map = pimpl_->host_->pimpl_->actors_at_boot_; + if (actors_map.find(pimpl_->name_) == actors_map.end()) { + simgrid::kernel::actor::ProcessArg* arg = new simgrid::kernel::actor::ProcessArg(pimpl_->host_, pimpl_); + XBT_DEBUG("Adding Process %s to the actors_at_boot_ list of Host %s", arg->name.c_str(), arg->host->get_cname()); + actors_map.insert({arg->name, arg}); + } + }); } void Actor::on_exit(int_f_pvoid_pvoid_t fun, void* data) /* deprecated */ diff --git a/src/s4u/s4u_Host.cpp b/src/s4u/s4u_Host.cpp index 8e5edcbecd..e78c4eb956 100644 --- a/src/s4u/s4u_Host.cpp +++ b/src/s4u/s4u_Host.cpp @@ -86,8 +86,8 @@ void Host::turn_on() { if (is_off()) { simgrid::simix::simcall([this] { - this->pimpl_->turn_on(); this->pimpl_cpu->turn_on(); + this->pimpl_->turn_on(); on_state_change(*this); }); } diff --git a/src/simix/ActorImpl.cpp b/src/simix/ActorImpl.cpp index c27e6d0386..8c60d57d16 100644 --- a/src/simix/ActorImpl.cpp +++ b/src/simix/ActorImpl.cpp @@ -450,7 +450,9 @@ void SIMIX_process_kill(smx_actor_t process, smx_actor_t issuer) { /* destroy the blocking synchro if any */ if (process->waiting_synchro != nullptr) { - + if (process->host_->is_off()) { + SMX_EXCEPTION(process, host_error, 0, "Host failed"); + } simgrid::kernel::activity::ExecImplPtr exec = boost::dynamic_pointer_cast(process->waiting_synchro); simgrid::kernel::activity::CommImplPtr comm = @@ -698,10 +700,14 @@ void SIMIX_process_yield(smx_actor_t self) self->finished_ = true; /* execute the on_exit functions */ SIMIX_process_on_exit_runall(self); - /* Add the process to the list of process to restart, only if the host is down */ - if (self->auto_restart_ && self->host_->is_off()) { - SIMIX_host_add_auto_restart_process(self->host_, self); + + if (self->auto_restart_ && self->host_->is_off() && + watched_hosts.find(self->host_->get_cname()) == watched_hosts.end()) { + XBT_DEBUG("Push host %s to watched_hosts because it's off and %s needs to restart", self->host_->get_cname(), + self->get_cname()); + watched_hosts.insert(self->host_->get_cname()); } + XBT_DEBUG("Process %s@%s is dead", self->get_cname(), self->host_->get_cname()); self->context_->stop(); } diff --git a/src/simix/smx_global.cpp b/src/simix/smx_global.cpp index c527f79be5..eecd1d4c2b 100644 --- a/src/simix/smx_global.cpp +++ b/src/simix/smx_global.cpp @@ -506,7 +506,7 @@ void SIMIX_run() /* Autorestart all process */ for (auto const& host : host_that_restart) { XBT_INFO("Restart processes on host %s", host->get_cname()); - SIMIX_host_autorestart(host); + host->turn_on(); } host_that_restart.clear(); diff --git a/src/simix/smx_host.cpp b/src/simix/smx_host.cpp index 05dc306c6f..dc7f150213 100644 --- a/src/simix/smx_host.cpp +++ b/src/simix/smx_host.cpp @@ -24,42 +24,6 @@ const char* sg_host_self_get_name() return host->get_cname(); } -/** - * @brief Add a process to the list of the processes that the host will restart when it comes back - * This function add a process to the list of the processes that will be restarted when the host comes - * back. It is expected that this function is called when the host is down. - * The processes will only be restarted once, meaning that you will have to register the process - * again to restart the process again. - */ -void SIMIX_host_add_auto_restart_process(sg_host_t host, simgrid::kernel::actor::ActorImpl* actor) -{ - simgrid::kernel::actor::ProcessArg* arg = new simgrid::kernel::actor::ProcessArg(host, actor); - - if (host->is_off() && watched_hosts.find(host->get_cname()) == watched_hosts.end()) { - watched_hosts.insert(host->get_cname()); - XBT_DEBUG("Push host %s to watched_hosts because state == SURF_RESOURCE_OFF", host->get_cname()); - } - XBT_DEBUG("Adding Process %s to the auto-restart list of Host %s", arg->name.c_str(), arg->host->get_cname()); - host->pimpl_->auto_restart_processes_.push_back(arg); -} - -/** @brief Restart the list of processes that have been registered to the host */ -void SIMIX_host_autorestart(sg_host_t host) -{ - std::vector process_list = host->pimpl_->auto_restart_processes_; - - for (auto const& arg : process_list) { - XBT_DEBUG("Restarting Process %s@%s right now", arg->name.c_str(), arg->host->get_cname()); - smx_actor_t actor = simix_global->create_process_function(arg->name.c_str(), arg->code, nullptr, arg->host, - arg->properties.get(), nullptr); - if (arg->kill_time >= 0) - simcall_process_set_kill_time(actor, arg->kill_time); - if (arg->auto_restart) - actor->auto_restart_ = arg->auto_restart; - } - process_list.clear(); -} - simgrid::kernel::activity::ExecImplPtr SIMIX_execution_start(std::string name, std::string category, double flops_amount, double priority, double bound, sg_host_t host) diff --git a/src/simix/smx_host_private.hpp b/src/simix/smx_host_private.hpp index fe4c5dcdc3..1a6da742c3 100644 --- a/src/simix/smx_host_private.hpp +++ b/src/simix/smx_host_private.hpp @@ -10,9 +10,6 @@ #include -XBT_PRIVATE void SIMIX_host_add_auto_restart_process(sg_host_t host, simgrid::kernel::actor::ActorImpl* actor); -XBT_PRIVATE void SIMIX_host_autorestart(sg_host_t host); - XBT_PRIVATE void SIMIX_execution_finish(smx_activity_t synchro); XBT_PRIVATE void SIMIX_set_category(smx_activity_t synchro, std::string category); diff --git a/src/surf/HostImpl.cpp b/src/surf/HostImpl.cpp index 1084bc0a83..8fa93261b5 100644 --- a/src/surf/HostImpl.cpp +++ b/src/surf/HostImpl.cpp @@ -102,6 +102,7 @@ HostImpl::HostImpl(s4u::Host* host) : piface_(host) delete piface_->pimpl_; piface_->pimpl_ = this; } + HostImpl::~HostImpl() { /* All processes should be gone when the host is turned off (by the end of the simulation). */ @@ -113,12 +114,9 @@ HostImpl::~HostImpl() SIMIX_display_process_status(); THROWF(arg_error, 0, "%s", msg.c_str()); } - for (auto const& arg : auto_restart_processes_) - delete arg; - auto_restart_processes_.clear(); - for (auto const& arg : boot_processes_) - delete arg; - boot_processes_.clear(); + for (auto const& arg : actors_at_boot_) + delete arg.second; + actors_at_boot_.clear(); } /** Re-starts all the actors that are marked as restartable. @@ -127,8 +125,9 @@ HostImpl::~HostImpl() */ void HostImpl::turn_on() { - for (auto const& arg : boot_processes_) { - XBT_DEBUG("Booting Process %s(%s) right now", arg->name.c_str(), arg->host->get_cname()); + for (auto const& elm : actors_at_boot_) { + kernel::actor::ProcessArg* arg = elm.second; + XBT_DEBUG("Booting Actor %s(%s) right now", arg->name.c_str(), arg->host->get_cname()); smx_actor_t actor = simix_global->create_process_function(arg->name.c_str(), arg->code, nullptr, arg->host, arg->properties.get(), nullptr); if (arg->kill_time >= 0) @@ -142,11 +141,21 @@ void HostImpl::turn_off() { if (not process_list_.empty()) { for (auto& actor : process_list_) { - SIMIX_process_kill(&actor, SIMIX_process_self()); - XBT_DEBUG("Killing %s@%s on behalf of %s which turned off that host.", actor.get_cname(), + XBT_DEBUG("Killing Actor %s@%s on behalf of %s which turned off that host.", actor.get_cname(), actor.host_->get_cname(), SIMIX_process_self()->get_cname()); + SIMIX_process_kill(&actor, SIMIX_process_self()); } } + // When a host is turned off, we want to keep only the actors that should restart for when it will boot again. + // Then get rid of the others. + auto elm = actors_at_boot_.begin(); + while (elm != actors_at_boot_.end()) { + if (not elm->second->auto_restart) { + delete elm->second; + actors_at_boot_.erase(elm); + } else + ++elm; + } } std::vector HostImpl::get_all_actors() diff --git a/src/surf/HostImpl.hpp b/src/surf/HostImpl.hpp index 6672d7afdd..cca54d6485 100644 --- a/src/surf/HostImpl.hpp +++ b/src/surf/HostImpl.hpp @@ -66,8 +66,7 @@ public: // FIXME: make these private ActorList process_list_; - std::vector auto_restart_processes_; - std::vector boot_processes_; + std::map actors_at_boot_; }; } } diff --git a/src/surf/cpu_cas01.cpp b/src/surf/cpu_cas01.cpp index 62e69e23c7..b59498075f 100644 --- a/src/surf/cpu_cas01.cpp +++ b/src/surf/cpu_cas01.cpp @@ -130,16 +130,17 @@ void CpuCas01::apply_event(tmgr_trace_event_t event, double value) xbt_assert(get_core_count() == 1, "FIXME: add state change code also for constraint_core[i]"); if (value > 0) { - if (is_off()) + if (is_off()) { host_that_restart.push_back(get_host()); - turn_on(); + get_host()->turn_on(); + } } else { kernel::lmm::Constraint* cnst = get_constraint(); kernel::lmm::Variable* var = nullptr; const kernel::lmm::Element* elem = nullptr; double date = surf_get_clock(); - turn_off(); + get_host()->turn_off(); while ((var = cnst->get_variable(&elem))) { kernel::resource::Action* action = static_cast(var->get_id()); diff --git a/src/surf/cpu_ti.cpp b/src/surf/cpu_ti.cpp index 99d32a6738..c7f6266ce2 100644 --- a/src/surf/cpu_ti.cpp +++ b/src/surf/cpu_ti.cpp @@ -398,11 +398,12 @@ void CpuTi::apply_event(tmgr_trace_event_t event, double value) } else if (event == state_event_) { if (value > 0) { - if (is_off()) + if (is_off()) { host_that_restart.push_back(get_host()); - turn_on(); + get_host()->turn_on(); + } } else { - turn_off(); + get_host()->turn_off(); double date = surf_get_clock(); /* put all action running on cpu to failed */ diff --git a/src/surf/ptask_L07.cpp b/src/surf/ptask_L07.cpp index bffa33376d..6e7546186d 100644 --- a/src/surf/ptask_L07.cpp +++ b/src/surf/ptask_L07.cpp @@ -315,10 +315,13 @@ void CpuL07::apply_event(tmgr_trace_event_t triggered, double value) tmgr_trace_event_unref(&speed_.event); } else if (triggered == state_event_) { - if (value > 0) - turn_on(); - else - turn_off(); + if (value > 0) { + if (is_off()) { + host_that_restart.push_back(get_host()); + get_host()->turn_on(); + } + } else + get_host()->turn_off(); tmgr_trace_event_unref(&state_event_); } else { diff --git a/src/surf/sg_platf.cpp b/src/surf/sg_platf.cpp index a3a9c52ac7..55de70536e 100644 --- a/src/surf/sg_platf.cpp +++ b/src/surf/sg_platf.cpp @@ -444,7 +444,7 @@ void sg_platf_new_actor(simgrid::kernel::routing::ActorCreationArgs* actor) simgrid::kernel::actor::ProcessArg* arg = new simgrid::kernel::actor::ProcessArg(actor_name, code, nullptr, host, kill_time, properties, auto_restart); - host->pimpl_->boot_processes_.push_back(arg); + host->pimpl_->actors_at_boot_.insert({actor_name, arg}); if (start_time > SIMIX_get_clock()) { diff --git a/teshsuite/msg/host_on_off/host_on_off.c b/teshsuite/msg/host_on_off/host_on_off.c index 8c34453a53..9922503d12 100644 --- a/teshsuite/msg/host_on_off/host_on_off.c +++ b/teshsuite/msg/host_on_off/host_on_off.c @@ -17,6 +17,10 @@ static int slave(int argc, char *argv[]) while (1) { res = MSG_task_receive(&(task), mailbox); + if (res == MSG_HOST_FAILURE) { + XBT_DEBUG("The host has been turned off, this was expected"); + return 1; + } xbt_assert(res == MSG_OK, "MSG_task_get failed"); if (!strcmp(MSG_task_get_name(task), "finalize")) { diff --git a/teshsuite/msg/host_on_off_processes/host_on_off_processes.cpp b/teshsuite/msg/host_on_off_processes/host_on_off_processes.cpp index f84aaed537..8f0b5b2d36 100644 --- a/teshsuite/msg/host_on_off_processes/host_on_off_processes.cpp +++ b/teshsuite/msg/host_on_off_processes/host_on_off_processes.cpp @@ -27,10 +27,13 @@ static int process_daemon(int /*argc*/, char** /*argv*/) msg_task_t task = MSG_task_create("daemon", MSG_host_get_speed(MSG_host_self()), 0, NULL); MSG_process_set_data(self, task); XBT_INFO(" Execute daemon"); - MSG_task_execute(task); - MSG_process_set_data(self, NULL); + msg_error_t res = MSG_task_execute(task); MSG_task_destroy(task); tasks_done++; + if (res == MSG_HOST_FAILURE) { + XBT_INFO("Host as died as expected, do nothing else"); + return 0; + } } XBT_INFO(" daemon done. See you!"); return 0; @@ -247,7 +250,6 @@ int main(int argc, char* argv[]) MSG_create_environment(argv[1]); - MSG_process_set_data_cleanup(task_cleanup_handler); MSG_process_create("test_launcher", test_launcher, NULL, MSG_get_host_by_name("Tremblay")); res = MSG_main(); diff --git a/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh b/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh index ba402ab5a0..f89a77e731 100644 --- a/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh +++ b/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh @@ -65,6 +65,8 @@ $ ${bindir}/host_on_off_processes ${platfdir}/small_platform.xml 5 --log=no_loc > [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Turn Jupiter off > [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Test 5 seems ok (number of Process: 2, it should be 2) > [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Test done. See you! +> [Jupiter:commRX:(2) 20.000000] [msg_test/INFO] Receive message: HOST_FAILURE +> [Jupiter:commRX:(2) 20.000000] [msg_test/INFO] RX Done > [Tremblay:commTX:(3) 40.000000] [msg_test/INFO] TX done > [40.000000] [msg_test/INFO] Simulation time 40 @@ -85,6 +87,7 @@ $ ${bindir}/host_on_off_processes ${platfdir}/small_platform.xml 6 --log=no_loc > [Jupiter:process_daemonJUPI:(3) 9.000011] [msg_test/INFO] Execute daemon > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Turn Jupiter off > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Shutdown vm0 +> [Jupiter:process_daemonJUPI:(3) 10.000000] [msg_test/INFO] Host as died as expected, do nothing else > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Destroy vm0 > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Test 6 is also weird: when the node Jupiter is turned off once again, the VM and its daemon are not killed. However, the issue regarding the shutdown of hosted VMs can be seen a feature not a bug ;) > [Tremblay:test_launcher:(1) 10.000000] [msg_test/INFO] Test done. See you! diff --git a/teshsuite/msg/host_on_off_recv/host_on_off_recv.c b/teshsuite/msg/host_on_off_recv/host_on_off_recv.c index 97363f8ab2..9755c8d3f6 100644 --- a/teshsuite/msg/host_on_off_recv/host_on_off_recv.c +++ b/teshsuite/msg/host_on_off_recv/host_on_off_recv.c @@ -41,7 +41,10 @@ static int slave(int argc, char *argv[]) msg_task_t task = NULL; msg_error_t error = MSG_task_receive(&(task), mailbox); if (error) { - XBT_ERROR("Error while receiving message"); + if (error != MSG_HOST_FAILURE) + XBT_ERROR("Error while receiving message"); + else + XBT_DEBUG("The host has been turned off, this was expected"); return 1; } diff --git a/teshsuite/s4u/CMakeLists.txt b/teshsuite/s4u/CMakeLists.txt index f54101688e..a0eb9f6c1b 100644 --- a/teshsuite/s4u/CMakeLists.txt +++ b/teshsuite/s4u/CMakeLists.txt @@ -11,7 +11,7 @@ endforeach() ## Add the tests. ## Some need to be run with all factories, some need not tesh to run -foreach(x actor actor-migration cloud-interrupt-migration concurrent_rw) # TODO: actor-autorestart is disabled for now +foreach(x actor actor-autorestart actor-migration cloud-interrupt-migration concurrent_rw) # TODO: actor-autorestart is disabled for now set(tesh_files ${tesh_files} ${CMAKE_CURRENT_SOURCE_DIR}/${x}/${x}.tesh) ADD_TESH_FACTORIES(tesh-s4u-${x} "thread;ucontext;raw;boost" --setenv srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/s4u/${x} --setenv platfdir=${CMAKE_HOME_DIRECTORY}/examples/platforms --cd ${CMAKE_BINARY_DIR}/teshsuite/s4u/${x} ${CMAKE_HOME_DIRECTORY}/teshsuite/s4u/${x}/${x}.tesh) endforeach() diff --git a/teshsuite/s4u/actor-autorestart/actor-autorestart.cpp b/teshsuite/s4u/actor-autorestart/actor-autorestart.cpp index bb2d96414d..47e132a9e1 100644 --- a/teshsuite/s4u/actor-autorestart/actor-autorestart.cpp +++ b/teshsuite/s4u/actor-autorestart/actor-autorestart.cpp @@ -4,14 +4,24 @@ * under the terms of the license (GNU LGPL) which comes with this package. */ #include "simgrid/s4u.hpp" +#include XBT_LOG_NEW_DEFAULT_CATEGORY(s4u_test, "Messages specific for this s4u example"); static void dummy() { XBT_INFO("I start"); - simgrid::s4u::this_actor::sleep_for(200); - XBT_INFO("I stop"); + try { + simgrid::s4u::this_actor::sleep_for(200); + XBT_INFO("I stop"); + } catch (xbt_ex& e) { + if (e.category == host_error) { + XBT_DEBUG("The host has died ... as expected. This actor silently stops"); + } else { + XBT_ERROR("An unexpected exception has been raised."); + throw; + } + } } static void autostart()