From bf4e9e11d5a5851f8adea9bdaa8f4cfef417add3 Mon Sep 17 00:00:00 2001 From: Frederic Suter Date: Sun, 5 Aug 2018 14:46:25 +0200 Subject: [PATCH] add more info and revalidate to where it diverges --- .../s4u-platform-failures.cpp | 4 +++- .../s4u-platform-failures.tesh | 22 ++++++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/examples/s4u/platform-failures/s4u-platform-failures.cpp b/examples/s4u/platform-failures/s4u-platform-failures.cpp index 941ea2aba8..db0ee4cf33 100644 --- a/examples/s4u/platform-failures/s4u-platform-failures.cpp +++ b/examples/s4u/platform-failures/s4u-platform-failures.cpp @@ -25,8 +25,9 @@ static int master(int argc, char* argv[]) mailbox = simgrid::s4u::Mailbox::by_name(std::string("worker-") + std::to_string(i % workers_count)); double* payload = new double(comp_size); try { + XBT_INFO("Send a message to %s", mailbox->get_cname()); mailbox->put(payload, comm_size, 10.0); - XBT_INFO("Send completed"); + XBT_INFO("Send to %s completed", mailbox->get_cname()); } catch (xbt_ex& e) { switch (e.category) { case host_error: @@ -84,6 +85,7 @@ static int worker(int argc, char* argv[]) double comp_size = -1; while (1) { try { + XBT_INFO("Waiting a message on %s", mailbox->get_cname()); payload = static_cast(mailbox->get()); comp_size = *payload; delete payload; diff --git a/examples/s4u/platform-failures/s4u-platform-failures.tesh b/examples/s4u/platform-failures/s4u-platform-failures.tesh index 5738a23318..e51427bf26 100644 --- a/examples/s4u/platform-failures/s4u-platform-failures.tesh +++ b/examples/s4u/platform-failures/s4u-platform-failures.tesh @@ -6,16 +6,28 @@ p Testing a simple master/worker example application handling failures TCP cross $ $SG_TEST_EXENV ${bindir:=.}/s4u-platform-failures$EXEEXT --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_with_failures.xml ${bindir}/../app-masterworker/s4u-app-masterworker_d.xml --cfg=path:${srcdir} --cfg=network/crosstraffic:0 "--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n" > [ 0.000000] (0:maestro@) Cannot launch process 'worker' on failed host 'Fafard' > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process -> [ 0.010309] (1:master@Tremblay) Send completed +> [ 0.000000] (1:master@Tremblay) Send a message to worker-0 +> [ 0.010309] (1:master@Tremblay) Send to worker-0 completed +> [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 +> [ 0.000000] (3:worker@Jupiter) Waiting a message on worker-1 +> [ 0.000000] (4:worker@Ginette) Waiting a message on worker-3 +> [ 0.000000] (5:worker@Bourassa) Waiting a message on worker-4 +> [ 0.010309] (1:master@Tremblay) Send a message to worker-1 > [ 1.000000] (0:maestro@) Restart processes on host Fafard > [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'worker-1'. Nevermind. Let's keep going! +> [ 1.000000] (1:master@Tremblay) Send a message to worker-2 > [ 1.000000] (3:worker@Jupiter) Gloups. The cpu on which I'm running just turned off!. See you! > [ 2.000000] (0:maestro@) Restart processes on host Jupiter +> [ 2.010309] (2:worker@Tremblay) Waiting a message on worker-0 > [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! -> [ 12.030928] (1:master@Tremblay) Send completed -> [ 13.061856] (1:master@Tremblay) Send completed -> [ 13.072165] (1:master@Tremblay) Send completed -> [ 14.103093] (1:master@Tremblay) Send completed +> [ 11.000000] (1:master@Tremblay) Send a message to worker-3 +> [ 12.030928] (1:master@Tremblay) Send to worker-3 completed +> [ 12.030928] (1:master@Tremblay) Send a message to worker-4 +> [ 13.061856] (1:master@Tremblay) Send to worker-4 completed +> [ 13.061856] (1:master@Tremblay) Send a message to worker-0 +> [ 13.072165] (1:master@Tremblay) Send to worker-0 completed +> [ 13.072165] (1:master@Tremblay) Send a message to worker-1 +> [ 14.103093] (1:master@Tremblay) Send to worker-1 completed > [ 24.103093] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'worker-2'. Nevermind. Let's keep going! > [ 24.103093] (1:master@Tremblay) Mmh. Something went wrong with 'worker-3'. Nevermind. Let's keep going! > [ 24.103093] (4:worker@Ginette) Mmh. Something went wrong. Nevermind. Let's keep going! -- 2.20.1