From 921f78391cfb481ec3305c8e4129e744a50db522 Mon Sep 17 00:00:00 2001 From: Gabriel Corona Date: Mon, 29 Jun 2015 13:08:51 +0200 Subject: [PATCH] Fix host-on-off This is a temporary workaround. This should probably be fixed at a lower layer instead. The test stil has a refcount handling bug (two +1 but only one -1). --- src/msg/msg_gos.c | 2 ++ src/msg/msg_mailbox.c | 2 ++ src/simix/smx_network.c | 10 ++++++++++ .../msg/host_on_off_processes/host_on_off_processes.c | 2 -- .../host_on_off_processes/host_on_off_processes.tesh | 9 ++++++++- 5 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/msg/msg_gos.c b/src/msg/msg_gos.c index 11218c98b1..6c9427a39a 100644 --- a/src/msg/msg_gos.c +++ b/src/msg/msg_gos.c @@ -338,6 +338,8 @@ MSG_task_receive_ext(msg_task_t * task, const char *alias, double timeout, } CATCH(e) { switch (e.category) { + case host_error: + ret = MSG_HOST_FAILURE; case cancel_error: /* may be thrown by MSG_mailbox_get_by_alias */ ret = MSG_HOST_FAILURE; break; diff --git a/src/msg/msg_mailbox.c b/src/msg/msg_mailbox.c index 4a3f158d79..ea94cefd22 100644 --- a/src/msg/msg_mailbox.c +++ b/src/msg/msg_mailbox.c @@ -145,6 +145,8 @@ MSG_mailbox_get_task_ext_bounded(msg_mailbox_t mailbox, msg_task_t * task, case timeout_error: ret = MSG_TIMEOUT; break; + case host_error: + ret = MSG_HOST_FAILURE; default: RETHROW; } diff --git a/src/simix/smx_network.c b/src/simix/smx_network.c index fbeb548248..6d1b84928b 100644 --- a/src/simix/smx_network.c +++ b/src/simix/smx_network.c @@ -800,6 +800,7 @@ void SIMIX_comm_finish(smx_synchro_t synchro) XBT_DEBUG("SIMIX_comm_finish: synchro state = %d", (int)synchro->state); /* Check out for errors */ + switch (synchro->state) { case SIMIX_DONE: @@ -834,6 +835,15 @@ void SIMIX_comm_finish(smx_synchro_t synchro) break; case SIMIX_LINK_FAILURE: + + // There should be a cleaner way to do this. + // We should handle this in SIMIX_post_comm instead. + if (surf_resource_get_state(surf_workstation_resource_priv( + simcall->issuer->smx_host)) != SURF_RESOURCE_ON) { + SMX_EXCEPTION(simcall->issuer, host_error, 0, "Host failed"); + break; + } + XBT_DEBUG("Link failure in synchro %p between '%s' and '%s': posting an exception to the issuer: %s (%p) detached:%d", synchro, synchro->comm.src_proc ? sg_host_name(synchro->comm.src_proc->smx_host) : NULL, diff --git a/teshsuite/msg/host_on_off_processes/host_on_off_processes.c b/teshsuite/msg/host_on_off_processes/host_on_off_processes.c index 959bf5ec6a..f058c12e67 100644 --- a/teshsuite/msg/host_on_off_processes/host_on_off_processes.c +++ b/teshsuite/msg/host_on_off_processes/host_on_off_processes.c @@ -104,8 +104,6 @@ int test_launcher(int argc, char *argv[]) test = 5; if (xbt_dynar_search_or_negative(tests, &test)!=-1){ XBT_INFO("Test 5 (turn off dest during a communication : Create a Process/task to make a communication between Tremblay and Jupiter and turn off Jupiter during the communication"); - XBT_INFO("Warning! I think this test is completely broken and it was revealed by exception/exception test."); - XBT_INFO("At time 20, Jupiter should wake up with a HOST_FAILURE and it gets a TRANSFERT_FAILURE. This is because when turning off Jupiter, its processes are killed, which cancels/destroys the corresponding surf communication instead of canceling a src_ or dst_timeout."); MSG_host_on(jupiter); MSG_process_sleep(10); argvF = xbt_new(char*, 2); diff --git a/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh b/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh index 5881f4d40b..299e8500c0 100644 --- a/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh +++ b/teshsuite/msg/host_on_off_processes/host_on_off_processes.tesh @@ -54,7 +54,12 @@ $ ./host_on_off_processes ${srcdir:=.}/../../../examples/platforms/small_platfor > [Tremblay:commRX:(2) 20.000000] [msg_test/INFO] RX Done > [20.000000] [msg_test/INFO] Simulation time 20 -# This test is broken: +# This test is broken (refcount error): +# Warning! I think this test is completely broken and it was revealed by +# exception/exception test. At time 20, Jupiter should wake up with a +# HOST_FAILURE and it gets a TRANSFERT_FAILURE. This is because when turning +# off Jupiter, its processes are killed, which cancels/destroys the +# corresponding surf communication instead of canceling a src_ or dst_timeout. $ ./host_on_off_processes ${srcdir:=.}/../../../examples/platforms/small_platform.xml ${srcdir:=.}/host_on_off_processes_d.xml 5 --log=no_loc --log=msg.thresh:error --log=surf_maxmin.thresh:error > [Tremblay:test_launcher:(1) 0.000000] [msg_test/INFO] Test 5 (turn off dest during a communication : Create a Process/task to make a communication between Tremblay and Jupiter and turn off Jupiter during the communication > [Jupiter:commRX:(2) 10.000000] [msg_test/INFO] Start RX @@ -63,6 +68,8 @@ $ ./host_on_off_processes ${srcdir:=.}/../../../examples/platforms/small_platfor > [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Turn Jupiter off > [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Test 5 seems ok, cool !(number of Process : 2, it should be 2 > [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Test done. See you! +> [Jupiter:commRX:(2) 20.000000] [msg_test/INFO] Receive message: HOST_FAILURE +> [Jupiter:commRX:(2) 20.000000] [msg_test/INFO] RX Done > [Tremblay:commTX:(3) 40.000000] [msg_test/INFO] TX done > [40.000000] [msg_test/INFO] Simulation time 40 -- 2.20.1