}
CATCH(e) {
switch (e.category) {
+ case host_error:
+ ret = MSG_HOST_FAILURE;
case cancel_error: /* may be thrown by MSG_mailbox_get_by_alias */
ret = MSG_HOST_FAILURE;
break;
case timeout_error:
ret = MSG_TIMEOUT;
break;
+ case host_error:
+ ret = MSG_HOST_FAILURE;
default:
RETHROW;
}
XBT_DEBUG("SIMIX_comm_finish: synchro state = %d", (int)synchro->state);
/* Check out for errors */
+
switch (synchro->state) {
case SIMIX_DONE:
break;
case SIMIX_LINK_FAILURE:
+
+ // There should be a cleaner way to do this.
+ // We should handle this in SIMIX_post_comm instead.
+ if (surf_resource_get_state(surf_workstation_resource_priv(
+ simcall->issuer->smx_host)) != SURF_RESOURCE_ON) {
+ SMX_EXCEPTION(simcall->issuer, host_error, 0, "Host failed");
+ break;
+ }
+
XBT_DEBUG("Link failure in synchro %p between '%s' and '%s': posting an exception to the issuer: %s (%p) detached:%d",
synchro,
synchro->comm.src_proc ? sg_host_name(synchro->comm.src_proc->smx_host) : NULL,
test = 5;
if (xbt_dynar_search_or_negative(tests, &test)!=-1){
XBT_INFO("Test 5 (turn off dest during a communication : Create a Process/task to make a communication between Tremblay and Jupiter and turn off Jupiter during the communication");
- XBT_INFO("Warning! I think this test is completely broken and it was revealed by exception/exception test.");
- XBT_INFO("At time 20, Jupiter should wake up with a HOST_FAILURE and it gets a TRANSFERT_FAILURE. This is because when turning off Jupiter, its processes are killed, which cancels/destroys the corresponding surf communication instead of canceling a src_ or dst_timeout.");
MSG_host_on(jupiter);
MSG_process_sleep(10);
argvF = xbt_new(char*, 2);
> [Tremblay:commRX:(2) 20.000000] [msg_test/INFO] RX Done
> [20.000000] [msg_test/INFO] Simulation time 20
-# This test is broken:
+# This test is broken (refcount error):
+# Warning! I think this test is completely broken and it was revealed by
+# exception/exception test. At time 20, Jupiter should wake up with a
+# HOST_FAILURE and it gets a TRANSFERT_FAILURE. This is because when turning
+# off Jupiter, its processes are killed, which cancels/destroys the
+# corresponding surf communication instead of canceling a src_ or dst_timeout.
$ ./host_on_off_processes ${srcdir:=.}/../../../examples/platforms/small_platform.xml ${srcdir:=.}/host_on_off_processes_d.xml 5 --log=no_loc --log=msg.thresh:error --log=surf_maxmin.thresh:error
> [Tremblay:test_launcher:(1) 0.000000] [msg_test/INFO] Test 5 (turn off dest during a communication : Create a Process/task to make a communication between Tremblay and Jupiter and turn off Jupiter during the communication
> [Jupiter:commRX:(2) 10.000000] [msg_test/INFO] Start RX
> [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Turn Jupiter off
> [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Test 5 seems ok, cool !(number of Process : 2, it should be 2
> [Tremblay:test_launcher:(1) 20.000000] [msg_test/INFO] Test done. See you!
+> [Jupiter:commRX:(2) 20.000000] [msg_test/INFO] Receive message: HOST_FAILURE
+> [Jupiter:commRX:(2) 20.000000] [msg_test/INFO] RX Done
> [Tremblay:commTX:(3) 40.000000] [msg_test/INFO] TX done
> [40.000000] [msg_test/INFO] Simulation time 40