Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Handle host failures in communications. (bug raised by Alan Malta)
[simgrid.git] / src / msg / msg_mailbox.c
index 1b75fd6..13b605f 100644 (file)
@@ -2,7 +2,7 @@
 #include "msg/private.h"
 
 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(msg_mailbox, msg,
-                               "Logging specific to MSG (mailbox)");
+                                "Logging specific to MSG (mailbox)");
 
 static xbt_dict_t msg_mailboxes = NULL;
 
@@ -82,14 +82,13 @@ m_task_t MSG_mailbox_get_head(msg_mailbox_t mailbox)
 }
 
 
-m_task_t
-MSG_mailbox_get_first_host_task(msg_mailbox_t mailbox, m_host_t host)
+m_task_t MSG_mailbox_get_first_host_task(msg_mailbox_t mailbox, m_host_t host)
 {
   m_task_t task = NULL;
   xbt_fifo_item_t item = NULL;
 
   xbt_fifo_foreach(mailbox->tasks, item, task, m_task_t)
-      if (task->simdata->source == host) {
+    if (task->simdata->source == host) {
     xbt_fifo_remove_item(mailbox->tasks, item);
     return task;
   }
@@ -98,8 +97,7 @@ MSG_mailbox_get_first_host_task(msg_mailbox_t mailbox, m_host_t host)
 }
 
 int
-MSG_mailbox_get_count_host_waiting_tasks(msg_mailbox_t mailbox,
-                                        m_host_t host)
+MSG_mailbox_get_count_host_waiting_tasks(msg_mailbox_t mailbox, m_host_t host)
 {
   m_task_t task = NULL;
   xbt_fifo_item_t item = NULL;
@@ -146,20 +144,19 @@ msg_mailbox_t MSG_mailbox_get_by_alias(const char *alias)
   return mailbox;
 }
 
-msg_mailbox_t
-MSG_mailbox_get_by_channel(m_host_t host, m_channel_t channel)
+msg_mailbox_t MSG_mailbox_get_by_channel(m_host_t host, m_channel_t channel)
 {
   xbt_assert0((host != NULL), "Invalid host");
   xbt_assert1((channel >= 0)
-             && (channel < msg_global->max_channel), "Invalid channel %d",
-             channel);
+              && (channel < msg_global->max_channel), "Invalid channel %d",
+              channel);
 
   return host->simdata->mailboxes[(size_t) channel];
 }
 
 MSG_error_t
 MSG_mailbox_get_task_ext(msg_mailbox_t mailbox, m_task_t * task,
-                        m_host_t host, double timeout)
+                         m_host_t host, double timeout)
 {
   m_process_t process = MSG_process_self();
   m_task_t t = NULL;
@@ -168,7 +165,7 @@ MSG_mailbox_get_task_ext(msg_mailbox_t mailbox, m_task_t * task,
   simdata_host_t h_simdata = NULL;
   double start_time = SIMIX_get_clock();
 
-  smx_cond_t cond = NULL;      //conditional wait if the task isn't on the channel yet
+  smx_cond_t cond = NULL;       //conditional wait if the task isn't on the channel yet
 
   CHECK_HOST();
 
@@ -177,7 +174,7 @@ MSG_mailbox_get_task_ext(msg_mailbox_t mailbox, m_task_t * task,
 
   if (*task)
     CRITICAL0
-       ("MSG_task_get() was asked to write in a non empty task struct.");
+      ("MSG_task_get() was asked to write in a non empty task struct.");
 
   /* Get the task */
   h = MSG_host_self();
@@ -187,7 +184,7 @@ MSG_mailbox_get_task_ext(msg_mailbox_t mailbox, m_task_t * task,
 
   if (MSG_mailbox_get_cond(mailbox)) {
     CRITICAL1("A process is already blocked on the channel %s",
-             MSG_mailbox_get_alias(mailbox));
+              MSG_mailbox_get_alias(mailbox));
     SIMIX_cond_display_info(MSG_mailbox_get_cond(mailbox));
     xbt_die("Go fix your code!");
   }
@@ -196,30 +193,30 @@ MSG_mailbox_get_task_ext(msg_mailbox_t mailbox, m_task_t * task,
     /* if the mailbox is empty (has no task */
     if (!MSG_mailbox_is_empty(mailbox)) {
       if (!host) {
-       /* pop the head of the mailbox */
-       t = MSG_mailbox_pop_head(mailbox);
-       break;
+        /* pop the head of the mailbox */
+        t = MSG_mailbox_pop_head(mailbox);
+        break;
       } else {
-       /* get the first task of the host */
-       if ((t = MSG_mailbox_get_first_host_task(mailbox, host)))
-         break;
+        /* get the first task of the host */
+        if ((t = MSG_mailbox_get_first_host_task(mailbox, host)))
+          break;
       }
     }
 
-    if ((timeout > 0) && (SIMIX_get_clock()-start_time>=timeout)) {
+    if ((timeout > 0) && (SIMIX_get_clock() - start_time >= timeout)) {
       SIMIX_mutex_unlock(h->simdata->mutex);
       MSG_mailbox_set_cond(mailbox, NULL);
       SIMIX_cond_destroy(cond);
       MSG_RETURN(MSG_TRANSFER_FAILURE);
     }
 
-    if(!cond) {
+    if (!cond) {
       cond = SIMIX_cond_init();
       MSG_mailbox_set_cond(mailbox, cond);
     }
 
     if (timeout > 0)
-      SIMIX_cond_wait_timeout(cond, h->simdata->mutex, timeout-start_time);
+      SIMIX_cond_wait_timeout(cond, h->simdata->mutex, timeout - start_time);
     else
       SIMIX_cond_wait(MSG_mailbox_get_cond(mailbox), h->simdata->mutex);
 
@@ -250,11 +247,11 @@ MSG_mailbox_get_task_ext(msg_mailbox_t mailbox, m_task_t * task,
   /* Transfer */
   /* create SIMIX action to the communication */
   t_simdata->comm =
-      SIMIX_action_communicate(t_simdata->sender->simdata->m_host->
-                              simdata->smx_host,
-                              process->simdata->m_host->simdata->smx_host,
-                              t->name, t_simdata->message_size,
-                              t_simdata->rate);
+    SIMIX_action_communicate(t_simdata->sender->simdata->m_host->simdata->
+                             smx_host,
+                             process->simdata->m_host->simdata->smx_host,
+                             t->name, t_simdata->message_size,
+                             t_simdata->rate);
 
   /* This is a hack. We know that both the receiver and the sender will
      need to look at the content of t_simdata->comm. And it needs to be
@@ -280,6 +277,10 @@ MSG_mailbox_get_task_ext(msg_mailbox_t mailbox, m_task_t * task,
 
     if (SIMIX_action_get_state(t_simdata->comm) != SURF_ACTION_RUNNING)
       break;
+    if (!SIMIX_host_get_state(h_simdata->smx_host))
+      break;
+    if (!SIMIX_host_get_state(process->simdata->m_host->simdata->smx_host))
+      break;
   }
 
   SIMIX_unregister_action_to_condition(t_simdata->comm, t_simdata->cond);
@@ -299,34 +300,34 @@ MSG_mailbox_get_task_ext(msg_mailbox_t mailbox, m_task_t * task,
       SIMIX_action_destroy(t_simdata->comm);
       t_simdata->comm = NULL;
     } else {
-      t_simdata->comm->refcount --;
+      t_simdata->comm->refcount--;
     }
-    t_simdata->refcount --;
+    t_simdata->refcount--;
     MSG_RETURN(MSG_OK);
   } else if (SIMIX_host_get_state(h_simdata->smx_host) == 0) {
     if (t_simdata->comm->refcount == 1) {
       SIMIX_action_destroy(t_simdata->comm);
       t_simdata->comm = NULL;
     } else {
-      t_simdata->comm->refcount --;
+      t_simdata->comm->refcount--;
     }
-    t_simdata->refcount --;
+    t_simdata->refcount--;
     MSG_RETURN(MSG_HOST_FAILURE);
   } else {
-    if (t_simdata->comm->refcount ==) {
+    if (t_simdata->comm->refcount == 1) {
       SIMIX_action_destroy(t_simdata->comm);
       t_simdata->comm = NULL;
     } else {
-      t_simdata->comm->refcount --;
+      t_simdata->comm->refcount--;
     }
-    t_simdata->refcount --;
+    t_simdata->refcount--;
     MSG_RETURN(MSG_TRANSFER_FAILURE);
   }
 }
 
 MSG_error_t
 MSG_mailbox_put_with_timeout(msg_mailbox_t mailbox, m_task_t task,
-                            double timeout)
+                             double timeout)
 {
   m_process_t process = MSG_process_self();
   const char *hostname;
@@ -341,12 +342,12 @@ MSG_mailbox_put_with_timeout(msg_mailbox_t mailbox, m_task_t task,
   t_simdata->sender = process;
   t_simdata->source = MSG_process_get_host(process);
 
-  xbt_assert0(t_simdata->refcount  == 1,
-             "This task is still being used somewhere else. You cannot send it now. Go fix your code!");
+  xbt_assert0(t_simdata->refcount == 1,
+              "This task is still being used somewhere else. You cannot send it now. Go fix your code!");
 
   t_simdata->comm = NULL;
 
-  t_simdata->refcount ++;
+  t_simdata->refcount++;
   local_host = ((simdata_process_t) process->simdata)->m_host;
   msg_global->sent_msg++;
 
@@ -360,8 +361,8 @@ MSG_mailbox_put_with_timeout(msg_mailbox_t mailbox, m_task_t task,
 
 
   DEBUG4("Trying to send a task (%g kB) from %s to %s on the channel %s",
-                 t_simdata->message_size / 1000, local_host->name,
-                 remote_host->name, MSG_mailbox_get_alias(mailbox));
+         t_simdata->message_size / 1000, local_host->name,
+         remote_host->name, MSG_mailbox_get_alias(mailbox));
 
   SIMIX_mutex_lock(remote_host->simdata->mutex);
 
@@ -388,43 +389,50 @@ MSG_mailbox_put_with_timeout(msg_mailbox_t mailbox, m_task_t task,
     TRY {
       /*verify if the action that ends is the correct. Call the wait_timeout with the new time. If the timeout occurs, an exception is raised */
       while (1) {
-       time_elapsed = SIMIX_get_clock() - time;
-       SIMIX_cond_wait_timeout(t_simdata->cond, t_simdata->mutex,
-                               timeout - time_elapsed);
-
-       if ((t_simdata->comm != NULL)
-           && (SIMIX_action_get_state(t_simdata->comm) !=
-               SURF_ACTION_RUNNING))
+        time_elapsed = SIMIX_get_clock() - time;
+        SIMIX_cond_wait_timeout(t_simdata->cond, t_simdata->mutex,
+                                timeout - time_elapsed);
+
+        if ((t_simdata->comm != NULL)
+            && (SIMIX_action_get_state(t_simdata->comm) !=
+                SURF_ACTION_RUNNING))
+          break;
+       if (!SIMIX_host_get_state(local_host->simdata->smx_host))
+         break;
+       if (!SIMIX_host_get_state(remote_host->simdata->smx_host))
          break;
       }
     }
     CATCH(e) {
       if (e.category == timeout_error) {
-       xbt_ex_free(e);
-       /* verify if the timeout happened and the communication didn't started yet */
-       if (t_simdata->comm == NULL) {
-         process->simdata->waiting_task = NULL;
+        xbt_ex_free(e);
+        /* verify if the timeout happened and the communication didn't started yet */
+        if (t_simdata->comm == NULL) {
+          process->simdata->waiting_task = NULL;
 
-         /* remove the task from the mailbox */
-         MSG_mailbox_remove(mailbox, task);
+          /* remove the task from the mailbox */
+          MSG_mailbox_remove(mailbox, task);
 
-         if (t_simdata->receiver && t_simdata->receiver->simdata) { /* receiver still around */
-           t_simdata->receiver->simdata->waiting_task = NULL;
-         }
+          if (t_simdata->receiver && t_simdata->receiver->simdata) {    /* receiver still around */
+            t_simdata->receiver->simdata->waiting_task = NULL;
+          }
 
-         SIMIX_mutex_unlock(t_simdata->mutex);
-         MSG_RETURN(MSG_TRANSFER_FAILURE);
-       }
+          SIMIX_mutex_unlock(t_simdata->mutex);
+          MSG_RETURN(MSG_TRANSFER_FAILURE);
+        }
       } else {
-       RETHROW;
+        RETHROW;
       }
     }
   } else {
     while (1) {
       SIMIX_cond_wait(t_simdata->cond, t_simdata->mutex);
 
-      if (SIMIX_action_get_state(t_simdata->comm) !=
-         SURF_ACTION_RUNNING)
+      if (SIMIX_action_get_state(t_simdata->comm) != SURF_ACTION_RUNNING)
+        break;
+      if (!SIMIX_host_get_state(local_host->simdata->smx_host))
+       break;
+      if (!SIMIX_host_get_state(remote_host->simdata->smx_host))
        break;
     }
   }
@@ -443,7 +451,7 @@ MSG_mailbox_put_with_timeout(msg_mailbox_t mailbox, m_task_t task,
       SIMIX_action_destroy(t_simdata->comm);
       t_simdata->comm = NULL;
     } else {
-      t_simdata->comm->refcount --;
+      t_simdata->comm->refcount--;
     }
     MSG_RETURN(MSG_OK);
   } else if (SIMIX_host_get_state(local_host->simdata->smx_host) == 0) {
@@ -451,7 +459,7 @@ MSG_mailbox_put_with_timeout(msg_mailbox_t mailbox, m_task_t task,
       SIMIX_action_destroy(t_simdata->comm);
       t_simdata->comm = NULL;
     } else {
-      t_simdata->comm->refcount --;
+      t_simdata->comm->refcount--;
     }
     MSG_RETURN(MSG_HOST_FAILURE);
   } else {
@@ -459,7 +467,7 @@ MSG_mailbox_put_with_timeout(msg_mailbox_t mailbox, m_task_t task,
       SIMIX_action_destroy(t_simdata->comm);
       t_simdata->comm = NULL;
     } else {
-      t_simdata->comm->refcount --;
+      t_simdata->comm->refcount--;
     }
     MSG_RETURN(MSG_TRANSFER_FAILURE);
   }