Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Fix possible crashes and leaks with dsends during processes cleanup
authorChristophe Thiéry <christopho128@gmail.com>
Wed, 9 Nov 2011 10:46:02 +0000 (11:46 +0100)
committerChristophe Thiéry <christopho128@gmail.com>
Wed, 9 Nov 2011 11:23:49 +0000 (12:23 +0100)
src/simix/smx_global.c
src/simix/smx_network.c
src/simix/smx_process.c

index 3485c6b..24f9df4 100644 (file)
@@ -360,8 +360,10 @@ void SIMIX_display_process_status(void)
          action_description = "I/O";
          break;
       }
-      XBT_INFO("Process %ld (%s@%s): waiting for %s action %p (%s) to finish", process->pid, process->name, process->smx_host->name,
-         action_description, process->waiting_action, process->waiting_action->name);
+      XBT_INFO("Process %ld (%s@%s): waiting for %s action %p (%s) in state %d to finish",
+          process->pid, process->name, process->smx_host->name,
+         action_description, process->waiting_action,
+         process->waiting_action->name, process->waiting_action->state);
     }
     else {
       XBT_INFO("Process %ld (%s@%s)", process->pid, process->name, process->smx_host->name);
index d5d4631..56f4086 100644 (file)
@@ -18,7 +18,6 @@ unsigned long int smx_total_comms = 0;
 static void SIMIX_waitany_req_remove_from_actions(smx_req_t req);
 static void SIMIX_comm_copy_data(smx_action_t comm);
 static smx_action_t SIMIX_comm_new(e_smx_comm_type_t type);
-static void SIMIX_comm_remove_from_processes(smx_action_t action);
 static XBT_INLINE void SIMIX_rdv_push(smx_rdv_t rdv, smx_action_t comm);
 static XBT_INLINE void SIMIX_rdv_remove(smx_rdv_t rdv, smx_action_t comm);
 static smx_action_t SIMIX_rdv_get_request(smx_rdv_t rdv, e_smx_comm_type_t type,
@@ -135,7 +134,7 @@ smx_action_t SIMIX_rdv_get_request(smx_rdv_t rdv, e_smx_comm_type_t type,
   xbt_fifo_item_t item;
   void* req_data = NULL;
 
-  xbt_fifo_foreach(rdv->comm_fifo, item, action, smx_action_t){
+  xbt_fifo_foreach(rdv->comm_fifo, item, action, smx_action_t) {
     if (action->comm.type == SIMIX_COMM_SEND) {
       req_data = action->comm.src_data;
     } else if (action->comm.type == SIMIX_COMM_RECEIVE) {
@@ -188,7 +187,7 @@ int SIMIX_comm_has_recv_match(smx_rdv_t rdv, int (*match_fun)(void*, void*), voi
   smx_action_t action;
   xbt_fifo_item_t item;
 
-  xbt_fifo_foreach(rdv->comm_fifo, item, action, smx_action_t){
+  xbt_fifo_foreach(rdv->comm_fifo, item, action, smx_action_t) {
     if (action->comm.type == SIMIX_COMM_RECEIVE
         && (!match_fun || match_fun(data, action->comm.dst_data))) {
       XBT_DEBUG("Found a matching communication action %p", action);
@@ -214,6 +213,7 @@ smx_action_t SIMIX_comm_new(e_smx_comm_type_t type)
 
   /* alloc structures */
   act = xbt_mallocator_get(simix_global->action_mallocator);
+
   act->type = SIMIX_ACTION_COMMUNICATE;
   act->state = SIMIX_WAITING;
 
@@ -242,11 +242,12 @@ smx_action_t SIMIX_comm_new(e_smx_comm_type_t type)
  */
 void SIMIX_comm_destroy(smx_action_t action)
 {
-  XBT_DEBUG("Destroy action %p (refcount:%d)", action, action->comm.refcount);
+  XBT_DEBUG("Destroy action %p (refcount: %d), state: %d",
+      action, action->comm.refcount, action->state);
 
-  if (action->comm.refcount <= 0)
-    xbt_die("the refcount of comm %p is already 0 before decreasing it. "
-            "That's a bug!", action);
+  xbt_assert(action->comm.refcount > 0,
+      "The refcount of comm %p is already 0 before decreasing it. "
+      "That's a bug!", action);
 
   action->comm.refcount--;
   if (action->comm.refcount > 0)
@@ -547,13 +548,15 @@ XBT_INLINE void SIMIX_comm_start(smx_action_t action)
   }
 }
 
+/**
+ * \brief Answers the SIMIX requests associated to a communication action.
+ * \param action a finished communication action
+ */
 void SIMIX_comm_finish(smx_action_t action)
 {
   unsigned int destroy_count = 0;
   smx_req_t req;
 
-  SIMIX_comm_remove_from_processes(action);
-
   while ((req = xbt_fifo_shift(action->request_list))) {
 
     /* If a waitany request is waiting for this action to finish, then remove
@@ -636,8 +639,22 @@ void SIMIX_comm_finish(smx_action_t action)
         }
         break;
 
+      case SIMIX_CANCELED:
+        TRY {
+          if (req->issuer == action->comm.dst_proc) {
+            THROWF(cancel_error, 0, "Communication canceled by the sender");
+          }
+          else {
+            THROWF(cancel_error, 0, "Communication canceled by the receiver");
+          }
+        }
+        CATCH(req->issuer->running_ctx->exception) {
+          req->issuer->doexception = 1;
+        }
+        break;
+
       default:
-        THROW_IMPOSSIBLE;
+        xbt_die("Unexpected action state in SIMIX_comm_finish: %d", action->state);
     }
 
     /* if there is an exception during a waitany or a testany, indicate the position of the failed communication */
@@ -659,6 +676,10 @@ void SIMIX_comm_finish(smx_action_t action)
     SIMIX_comm_destroy(action);
 }
 
+/**
+ * \brief This function is called when a Surf communication action is finished.
+ * \param action the corresponding Simix communication
+ */
 void SIMIX_post_comm(smx_action_t action)
 {
   /* Update action state */
@@ -686,28 +707,19 @@ void SIMIX_post_comm(smx_action_t action)
   /* destroy the surf actions associated with the Simix communication */
   SIMIX_comm_destroy_internal_actions(action);
 
-  /* if there are requests associated with the action, then answer them */
-  if (xbt_fifo_size(action->request_list)) {
-    SIMIX_comm_finish(action);
-  }
-  else {
-    SIMIX_comm_remove_from_processes(action);
-  }
-}
-
-/**
- * \brief Removes a communication action from the list of pending communications
- * of both processes (if they still exist)
- * \param action a communication action
- */
-static void SIMIX_comm_remove_from_processes(smx_action_t action) {
-
+  /* remove the communication action from the list of pending communications
+   * of both processes (if they still exist) */
   if (action->comm.src_proc) {
     xbt_fifo_remove(action->comm.src_proc->comms, action);
   }
   if (action->comm.dst_proc) {
     xbt_fifo_remove(action->comm.dst_proc->comms, action);
   }
+
+  /* if there are requests associated with the action, then answer them */
+  if (xbt_fifo_size(action->request_list)) {
+    SIMIX_comm_finish(action);
+  }
 }
 
 void SIMIX_comm_cancel(smx_action_t action)
@@ -718,10 +730,9 @@ void SIMIX_comm_cancel(smx_action_t action)
     SIMIX_rdv_remove(action->comm.rdv, action);
     action->state = SIMIX_CANCELED;
   }
-  else if (!MC_IS_ENABLED
+  else if (!MC_IS_ENABLED /* when running the MC there are no surf actions */
       && (action->state == SIMIX_READY || action->state == SIMIX_RUNNING)) {
 
-    /* when running the MC there are no surf actions */
     surf_workstation_model->action_cancel(action->comm.surf_comm);
   }
 }
index 0e8d20a..f2f550c 100644 (file)
@@ -46,28 +46,38 @@ void SIMIX_process_cleanup(smx_process_t process)
     SIMIX_comm_cancel(action);
 
     if (action->comm.src_proc == process) {
-      XBT_DEBUG("Found an unfinished send comm %p (detached = %d), state %d",
-          action, action->comm.detached, action->state);
+      XBT_DEBUG("Found an unfinished send comm %p (detached = %d), state %d, src = %p, dst = %p",
+          action, action->comm.detached, action->state, action->comm.src_proc, action->comm.dst_proc);
       action->comm.src_proc = NULL;
 
       if (action->comm.detached) {
-        /* the receiver was supposed to destroy the comm after completion,
-         * but the comm will actually never finish */
-        action->comm.refcount++;
+         if (action->comm.refcount == 0) {
+           /* I'm not supposed to destroy a detached comm from the sender side,
+            * unless there is no receiver matching the rdv */
+           action->comm.refcount++;
+           SIMIX_comm_destroy(action);
+         }
+      }
+      else {
+        SIMIX_comm_destroy(action);
       }
     }
     else if (action->comm.dst_proc == process){
-      XBT_DEBUG("Found an unfinished recv comm %p, state %d", action, action->state);
+      XBT_DEBUG("Found an unfinished recv comm %p, state %d, src = %p, dst = %p",
+          action, action->state, action->comm.src_proc, action->comm.dst_proc);
       action->comm.dst_proc = NULL;
+
+      if (action->comm.detached && action->comm.refcount == 1
+          && action->comm.src_proc != NULL) {
+        /* the comm will be freed right now, remove it from the sender */
+        xbt_fifo_remove(action->comm.src_proc->comms, action);
+      }
+      SIMIX_comm_destroy(action);
     }
     else {
-      XBT_DEBUG("Strange, I'm not in comm %p, state = %d, src = %p, dst = %p", action,
-          action->state, action->comm.src_proc, action->comm.dst_proc);
-      THROW_IMPOSSIBLE;
+      xbt_die("Communication action %p is in my list but I'm not the sender "
+          "or the receiver", action);
     }
-
-    /* FIXME uncommenting this instruction crashes complex simulations
-    SIMIX_comm_destroy(action); */
   }
 
   /*xbt_swag_remove(process, simix_global->process_to_run);*/