Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Test for failed host is wrong for parallel tasks.
[simgrid.git] / src / simix / smx_host.c
index db4429b..f663ea2 100644 (file)
@@ -4,7 +4,7 @@
 /* This program is free software; you can redistribute it and/or modify it
  * under the terms of the license (GNU LGPL) which comes with this package. */
 
-#include "private.h"
+#include "smx_private.h"
 #include "xbt/sysdep.h"
 #include "xbt/log.h"
 #include "xbt/dict.h"
@@ -67,7 +67,7 @@ void SIMIX_host_destroy(void *h)
     SIMIX_display_process_status();
     THROWF(arg_error, 0, "%s", msg);
   }
-
+  xbt_dynar_free(&host->auto_restart_processes);
   xbt_swag_free(host->process_list);
 
   /* Clean host structure */
@@ -90,7 +90,7 @@ xbt_dict_t SIMIX_host_get_dict(void)
   void **host = NULL;
 
   xbt_lib_foreach(host_lib, cursor, name, host){
-         if(host[SIMIX_HOST_LEVEL])
+    if(host[SIMIX_HOST_LEVEL])
             xbt_dict_set(host_dict,name,host[SIMIX_HOST_LEVEL], NULL);
   }
   return host_dict;
@@ -111,7 +111,7 @@ smx_host_t SIMIX_host_self(void)
   return (process == NULL) ? NULL : SIMIX_process_get_host(process);
 }
 
-/* needs to be public and without request because it is called
+/* needs to be public and without simcall because it is called
    by exceptions and logging events */
 const char* SIMIX_host_self_get_name(void)
 {
@@ -176,6 +176,50 @@ void* SIMIX_host_get_data(smx_host_t host)
 
   return host->data;
 }
+void _SIMIX_host_free_process_arg(void *);
+void _SIMIX_host_free_process_arg(void *data) {
+  smx_process_arg_t arg = *(void**)data;
+  int i;
+  xbt_free(arg->name);
+  for (i = 0; i < arg->argc; i++) {
+    xbt_free(arg->argv[i]);
+  }
+  xbt_free(arg->argv);
+  xbt_free(arg);
+}
+void SIMIX_host_add_auto_restart_process(smx_host_t host,
+                                         const char *name,
+                                         xbt_main_func_t code,
+                                         void *data,
+                                         const char *hostname,
+                                         double kill_time,
+                                         int argc, char **argv,
+                                         xbt_dict_t properties,
+                                         int auto_restart) {
+  if (!host->auto_restart_processes) {
+    host->auto_restart_processes = xbt_dynar_new(sizeof(smx_process_arg_t),_SIMIX_host_free_process_arg);
+  }
+  smx_process_arg_t arg = xbt_new(s_smx_process_arg_t,1);
+
+  arg->name = xbt_strdup(name);
+  arg->code = code;
+  arg->data = data;
+  arg->hostname = hostname;
+  arg->kill_time = kill_time;
+  arg->argc = argc;
+  arg->argv = xbt_new(char*,argc + 1);
+
+  int i;
+  for (i = 0; i < argc; i++) {
+    arg->argv[i] = xbt_strdup(argv[i]);
+  }
+
+  arg->properties = properties;
+  arg->auto_restart = auto_restart;
+
+  xbt_dynar_push_as(host->auto_restart_processes,smx_process_arg_t,arg);
+}
+
 
 void SIMIX_host_set_data(smx_host_t host, void *data)
 {
@@ -204,7 +248,7 @@ smx_action_t SIMIX_host_execute(const char *name, smx_host_t host,
   if (!MC_IS_ENABLED) {
     action->execution.surf_exec =
       surf_workstation_model->extension.workstation.execute(host->host,
-         computation_amount);
+    computation_amount);
     surf_workstation_model->action_data_set(action->execution.surf_exec, action);
     surf_workstation_model->set_priority(action->execution.surf_exec, priority);
   }
@@ -243,7 +287,7 @@ smx_action_t SIMIX_host_parallel_execute( const char *name,
     action->execution.surf_exec =
       surf_workstation_model->extension.workstation.
       execute_parallel_task(host_nb, workstation_list, computation_amount,
-                           communication_amount, amount, rate);
+                      communication_amount, rate);
 
     surf_workstation_model->action_data_set(action->execution.surf_exec, action);
   }
@@ -254,16 +298,19 @@ smx_action_t SIMIX_host_parallel_execute( const char *name,
 
 void SIMIX_host_execution_destroy(smx_action_t action)
 {
+  int destroyed=0;
   XBT_DEBUG("Destroy action %p", action);
 
-  xbt_free(action->name);
 
   if (action->execution.surf_exec) {
-    surf_workstation_model->action_unref(action->execution.surf_exec);
+    destroyed = surf_workstation_model->action_unref(action->execution.surf_exec);
     action->execution.surf_exec = NULL;
   }
 
-  xbt_mallocator_release(simix_global->action_mallocator, action);
+  if (destroyed) {
+    xbt_free(action->name);
+    xbt_mallocator_release(simix_global->action_mallocator, action);
+  }
 }
 
 void SIMIX_host_execution_cancel(smx_action_t action)
@@ -295,18 +342,18 @@ void SIMIX_host_execution_set_priority(smx_action_t action, double priority)
     surf_workstation_model->set_priority(action->execution.surf_exec, priority);
 }
 
-void SIMIX_pre_host_execution_wait(smx_req_t req)
+void SIMIX_pre_host_execution_wait(smx_simcall_t simcall)
 {
-  smx_action_t action = req->host_execution_wait.execution;
+  smx_action_t action = simcall->host_execution_wait.execution;
 
-  XBT_DEBUG("Wait for execution of action %p, state %d", action, action->state);
+  XBT_DEBUG("Wait for execution of action %p, state %d", action, (int)action->state);
 
-  /* Associate this request to the action */
-  xbt_fifo_push(action->request_list, req);
-  req->issuer->waiting_action = action;
+  /* Associate this simcall to the action */
+  xbt_fifo_push(action->simcalls, simcall);
+  simcall->issuer->waiting_action = action;
 
   /* set surf's action */
-  if (MC_IS_ENABLED){
+  if (MC_IS_ENABLED) {
     action->state = SIMIX_DONE;
     SIMIX_execution_finish(action);
     return;
@@ -326,50 +373,47 @@ void SIMIX_host_execution_suspend(smx_action_t action)
 void SIMIX_host_execution_resume(smx_action_t action)
 {
   if(action->execution.surf_exec)
-    surf_workstation_model->suspend(action->execution.surf_exec);
+    surf_workstation_model->resume(action->execution.surf_exec);
 }
 
 void SIMIX_execution_finish(smx_action_t action)
 {
-  volatile xbt_fifo_item_t item;
-  smx_req_t req;
+  xbt_fifo_item_t item;
+  smx_simcall_t simcall;
 
-  xbt_fifo_foreach(action->request_list, item, req, smx_req_t) {
+  xbt_fifo_foreach(action->simcalls, item, simcall, smx_simcall_t) {
 
     switch (action->state) {
 
       case SIMIX_DONE:
         /* do nothing, action done */
-       XBT_DEBUG("SIMIX_execution_finished: execution successful");
+  XBT_DEBUG("SIMIX_execution_finished: execution successful");
         break;
 
       case SIMIX_FAILED:
-        XBT_DEBUG("SIMIX_execution_finished: host '%s' failed", req->issuer->smx_host->name);
-        TRY {
-          THROWF(host_error, 0, "Host failed");
-        }
-       CATCH(req->issuer->running_ctx->exception) {
-         req->issuer->doexception = 1;
-       }
-      break;
+        XBT_DEBUG("SIMIX_execution_finished: host '%s' failed", simcall->issuer->smx_host->name);
+        simcall->issuer->context->iwannadie = 1;
+        //SMX_EXCEPTION(simcall->issuer, host_error, 0, "Host failed");
+        break;
 
       case SIMIX_CANCELED:
         XBT_DEBUG("SIMIX_execution_finished: execution canceled");
-        TRY {
-          THROWF(cancel_error, 0, "Canceled");
-        }
-       CATCH(req->issuer->running_ctx->exception) {
-         req->issuer->doexception = 1;
-        }
-       break;
+        SMX_EXCEPTION(simcall->issuer, cancel_error, 0, "Canceled");
+        break;
 
       default:
         xbt_die("Internal error in SIMIX_execution_finish: unexpected action state %d",
-            action->state);
+            (int)action->state);
     }
-    req->issuer->waiting_action = NULL;
-    req->host_execution_wait.result = action->state;
-    SIMIX_request_answer(req);
+    /* check if the host is down */
+    if (surf_workstation_model->extension.
+        workstation.get_state(simcall->issuer->smx_host->host) != SURF_RESOURCE_ON) {
+      simcall->issuer->context->iwannadie = 1;
+    }
+
+    simcall->issuer->waiting_action =    NULL;
+    simcall->host_execution_wait.result = action->state;
+    SIMIX_simcall_answer(simcall);
   }
 
   /* We no longer need it */
@@ -378,23 +422,29 @@ void SIMIX_execution_finish(smx_action_t action)
 
 void SIMIX_post_host_execute(smx_action_t action)
 {
-  /* FIXME: check if the host running the action failed or not*/
-  /*if(surf_workstation_model->extension.workstation.get_state(action->host->host))*/
-
-  /* If the host running the action didn't fail, then the action was canceled */
-  if (surf_workstation_model->action_state_get(action->execution.surf_exec) == SURF_ACTION_FAILED)
-     action->state = SIMIX_CANCELED;
-  else
-     action->state = SIMIX_DONE;
+  if (action->type == SIMIX_ACTION_EXECUTE && /* FIMXE: handle resource failure
+                                               * for parallel tasks too */
+      surf_workstation_model->extension.workstation.get_state(action->execution.host->host) == SURF_RESOURCE_OFF) {
+    /* If the host running the action failed, notice it so that the asking
+     * process can be killed if it runs on that host itself */
+    action->state = SIMIX_FAILED;
+  } else if (surf_workstation_model->action_state_get(action->execution.surf_exec) == SURF_ACTION_FAILED) {
+    /* If the host running the action didn't fail, then the action was
+     * canceled */
+    action->state = SIMIX_CANCELED;
+  } else {
+    action->state = SIMIX_DONE;
+  }
 
   if (action->execution.surf_exec) {
     surf_workstation_model->action_unref(action->execution.surf_exec);
     action->execution.surf_exec = NULL;
   }
 
-  /* If there are requests associated with the action, then answer them */
-  if (xbt_fifo_size(action->request_list))
+  /* If there are simcalls associated with the action, then answer them */
+  if (xbt_fifo_size(action->simcalls)) {
     SIMIX_execution_finish(action);
+  }
 }