Revalidate the tesh files which were actually wrong since hosts were not restarted at the right time...
> [ 0.000000] (1:master@Tremblay) Bourassa
> [ 0.000000] (1:master@Tremblay) Tremblay
> [ 0.000000] (1:master@Tremblay) Got 20 task to process :
-> [ 1.000000] (0:@) Restart processes on host: Jupiter
+> [ 1.000000] (0:@) Restart processes on host: Fafard
> [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'Jupiter'. Nevermind. Let's keep going!
+> [ 2.000000] (0:@) Restart processes on host: Jupiter
> [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'Fafard'. Nevermind. Let's keep going!
> [ 12.082474] (1:master@Tremblay) Send completed
> [ 12.082474] (4:slave@Ginette) Received "Task"
> [ 0.000000] (1:master@Tremblay) Bourassa
> [ 0.000000] (1:master@Tremblay) Tremblay
> [ 0.000000] (1:master@Tremblay) Got 20 task to process :
-> [ 1.000000] (0:@) Restart processes on host: Jupiter
+> [ 1.000000] (0:@) Restart processes on host: Fafard
> [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'Jupiter'. Nevermind. Let's keep going!
+> [ 2.000000] (0:@) Restart processes on host: Jupiter
> [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'Fafard'. Nevermind. Let's keep going!
> [ 12.030928] (1:master@Tremblay) Send completed
> [ 12.030928] (4:slave@Ginette) Received "Task"
> [ 0.000000] (1:master@Tremblay) Bourassa
> [ 0.000000] (1:master@Tremblay) Tremblay
> [ 0.000000] (1:master@Tremblay) Got 20 task to process :
-> [ 1.000000] (0:@) Restart processes on host: Jupiter
+> [ 1.000000] (0:@) Restart processes on host: Fafard
> [ 1.000000] (1:master@Tremblay) Mmh. Something went wrong with 'Jupiter'. Nevermind. Let's keep going!
+> [ 2.000000] (0:@) Restart processes on host: Jupiter
> [ 11.000000] (1:master@Tremblay) Mmh. Got timeouted while speaking to 'Fafard'. Nevermind. Let's keep going!
> [ 12.082474] (1:master@Tremblay) Send completed
> [ 12.082474] (4:slave@Ginette) Received "Task"
*/
XBT_PUBLIC_DATA(xbt_dynar_t) model_list;
+/** \ingroup SURF_simulation
+ * \brief List of hosts that have juste restarted and whose autorestart process should be restarted.
+ */
+XBT_PUBLIC_DATA(xbt_dynar_t) host_that_restart;
+
/** \ingroup SURF_simulation
* \brief List of hosts for which one want to be notified if they ever restart.
*/
int surf_get_nthreads(void);
void surf_set_nthreads(int nthreads);
-void surf_watched_hosts(void);
-
/*
* Returns the initial path. On Windows the initial path is
* the current directory for the current process in the other
SIMIX_simcall_post((smx_action_t) action->data);
}
+ /* Autorestart all process */
+ char *hostname = NULL;
+ xbt_dynar_foreach(host_that_restart,iter,hostname) {
+ XBT_INFO("Restart processes on host: %s",hostname);
+ SIMIX_host_autorestart(SIMIX_host_get_by_name(hostname));
+ }
+ xbt_dynar_reset(host_that_restart);
+
/* Clean processes to destroy */
SIMIX_process_empty_trash();
{
unsigned int cpt;
smx_process_arg_t arg;
- xbt_dynar_foreach(SIMIX_host_priv(host)->auto_restart_processes,cpt,arg) {
+ xbt_dynar_t process_list = SIMIX_host_priv(host)->auto_restart_processes;
+ if(!process_list) return;
+
+ xbt_dynar_foreach(process_list,cpt,arg) {
smx_process_t process;
}
}
- xbt_dynar_reset(SIMIX_host_priv(host)->auto_restart_processes);
+ xbt_dynar_reset(process_list);
}
void SIMIX_host_autorestart(smx_host_t host)
lmm_variable_t var = NULL;
lmm_element_t elem = NULL;
- surf_watched_hosts();
-
if (event_type == cpu->power_event) {
cpu->power_scale = value;
lmm_update_constraint_bound(surf_cpu_model->model_private->maxmin_system, cpu->constraint,
if (tmgr_trace_event_free(event_type))
cpu->power_event = NULL;
} else if (event_type == cpu->state_event) {
- if (value > 0)
+ if (value > 0) {
+ if(cpu->state_current == SURF_RESOURCE_OFF)
+ xbt_dynar_push_as(host_that_restart, char*, (cpu->generic_resource.name));
cpu->state_current = SURF_RESOURCE_ON;
- else {
+ } else {
lmm_constraint_t cnst = cpu->constraint;
cpu->state_current = SURF_RESOURCE_OFF;
cpu_ti_t cpu = id;
surf_action_cpu_ti_t action;
- surf_watched_hosts();
-
if (event_type == cpu->power_event) {
tmgr_trace_t power_trace;
surf_cpu_ti_tgmr_t trace;
cpu->power_event = NULL;
} else if (event_type == cpu->state_event) {
- if (value > 0)
+ if (value > 0) {
+ if(cpu->state_current == SURF_RESOURCE_OFF)
+ xbt_dynar_push_as(host_that_restart, char*, (cpu->generic_resource.name));
cpu->state_current = SURF_RESOURCE_ON;
- else {
+ } else {
cpu->state_current = SURF_RESOURCE_OFF;
/* put all action running on cpu to failed */
tmgr_history_t history = NULL;
lmm_system_t maxmin_system = NULL;
xbt_dynar_t surf_path = NULL;
+xbt_dynar_t host_that_restart = NULL;
xbt_dict_t watched_hosts_lib;
/* Don't forget to update the option description in smx_config when you change this */
tmgr_trace_event_t event = NULL;
unsigned int iter;
+ if(!host_that_restart)
+ host_that_restart = xbt_dynar_new(sizeof(char*), NULL);
+
if (max_date != -1.0 && max_date != NOW) {
min = max_date - NOW;
}
tmgr_history_get_next_event_leq(history, next_event_date,
&value,
(void **) &resource))) {
- if (resource->model->model_private->resource_used(resource)) {
+ if (resource->model->model_private->resource_used(resource) ||
+ xbt_dict_get_or_null(watched_hosts_lib,resource->name)
+ ) {
min = next_event_date - NOW;
XBT_DEBUG
("This event will modify model state. Next event set to %f",
/* update state of model_obj according to new value. Does not touch lmm.
It will be modified if needed when updating actions */
XBT_DEBUG("Calling update_resource_state for resource %s with min %lf",
- resource->model->name, min);
+ resource->name, min);
+
resource->model->model_private->update_resource_state(resource,
event, value,
next_event_date);
model->model_private->update_actions_state(NOW, min);
}
-
-/* This function is a pimple that we ought to fix. But it won't be easy.
- *
- * The surf_solve() function does properly return the set of actions that changed.
- * Instead, each model change a global data, and then the caller of surf_solve must
- * pick into these sets of action_failed and action_done.
- *
- * This was not clean but ok as long as we didn't had to restart the processes when the resource comes back up.
- * We worked by putting sentinel actions on every resources we are interested in,
- * so that surf informs us if/when the corresponding resource fails.
- *
- * But this does not work to get Simix informed of when a resource comes back up, and this is where this pimple comes.
- * We have a set of resources that are currently down and for which simix needs to know when it comes back up.
- * And the current function is called *at every simulation step* to sweep over that set, searching for a resource
- * that was turned back up in the meanwhile. This is UGLY and slow.
- *
- * The proper solution would be to not rely on globals for the action_failed and action_done swags.
- * They must be passed as parameter by the caller (the handling of these actions in simix may let you
- * think that these two sets can be merged, but their handling in SimDag induce the contrary unless this
- * simdag code can check by itself whether the action is done of failed -- seems very doable, but yet more
- * cleanup to do).
- *
- * Once surf_solve() is passed the set of actions that changed, you want to add a new set of resources back up
- * as parameter to this function. You also want to add a boolean field "restart_watched" to each resource, and
- * make sure that whenever a resource with this field enabled comes back up, it's added to that set so that Simix
- * sees it and react accordingly. This would kill that need for surf to call simix.
- *
- */
-
-static void remove_watched_host(void *key)
-{
- xbt_dict_remove(watched_hosts_lib, *(char**)key);
-}
-
-void surf_watched_hosts(void)
-{
- char *key;
- void *host;
- xbt_dict_cursor_t cursor;
- xbt_dynar_t hosts = xbt_dynar_new(sizeof(char*), NULL);
-
- XBT_DEBUG("Check for host SURF_RESOURCE_ON on watched_hosts_lib");
- xbt_dict_foreach(watched_hosts_lib,cursor,key,host)
- {
- if(SIMIX_host_get_state(host) == SURF_RESOURCE_ON){
- XBT_INFO("Restart processes on host: %s",SIMIX_host_get_name(host));
- SIMIX_host_autorestart(host);
- xbt_dynar_push_as(hosts, char*, key);
- }
- else
- XBT_DEBUG("See SURF_RESOURCE_OFF on host: %s",key);
- }
- xbt_dynar_map(hosts, remove_watched_host);
- xbt_dynar_free(&hosts);
-}