Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Use the xbt barrier to finalize smpi processes cleanly.
authorAugustin Degomme <degomme@idpann.imag.fr>
Tue, 6 May 2014 13:41:53 +0000 (15:41 +0200)
committerAugustin Degomme <degomme@idpann.imag.fr>
Tue, 6 May 2014 13:56:16 +0000 (15:56 +0200)
This removes the MPI_Barrier that was used before, which might have caused problems in some cases

src/smpi/private.h
src/smpi/smpi_deployment.c
src/smpi/smpi_global.c

index 2c53164..aec9e4b 100644 (file)
@@ -10,6 +10,7 @@
 #include "internal_config.h"
 #include "xbt.h"
 #include "xbt/xbt_os_time.h"
+#include "xbt/synchro_core.h"
 #include "simgrid/simix.h"
 #include "smpi/smpi_interface.h"
 #include "smpi/smpi.h"
@@ -143,7 +144,8 @@ double smpi_process_simulated_elapsed(void);
 void smpi_process_set_sampling(int s);
 int smpi_process_get_sampling(void);
 
-MPI_Comm* smpi_deployment_register_process(const char* instance_id, int rank, int index);
+void smpi_deployment_register_process(const char* instance_id, int rank, int index, MPI_Comm**, xbt_bar_t*);
+void smpi_deployment_cleanup_instances(void);
 
 void smpi_comm_copy_buffer_callback(smx_action_t comm,
                                            void *buff, size_t buff_size);
index 42f679f..83714ad 100644 (file)
@@ -6,6 +6,7 @@
 
 #include "private.h"
 #include "xbt/sysdep.h"
+#include "xbt/synchro_core.h"
 #include "xbt/log.h"
 #include "xbt/dict.h"
 
@@ -19,6 +20,7 @@ typedef struct s_smpi_mpi_instance{
   int present_processes;
   int index;
   MPI_Comm comm_world;
+  xbt_bar_t finalization_barrier;
 } s_smpi_mpi_instance_t;
 
 
@@ -42,6 +44,8 @@ void SMPI_app_instance_register(const char *name, xbt_main_func_t code, int num_
   instance->present_processes = 0;
   instance->index = process_count;
   instance->comm_world = MPI_COMM_NULL;
+  instance->finalization_barrier=xbt_barrier_init(num_processes);
+
   process_count+=num_processes;
 
   if(!smpi_instances){
@@ -54,11 +58,13 @@ void SMPI_app_instance_register(const char *name, xbt_main_func_t code, int num_
 
 
 //get the index of the process in the process_data array
-MPI_Comm* smpi_deployment_register_process(const char* instance_id, int rank, int index){
+void smpi_deployment_register_process(const char* instance_id, int rank, int index,MPI_Comm** comm, xbt_bar_t* bar){
 
   if(!smpi_instances){//no instance registered, we probably used smpirun.
-      index_to_process_data[index]=index;
-      return NULL;
+    index_to_process_data[index]=index;
+    *bar = NULL;
+    *comm = NULL;
+    return;
   }
 
   s_smpi_mpi_instance_t* instance = xbt_dict_get_or_null(smpi_instances, instance_id);
@@ -72,5 +78,19 @@ MPI_Comm* smpi_deployment_register_process(const char* instance_id, int rank, in
   instance->present_processes++;
   index_to_process_data[index]=instance->index+rank;
   smpi_group_set_mapping(smpi_comm_group(instance->comm_world), index, rank);
-  return & instance->comm_world;
+  *bar = instance->finalization_barrier;
+  *comm = &instance->comm_world;
+  return;
+}
+
+void smpi_deployment_cleanup_instances(){
+  xbt_dict_cursor_t cursor = NULL;
+  s_smpi_mpi_instance_t* instance = NULL;
+  char *name = NULL;
+  xbt_dict_foreach((xbt_dict_t) smpi_instances, cursor, name, instance) {
+    while (smpi_group_unuse(smpi_comm_group(instance->comm_world)) > 0);
+    xbt_free(instance->comm_world);
+    xbt_barrier_destroy(instance->finalization_barrier);
+  }
 }
+
index 1ac052f..fbf18b4 100644 (file)
@@ -34,6 +34,7 @@ typedef struct s_smpi_process_data {
   char state;
   int sampling;                 /* inside an SMPI_SAMPLE_ block? */
   char* instance_id;
+  xbt_bar_t finalization_barrier;
 } s_smpi_process_data_t;
 
 static smpi_process_data_t *process_data = NULL;
@@ -84,9 +85,12 @@ void smpi_process_init(int *argc, char ***argv)
     if(!index_to_process_data){
         index_to_process_data=(int*)xbt_malloc(SIMIX_process_count()*sizeof(int));
     }
-    MPI_Comm* temp_comm_world = smpi_deployment_register_process(instance_id, rank, index);
+    MPI_Comm* temp_comm_world;
+    xbt_bar_t temp_bar;
+    smpi_deployment_register_process(instance_id, rank, index, &temp_comm_world ,&temp_bar);
     data = smpi_process_remote_data(index);
     data->comm_world = temp_comm_world;
+    if(temp_bar != NULL) data->finalization_barrier = temp_bar;
     data->index = index;
     data->instance_id = instance_id;
     simcall_process_set_data(proc, data);
@@ -128,44 +132,10 @@ void smpi_process_destroy(void)
  */
 void smpi_process_finalize(void)
 {
-  if(MC_is_active()){
+    int index = smpi_process_index();
     // wait for all pending asynchronous comms to finish
-    while (SIMIX_process_has_pending_comms(SIMIX_process_self())) {
-      simcall_process_sleep(0.01);
-    }
-  }else{
-    int i;
-    int size = smpi_comm_size(MPI_COMM_WORLD);
-    int rank = smpi_comm_rank(MPI_COMM_WORLD);
-    /* All non-root send & receive zero-length message. */
-    if (rank > 0) {
-      smpi_mpi_ssend (NULL, 0, MPI_BYTE, 0,
-                      COLL_TAG_BARRIER,
-                      MPI_COMM_WORLD);
-      smpi_mpi_recv (NULL, 0, MPI_BYTE, 0,
-                     COLL_TAG_BARRIER,
-                     MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-    }
-    /* The root collects and broadcasts the messages. */
-    else {
-      MPI_Request* requests;
-      requests = (MPI_Request*)malloc( size * sizeof(MPI_Request) );
-      for (i = 1; i < size; ++i) {
-        requests[i] = smpi_mpi_irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
-                                     COLL_TAG_BARRIER, MPI_COMM_WORLD
-                                     );
-      }
-      smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
-      for (i = 1; i < size; ++i) {
-        requests[i] = smpi_mpi_issend(NULL, 0, MPI_BYTE, i,
-                                      COLL_TAG_BARRIER,
-                                      MPI_COMM_WORLD
-                                      );
-      }
-      smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
-      free( requests );
-    }
-  }
+    xbt_barrier_wait(process_data[index_to_process_data[index]]->finalization_barrier);
+
 }
 
 /**
@@ -441,6 +411,7 @@ void smpi_global_init(void)
     process_data[i]->comm_world = NULL;
     process_data[i]->state = SMPI_UNINITIALIZED;
     process_data[i]->sampling = 0;
+    process_data[i]->finalization_barrier = NULL;
   }
   //if the process was launched through smpirun script
   //we generate a global mpi_comm_world
@@ -449,9 +420,12 @@ void smpi_global_init(void)
   if(smpirun){
     group = smpi_group_new(process_count);
     MPI_COMM_WORLD = smpi_comm_new(group, NULL);
+    xbt_bar_t bar=xbt_barrier_init(process_count);
+
     MPI_UNIVERSE_SIZE = smpi_comm_size(MPI_COMM_WORLD);
     for (i = 0; i < process_count; i++) {
       smpi_group_set_mapping(group, i, i);
+      process_data[i]->finalization_barrier = bar;
     }
   }
 }
@@ -465,6 +439,9 @@ void smpi_global_destroy(void)
   if (MPI_COMM_WORLD != MPI_COMM_UNINITIALIZED){
       while (smpi_group_unuse(smpi_comm_group(MPI_COMM_WORLD)) > 0);
       xbt_free(MPI_COMM_WORLD);
+      xbt_barrier_destroy(process_data[0]->finalization_barrier);
+  }else{
+      smpi_deployment_cleanup_instances();
   }
   MPI_COMM_WORLD = MPI_COMM_NULL;
   for (i = 0; i < count; i++) {