SMPI replay: make sure that the replay is inited even if rank 0 is not first for...

[simgrid.git] / src / smpi / smpi_rma.c
diff --git a/src/smpi/smpi_rma.c b/src/smpi/smpi_rma.c

index 9ff12c8..77d4795 100644 (file)
--- a/src/smpi/smpi_rma.c
+++ b/src/smpi/smpi_rma.c
@@ -11,10 +11,6 @@ XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_rma, smpi, "Logging specific to SMPI (RMA o
  
  #define RMA_TAG -1234
  
-/* FIXME:using a global array of MPI_Win simplifies the way to exchange pointers and info,
- * but it breaks distributed simulation
- */
-
  xbt_bar_t creation_bar = NULL;
  
  typedef struct s_smpi_mpi_win{
@@ -22,11 +18,14 @@ typedef struct s_smpi_mpi_win{
    MPI_Aint size;
    int disp_unit;
    MPI_Comm comm;
-  //MPI_Info info
+  MPI_Info info;
    int assert;
    xbt_dynar_t requests;
    xbt_bar_t bar;
    MPI_Win* connected_wins;
+  char* name;
+  int opened;
+  MPI_Group group;
  } s_smpi_mpi_win_t;
  
  
@@ -43,8 +42,13 @@ MPI_Win smpi_mpi_win_create( void *base, MPI_Aint size, int disp_unit, MPI_Info
    win->size = size;
    win->disp_unit = disp_unit;
    win->assert = 0;
-  //win->info = info;
+  win->info = info;
+  if(info!=MPI_INFO_NULL)
+    info->refcount++;
    win->comm = comm;
+  win->name = NULL;
+  win->opened = 0;
+  win->group = MPI_GROUP_NULL;
    win->requests = xbt_dynar_new(sizeof(MPI_Request), NULL);
    win->connected_wins = xbt_malloc0(comm_size*sizeof(MPI_Win));
    win->connected_wins[rank] = win;
@@ -78,16 +82,44 @@ int smpi_mpi_win_free( MPI_Win* win){
    xbt_barrier_wait((*win)->bar);
    xbt_dynar_free(&(*win)->requests);
    xbt_free((*win)->connected_wins);
+  if ((*win)->name != NULL){
+    xbt_free((*win)->name);
+  }
+  if((*win)->info!=MPI_INFO_NULL){
+    MPI_Info_free(&(*win)->info);
+  }
    xbt_free(*win);
    win = MPI_WIN_NULL;
    return MPI_SUCCESS;
  }
  
+void smpi_mpi_win_get_name(MPI_Win win, char* name, int* length){
+  if(win->name==NULL){
+    *length=0;
+    name=NULL;
+    return;
+  }
+  *length = strlen(win->name);
+  strcpy(name, win->name);
+}
+
+void smpi_mpi_win_get_group(MPI_Win win, MPI_Group* group){
+  if(win->comm != MPI_COMM_NULL){
+    *group = smpi_comm_group(win->comm);
+    smpi_group_use(*group);
+  }
+}
+
+void smpi_mpi_win_set_name(MPI_Win win, char* name){
+  win->name = xbt_strdup(name);;
+}
+
  
  int smpi_mpi_win_fence( int assert,  MPI_Win win){
  
    XBT_DEBUG("Entering fence");
-
+  if(!win->opened)
+    win->opened=1;
    if(assert != MPI_MODE_NOPRECEDE){
      xbt_barrier_wait(win->bar);
  
@@ -117,10 +149,13 @@ int smpi_mpi_win_fence( int assert,  MPI_Win win){
  int smpi_mpi_put( void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank,
                MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Win win)
  {
+
+  if(!win->opened)//check that post/start has been done
+    return MPI_ERR_WIN;
    //get receiver pointer
    MPI_Win recv_win = win->connected_wins[target_rank];
  
-  void* recv_addr = recv_win->base + target_disp * smpi_datatype_size(target_datatype)/* recv_win->disp_unit*/;
+  void* recv_addr = (void*) ( ((char*)recv_win->base) + target_disp * recv_win->disp_unit);
    smpi_datatype_use(origin_datatype);
    smpi_datatype_use(target_datatype);
    XBT_DEBUG("Entering MPI_Put to %d", target_rank);
@@ -128,11 +163,11 @@ int smpi_mpi_put( void *origin_addr, int origin_count, MPI_Datatype origin_datat
    if(target_rank != smpi_comm_rank(win->comm)){
      //prepare send_request
      MPI_Request sreq = smpi_rma_send_init(origin_addr, origin_count, origin_datatype,
-        smpi_process_index(), smpi_group_index(smpi_comm_group(win->comm),target_rank), RMA_TAG+1, win->comm);
+        smpi_process_index(), smpi_group_index(smpi_comm_group(win->comm),target_rank), RMA_TAG+1, win->comm, MPI_OP_NULL);
  
      //prepare receiver request
      MPI_Request rreq = smpi_rma_recv_init(recv_addr, target_count, target_datatype,
-        smpi_process_index(), smpi_group_index(smpi_comm_group(win->comm),target_rank), RMA_TAG+1, recv_win->comm);
+        smpi_process_index(), smpi_group_index(smpi_comm_group(win->comm),target_rank), RMA_TAG+1, recv_win->comm, MPI_OP_NULL);
  
      //push request to receiver's win
      xbt_dynar_push_as(recv_win->requests, MPI_Request, rreq);
@@ -142,10 +177,10 @@ int smpi_mpi_put( void *origin_addr, int origin_count, MPI_Datatype origin_datat
  
      //push request to sender's win
      xbt_dynar_push_as(win->requests, MPI_Request, sreq);
+  }else{
+    smpi_datatype_copy(origin_addr, origin_count, origin_datatype,
+                       recv_addr, target_count, target_datatype);
    }
-  //perform actual copy
-  /*smpi_datatype_copy(origin_addr, origin_count, origin_datatype,
-                    recv_addr, target_count, target_datatype);*/
  
    return MPI_SUCCESS;
  }
@@ -153,10 +188,12 @@ int smpi_mpi_put( void *origin_addr, int origin_count, MPI_Datatype origin_datat
  int smpi_mpi_get( void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank,
                MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Win win)
  {
+  if(!win->opened)//check that post/start has been done
+    return MPI_ERR_WIN;
    //get sender pointer
    MPI_Win send_win = win->connected_wins[target_rank];
  
-  void* send_addr = send_win->base + target_disp * smpi_datatype_size(target_datatype)/** send_win->disp_unit*/;
+  void* send_addr = (void*)( ((char*)send_win->base) + target_disp * send_win->disp_unit);
    smpi_datatype_use(origin_datatype);
    smpi_datatype_use(target_datatype);
    XBT_DEBUG("Entering MPI_Get from %d", target_rank);
@@ -164,12 +201,15 @@ int smpi_mpi_get( void *origin_addr, int origin_count, MPI_Datatype origin_datat
    if(target_rank != smpi_comm_rank(win->comm)){
      //prepare send_request
      MPI_Request sreq = smpi_rma_send_init(send_addr, target_count, target_datatype,
-        smpi_group_index(smpi_comm_group(win->comm),target_rank), smpi_process_index(), RMA_TAG+2, send_win->comm);
+        smpi_group_index(smpi_comm_group(win->comm),target_rank), smpi_process_index(), RMA_TAG+2, send_win->comm, MPI_OP_NULL);
  
      //prepare receiver request
      MPI_Request rreq = smpi_rma_recv_init(origin_addr, origin_count, origin_datatype,
-        smpi_group_index(smpi_comm_group(win->comm),target_rank), smpi_process_index(), RMA_TAG+2, win->comm);
-
+        smpi_group_index(smpi_comm_group(win->comm),target_rank), smpi_process_index(), RMA_TAG+2, win->comm, MPI_OP_NULL);
+        
+    //start the send, with another process than us as sender. 
+    smpi_mpi_start(sreq);
+    
      //push request to receiver's win
      xbt_dynar_push_as(send_win->requests, MPI_Request, sreq);
  
@@ -178,11 +218,10 @@ int smpi_mpi_get( void *origin_addr, int origin_count, MPI_Datatype origin_datat
  
      //push request to sender's win
      xbt_dynar_push_as(win->requests, MPI_Request, rreq);
+  }else{
+    smpi_datatype_copy(send_addr, target_count, target_datatype,
+                       origin_addr, origin_count, origin_datatype);
    }
-  //perform actual copy
-  /*smpi_datatype_copy(send_addr, target_count, target_datatype,
-                     origin_addr, origin_count, origin_datatype);*/
-
  
    return MPI_SUCCESS;
  }
@@ -191,34 +230,213 @@ int smpi_mpi_get( void *origin_addr, int origin_count, MPI_Datatype origin_datat
  int smpi_mpi_accumulate( void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank,
                MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPI_Win win)
  {
+  if(!win->opened)//check that post/start has been done
+    return MPI_ERR_WIN;
+  //FIXME: local version 
    //get receiver pointer
    MPI_Win recv_win = win->connected_wins[target_rank];
  
-  void* recv_addr = recv_win->base + target_disp * smpi_datatype_size(target_datatype) /** recv_win->disp_unit*/;
+  void* recv_addr = (void*)( ((char*)recv_win->base) + target_disp * recv_win->disp_unit);
    XBT_DEBUG("Entering MPI_Accumulate to %d", target_rank);
  
    smpi_datatype_use(origin_datatype);
    smpi_datatype_use(target_datatype);
  
-  if(target_rank != smpi_comm_rank(win->comm)){
+
      //prepare send_request
      MPI_Request sreq = smpi_rma_send_init(origin_addr, origin_count, origin_datatype,
-        smpi_process_index(), smpi_group_index(smpi_comm_group(win->comm),target_rank), RMA_TAG+3, win->comm);
+        smpi_process_index(), smpi_group_index(smpi_comm_group(win->comm),target_rank), RMA_TAG+3, win->comm, op);
  
      //prepare receiver request
-    MPI_Request rreq = smpi_rma_recv_init(NULL, 0, target_datatype,
-        smpi_process_index(), smpi_group_index(smpi_comm_group(win->comm),target_rank), RMA_TAG+3, recv_win->comm);
-    rreq->flags |= ACCUMULATE;
+    MPI_Request rreq = smpi_rma_recv_init(recv_addr, target_count, target_datatype,
+        smpi_process_index(), smpi_group_index(smpi_comm_group(win->comm),target_rank), RMA_TAG+3, recv_win->comm, op);
      //push request to receiver's win
      xbt_dynar_push_as(recv_win->requests, MPI_Request, rreq);
      //start send
      smpi_mpi_start(sreq);
+    
      //push request to sender's win
      xbt_dynar_push_as(win->requests, MPI_Request, sreq);
-   }
-  //perform actual accumulation
-  smpi_op_apply(op, origin_addr, recv_addr, &origin_count, &origin_datatype);
+  
+
+
+  return MPI_SUCCESS;
+}
+
+int smpi_mpi_win_start(MPI_Group group, int assert, MPI_Win win){
+    /* From MPI forum advices
+    The call to MPI_WIN_COMPLETE does not return until the put call has completed at 
+    the origin; and the target window will be accessed by the put operation only 
+    after the call to MPI_WIN_START has matched a call to MPI_WIN_POST by the target
+     process. This still leaves much choice to implementors. The call to 
+     MPI_WIN_START can block until the matching call to MPI_WIN_POST occurs at all 
+    target processes. One can also have implementations where the call to 
+    MPI_WIN_START is nonblocking, but the call to MPI_PUT blocks until the matching 
+    call to MPI_WIN_POST occurred; or implementations where the first two calls are 
+    nonblocking, but the call to MPI_WIN_COMPLETE blocks until the call to 
+    MPI_WIN_POST occurred; or even implementations where all three calls can 
+    complete before any target process called MPI_WIN_POST --- the data put must be 
+    buffered, in this last case, so as to allow the put to complete at the origin 
+    ahead of its completion at the target. However, once the call to MPI_WIN_POST is
+     issued, the sequence above must complete, without further dependencies.
+    */
+  
+  //naive, blocking implementation.
+  int i=0,j=0;
+  int size = smpi_group_size(group);
+  MPI_Request* reqs = xbt_new0(MPI_Request, size);
+  
+//  for(i=0;i<size;i++){
+  while(j!=size){
+    int src=smpi_group_index(group,j);
+    if(src!=smpi_process_index()){
+      reqs[i]=smpi_irecv_init(NULL, 0, MPI_CHAR, src,RMA_TAG+4, MPI_COMM_WORLD);
+      i++;
+    }
+    j++;
+  }
+  size=i;
+  smpi_mpi_startall(size, reqs);
+  smpi_mpi_waitall(size, reqs, MPI_STATUSES_IGNORE);
+  for(i=0;i<size;i++){
+    smpi_mpi_request_free(&reqs[i]);
+  }
+  xbt_free(reqs);
+  win->opened++; //we're open for business !
+  win->group=group;
+  smpi_group_use(group);
+  return MPI_SUCCESS;
+}
  
+int smpi_mpi_win_post(MPI_Group group, int assert, MPI_Win win){
+  //let's make a synchronous send here
+  int i=0,j=0;
+  int size = smpi_group_size(group);
+  MPI_Request* reqs = xbt_new0(MPI_Request, size);
+  
+  while(j!=size){
+    int dst=smpi_group_index(group,j);
+    if(dst!=smpi_process_index()){
+      reqs[i]=smpi_mpi_send_init(NULL, 0, MPI_CHAR, dst,
+        RMA_TAG+4, MPI_COMM_WORLD);
+      i++;
+    }
+    j++;
+  }
+  size=i;
+  
+  smpi_mpi_startall(size, reqs);
+  smpi_mpi_waitall(size, reqs, MPI_STATUSES_IGNORE);
+  for(i=0;i<size;i++){
+    smpi_mpi_request_free(&reqs[i]);
+  }
+  xbt_free(reqs);
+  win->opened++; //we're open for business !
+  win->group=group;
+  smpi_group_use(group);
+  return MPI_SUCCESS;
+}
+
+int smpi_mpi_win_complete(MPI_Win win){
+  if(win->opened==0)
+    xbt_die("Complete called on already opened MPI_Win");
+//  xbt_barrier_wait(win->bar);
+  //MPI_Comm comm = smpi_comm_new(win->group, NULL);
+  //mpi_coll_barrier_fun(comm);
+  //smpi_comm_destroy(comm);
+  
+  XBT_DEBUG("Entering MPI_Win_Complete");
+  int i=0,j=0;
+  int size = smpi_group_size(win->group);
+  MPI_Request* reqs = xbt_new0(MPI_Request, size);
+  
+  while(j!=size){
+    int dst=smpi_group_index(win->group,j);
+    if(dst!=smpi_process_index()){
+      reqs[i]=smpi_mpi_send_init(NULL, 0, MPI_CHAR, dst,
+        RMA_TAG+5, MPI_COMM_WORLD);
+      i++;
+    }
+    j++;
+  }
+  size=i;
+  XBT_DEBUG("Win_complete - Sending sync messages to %d processes", size);
+  smpi_mpi_startall(size, reqs);
+  smpi_mpi_waitall(size, reqs, MPI_STATUSES_IGNORE);
+  
+  for(i=0;i<size;i++){
+    smpi_mpi_request_free(&reqs[i]);
+  }
+  xbt_free(reqs);
+  
+  //now we can finish RMA calls
+  
+  xbt_dynar_t reqqs = win->requests;
+  size = xbt_dynar_length(reqqs);
+  
+  XBT_DEBUG("Win_complete - Finishing %d RMA calls", size);
+  unsigned int cpt=0;
+  MPI_Request req;
+  // start all requests that have been prepared by another process
+  xbt_dynar_foreach(reqqs, cpt, req){
+    if (req->flags & PREPARED) smpi_mpi_start(req);
+  }
+
+  MPI_Request* treqs = xbt_dynar_to_array(reqqs);
+  smpi_mpi_waitall(size,treqs,MPI_STATUSES_IGNORE);
+  xbt_free(treqs);
+  win->requests=xbt_dynar_new(sizeof(MPI_Request), NULL);
+  win->opened--; //we're closed for business !
    return MPI_SUCCESS;
  }
  
+
+
+int smpi_mpi_win_wait(MPI_Win win){
+//  xbt_barrier_wait(win->bar);
+  //MPI_Comm comm = smpi_comm_new(win->group, NULL);
+  //mpi_coll_barrier_fun(comm);
+  //smpi_comm_destroy(comm);
+  //naive, blocking implementation.
+  XBT_DEBUG("Entering MPI_Win_Wait");
+  int i=0,j=0;
+  int size = smpi_group_size(win->group);
+  MPI_Request* reqs = xbt_new0(MPI_Request, size);
+  
+//  for(i=0;i<size;i++){
+  while(j!=size){
+    int src=smpi_group_index(win->group,j);
+    if(src!=smpi_process_index()){
+      reqs[i]=smpi_irecv_init(NULL, 0, MPI_CHAR, src,RMA_TAG+5, MPI_COMM_WORLD);
+      i++;
+    }
+    j++;
+  }
+  size=i;
+  XBT_DEBUG("Win_wait - Receiving sync messages from %d processes", size);
+  smpi_mpi_startall(size, reqs);
+  smpi_mpi_waitall(size, reqs, MPI_STATUSES_IGNORE);
+  for(i=0;i<size;i++){
+    smpi_mpi_request_free(&reqs[i]);
+  }
+  xbt_free(reqs);
+
+  xbt_dynar_t reqqs = win->requests;
+  size = xbt_dynar_length(reqqs);
+  
+  XBT_DEBUG("Win_complete - Finishing %d RMA calls", size);
+
+  unsigned int cpt=0;
+  MPI_Request req;
+  // start all requests that have been prepared by another process
+  xbt_dynar_foreach(reqqs, cpt, req){
+    if (req->flags & PREPARED) smpi_mpi_start(req);
+  }
+
+  MPI_Request* treqs = xbt_dynar_to_array(reqqs);
+  smpi_mpi_waitall(size,treqs,MPI_STATUSES_IGNORE);
+  xbt_free(treqs);
+  win->requests=xbt_dynar_new(sizeof(MPI_Request), NULL);
+  win->opened--; //we're opened for business !
+  return MPI_SUCCESS;
+}