src/smpi/smpi_rma.cpp

   1
   2 /* Copyright (c) 2007-2015. The SimGrid Team.
   3  * All rights reserved.                                                     */
   4
   5 /* This program is free software; you can redistribute it and/or modify it
   6  * under the terms of the license (GNU LGPL) which comes with this package. */
   7
   8 #include "private.h"
   9 #include <vector>
  10
  11 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_rma, smpi, "Logging specific to SMPI (RMA operations)");
  12
  13 #define RMA_TAG -1234
  14
  15 xbt_bar_t creation_bar = nullptr;
  16
  17 typedef struct s_smpi_mpi_win{
  18   void* base;
  19   MPI_Aint size;
  20   int disp_unit;
  21   MPI_Comm comm;
  22   MPI_Info info;
  23   int assert;
  24   std::vector<MPI_Request> *requests;
  25   xbt_bar_t bar;
  26   MPI_Win* connected_wins;
  27   char* name;
  28   int opened;
  29   MPI_Group group;
  30 } s_smpi_mpi_win_t;
  31
  32
  33 MPI_Win smpi_mpi_win_create( void *base, MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm){
  34   MPI_Win win;
  35
  36   int comm_size = smpi_comm_size(comm);
  37   int rank=smpi_comm_rank(comm);
  38   XBT_DEBUG("Creating window");
  39
  40   win = xbt_new(s_smpi_mpi_win_t, 1);
  41   win->base = base;
  42   win->size = size;
  43   win->disp_unit = disp_unit;
  44   win->assert = 0;
  45   win->info = info;
  46   if(info!=MPI_INFO_NULL)
  47     info->refcount++;
  48   win->comm = comm;
  49   win->name = nullptr;
  50   win->opened = 0;
  51   win->group = MPI_GROUP_NULL;
  52   win->requests = new std::vector<MPI_Request>();
  53   win->connected_wins = xbt_new0(MPI_Win, comm_size);
  54   win->connected_wins[rank] = win;
  55
  56   if(rank==0){
  57     win->bar=xbt_barrier_init(comm_size);
  58   }
  59   mpi_coll_allgather_fun(&(win->connected_wins[rank]), sizeof(MPI_Win), MPI_BYTE, win->connected_wins, sizeof(MPI_Win),
  60                          MPI_BYTE, comm);
  61
  62   mpi_coll_bcast_fun( &(win->bar), sizeof(xbt_bar_t), MPI_BYTE, 0, comm);
  63
  64   mpi_coll_barrier_fun(comm);
  65
  66   return win;
  67 }
  68
  69 int smpi_mpi_win_free( MPI_Win* win){
  70   //As per the standard, perform a barrier to ensure every async comm is finished
  71   xbt_barrier_wait((*win)->bar);
  72   delete (*win)->requests;
  73   xbt_free((*win)->connected_wins);
  74   if ((*win)->name != nullptr){
  75     xbt_free((*win)->name);
  76   }
  77   if((*win)->info!=MPI_INFO_NULL){
  78     MPI_Info_free(&(*win)->info);
  79   }
  80
  81   mpi_coll_barrier_fun((*win)->comm);
  82   int rank=smpi_comm_rank((*win)->comm);
  83   if(rank == 0)
  84     xbt_barrier_destroy((*win)->bar);
  85   xbt_free(*win);
  86   *win = MPI_WIN_NULL;
  87   return MPI_SUCCESS;
  88 }
  89
  90 void smpi_mpi_win_get_name(MPI_Win win, char* name, int* length){
  91   if(win->name==nullptr){
  92     *length=0;
  93     name=nullptr;
  94     return;
  95   }
  96   *length = strlen(win->name);
  97   strncpy(name, win->name, *length+1);
  98 }
  99
 100 void smpi_mpi_win_get_group(MPI_Win win, MPI_Group* group){
 101   if(win->comm != MPI_COMM_NULL){
 102     *group = smpi_comm_group(win->comm);
 103   }
 104 }
 105
 106 void smpi_mpi_win_set_name(MPI_Win win, char* name){
 107   win->name = xbt_strdup(name);
 108 }
 109
 110 int smpi_mpi_win_fence( int assert,  MPI_Win win){
 111   XBT_DEBUG("Entering fence");
 112   if(win->opened==0)
 113     win->opened=1;
 114   if(assert != MPI_MODE_NOPRECEDE){
 115     xbt_barrier_wait(win->bar);
 116
 117     std::vector<MPI_Request> *reqs = win->requests;
 118     int size = static_cast<int>(reqs->size());
 119     // start all requests that have been prepared by another process
 120     for(auto req: *reqs){
 121       if (req->flags & PREPARED)
 122         smpi_mpi_start(req);
 123     }
 124
 125     MPI_Request* treqs = &(*reqs)[0];
 126     smpi_mpi_waitall(size,treqs,MPI_STATUSES_IGNORE);
 127   }
 128   win->assert = assert;
 129
 130   xbt_barrier_wait(win->bar);
 131   XBT_DEBUG("Leaving fence ");
 132
 133   return MPI_SUCCESS;
 134 }
 135
 136 int smpi_mpi_put( void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank,
 137               MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Win win)
 138 {
 139   if(win->opened==0)//check that post/start has been done
 140     return MPI_ERR_WIN;
 141   //get receiver pointer
 142   MPI_Win recv_win = win->connected_wins[target_rank];
 143
 144   void* recv_addr = static_cast<void*> ( static_cast<char*>(recv_win->base) + target_disp * recv_win->disp_unit);
 145   XBT_DEBUG("Entering MPI_Put to %d", target_rank);
 146
 147   if(target_rank != smpi_comm_rank(win->comm)){
 148     //prepare send_request
 149     MPI_Request sreq = smpi_rma_send_init(origin_addr, origin_count, origin_datatype, smpi_process_index(),
 150         smpi_group_index(smpi_comm_group(win->comm),target_rank), RMA_TAG+1, win->comm, MPI_OP_NULL);
 151
 152     //prepare receiver request
 153     MPI_Request rreq = smpi_rma_recv_init(recv_addr, target_count, target_datatype, smpi_process_index(),
 154         smpi_group_index(smpi_comm_group(win->comm),target_rank), RMA_TAG+1, recv_win->comm, MPI_OP_NULL);
 155
 156     //push request to receiver's win
 157     recv_win->requests->push_back(rreq);
 158
 159     //start send
 160     smpi_mpi_start(sreq);
 161
 162     //push request to sender's win
 163     win->requests->push_back(sreq);
 164   }else{
 165     smpi_datatype_copy(origin_addr, origin_count, origin_datatype, recv_addr, target_count, target_datatype);
 166   }
 167
 168   return MPI_SUCCESS;
 169 }
 170
 171 int smpi_mpi_get( void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank,
 172               MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Win win)
 173 {
 174   if(win->opened==0)//check that post/start has been done
 175     return MPI_ERR_WIN;
 176   //get sender pointer
 177   MPI_Win send_win = win->connected_wins[target_rank];
 178
 179   void* send_addr = static_cast<void*>(static_cast<char*>(send_win->base) + target_disp * send_win->disp_unit);
 180   XBT_DEBUG("Entering MPI_Get from %d", target_rank);
 181
 182   if(target_rank != smpi_comm_rank(win->comm)){
 183     //prepare send_request
 184     MPI_Request sreq = smpi_rma_send_init(send_addr, target_count, target_datatype,
 185         smpi_group_index(smpi_comm_group(win->comm),target_rank), smpi_process_index(), RMA_TAG+2, send_win->comm,
 186         MPI_OP_NULL);
 187
 188     //prepare receiver request
 189     MPI_Request rreq = smpi_rma_recv_init(origin_addr, origin_count, origin_datatype,
 190         smpi_group_index(smpi_comm_group(win->comm),target_rank), smpi_process_index(), RMA_TAG+2, win->comm,
 191         MPI_OP_NULL);
 192
 193     //start the send, with another process than us as sender.
 194     smpi_mpi_start(sreq);
 195
 196     //push request to receiver's win
 197     send_win->requests->push_back(sreq);
 198
 199     //start recv
 200     smpi_mpi_start(rreq);
 201
 202     //push request to sender's win
 203     win->requests->push_back(rreq);
 204   }else{
 205     smpi_datatype_copy(send_addr, target_count, target_datatype, origin_addr, origin_count, origin_datatype);
 206   }
 207
 208   return MPI_SUCCESS;
 209 }
 210
 211
 212 int smpi_mpi_accumulate( void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank,
 213               MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPI_Win win)
 214 {
 215   if(win->opened==0)//check that post/start has been done
 216     return MPI_ERR_WIN;
 217   //FIXME: local version
 218   //get receiver pointer
 219   MPI_Win recv_win = win->connected_wins[target_rank];
 220
 221   void* recv_addr = static_cast<void*>(static_cast<char*>(recv_win->base) + target_disp * recv_win->disp_unit);
 222   XBT_DEBUG("Entering MPI_Accumulate to %d", target_rank);
 223
 224     //prepare send_request
 225     MPI_Request sreq = smpi_rma_send_init(origin_addr, origin_count, origin_datatype,
 226         smpi_process_index(), smpi_group_index(smpi_comm_group(win->comm),target_rank), RMA_TAG+3, win->comm, op);
 227
 228     //prepare receiver request
 229     MPI_Request rreq = smpi_rma_recv_init(recv_addr, target_count, target_datatype,
 230         smpi_process_index(), smpi_group_index(smpi_comm_group(win->comm),target_rank), RMA_TAG+3, recv_win->comm, op);
 231     //push request to receiver's win
 232     recv_win->requests->push_back(rreq);
 233     //start send
 234     smpi_mpi_start(sreq);
 235
 236     //push request to sender's win
 237     win->requests->push_back(sreq);
 238
 239   return MPI_SUCCESS;
 240 }
 241
 242 int smpi_mpi_win_start(MPI_Group group, int assert, MPI_Win win){
 243     /* From MPI forum advices
 244     The call to MPI_WIN_COMPLETE does not return until the put call has completed at the origin; and the target window
 245     will be accessed by the put operation only after the call to MPI_WIN_START has matched a call to MPI_WIN_POST by
 246     the target process. This still leaves much choice to implementors. The call to MPI_WIN_START can block until the
 247     matching call to MPI_WIN_POST occurs at all target processes. One can also have implementations where the call to
 248     MPI_WIN_START is nonblocking, but the call to MPI_PUT blocks until the matching call to MPI_WIN_POST occurred; or
 249     implementations where the first two calls are nonblocking, but the call to MPI_WIN_COMPLETE blocks until the call
 250     to MPI_WIN_POST occurred; or even implementations where all three calls can complete before any target process
 251     called MPI_WIN_POST --- the data put must be buffered, in this last case, so as to allow the put to complete at the
 252     origin ahead of its completion at the target. However, once the call to MPI_WIN_POST is issued, the sequence above
 253     must complete, without further dependencies.  */
 254
 255   //naive, blocking implementation.
 256   int i=0,j=0;
 257   int size = smpi_group_size(group);
 258   MPI_Request* reqs = xbt_new0(MPI_Request, size);
 259
 260   while(j!=size){
 261     int src=smpi_group_index(group,j);
 262     if(src!=smpi_process_index()){
 263       reqs[i]=smpi_irecv_init(nullptr, 0, MPI_CHAR, src,RMA_TAG+4, MPI_COMM_WORLD);
 264       i++;
 265     }
 266     j++;
 267   }
 268   size=i;
 269   smpi_mpi_startall(size, reqs);
 270   smpi_mpi_waitall(size, reqs, MPI_STATUSES_IGNORE);
 271   for(i=0;i<size;i++){
 272     smpi_mpi_request_free(&reqs[i]);
 273   }
 274   xbt_free(reqs);
 275   win->opened++; //we're open for business !
 276   win->group=group;
 277   smpi_group_use(group);
 278   return MPI_SUCCESS;
 279 }
 280
 281 int smpi_mpi_win_post(MPI_Group group, int assert, MPI_Win win){
 282   //let's make a synchronous send here
 283   int i=0,j=0;
 284   int size = smpi_group_size(group);
 285   MPI_Request* reqs = xbt_new0(MPI_Request, size);
 286
 287   while(j!=size){
 288     int dst=smpi_group_index(group,j);
 289     if(dst!=smpi_process_index()){
 290       reqs[i]=smpi_mpi_send_init(nullptr, 0, MPI_CHAR, dst, RMA_TAG+4, MPI_COMM_WORLD);
 291       i++;
 292     }
 293     j++;
 294   }
 295   size=i;
 296
 297   smpi_mpi_startall(size, reqs);
 298   smpi_mpi_waitall(size, reqs, MPI_STATUSES_IGNORE);
 299   for(i=0;i<size;i++){
 300     smpi_mpi_request_free(&reqs[i]);
 301   }
 302   xbt_free(reqs);
 303   win->opened++; //we're open for business !
 304   win->group=group;
 305   smpi_group_use(group);
 306   return MPI_SUCCESS;
 307 }
 308
 309 int smpi_mpi_win_complete(MPI_Win win){
 310   if(win->opened==0)
 311     xbt_die("Complete called on already opened MPI_Win");
 312
 313   XBT_DEBUG("Entering MPI_Win_Complete");
 314   int i=0,j=0;
 315   int size = smpi_group_size(win->group);
 316   MPI_Request* reqs = xbt_new0(MPI_Request, size);
 317
 318   while(j!=size){
 319     int dst=smpi_group_index(win->group,j);
 320     if(dst!=smpi_process_index()){
 321       reqs[i]=smpi_mpi_send_init(nullptr, 0, MPI_CHAR, dst, RMA_TAG+5, MPI_COMM_WORLD);
 322       i++;
 323     }
 324     j++;
 325   }
 326   size=i;
 327   XBT_DEBUG("Win_complete - Sending sync messages to %d processes", size);
 328   smpi_mpi_startall(size, reqs);
 329   smpi_mpi_waitall(size, reqs, MPI_STATUSES_IGNORE);
 330
 331   for(i=0;i<size;i++){
 332     smpi_mpi_request_free(&reqs[i]);
 333   }
 334   xbt_free(reqs);
 335
 336   //now we can finish RMA calls
 337
 338   std::vector<MPI_Request> *reqqs = win->requests;
 339   size = static_cast<int>(reqqs->size());
 340
 341   XBT_DEBUG("Win_complete - Finishing %d RMA calls", size);
 342   // start all requests that have been prepared by another process
 343   for (auto req: *reqqs){
 344     if (req->flags & PREPARED)
 345       smpi_mpi_start(req);
 346   }
 347
 348   MPI_Request* treqs = &(*reqqs)[0];
 349   smpi_mpi_waitall(size,treqs,MPI_STATUSES_IGNORE);
 350   delete reqqs;
 351   smpi_group_unuse(win->group);
 352   win->opened--; //we're closed for business !
 353   return MPI_SUCCESS;
 354 }
 355
 356 int smpi_mpi_win_wait(MPI_Win win){
 357   //naive, blocking implementation.
 358   XBT_DEBUG("Entering MPI_Win_Wait");
 359   int i=0,j=0;
 360   int size = smpi_group_size(win->group);
 361   MPI_Request* reqs = xbt_new0(MPI_Request, size);
 362
 363   while(j!=size){
 364     int src=smpi_group_index(win->group,j);
 365     if(src!=smpi_process_index()){
 366       reqs[i]=smpi_irecv_init(nullptr, 0, MPI_CHAR, src,RMA_TAG+5, MPI_COMM_WORLD);
 367       i++;
 368     }
 369     j++;
 370   }
 371   size=i;
 372   XBT_DEBUG("Win_wait - Receiving sync messages from %d processes", size);
 373   smpi_mpi_startall(size, reqs);
 374   smpi_mpi_waitall(size, reqs, MPI_STATUSES_IGNORE);
 375   for(i=0;i<size;i++){
 376     smpi_mpi_request_free(&reqs[i]);
 377   }
 378   xbt_free(reqs);
 379
 380   std::vector<MPI_Request> *reqqs = win->requests;
 381   size = static_cast<int>(reqqs->size());
 382
 383   XBT_DEBUG("Win_complete - Finishing %d RMA calls", size);
 384
 385   // start all requests that have been prepared by another process
 386   for(auto req: *reqqs){
 387     if (req->flags & PREPARED)
 388       smpi_mpi_start(req);
 389   }
 390
 391   MPI_Request* treqs = &(*reqqs)[0];
 392   smpi_mpi_waitall(size,treqs,MPI_STATUSES_IGNORE);
 393   delete reqqs;
 394   smpi_group_unuse(win->group);
 395   win->opened--; //we're opened for business !
 396   return MPI_SUCCESS;
 397 }