src/smpi/colls/barrier/barrier-ompi.cpp

   1 /* Copyright (c) 2013-2017. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 /*
   8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   9  *                         University Research and Technology
  10  *                         Corporation.  All rights reserved.
  11  * Copyright (c) 2004-2006 The University of Tennessee and The University
  12  *                         of Tennessee Research Foundation.  All rights
  13  *                         reserved.
  14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  15  *                         University of Stuttgart.  All rights reserved.
  16  * Copyright (c) 2004-2005 The Regents of the University of California.
  17  *                         All rights reserved.
  18  * Copyright (c) 2008      Sun Microsystems, Inc.  All rights reserved.
  19  *
  20  * Additional copyrights may follow
  21  */
  22
  23 #include "../colls_private.h"
  24 #include "../coll_tuned_topo.h"
  25
  26
  27 /*
  28  * Barrier is ment to be a synchronous operation, as some BTLs can mark
  29  * a request done before its passed to the NIC and progress might not be made
  30  * elsewhere we cannot allow a process to exit the barrier until its last
  31  * [round of] sends are completed.
  32  *
  33  * It is last round of sends rather than 'last' individual send as each pair of
  34  * peers can use different channels/devices/btls and the receiver of one of
  35  * these sends might be forced to wait as the sender
  36  * leaves the collective and does not make progress until the next mpi call
  37  *
  38  */
  39
  40 /*
  41  * Simple double ring version of barrier
  42  *
  43  * synchronous gurantee made by last ring of sends are synchronous
  44  *
  45  */
  46 namespace simgrid{
  47 namespace smpi{
  48 int Coll_barrier_ompi_doublering::barrier(MPI_Comm comm
  49                                              )
  50 {
  51     int rank, size;
  52     int left, right;
  53
  54
  55     rank = comm->rank();
  56     size = comm->size();
  57
  58     XBT_DEBUG("ompi_coll_tuned_barrier_ompi_doublering rank %d", rank);
  59
  60     left = ((rank-1+size)%size);
  61     right = ((rank+1)%size);
  62
  63     if (rank > 0) { /* receive message from the left */
  64         Request::recv((void*)NULL, 0, MPI_BYTE, left,
  65                                 COLL_TAG_BARRIER, comm,
  66                                 MPI_STATUS_IGNORE);
  67     }
  68
  69     /* Send message to the right */
  70     Request::send((void*)NULL, 0, MPI_BYTE, right,
  71                             COLL_TAG_BARRIER,
  72                              comm);
  73
  74     /* root needs to receive from the last node */
  75     if (rank == 0) {
  76         Request::recv((void*)NULL, 0, MPI_BYTE, left,
  77                                 COLL_TAG_BARRIER, comm,
  78                                 MPI_STATUS_IGNORE);
  79     }
  80
  81     /* Allow nodes to exit */
  82     if (rank > 0) { /* post Receive from left */
  83         Request::recv((void*)NULL, 0, MPI_BYTE, left,
  84                                 COLL_TAG_BARRIER, comm,
  85                                 MPI_STATUS_IGNORE);
  86     }
  87
  88     /* send message to the right one */
  89     Request::send((void*)NULL, 0, MPI_BYTE, right,
  90                             COLL_TAG_BARRIER,
  91                              comm);
  92
  93     /* rank 0 post receive from the last node */
  94     if (rank == 0) {
  95         Request::recv((void*)NULL, 0, MPI_BYTE, left,
  96                                 COLL_TAG_BARRIER, comm,
  97                                 MPI_STATUS_IGNORE);
  98     }
  99
 100     return MPI_SUCCESS;
 101
 102 }
 103
 104
 105 /*
 106  * To make synchronous, uses sync sends and sync sendrecvs
 107  */
 108
 109 int Coll_barrier_ompi_recursivedoubling::barrier(MPI_Comm comm
 110                                                     )
 111 {
 112     int rank, size, adjsize;
 113     int mask, remote;
 114
 115     rank = comm->rank();
 116     size = comm->size();
 117     XBT_DEBUG(
 118                  "ompi_coll_tuned_barrier_ompi_recursivedoubling rank %d",
 119                  rank);
 120
 121     /* do nearest power of 2 less than size calc */
 122     for( adjsize = 1; adjsize <= size; adjsize <<= 1 );
 123     adjsize >>= 1;
 124
 125     /* if size is not exact power of two, perform an extra step */
 126     if (adjsize != size) {
 127         if (rank >= adjsize) {
 128             /* send message to lower ranked node */
 129             remote = rank - adjsize;
 130             Request::sendrecv(NULL, 0, MPI_BYTE, remote,
 131                                                   COLL_TAG_BARRIER,
 132                                                   NULL, 0, MPI_BYTE, remote,
 133                                                   COLL_TAG_BARRIER,
 134                                                   comm, MPI_STATUS_IGNORE);
 135
 136         } else if (rank < (size - adjsize)) {
 137
 138             /* receive message from high level rank */
 139             Request::recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
 140                                     COLL_TAG_BARRIER, comm,
 141                                     MPI_STATUS_IGNORE);
 142
 143         }
 144     }
 145
 146     /* exchange messages */
 147     if ( rank < adjsize ) {
 148         mask = 0x1;
 149         while ( mask < adjsize ) {
 150             remote = rank ^ mask;
 151             mask <<= 1;
 152             if (remote >= adjsize) continue;
 153
 154             /* post receive from the remote node */
 155             Request::sendrecv(NULL, 0, MPI_BYTE, remote,
 156                                                   COLL_TAG_BARRIER,
 157                                                   NULL, 0, MPI_BYTE, remote,
 158                                                   COLL_TAG_BARRIER,
 159                                                   comm, MPI_STATUS_IGNORE);
 160         }
 161     }
 162
 163     /* non-power of 2 case */
 164     if (adjsize != size) {
 165         if (rank < (size - adjsize)) {
 166             /* send enter message to higher ranked node */
 167             remote = rank + adjsize;
 168             Request::send((void*)NULL, 0, MPI_BYTE, remote,
 169                                     COLL_TAG_BARRIER,
 170                                      comm);
 171
 172         }
 173     }
 174
 175     return MPI_SUCCESS;
 176
 177 }
 178
 179
 180 /*
 181  * To make synchronous, uses sync sends and sync sendrecvs
 182  */
 183
 184 int Coll_barrier_ompi_bruck::barrier(MPI_Comm comm
 185                                         )
 186 {
 187     int rank, size;
 188     int distance, to, from;
 189
 190     rank = comm->rank();
 191     size = comm->size();
 192     XBT_DEBUG(
 193                  "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank);
 194
 195     /* exchange data with rank-2^k and rank+2^k */
 196     for (distance = 1; distance < size; distance <<= 1) {
 197         from = (rank + size - distance) % size;
 198         to   = (rank + distance) % size;
 199
 200         /* send message to lower ranked node */
 201         Request::sendrecv(NULL, 0, MPI_BYTE, to,
 202                                               COLL_TAG_BARRIER,
 203                                               NULL, 0, MPI_BYTE, from,
 204                                               COLL_TAG_BARRIER,
 205                                               comm, MPI_STATUS_IGNORE);
 206     }
 207
 208     return MPI_SUCCESS;
 209
 210 }
 211
 212
 213 /*
 214  * To make synchronous, uses sync sends and sync sendrecvs
 215  */
 216 /* special case for two processes */
 217 int Coll_barrier_ompi_two_procs::barrier(MPI_Comm comm
 218                                             )
 219 {
 220     int remote;
 221
 222     remote = comm->rank();
 223     XBT_DEBUG(
 224                  "ompi_coll_tuned_barrier_ompi_two_procs rank %d", remote);
 225     remote = (remote + 1) & 0x1;
 226
 227     Request::sendrecv(NULL, 0, MPI_BYTE, remote,
 228                                           COLL_TAG_BARRIER,
 229                                           NULL, 0, MPI_BYTE, remote,
 230                                           COLL_TAG_BARRIER,
 231                                           comm, MPI_STATUS_IGNORE);
 232     return (MPI_SUCCESS);
 233 }
 234
 235
 236 /*
 237  * Linear functions are copied from the BASIC coll module
 238  * they do not segment the message and are simple implementations
 239  * but for some small number of nodes and/or small data sizes they
 240  * are just as fast as tuned/tree based segmenting operations
 241  * and as such may be selected by the decision functions
 242  * These are copied into this module due to the way we select modules
 243  * in V1. i.e. in V2 we will handle this differently and so will not
 244  * have to duplicate code.
 245  * GEF Oct05 after asking Jeff.
 246  */
 247
 248 /* copied function (with appropriate renaming) starts here */
 249
 250 int Coll_barrier_ompi_basic_linear::barrier(MPI_Comm comm)
 251 {
 252     int i;
 253     int size = comm->size();
 254     int rank = comm->rank();
 255
 256     /* All non-root send & receive zero-length message. */
 257
 258     if (rank > 0) {
 259         Request::send (NULL, 0, MPI_BYTE, 0,
 260                                  COLL_TAG_BARRIER,
 261                                   comm);
 262
 263         Request::recv (NULL, 0, MPI_BYTE, 0,
 264                                  COLL_TAG_BARRIER,
 265                                  comm, MPI_STATUS_IGNORE);
 266     }
 267
 268     /* The root collects and broadcasts the messages. */
 269
 270     else {
 271         MPI_Request* requests;
 272
 273         requests = (MPI_Request*)malloc( size * sizeof(MPI_Request) );
 274         for (i = 1; i < size; ++i) {
 275             requests[i] = Request::irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
 276                                      COLL_TAG_BARRIER, comm
 277                                      );
 278         }
 279         Request::waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
 280
 281         for (i = 1; i < size; ++i) {
 282             requests[i] = Request::isend(NULL, 0, MPI_BYTE, i,
 283                                      COLL_TAG_BARRIER,
 284                                       comm
 285                                      );
 286         }
 287         Request::waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
 288         free( requests );
 289     }
 290
 291     /* All done */
 292
 293     return MPI_SUCCESS;
 294
 295 }
 296 /* copied function (with appropriate renaming) ends here */
 297
 298 /*
 299  * Another recursive doubling type algorithm, but in this case
 300  * we go up the tree and back down the tree.
 301  */
 302 int Coll_barrier_ompi_tree::barrier(MPI_Comm comm)
 303 {
 304     int rank, size, depth;
 305     int jump, partner;
 306
 307     rank = comm->rank();
 308     size = comm->size();
 309     XBT_DEBUG(
 310                  "ompi_coll_tuned_barrier_ompi_tree %d",
 311                  rank);
 312
 313     /* Find the nearest power of 2 of the communicator size. */
 314     for(depth = 1; depth < size; depth <<= 1 );
 315
 316     for (jump=1; jump<depth; jump<<=1) {
 317         partner = rank ^ jump;
 318         if (!(partner & (jump-1)) && partner < size) {
 319             if (partner > rank) {
 320                 Request::recv (NULL, 0, MPI_BYTE, partner,
 321                                          COLL_TAG_BARRIER, comm,
 322                                          MPI_STATUS_IGNORE);
 323             } else if (partner < rank) {
 324                 Request::send (NULL, 0, MPI_BYTE, partner,
 325                                          COLL_TAG_BARRIER,
 326                                           comm);
 327             }
 328         }
 329     }
 330
 331     depth>>=1;
 332     for (jump = depth; jump>0; jump>>=1) {
 333         partner = rank ^ jump;
 334         if (!(partner & (jump-1)) && partner < size) {
 335             if (partner > rank) {
 336                 Request::send (NULL, 0, MPI_BYTE, partner,
 337                                          COLL_TAG_BARRIER,
 338                                           comm);
 339             } else if (partner < rank) {
 340                 Request::recv (NULL, 0, MPI_BYTE, partner,
 341                                          COLL_TAG_BARRIER, comm,
 342                                          MPI_STATUS_IGNORE);
 343             }
 344         }
 345     }
 346
 347     return MPI_SUCCESS;
 348 }
 349
 350 }
 351 }