src/smpi/colls/barrier-ompi.c

   1 /*
   2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2006 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2008      Sun Microsystems, Inc.  All rights reserved.
  13  * $COPYRIGHT$
  14  *
  15  * Additional copyrights may follow
  16  *
  17  * $HEADER$
  18  */
  19
  20 #include "colls_private.h"
  21 #include "coll_tuned_topo.h"
  22
  23
  24 /*
  25  * Barrier is ment to be a synchronous operation, as some BTLs can mark
  26  * a request done before its passed to the NIC and progress might not be made
  27  * elsewhere we cannot allow a process to exit the barrier until its last
  28  * [round of] sends are completed.
  29  *
  30  * It is last round of sends rather than 'last' individual send as each pair of
  31  * peers can use different channels/devices/btls and the receiver of one of
  32  * these sends might be forced to wait as the sender
  33  * leaves the collective and does not make progress until the next mpi call
  34  *
  35  */
  36
  37 /*
  38  * Simple double ring version of barrier
  39  *
  40  * synchronous gurantee made by last ring of sends are synchronous
  41  *
  42  */
  43 int smpi_coll_tuned_barrier_ompi_doublering(MPI_Comm comm
  44                                              )
  45 {
  46     int rank, size;
  47     int left, right;
  48
  49
  50     rank = smpi_comm_rank(comm);
  51     size = smpi_comm_size(comm);
  52
  53     XBT_DEBUG("ompi_coll_tuned_barrier_ompi_doublering rank %d", rank);
  54
  55     left = ((rank-1+size)%size);
  56     right = ((rank+1)%size);
  57
  58     if (rank > 0) { /* receive message from the left */
  59         smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
  60                                 COLL_TAG_BARRIER, comm,
  61                                 MPI_STATUS_IGNORE);
  62     }
  63
  64     /* Send message to the right */
  65     smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right,
  66                             COLL_TAG_BARRIER,
  67                              comm);
  68
  69     /* root needs to receive from the last node */
  70     if (rank == 0) {
  71         smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
  72                                 COLL_TAG_BARRIER, comm,
  73                                 MPI_STATUS_IGNORE);
  74     }
  75
  76     /* Allow nodes to exit */
  77     if (rank > 0) { /* post Receive from left */
  78         smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
  79                                 COLL_TAG_BARRIER, comm,
  80                                 MPI_STATUS_IGNORE);
  81     }
  82
  83     /* send message to the right one */
  84     smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right,
  85                             COLL_TAG_BARRIER,
  86                              comm);
  87
  88     /* rank 0 post receive from the last node */
  89     if (rank == 0) {
  90         smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
  91                                 COLL_TAG_BARRIER, comm,
  92                                 MPI_STATUS_IGNORE);
  93     }
  94
  95     return MPI_SUCCESS;
  96
  97 }
  98
  99
 100 /*
 101  * To make synchronous, uses sync sends and sync sendrecvs
 102  */
 103
 104 int smpi_coll_tuned_barrier_ompi_recursivedoubling(MPI_Comm comm
 105                                                     )
 106 {
 107     int rank, size, adjsize;
 108     int mask, remote;
 109
 110     rank = smpi_comm_rank(comm);
 111     size = smpi_comm_size(comm);
 112     XBT_DEBUG(
 113                  "ompi_coll_tuned_barrier_ompi_recursivedoubling rank %d",
 114                  rank);
 115
 116     /* do nearest power of 2 less than size calc */
 117     for( adjsize = 1; adjsize <= size; adjsize <<= 1 );
 118     adjsize >>= 1;
 119
 120     /* if size is not exact power of two, perform an extra step */
 121     if (adjsize != size) {
 122         if (rank >= adjsize) {
 123             /* send message to lower ranked node */
 124             remote = rank - adjsize;
 125             smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
 126                                                   COLL_TAG_BARRIER,
 127                                                   NULL, 0, MPI_BYTE, remote,
 128                                                   COLL_TAG_BARRIER,
 129                                                   comm, MPI_STATUS_IGNORE);
 130
 131         } else if (rank < (size - adjsize)) {
 132
 133             /* receive message from high level rank */
 134             smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
 135                                     COLL_TAG_BARRIER, comm,
 136                                     MPI_STATUS_IGNORE);
 137
 138         }
 139     }
 140
 141     /* exchange messages */
 142     if ( rank < adjsize ) {
 143         mask = 0x1;
 144         while ( mask < adjsize ) {
 145             remote = rank ^ mask;
 146             mask <<= 1;
 147             if (remote >= adjsize) continue;
 148
 149             /* post receive from the remote node */
 150             smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
 151                                                   COLL_TAG_BARRIER,
 152                                                   NULL, 0, MPI_BYTE, remote,
 153                                                   COLL_TAG_BARRIER,
 154                                                   comm, MPI_STATUS_IGNORE);
 155         }
 156     }
 157
 158     /* non-power of 2 case */
 159     if (adjsize != size) {
 160         if (rank < (size - adjsize)) {
 161             /* send enter message to higher ranked node */
 162             remote = rank + adjsize;
 163             smpi_mpi_send((void*)NULL, 0, MPI_BYTE, remote,
 164                                     COLL_TAG_BARRIER,
 165                                      comm);
 166
 167         }
 168     }
 169
 170     return MPI_SUCCESS;
 171
 172 }
 173
 174
 175 /*
 176  * To make synchronous, uses sync sends and sync sendrecvs
 177  */
 178
 179 int smpi_coll_tuned_barrier_ompi_bruck(MPI_Comm comm
 180                                         )
 181 {
 182     int rank, size;
 183     int distance, to, from;
 184
 185     rank = smpi_comm_rank(comm);
 186     size = smpi_comm_size(comm);
 187     XBT_DEBUG(
 188                  "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank);
 189
 190     /* exchange data with rank-2^k and rank+2^k */
 191     for (distance = 1; distance < size; distance <<= 1) {
 192         from = (rank + size - distance) % size;
 193         to   = (rank + distance) % size;
 194
 195         /* send message to lower ranked node */
 196         smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, to,
 197                                               COLL_TAG_BARRIER,
 198                                               NULL, 0, MPI_BYTE, from,
 199                                               COLL_TAG_BARRIER,
 200                                               comm, MPI_STATUS_IGNORE);
 201     }
 202
 203     return MPI_SUCCESS;
 204
 205 }
 206
 207
 208 /*
 209  * To make synchronous, uses sync sends and sync sendrecvs
 210  */
 211 /* special case for two processes */
 212 int smpi_coll_tuned_barrier_ompi_two_procs(MPI_Comm comm
 213                                             )
 214 {
 215     int remote;
 216
 217     remote = smpi_comm_rank(comm);
 218     XBT_DEBUG(
 219                  "ompi_coll_tuned_barrier_ompi_two_procs rank %d", remote);
 220     remote = (remote + 1) & 0x1;
 221
 222     smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
 223                                           COLL_TAG_BARRIER,
 224                                           NULL, 0, MPI_BYTE, remote,
 225                                           COLL_TAG_BARRIER,
 226                                           comm, MPI_STATUS_IGNORE);
 227     return (MPI_SUCCESS);
 228 }
 229
 230
 231 /*
 232  * Linear functions are copied from the BASIC coll module
 233  * they do not segment the message and are simple implementations
 234  * but for some small number of nodes and/or small data sizes they
 235  * are just as fast as tuned/tree based segmenting operations
 236  * and as such may be selected by the decision functions
 237  * These are copied into this module due to the way we select modules
 238  * in V1. i.e. in V2 we will handle this differently and so will not
 239  * have to duplicate code.
 240  * GEF Oct05 after asking Jeff.
 241  */
 242
 243 /* copied function (with appropriate renaming) starts here */
 244
 245 int smpi_coll_tuned_barrier_ompi_basic_linear(MPI_Comm comm)
 246 {
 247     int i;
 248     int size = smpi_comm_size(comm);
 249     int rank = smpi_comm_rank(comm);
 250
 251     /* All non-root send & receive zero-length message. */
 252
 253     if (rank > 0) {
 254         smpi_mpi_send (NULL, 0, MPI_BYTE, 0,
 255                                  COLL_TAG_BARRIER,
 256                                   comm);
 257
 258         smpi_mpi_recv (NULL, 0, MPI_BYTE, 0,
 259                                  COLL_TAG_BARRIER,
 260                                  comm, MPI_STATUS_IGNORE);
 261     }
 262
 263     /* The root collects and broadcasts the messages. */
 264
 265     else {
 266         MPI_Request* requests;
 267
 268         requests = (MPI_Request*)malloc( size * sizeof(MPI_Request) );
 269         for (i = 1; i < size; ++i) {
 270             requests[i] = smpi_mpi_irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
 271                                      COLL_TAG_BARRIER, comm
 272                                      );
 273         }
 274         smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
 275
 276         for (i = 1; i < size; ++i) {
 277             requests[i] = smpi_mpi_isend(NULL, 0, MPI_BYTE, i,
 278                                      COLL_TAG_BARRIER,
 279                                       comm
 280                                      );
 281         }
 282         smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
 283         free( requests );
 284     }
 285
 286     /* All done */
 287
 288     return MPI_SUCCESS;
 289
 290 }
 291 /* copied function (with appropriate renaming) ends here */
 292
 293 /*
 294  * Another recursive doubling type algorithm, but in this case
 295  * we go up the tree and back down the tree.
 296  */
 297 int smpi_coll_tuned_barrier_ompi_tree(MPI_Comm comm)
 298 {
 299     int rank, size, depth;
 300     int jump, partner;
 301
 302     rank = smpi_comm_rank(comm);
 303     size = smpi_comm_size(comm);
 304     XBT_DEBUG(
 305                  "ompi_coll_tuned_barrier_ompi_tree %d",
 306                  rank);
 307
 308     /* Find the nearest power of 2 of the communicator size. */
 309     for(depth = 1; depth < size; depth <<= 1 );
 310
 311     for (jump=1; jump<depth; jump<<=1) {
 312         partner = rank ^ jump;
 313         if (!(partner & (jump-1)) && partner < size) {
 314             if (partner > rank) {
 315                 smpi_mpi_recv (NULL, 0, MPI_BYTE, partner,
 316                                          COLL_TAG_BARRIER, comm,
 317                                          MPI_STATUS_IGNORE);
 318             } else if (partner < rank) {
 319                 smpi_mpi_send (NULL, 0, MPI_BYTE, partner,
 320                                          COLL_TAG_BARRIER,
 321                                           comm);
 322             }
 323         }
 324     }
 325
 326     depth>>=1;
 327     for (jump = depth; jump>0; jump>>=1) {
 328         partner = rank ^ jump;
 329         if (!(partner & (jump-1)) && partner < size) {
 330             if (partner > rank) {
 331                 smpi_mpi_send (NULL, 0, MPI_BYTE, partner,
 332                                          COLL_TAG_BARRIER,
 333                                           comm);
 334             } else if (partner < rank) {
 335                 smpi_mpi_recv (NULL, 0, MPI_BYTE, partner,
 336                                          COLL_TAG_BARRIER, comm,
 337                                          MPI_STATUS_IGNORE);
 338             }
 339         }
 340     }
 341
 342     return MPI_SUCCESS;
 343 }