src/smpi/colls/barrier-ompi.c

   1 /*
   2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2006 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2008      Sun Microsystems, Inc.  All rights reserved.
  13  * $COPYRIGHT$
  14  *
  15  * Additional copyrights may follow
  16  *
  17  * $HEADER$
  18  */
  19
  20 #include "colls_private.h"
  21 #include "coll_tuned_topo.h"
  22
  23
  24 #define MCA_COLL_BASE_TAG_BARRIER 100
  25 /*
  26  * Barrier is ment to be a synchronous operation, as some BTLs can mark
  27  * a request done before its passed to the NIC and progress might not be made
  28  * elsewhere we cannot allow a process to exit the barrier until its last
  29  * [round of] sends are completed.
  30  *
  31  * It is last round of sends rather than 'last' individual send as each pair of
  32  * peers can use different channels/devices/btls and the receiver of one of
  33  * these sends might be forced to wait as the sender
  34  * leaves the collective and does not make progress until the next mpi call
  35  *
  36  */
  37
  38 /*
  39  * Simple double ring version of barrier
  40  *
  41  * synchronous gurantee made by last ring of sends are synchronous
  42  *
  43  */
  44 int smpi_coll_tuned_barrier_ompi_doublering(MPI_Comm comm
  45                                              )
  46 {
  47     int rank, size;
  48     int left, right;
  49
  50
  51     rank = smpi_comm_rank(comm);
  52     size = smpi_comm_size(comm);
  53
  54     XBT_DEBUG("ompi_coll_tuned_barrier_ompi_doublering rank %d", rank);
  55
  56     left = ((rank-1)%size);
  57     right = ((rank+1)%size);
  58
  59     if (rank > 0) { /* receive message from the left */
  60         smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
  61                                 MCA_COLL_BASE_TAG_BARRIER, comm,
  62                                 MPI_STATUS_IGNORE);
  63     }
  64
  65     /* Send message to the right */
  66     smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right,
  67                             MCA_COLL_BASE_TAG_BARRIER,
  68                              comm);
  69
  70     /* root needs to receive from the last node */
  71     if (rank == 0) {
  72         smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
  73                                 MCA_COLL_BASE_TAG_BARRIER, comm,
  74                                 MPI_STATUS_IGNORE);
  75     }
  76
  77     /* Allow nodes to exit */
  78     if (rank > 0) { /* post Receive from left */
  79         smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
  80                                 MCA_COLL_BASE_TAG_BARRIER, comm,
  81                                 MPI_STATUS_IGNORE);
  82     }
  83
  84     /* send message to the right one */
  85     smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right,
  86                             MCA_COLL_BASE_TAG_BARRIER,
  87                              comm);
  88
  89     /* rank 0 post receive from the last node */
  90     if (rank == 0) {
  91         smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
  92                                 MCA_COLL_BASE_TAG_BARRIER, comm,
  93                                 MPI_STATUS_IGNORE);
  94     }
  95
  96     return MPI_SUCCESS;
  97
  98 }
  99
 100
 101 /*
 102  * To make synchronous, uses sync sends and sync sendrecvs
 103  */
 104
 105 int smpi_coll_tuned_barrier_ompi_recursivedoubling(MPI_Comm comm
 106                                                     )
 107 {
 108     int rank, size, adjsize;
 109     int mask, remote;
 110
 111     rank = smpi_comm_rank(comm);
 112     size = smpi_comm_size(comm);
 113     XBT_DEBUG(
 114                  "ompi_coll_tuned_barrier_ompi_recursivedoubling rank %d",
 115                  rank);
 116
 117     /* do nearest power of 2 less than size calc */
 118     for( adjsize = 1; adjsize <= size; adjsize <<= 1 );
 119     adjsize >>= 1;
 120
 121     /* if size is not exact power of two, perform an extra step */
 122     if (adjsize != size) {
 123         if (rank >= adjsize) {
 124             /* send message to lower ranked node */
 125             remote = rank - adjsize;
 126             smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
 127                                                   MCA_COLL_BASE_TAG_BARRIER,
 128                                                   NULL, 0, MPI_BYTE, remote,
 129                                                   MCA_COLL_BASE_TAG_BARRIER,
 130                                                   comm, MPI_STATUS_IGNORE);
 131
 132         } else if (rank < (size - adjsize)) {
 133
 134             /* receive message from high level rank */
 135             smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
 136                                     MCA_COLL_BASE_TAG_BARRIER, comm,
 137                                     MPI_STATUS_IGNORE);
 138
 139         }
 140     }
 141
 142     /* exchange messages */
 143     if ( rank < adjsize ) {
 144         mask = 0x1;
 145         while ( mask < adjsize ) {
 146             remote = rank ^ mask;
 147             mask <<= 1;
 148             if (remote >= adjsize) continue;
 149
 150             /* post receive from the remote node */
 151             smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
 152                                                   MCA_COLL_BASE_TAG_BARRIER,
 153                                                   NULL, 0, MPI_BYTE, remote,
 154                                                   MCA_COLL_BASE_TAG_BARRIER,
 155                                                   comm, MPI_STATUS_IGNORE);
 156         }
 157     }
 158
 159     /* non-power of 2 case */
 160     if (adjsize != size) {
 161         if (rank < (size - adjsize)) {
 162             /* send enter message to higher ranked node */
 163             remote = rank + adjsize;
 164             smpi_mpi_send((void*)NULL, 0, MPI_BYTE, remote,
 165                                     MCA_COLL_BASE_TAG_BARRIER,
 166                                      comm);
 167
 168         }
 169     }
 170
 171     return MPI_SUCCESS;
 172
 173 }
 174
 175
 176 /*
 177  * To make synchronous, uses sync sends and sync sendrecvs
 178  */
 179
 180 int smpi_coll_tuned_barrier_ompi_bruck(MPI_Comm comm
 181                                         )
 182 {
 183     int rank, size;
 184     int distance, to, from;
 185
 186     rank = smpi_comm_rank(comm);
 187     size = smpi_comm_size(comm);
 188     XBT_DEBUG(
 189                  "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank);
 190
 191     /* exchange data with rank-2^k and rank+2^k */
 192     for (distance = 1; distance < size; distance <<= 1) {
 193         from = (rank + size - distance) % size;
 194         to   = (rank + distance) % size;
 195
 196         /* send message to lower ranked node */
 197         smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, to,
 198                                               MCA_COLL_BASE_TAG_BARRIER,
 199                                               NULL, 0, MPI_BYTE, from,
 200                                               MCA_COLL_BASE_TAG_BARRIER,
 201                                               comm, MPI_STATUS_IGNORE);
 202     }
 203
 204     return MPI_SUCCESS;
 205
 206 }
 207
 208
 209 /*
 210  * To make synchronous, uses sync sends and sync sendrecvs
 211  */
 212 /* special case for two processes */
 213 int smpi_coll_tuned_barrier_ompi_two_procs(MPI_Comm comm
 214                                             )
 215 {
 216     int remote;
 217
 218     remote = smpi_comm_rank(comm);
 219     XBT_DEBUG(
 220                  "ompi_coll_tuned_barrier_ompi_two_procs rank %d", remote);
 221     remote = (remote + 1) & 0x1;
 222
 223     smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
 224                                           MCA_COLL_BASE_TAG_BARRIER,
 225                                           NULL, 0, MPI_BYTE, remote,
 226                                           MCA_COLL_BASE_TAG_BARRIER,
 227                                           comm, MPI_STATUS_IGNORE);
 228     return (MPI_SUCCESS);
 229 }
 230
 231
 232 /*
 233  * Linear functions are copied from the BASIC coll module
 234  * they do not segment the message and are simple implementations
 235  * but for some small number of nodes and/or small data sizes they
 236  * are just as fast as tuned/tree based segmenting operations
 237  * and as such may be selected by the decision functions
 238  * These are copied into this module due to the way we select modules
 239  * in V1. i.e. in V2 we will handle this differently and so will not
 240  * have to duplicate code.
 241  * GEF Oct05 after asking Jeff.
 242  */
 243
 244 /* copied function (with appropriate renaming) starts here */
 245
 246 int smpi_coll_tuned_barrier_ompi_basic_linear(MPI_Comm comm)
 247 {
 248     int i;
 249     int size = smpi_comm_size(comm);
 250     int rank = smpi_comm_rank(comm);
 251
 252     /* All non-root send & receive zero-length message. */
 253
 254     if (rank > 0) {
 255         smpi_mpi_send (NULL, 0, MPI_BYTE, 0,
 256                                  MCA_COLL_BASE_TAG_BARRIER,
 257                                   comm);
 258
 259         smpi_mpi_recv (NULL, 0, MPI_BYTE, 0,
 260                                  MCA_COLL_BASE_TAG_BARRIER,
 261                                  comm, MPI_STATUS_IGNORE);
 262     }
 263
 264     /* The root collects and broadcasts the messages. */
 265
 266     else {
 267         MPI_Request* requests;
 268
 269         requests = (MPI_Request*)malloc( size * sizeof(MPI_Request) );
 270         for (i = 1; i < size; ++i) {
 271             requests[i] = smpi_mpi_irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
 272                                      MCA_COLL_BASE_TAG_BARRIER, comm
 273                                      );
 274         }
 275         smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
 276
 277         for (i = 1; i < size; ++i) {
 278             requests[i] = smpi_mpi_isend(NULL, 0, MPI_BYTE, i,
 279                                      MCA_COLL_BASE_TAG_BARRIER,
 280                                       comm
 281                                      );
 282         }
 283         smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
 284         free( requests );
 285     }
 286
 287     /* All done */
 288
 289     return MPI_SUCCESS;
 290
 291 }
 292 /* copied function (with appropriate renaming) ends here */
 293
 294 /*
 295  * Another recursive doubling type algorithm, but in this case
 296  * we go up the tree and back down the tree.
 297  */
 298 int smpi_coll_tuned_barrier_ompi_tree(MPI_Comm comm)
 299 {
 300     int rank, size, depth;
 301     int jump, partner;
 302
 303     rank = smpi_comm_rank(comm);
 304     size = smpi_comm_size(comm);
 305     XBT_DEBUG(
 306                  "ompi_coll_tuned_barrier_ompi_tree %d",
 307                  rank);
 308
 309     /* Find the nearest power of 2 of the communicator size. */
 310     for(depth = 1; depth < size; depth <<= 1 );
 311
 312     for (jump=1; jump<depth; jump<<=1) {
 313         partner = rank ^ jump;
 314         if (!(partner & (jump-1)) && partner < size) {
 315             if (partner > rank) {
 316                 smpi_mpi_recv (NULL, 0, MPI_BYTE, partner,
 317                                          MCA_COLL_BASE_TAG_BARRIER, comm,
 318                                          MPI_STATUS_IGNORE);
 319             } else if (partner < rank) {
 320                 smpi_mpi_send (NULL, 0, MPI_BYTE, partner,
 321                                          MCA_COLL_BASE_TAG_BARRIER,
 322                                           comm);
 323             }
 324         }
 325     }
 326
 327     depth>>=1;
 328     for (jump = depth; jump>0; jump>>=1) {
 329         partner = rank ^ jump;
 330         if (!(partner & (jump-1)) && partner < size) {
 331             if (partner > rank) {
 332                 smpi_mpi_send (NULL, 0, MPI_BYTE, partner,
 333                                          MCA_COLL_BASE_TAG_BARRIER,
 334                                           comm);
 335             } else if (partner < rank) {
 336                 smpi_mpi_recv (NULL, 0, MPI_BYTE, partner,
 337                                          MCA_COLL_BASE_TAG_BARRIER, comm,
 338                                          MPI_STATUS_IGNORE);
 339             }
 340         }
 341     }
 342
 343     return MPI_SUCCESS;
 344 }