src/smpi/colls/barrier-ompi.c

   1 /* Copyright (c) 2013-2014. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 /*
   8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   9  *                         University Research and Technology
  10  *                         Corporation.  All rights reserved.
  11  * Copyright (c) 2004-2006 The University of Tennessee and The University
  12  *                         of Tennessee Research Foundation.  All rights
  13  *                         reserved.
  14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  15  *                         University of Stuttgart.  All rights reserved.
  16  * Copyright (c) 2004-2005 The Regents of the University of California.
  17  *                         All rights reserved.
  18  * Copyright (c) 2008      Sun Microsystems, Inc.  All rights reserved.
  19  *
  20  * Additional copyrights may follow
  21  */
  22
  23 #include "colls_private.h"
  24 #include "coll_tuned_topo.h"
  25
  26
  27 /*
  28  * Barrier is ment to be a synchronous operation, as some BTLs can mark
  29  * a request done before its passed to the NIC and progress might not be made
  30  * elsewhere we cannot allow a process to exit the barrier until its last
  31  * [round of] sends are completed.
  32  *
  33  * It is last round of sends rather than 'last' individual send as each pair of
  34  * peers can use different channels/devices/btls and the receiver of one of
  35  * these sends might be forced to wait as the sender
  36  * leaves the collective and does not make progress until the next mpi call
  37  *
  38  */
  39
  40 /*
  41  * Simple double ring version of barrier
  42  *
  43  * synchronous gurantee made by last ring of sends are synchronous
  44  *
  45  */
  46 int smpi_coll_tuned_barrier_ompi_doublering(MPI_Comm comm
  47                                              )
  48 {
  49     int rank, size;
  50     int left, right;
  51
  52
  53     rank = smpi_comm_rank(comm);
  54     size = smpi_comm_size(comm);
  55
  56     XBT_DEBUG("ompi_coll_tuned_barrier_ompi_doublering rank %d", rank);
  57
  58     left = ((rank-1+size)%size);
  59     right = ((rank+1)%size);
  60
  61     if (rank > 0) { /* receive message from the left */
  62         smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
  63                                 COLL_TAG_BARRIER, comm,
  64                                 MPI_STATUS_IGNORE);
  65     }
  66
  67     /* Send message to the right */
  68     smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right,
  69                             COLL_TAG_BARRIER,
  70                              comm);
  71
  72     /* root needs to receive from the last node */
  73     if (rank == 0) {
  74         smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
  75                                 COLL_TAG_BARRIER, comm,
  76                                 MPI_STATUS_IGNORE);
  77     }
  78
  79     /* Allow nodes to exit */
  80     if (rank > 0) { /* post Receive from left */
  81         smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
  82                                 COLL_TAG_BARRIER, comm,
  83                                 MPI_STATUS_IGNORE);
  84     }
  85
  86     /* send message to the right one */
  87     smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right,
  88                             COLL_TAG_BARRIER,
  89                              comm);
  90
  91     /* rank 0 post receive from the last node */
  92     if (rank == 0) {
  93         smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
  94                                 COLL_TAG_BARRIER, comm,
  95                                 MPI_STATUS_IGNORE);
  96     }
  97
  98     return MPI_SUCCESS;
  99
 100 }
 101
 102
 103 /*
 104  * To make synchronous, uses sync sends and sync sendrecvs
 105  */
 106
 107 int smpi_coll_tuned_barrier_ompi_recursivedoubling(MPI_Comm comm
 108                                                     )
 109 {
 110     int rank, size, adjsize;
 111     int mask, remote;
 112
 113     rank = smpi_comm_rank(comm);
 114     size = smpi_comm_size(comm);
 115     XBT_DEBUG(
 116                  "ompi_coll_tuned_barrier_ompi_recursivedoubling rank %d",
 117                  rank);
 118
 119     /* do nearest power of 2 less than size calc */
 120     for( adjsize = 1; adjsize <= size; adjsize <<= 1 );
 121     adjsize >>= 1;
 122
 123     /* if size is not exact power of two, perform an extra step */
 124     if (adjsize != size) {
 125         if (rank >= adjsize) {
 126             /* send message to lower ranked node */
 127             remote = rank - adjsize;
 128             smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
 129                                                   COLL_TAG_BARRIER,
 130                                                   NULL, 0, MPI_BYTE, remote,
 131                                                   COLL_TAG_BARRIER,
 132                                                   comm, MPI_STATUS_IGNORE);
 133
 134         } else if (rank < (size - adjsize)) {
 135
 136             /* receive message from high level rank */
 137             smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
 138                                     COLL_TAG_BARRIER, comm,
 139                                     MPI_STATUS_IGNORE);
 140
 141         }
 142     }
 143
 144     /* exchange messages */
 145     if ( rank < adjsize ) {
 146         mask = 0x1;
 147         while ( mask < adjsize ) {
 148             remote = rank ^ mask;
 149             mask <<= 1;
 150             if (remote >= adjsize) continue;
 151
 152             /* post receive from the remote node */
 153             smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
 154                                                   COLL_TAG_BARRIER,
 155                                                   NULL, 0, MPI_BYTE, remote,
 156                                                   COLL_TAG_BARRIER,
 157                                                   comm, MPI_STATUS_IGNORE);
 158         }
 159     }
 160
 161     /* non-power of 2 case */
 162     if (adjsize != size) {
 163         if (rank < (size - adjsize)) {
 164             /* send enter message to higher ranked node */
 165             remote = rank + adjsize;
 166             smpi_mpi_send((void*)NULL, 0, MPI_BYTE, remote,
 167                                     COLL_TAG_BARRIER,
 168                                      comm);
 169
 170         }
 171     }
 172
 173     return MPI_SUCCESS;
 174
 175 }
 176
 177
 178 /*
 179  * To make synchronous, uses sync sends and sync sendrecvs
 180  */
 181
 182 int smpi_coll_tuned_barrier_ompi_bruck(MPI_Comm comm
 183                                         )
 184 {
 185     int rank, size;
 186     int distance, to, from;
 187
 188     rank = smpi_comm_rank(comm);
 189     size = smpi_comm_size(comm);
 190     XBT_DEBUG(
 191                  "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank);
 192
 193     /* exchange data with rank-2^k and rank+2^k */
 194     for (distance = 1; distance < size; distance <<= 1) {
 195         from = (rank + size - distance) % size;
 196         to   = (rank + distance) % size;
 197
 198         /* send message to lower ranked node */
 199         smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, to,
 200                                               COLL_TAG_BARRIER,
 201                                               NULL, 0, MPI_BYTE, from,
 202                                               COLL_TAG_BARRIER,
 203                                               comm, MPI_STATUS_IGNORE);
 204     }
 205
 206     return MPI_SUCCESS;
 207
 208 }
 209
 210
 211 /*
 212  * To make synchronous, uses sync sends and sync sendrecvs
 213  */
 214 /* special case for two processes */
 215 int smpi_coll_tuned_barrier_ompi_two_procs(MPI_Comm comm
 216                                             )
 217 {
 218     int remote;
 219
 220     remote = smpi_comm_rank(comm);
 221     XBT_DEBUG(
 222                  "ompi_coll_tuned_barrier_ompi_two_procs rank %d", remote);
 223     remote = (remote + 1) & 0x1;
 224
 225     smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
 226                                           COLL_TAG_BARRIER,
 227                                           NULL, 0, MPI_BYTE, remote,
 228                                           COLL_TAG_BARRIER,
 229                                           comm, MPI_STATUS_IGNORE);
 230     return (MPI_SUCCESS);
 231 }
 232
 233
 234 /*
 235  * Linear functions are copied from the BASIC coll module
 236  * they do not segment the message and are simple implementations
 237  * but for some small number of nodes and/or small data sizes they
 238  * are just as fast as tuned/tree based segmenting operations
 239  * and as such may be selected by the decision functions
 240  * These are copied into this module due to the way we select modules
 241  * in V1. i.e. in V2 we will handle this differently and so will not
 242  * have to duplicate code.
 243  * GEF Oct05 after asking Jeff.
 244  */
 245
 246 /* copied function (with appropriate renaming) starts here */
 247
 248 int smpi_coll_tuned_barrier_ompi_basic_linear(MPI_Comm comm)
 249 {
 250     int i;
 251     int size = smpi_comm_size(comm);
 252     int rank = smpi_comm_rank(comm);
 253
 254     /* All non-root send & receive zero-length message. */
 255
 256     if (rank > 0) {
 257         smpi_mpi_send (NULL, 0, MPI_BYTE, 0,
 258                                  COLL_TAG_BARRIER,
 259                                   comm);
 260
 261         smpi_mpi_recv (NULL, 0, MPI_BYTE, 0,
 262                                  COLL_TAG_BARRIER,
 263                                  comm, MPI_STATUS_IGNORE);
 264     }
 265
 266     /* The root collects and broadcasts the messages. */
 267
 268     else {
 269         MPI_Request* requests;
 270
 271         requests = (MPI_Request*)malloc( size * sizeof(MPI_Request) );
 272         for (i = 1; i < size; ++i) {
 273             requests[i] = smpi_mpi_irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
 274                                      COLL_TAG_BARRIER, comm
 275                                      );
 276         }
 277         smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
 278
 279         for (i = 1; i < size; ++i) {
 280             requests[i] = smpi_mpi_isend(NULL, 0, MPI_BYTE, i,
 281                                      COLL_TAG_BARRIER,
 282                                       comm
 283                                      );
 284         }
 285         smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
 286         free( requests );
 287     }
 288
 289     /* All done */
 290
 291     return MPI_SUCCESS;
 292
 293 }
 294 /* copied function (with appropriate renaming) ends here */
 295
 296 /*
 297  * Another recursive doubling type algorithm, but in this case
 298  * we go up the tree and back down the tree.
 299  */
 300 int smpi_coll_tuned_barrier_ompi_tree(MPI_Comm comm)
 301 {
 302     int rank, size, depth;
 303     int jump, partner;
 304
 305     rank = smpi_comm_rank(comm);
 306     size = smpi_comm_size(comm);
 307     XBT_DEBUG(
 308                  "ompi_coll_tuned_barrier_ompi_tree %d",
 309                  rank);
 310
 311     /* Find the nearest power of 2 of the communicator size. */
 312     for(depth = 1; depth < size; depth <<= 1 );
 313
 314     for (jump=1; jump<depth; jump<<=1) {
 315         partner = rank ^ jump;
 316         if (!(partner & (jump-1)) && partner < size) {
 317             if (partner > rank) {
 318                 smpi_mpi_recv (NULL, 0, MPI_BYTE, partner,
 319                                          COLL_TAG_BARRIER, comm,
 320                                          MPI_STATUS_IGNORE);
 321             } else if (partner < rank) {
 322                 smpi_mpi_send (NULL, 0, MPI_BYTE, partner,
 323                                          COLL_TAG_BARRIER,
 324                                           comm);
 325             }
 326         }
 327     }
 328
 329     depth>>=1;
 330     for (jump = depth; jump>0; jump>>=1) {
 331         partner = rank ^ jump;
 332         if (!(partner & (jump-1)) && partner < size) {
 333             if (partner > rank) {
 334                 smpi_mpi_send (NULL, 0, MPI_BYTE, partner,
 335                                          COLL_TAG_BARRIER,
 336                                           comm);
 337             } else if (partner < rank) {
 338                 smpi_mpi_recv (NULL, 0, MPI_BYTE, partner,
 339                                          COLL_TAG_BARRIER, comm,
 340                                          MPI_STATUS_IGNORE);
 341             }
 342         }
 343     }
 344
 345     return MPI_SUCCESS;
 346 }