src/smpi/colls/barrier/barrier-ompi.cpp

   1 /* Copyright (c) 2013-2021. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 /*
   8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   9  *                         University Research and Technology
  10  *                         Corporation.  All rights reserved.
  11  * Copyright (c) 2004-2006 The University of Tennessee and The University
  12  *                         of Tennessee Research Foundation.  All rights
  13  *                         reserved.
  14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  15  *                         University of Stuttgart.  All rights reserved.
  16  * Copyright (c) 2004-2005 The Regents of the University of California.
  17  *                         All rights reserved.
  18  * Copyright (c) 2008      Sun Microsystems, Inc.  All rights reserved.
  19  *
  20  * Additional copyrights may follow
  21  */
  22
  23 #include "../coll_tuned_topo.hpp"
  24 #include "../colls_private.hpp"
  25
  26 /*
  27  * Barrier is meant to be a synchronous operation, as some BTLs can mark
  28  * a request done before its passed to the NIC and progress might not be made
  29  * elsewhere we cannot allow a process to exit the barrier until its last
  30  * [round of] sends are completed.
  31  *
  32  * It is last round of sends rather than 'last' individual send as each pair of
  33  * peers can use different channels/devices/btls and the receiver of one of
  34  * these sends might be forced to wait as the sender
  35  * leaves the collective and does not make progress until the next mpi call
  36  *
  37  */
  38
  39 /*
  40  * Simple double ring version of barrier
  41  *
  42  * synchronous guarantee made by last ring of sends are synchronous
  43  *
  44  */
  45 namespace simgrid {
  46 namespace smpi {
  47 int barrier__ompi_doublering(MPI_Comm comm)
  48 {
  49     int rank, size;
  50     int left, right;
  51
  52
  53     rank = comm->rank();
  54     size = comm->size();
  55
  56     XBT_DEBUG("ompi_coll_tuned_barrier_ompi_doublering rank %d", rank);
  57
  58     left = ((rank-1+size)%size);
  59     right = ((rank+1)%size);
  60
  61     if (rank > 0) { /* receive message from the left */
  62       Request::recv(nullptr, 0, MPI_BYTE, left, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE);
  63     }
  64
  65     /* Send message to the right */
  66     Request::send(nullptr, 0, MPI_BYTE, right, COLL_TAG_BARRIER, comm);
  67
  68     /* root needs to receive from the last node */
  69     if (rank == 0) {
  70       Request::recv(nullptr, 0, MPI_BYTE, left, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE);
  71     }
  72
  73     /* Allow nodes to exit */
  74     if (rank > 0) { /* post Receive from left */
  75       Request::recv(nullptr, 0, MPI_BYTE, left, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE);
  76     }
  77
  78     /* send message to the right one */
  79     Request::send(nullptr, 0, MPI_BYTE, right, COLL_TAG_BARRIER, comm);
  80
  81     /* rank 0 post receive from the last node */
  82     if (rank == 0) {
  83       Request::recv(nullptr, 0, MPI_BYTE, left, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE);
  84     }
  85
  86     return MPI_SUCCESS;
  87
  88 }
  89
  90
  91 /*
  92  * To make synchronous, uses sync sends and sync sendrecvs
  93  */
  94
  95 int barrier__ompi_recursivedoubling(MPI_Comm comm)
  96 {
  97     int rank, size, adjsize;
  98     int mask, remote;
  99
 100     rank = comm->rank();
 101     size = comm->size();
 102     XBT_DEBUG(
 103                  "ompi_coll_tuned_barrier_ompi_recursivedoubling rank %d",
 104                  rank);
 105
 106     /* do nearest power of 2 less than size calc */
 107     for( adjsize = 1; adjsize <= size; adjsize <<= 1 );
 108     adjsize >>= 1;
 109
 110     /* if size is not exact power of two, perform an extra step */
 111     if (adjsize != size) {
 112         if (rank >= adjsize) {
 113             /* send message to lower ranked node */
 114             remote = rank - adjsize;
 115             Request::sendrecv(nullptr, 0, MPI_BYTE, remote, COLL_TAG_BARRIER, nullptr, 0, MPI_BYTE, remote,
 116                               COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE);
 117
 118         } else if (rank < (size - adjsize)) {
 119
 120             /* receive message from high level rank */
 121             Request::recv(nullptr, 0, MPI_BYTE, rank + adjsize, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE);
 122         }
 123     }
 124
 125     /* exchange messages */
 126     if ( rank < adjsize ) {
 127         mask = 0x1;
 128         while ( mask < adjsize ) {
 129             remote = rank ^ mask;
 130             mask <<= 1;
 131             if (remote >= adjsize) continue;
 132
 133             /* post receive from the remote node */
 134             Request::sendrecv(nullptr, 0, MPI_BYTE, remote, COLL_TAG_BARRIER, nullptr, 0, MPI_BYTE, remote,
 135                               COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE);
 136         }
 137     }
 138
 139     /* non-power of 2 case */
 140     if (adjsize != size) {
 141         if (rank < (size - adjsize)) {
 142             /* send enter message to higher ranked node */
 143             remote = rank + adjsize;
 144             Request::send(nullptr, 0, MPI_BYTE, remote, COLL_TAG_BARRIER, comm);
 145         }
 146     }
 147
 148     return MPI_SUCCESS;
 149
 150 }
 151
 152
 153 /*
 154  * To make synchronous, uses sync sends and sync sendrecvs
 155  */
 156
 157 int barrier__ompi_bruck(MPI_Comm comm)
 158 {
 159     int rank, size;
 160     int distance, to, from;
 161
 162     rank = comm->rank();
 163     size = comm->size();
 164     XBT_DEBUG(
 165                  "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank);
 166
 167     /* exchange data with rank-2^k and rank+2^k */
 168     for (distance = 1; distance < size; distance <<= 1) {
 169         from = (rank + size - distance) % size;
 170         to   = (rank + distance) % size;
 171
 172         /* send message to lower ranked node */
 173         Request::sendrecv(nullptr, 0, MPI_BYTE, to, COLL_TAG_BARRIER, nullptr, 0, MPI_BYTE, from, COLL_TAG_BARRIER,
 174                           comm, MPI_STATUS_IGNORE);
 175     }
 176
 177     return MPI_SUCCESS;
 178
 179 }
 180
 181
 182 /*
 183  * To make synchronous, uses sync sends and sync sendrecvs
 184  */
 185 /* special case for two processes */
 186 int barrier__ompi_two_procs(MPI_Comm comm)
 187 {
 188     int remote;
 189
 190     remote = comm->rank();
 191     XBT_DEBUG(
 192                  "ompi_coll_tuned_barrier_ompi_two_procs rank %d", remote);
 193     remote = (remote + 1) & 0x1;
 194
 195     Request::sendrecv(nullptr, 0, MPI_BYTE, remote, COLL_TAG_BARRIER, nullptr, 0, MPI_BYTE, remote, COLL_TAG_BARRIER,
 196                       comm, MPI_STATUS_IGNORE);
 197     return (MPI_SUCCESS);
 198 }
 199
 200
 201 /*
 202  * Linear functions are copied from the BASIC coll module
 203  * they do not segment the message and are simple implementations
 204  * but for some small number of nodes and/or small data sizes they
 205  * are just as fast as tuned/tree based segmenting operations
 206  * and as such may be selected by the decision functions
 207  * These are copied into this module due to the way we select modules
 208  * in V1. i.e. in V2 we will handle this differently and so will not
 209  * have to duplicate code.
 210  * GEF Oct05 after asking Jeff.
 211  */
 212
 213 /* copied function (with appropriate renaming) starts here */
 214
 215 int barrier__ompi_basic_linear(MPI_Comm comm)
 216 {
 217     int i;
 218     int size = comm->size();
 219     int rank = comm->rank();
 220
 221     /* All non-root send & receive zero-length message. */
 222
 223     if (rank > 0) {
 224       Request::send(nullptr, 0, MPI_BYTE, 0, COLL_TAG_BARRIER, comm);
 225
 226       Request::recv(nullptr, 0, MPI_BYTE, 0, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE);
 227     }
 228
 229     /* The root collects and broadcasts the messages. */
 230
 231     else {
 232         MPI_Request* requests;
 233
 234         requests = new MPI_Request[size];
 235         for (i = 1; i < size; ++i) {
 236           requests[i] = Request::irecv(nullptr, 0, MPI_BYTE, i, COLL_TAG_BARRIER, comm);
 237         }
 238         Request::waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
 239
 240         for (i = 1; i < size; ++i) {
 241           requests[i] = Request::isend(nullptr, 0, MPI_BYTE, i, COLL_TAG_BARRIER, comm);
 242         }
 243         Request::waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
 244         delete[] requests;
 245     }
 246
 247     /* All done */
 248
 249     return MPI_SUCCESS;
 250
 251 }
 252 /* copied function (with appropriate renaming) ends here */
 253
 254 /*
 255  * Another recursive doubling type algorithm, but in this case
 256  * we go up the tree and back down the tree.
 257  */
 258 int barrier__ompi_tree(MPI_Comm comm)
 259 {
 260     int rank, size, depth;
 261     int jump, partner;
 262
 263     rank = comm->rank();
 264     size = comm->size();
 265     XBT_DEBUG(
 266                  "ompi_coll_tuned_barrier_ompi_tree %d",
 267                  rank);
 268
 269     /* Find the nearest power of 2 of the communicator size. */
 270     for(depth = 1; depth < size; depth <<= 1 );
 271
 272     for (jump=1; jump<depth; jump<<=1) {
 273         partner = rank ^ jump;
 274         if (!(partner & (jump-1)) && partner < size) {
 275             if (partner > rank) {
 276               Request::recv(nullptr, 0, MPI_BYTE, partner, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE);
 277             } else if (partner < rank) {
 278               Request::send(nullptr, 0, MPI_BYTE, partner, COLL_TAG_BARRIER, comm);
 279             }
 280         }
 281     }
 282
 283     depth>>=1;
 284     for (jump = depth; jump>0; jump>>=1) {
 285         partner = rank ^ jump;
 286         if (!(partner & (jump-1)) && partner < size) {
 287             if (partner > rank) {
 288               Request::send(nullptr, 0, MPI_BYTE, partner, COLL_TAG_BARRIER, comm);
 289             } else if (partner < rank) {
 290               Request::recv(nullptr, 0, MPI_BYTE, partner, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE);
 291             }
 292         }
 293     }
 294
 295     return MPI_SUCCESS;
 296 }
 297
 298 }
 299 }