src/smpi/colls/barrier/barrier-ompi.cpp

   1 /* Copyright (c) 2013-2023. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 /*
   8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   9  *                         University Research and Technology
  10  *                         Corporation.  All rights reserved.
  11  * Copyright (c) 2004-2006 The University of Tennessee and The University
  12  *                         of Tennessee Research Foundation.  All rights
  13  *                         reserved.
  14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  15  *                         University of Stuttgart.  All rights reserved.
  16  * Copyright (c) 2004-2005 The Regents of the University of California.
  17  *                         All rights reserved.
  18  * Copyright (c) 2008      Sun Microsystems, Inc.  All rights reserved.
  19  *
  20  * Additional copyrights may follow
  21  */
  22
  23 #include "../coll_tuned_topo.hpp"
  24 #include "../colls_private.hpp"
  25 #include "smpi_actor.hpp"
  26
  27 /*
  28  * Barrier is meant to be a synchronous operation, as some BTLs can mark
  29  * a request done before its passed to the NIC and progress might not be made
  30  * elsewhere we cannot allow a process to exit the barrier until its last
  31  * [round of] sends are completed.
  32  *
  33  * It is last round of sends rather than 'last' individual send as each pair of
  34  * peers can use different channels/devices/btls and the receiver of one of
  35  * these sends might be forced to wait as the sender
  36  * leaves the collective and does not make progress until the next mpi call
  37  *
  38  */
  39
  40 /*
  41  * Simple double ring version of barrier
  42  *
  43  * synchronous guarantee made by last ring of sends are synchronous
  44  *
  45  */
  46 namespace simgrid::smpi {
  47 int barrier__ompi_doublering(MPI_Comm comm)
  48 {
  49     int rank, size;
  50     int left, right;
  51
  52
  53     rank = comm->rank();
  54     size = comm->size();
  55     int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
  56     XBT_DEBUG("ompi_coll_tuned_barrier_ompi_doublering rank %d", rank);
  57
  58     left = ((rank-1+size)%size);
  59     right = ((rank+1)%size);
  60
  61     if (rank > 0) { /* receive message from the left */
  62       Request::recv(nullptr, 0, MPI_BYTE, left, tag, comm, MPI_STATUS_IGNORE);
  63     }
  64
  65     /* Send message to the right */
  66     Request::send(nullptr, 0, MPI_BYTE, right, tag, comm);
  67
  68     /* root needs to receive from the last node */
  69     if (rank == 0) {
  70       Request::recv(nullptr, 0, MPI_BYTE, left, tag, comm, MPI_STATUS_IGNORE);
  71     }
  72
  73     /* Allow nodes to exit */
  74     if (rank > 0) { /* post Receive from left */
  75       Request::recv(nullptr, 0, MPI_BYTE, left, tag, comm, MPI_STATUS_IGNORE);
  76     }
  77
  78     /* send message to the right one */
  79     Request::send(nullptr, 0, MPI_BYTE, right, tag, comm);
  80
  81     /* rank 0 post receive from the last node */
  82     if (rank == 0) {
  83       Request::recv(nullptr, 0, MPI_BYTE, left, tag, comm, MPI_STATUS_IGNORE);
  84     }
  85
  86     return MPI_SUCCESS;
  87
  88 }
  89
  90
  91 /*
  92  * To make synchronous, uses sync sends and sync sendrecvs
  93  */
  94
  95 int barrier__ompi_recursivedoubling(MPI_Comm comm)
  96 {
  97     int rank, size, adjsize;
  98     int mask, remote;
  99
 100     rank = comm->rank();
 101     size = comm->size();
 102     int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
 103     XBT_DEBUG(
 104                  "ompi_coll_tuned_barrier_ompi_recursivedoubling rank %d",
 105                  rank);
 106
 107     /* do nearest power of 2 less than size calc */
 108     for( adjsize = 1; adjsize <= size; adjsize <<= 1 );
 109     adjsize >>= 1;
 110
 111     /* if size is not exact power of two, perform an extra step */
 112     if (adjsize != size) {
 113         if (rank >= adjsize) {
 114             /* send message to lower ranked node */
 115             remote = rank - adjsize;
 116             Request::sendrecv(nullptr, 0, MPI_BYTE, remote, tag, nullptr, 0, MPI_BYTE, remote,
 117                               tag, comm, MPI_STATUS_IGNORE);
 118
 119         } else if (rank < (size - adjsize)) {
 120
 121             /* receive message from high level rank */
 122             Request::recv(nullptr, 0, MPI_BYTE, rank + adjsize, tag, comm, MPI_STATUS_IGNORE);
 123         }
 124     }
 125
 126     /* exchange messages */
 127     if ( rank < adjsize ) {
 128         mask = 0x1;
 129         while ( mask < adjsize ) {
 130             remote = rank ^ mask;
 131             mask <<= 1;
 132             if (remote >= adjsize) continue;
 133
 134             /* post receive from the remote node */
 135             Request::sendrecv(nullptr, 0, MPI_BYTE, remote, tag, nullptr, 0, MPI_BYTE, remote,
 136                               tag, comm, MPI_STATUS_IGNORE);
 137         }
 138     }
 139
 140     /* non-power of 2 case */
 141     if (adjsize != size) {
 142         if (rank < (size - adjsize)) {
 143             /* send enter message to higher ranked node */
 144             remote = rank + adjsize;
 145             Request::send(nullptr, 0, MPI_BYTE, remote, tag, comm);
 146         }
 147     }
 148
 149     return MPI_SUCCESS;
 150
 151 }
 152
 153
 154 /*
 155  * To make synchronous, uses sync sends and sync sendrecvs
 156  */
 157
 158 int barrier__ompi_bruck(MPI_Comm comm)
 159 {
 160     int rank, size;
 161     int distance, to, from;
 162
 163     rank = comm->rank();
 164     size = comm->size();
 165     int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
 166     XBT_DEBUG(
 167                  "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank);
 168
 169     /* exchange data with rank-2^k and rank+2^k */
 170     for (distance = 1; distance < size; distance <<= 1) {
 171         from = (rank + size - distance) % size;
 172         to   = (rank + distance) % size;
 173
 174         /* send message to lower ranked node */
 175         Request::sendrecv(nullptr, 0, MPI_BYTE, to, tag, nullptr, 0, MPI_BYTE, from, tag,
 176                           comm, MPI_STATUS_IGNORE);
 177     }
 178
 179     return MPI_SUCCESS;
 180
 181 }
 182
 183
 184 /*
 185  * To make synchronous, uses sync sends and sync sendrecvs
 186  */
 187 /* special case for two processes */
 188 int barrier__ompi_two_procs(MPI_Comm comm)
 189 {
 190     int remote;
 191
 192     remote = comm->rank();
 193     int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
 194     XBT_DEBUG(
 195                  "ompi_coll_tuned_barrier_ompi_two_procs rank %d", remote);
 196     remote = (remote + 1) & 0x1;
 197
 198     Request::sendrecv(nullptr, 0, MPI_BYTE, remote, tag, nullptr, 0, MPI_BYTE, remote, tag,
 199                       comm, MPI_STATUS_IGNORE);
 200     return (MPI_SUCCESS);
 201 }
 202
 203
 204 /*
 205  * Linear functions are copied from the BASIC coll module
 206  * they do not segment the message and are simple implementations
 207  * but for some small number of nodes and/or small data sizes they
 208  * are just as fast as tuned/tree based segmenting operations
 209  * and as such may be selected by the decision functions
 210  * These are copied into this module due to the way we select modules
 211  * in V1. i.e. in V2 we will handle this differently and so will not
 212  * have to duplicate code.
 213  * GEF Oct05 after asking Jeff.
 214  */
 215
 216 /* copied function (with appropriate renaming) starts here */
 217
 218 int barrier__ompi_basic_linear(MPI_Comm comm)
 219 {
 220     int i;
 221     int size = comm->size();
 222     int rank = comm->rank();
 223
 224     int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
 225     /* All non-root send & receive zero-length message. */
 226
 227     if (rank > 0) {
 228       Request::send(nullptr, 0, MPI_BYTE, 0, tag, comm);
 229
 230       Request::recv(nullptr, 0, MPI_BYTE, 0, tag, comm, MPI_STATUS_IGNORE);
 231     }
 232
 233     /* The root collects and broadcasts the messages. */
 234
 235     else {
 236         MPI_Request* requests;
 237
 238         requests = new MPI_Request[size];
 239         for (i = 1; i < size; ++i) {
 240           requests[i] = Request::irecv(nullptr, 0, MPI_BYTE, i, tag, comm);
 241         }
 242         Request::waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
 243
 244         for (i = 1; i < size; ++i) {
 245           requests[i] = Request::isend(nullptr, 0, MPI_BYTE, i, tag, comm);
 246         }
 247         Request::waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
 248         delete[] requests;
 249     }
 250
 251     /* All done */
 252
 253     return MPI_SUCCESS;
 254
 255 }
 256 /* copied function (with appropriate renaming) ends here */
 257
 258 /*
 259  * Another recursive doubling type algorithm, but in this case
 260  * we go up the tree and back down the tree.
 261  */
 262 int barrier__ompi_tree(MPI_Comm comm)
 263 {
 264     int rank, size, depth;
 265     int jump, partner;
 266
 267     rank = comm->rank();
 268     size = comm->size();
 269     int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
 270     XBT_DEBUG(
 271                  "ompi_coll_tuned_barrier_ompi_tree %d",
 272                  rank);
 273
 274     /* Find the nearest power of 2 of the communicator size. */
 275     for(depth = 1; depth < size; depth <<= 1 );
 276
 277     for (jump=1; jump<depth; jump<<=1) {
 278         partner = rank ^ jump;
 279         if (!(partner & (jump-1)) && partner < size) {
 280             if (partner > rank) {
 281               Request::recv(nullptr, 0, MPI_BYTE, partner, tag, comm, MPI_STATUS_IGNORE);
 282             } else if (partner < rank) {
 283               Request::send(nullptr, 0, MPI_BYTE, partner, tag, comm);
 284             }
 285         }
 286     }
 287
 288     depth>>=1;
 289     for (jump = depth; jump>0; jump>>=1) {
 290         partner = rank ^ jump;
 291         if (!(partner & (jump-1)) && partner < size) {
 292             if (partner > rank) {
 293               Request::send(nullptr, 0, MPI_BYTE, partner, tag, comm);
 294             } else if (partner < rank) {
 295               Request::recv(nullptr, 0, MPI_BYTE, partner, tag, comm, MPI_STATUS_IGNORE);
 296             }
 297         }
 298     }
 299
 300     return MPI_SUCCESS;
 301 }
 302
 303 } // namespace simgrid::smpi