1 /* Copyright (c) 2013-2023. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2006 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
18 * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
20 * Additional copyrights may follow
23 #include "../coll_tuned_topo.hpp"
24 #include "../colls_private.hpp"
25 #include "smpi_actor.hpp"
28 * Barrier is meant to be a synchronous operation, as some BTLs can mark
29 * a request done before its passed to the NIC and progress might not be made
30 * elsewhere we cannot allow a process to exit the barrier until its last
31 * [round of] sends are completed.
33 * It is last round of sends rather than 'last' individual send as each pair of
34 * peers can use different channels/devices/btls and the receiver of one of
35 * these sends might be forced to wait as the sender
36 * leaves the collective and does not make progress until the next mpi call
41 * Simple double ring version of barrier
43 * synchronous guarantee made by last ring of sends are synchronous
46 namespace simgrid::smpi {
47 int barrier__ompi_doublering(MPI_Comm comm)
55 int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
56 XBT_DEBUG("ompi_coll_tuned_barrier_ompi_doublering rank %d", rank);
58 left = ((rank-1+size)%size);
59 right = ((rank+1)%size);
61 if (rank > 0) { /* receive message from the left */
62 Request::recv(nullptr, 0, MPI_BYTE, left, tag, comm, MPI_STATUS_IGNORE);
65 /* Send message to the right */
66 Request::send(nullptr, 0, MPI_BYTE, right, tag, comm);
68 /* root needs to receive from the last node */
70 Request::recv(nullptr, 0, MPI_BYTE, left, tag, comm, MPI_STATUS_IGNORE);
73 /* Allow nodes to exit */
74 if (rank > 0) { /* post Receive from left */
75 Request::recv(nullptr, 0, MPI_BYTE, left, tag, comm, MPI_STATUS_IGNORE);
78 /* send message to the right one */
79 Request::send(nullptr, 0, MPI_BYTE, right, tag, comm);
81 /* rank 0 post receive from the last node */
83 Request::recv(nullptr, 0, MPI_BYTE, left, tag, comm, MPI_STATUS_IGNORE);
92 * To make synchronous, uses sync sends and sync sendrecvs
95 int barrier__ompi_recursivedoubling(MPI_Comm comm)
97 int rank, size, adjsize;
102 int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
104 "ompi_coll_tuned_barrier_ompi_recursivedoubling rank %d",
107 /* do nearest power of 2 less than size calc */
108 for( adjsize = 1; adjsize <= size; adjsize <<= 1 );
111 /* if size is not exact power of two, perform an extra step */
112 if (adjsize != size) {
113 if (rank >= adjsize) {
114 /* send message to lower ranked node */
115 remote = rank - adjsize;
116 Request::sendrecv(nullptr, 0, MPI_BYTE, remote, tag, nullptr, 0, MPI_BYTE, remote,
117 tag, comm, MPI_STATUS_IGNORE);
119 } else if (rank < (size - adjsize)) {
121 /* receive message from high level rank */
122 Request::recv(nullptr, 0, MPI_BYTE, rank + adjsize, tag, comm, MPI_STATUS_IGNORE);
126 /* exchange messages */
127 if ( rank < adjsize ) {
129 while ( mask < adjsize ) {
130 remote = rank ^ mask;
132 if (remote >= adjsize) continue;
134 /* post receive from the remote node */
135 Request::sendrecv(nullptr, 0, MPI_BYTE, remote, tag, nullptr, 0, MPI_BYTE, remote,
136 tag, comm, MPI_STATUS_IGNORE);
140 /* non-power of 2 case */
141 if (adjsize != size) {
142 if (rank < (size - adjsize)) {
143 /* send enter message to higher ranked node */
144 remote = rank + adjsize;
145 Request::send(nullptr, 0, MPI_BYTE, remote, tag, comm);
155 * To make synchronous, uses sync sends and sync sendrecvs
158 int barrier__ompi_bruck(MPI_Comm comm)
161 int distance, to, from;
165 int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
167 "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank);
169 /* exchange data with rank-2^k and rank+2^k */
170 for (distance = 1; distance < size; distance <<= 1) {
171 from = (rank + size - distance) % size;
172 to = (rank + distance) % size;
174 /* send message to lower ranked node */
175 Request::sendrecv(nullptr, 0, MPI_BYTE, to, tag, nullptr, 0, MPI_BYTE, from, tag,
176 comm, MPI_STATUS_IGNORE);
185 * To make synchronous, uses sync sends and sync sendrecvs
187 /* special case for two processes */
188 int barrier__ompi_two_procs(MPI_Comm comm)
192 remote = comm->rank();
193 int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
195 "ompi_coll_tuned_barrier_ompi_two_procs rank %d", remote);
196 remote = (remote + 1) & 0x1;
198 Request::sendrecv(nullptr, 0, MPI_BYTE, remote, tag, nullptr, 0, MPI_BYTE, remote, tag,
199 comm, MPI_STATUS_IGNORE);
200 return (MPI_SUCCESS);
205 * Linear functions are copied from the BASIC coll module
206 * they do not segment the message and are simple implementations
207 * but for some small number of nodes and/or small data sizes they
208 * are just as fast as tuned/tree based segmenting operations
209 * and as such may be selected by the decision functions
210 * These are copied into this module due to the way we select modules
211 * in V1. i.e. in V2 we will handle this differently and so will not
212 * have to duplicate code.
213 * GEF Oct05 after asking Jeff.
216 /* copied function (with appropriate renaming) starts here */
218 int barrier__ompi_basic_linear(MPI_Comm comm)
221 int size = comm->size();
222 int rank = comm->rank();
224 int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
225 /* All non-root send & receive zero-length message. */
228 Request::send(nullptr, 0, MPI_BYTE, 0, tag, comm);
230 Request::recv(nullptr, 0, MPI_BYTE, 0, tag, comm, MPI_STATUS_IGNORE);
233 /* The root collects and broadcasts the messages. */
236 MPI_Request* requests;
238 requests = new MPI_Request[size];
239 for (i = 1; i < size; ++i) {
240 requests[i] = Request::irecv(nullptr, 0, MPI_BYTE, i, tag, comm);
242 Request::waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
244 for (i = 1; i < size; ++i) {
245 requests[i] = Request::isend(nullptr, 0, MPI_BYTE, i, tag, comm);
247 Request::waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
256 /* copied function (with appropriate renaming) ends here */
259 * Another recursive doubling type algorithm, but in this case
260 * we go up the tree and back down the tree.
262 int barrier__ompi_tree(MPI_Comm comm)
264 int rank, size, depth;
269 int tag = smpi_process()->finalizing() ? COLL_TAG_BARRIER-1: COLL_TAG_BARRIER;
271 "ompi_coll_tuned_barrier_ompi_tree %d",
274 /* Find the nearest power of 2 of the communicator size. */
275 for(depth = 1; depth < size; depth <<= 1 );
277 for (jump=1; jump<depth; jump<<=1) {
278 partner = rank ^ jump;
279 if (!(partner & (jump-1)) && partner < size) {
280 if (partner > rank) {
281 Request::recv(nullptr, 0, MPI_BYTE, partner, tag, comm, MPI_STATUS_IGNORE);
282 } else if (partner < rank) {
283 Request::send(nullptr, 0, MPI_BYTE, partner, tag, comm);
289 for (jump = depth; jump>0; jump>>=1) {
290 partner = rank ^ jump;
291 if (!(partner & (jump-1)) && partner < size) {
292 if (partner > rank) {
293 Request::send(nullptr, 0, MPI_BYTE, partner, tag, comm);
294 } else if (partner < rank) {
295 Request::recv(nullptr, 0, MPI_BYTE, partner, tag, comm, MPI_STATUS_IGNORE);
303 } // namespace simgrid::smpi