1 /* Copyright (c) 2013-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2006 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
18 * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
20 * Additional copyrights may follow
23 #include "colls_private.h"
24 #include "coll_tuned_topo.h"
28 * Barrier is ment to be a synchronous operation, as some BTLs can mark
29 * a request done before its passed to the NIC and progress might not be made
30 * elsewhere we cannot allow a process to exit the barrier until its last
31 * [round of] sends are completed.
33 * It is last round of sends rather than 'last' individual send as each pair of
34 * peers can use different channels/devices/btls and the receiver of one of
35 * these sends might be forced to wait as the sender
36 * leaves the collective and does not make progress until the next mpi call
41 * Simple double ring version of barrier
43 * synchronous gurantee made by last ring of sends are synchronous
46 int smpi_coll_tuned_barrier_ompi_doublering(MPI_Comm comm
53 rank = smpi_comm_rank(comm);
54 size = smpi_comm_size(comm);
56 XBT_DEBUG("ompi_coll_tuned_barrier_ompi_doublering rank %d", rank);
58 left = ((rank-1+size)%size);
59 right = ((rank+1)%size);
61 if (rank > 0) { /* receive message from the left */
62 smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
63 COLL_TAG_BARRIER, comm,
67 /* Send message to the right */
68 smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right,
72 /* root needs to receive from the last node */
74 smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
75 COLL_TAG_BARRIER, comm,
79 /* Allow nodes to exit */
80 if (rank > 0) { /* post Receive from left */
81 smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
82 COLL_TAG_BARRIER, comm,
86 /* send message to the right one */
87 smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right,
91 /* rank 0 post receive from the last node */
93 smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
94 COLL_TAG_BARRIER, comm,
104 * To make synchronous, uses sync sends and sync sendrecvs
107 int smpi_coll_tuned_barrier_ompi_recursivedoubling(MPI_Comm comm
110 int rank, size, adjsize;
113 rank = smpi_comm_rank(comm);
114 size = smpi_comm_size(comm);
116 "ompi_coll_tuned_barrier_ompi_recursivedoubling rank %d",
119 /* do nearest power of 2 less than size calc */
120 for( adjsize = 1; adjsize <= size; adjsize <<= 1 );
123 /* if size is not exact power of two, perform an extra step */
124 if (adjsize != size) {
125 if (rank >= adjsize) {
126 /* send message to lower ranked node */
127 remote = rank - adjsize;
128 smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
130 NULL, 0, MPI_BYTE, remote,
132 comm, MPI_STATUS_IGNORE);
134 } else if (rank < (size - adjsize)) {
136 /* receive message from high level rank */
137 smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
138 COLL_TAG_BARRIER, comm,
144 /* exchange messages */
145 if ( rank < adjsize ) {
147 while ( mask < adjsize ) {
148 remote = rank ^ mask;
150 if (remote >= adjsize) continue;
152 /* post receive from the remote node */
153 smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
155 NULL, 0, MPI_BYTE, remote,
157 comm, MPI_STATUS_IGNORE);
161 /* non-power of 2 case */
162 if (adjsize != size) {
163 if (rank < (size - adjsize)) {
164 /* send enter message to higher ranked node */
165 remote = rank + adjsize;
166 smpi_mpi_send((void*)NULL, 0, MPI_BYTE, remote,
179 * To make synchronous, uses sync sends and sync sendrecvs
182 int smpi_coll_tuned_barrier_ompi_bruck(MPI_Comm comm
186 int distance, to, from;
188 rank = smpi_comm_rank(comm);
189 size = smpi_comm_size(comm);
191 "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank);
193 /* exchange data with rank-2^k and rank+2^k */
194 for (distance = 1; distance < size; distance <<= 1) {
195 from = (rank + size - distance) % size;
196 to = (rank + distance) % size;
198 /* send message to lower ranked node */
199 smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, to,
201 NULL, 0, MPI_BYTE, from,
203 comm, MPI_STATUS_IGNORE);
212 * To make synchronous, uses sync sends and sync sendrecvs
214 /* special case for two processes */
215 int smpi_coll_tuned_barrier_ompi_two_procs(MPI_Comm comm
220 remote = smpi_comm_rank(comm);
222 "ompi_coll_tuned_barrier_ompi_two_procs rank %d", remote);
223 remote = (remote + 1) & 0x1;
225 smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
227 NULL, 0, MPI_BYTE, remote,
229 comm, MPI_STATUS_IGNORE);
230 return (MPI_SUCCESS);
235 * Linear functions are copied from the BASIC coll module
236 * they do not segment the message and are simple implementations
237 * but for some small number of nodes and/or small data sizes they
238 * are just as fast as tuned/tree based segmenting operations
239 * and as such may be selected by the decision functions
240 * These are copied into this module due to the way we select modules
241 * in V1. i.e. in V2 we will handle this differently and so will not
242 * have to duplicate code.
243 * GEF Oct05 after asking Jeff.
246 /* copied function (with appropriate renaming) starts here */
248 int smpi_coll_tuned_barrier_ompi_basic_linear(MPI_Comm comm)
251 int size = smpi_comm_size(comm);
252 int rank = smpi_comm_rank(comm);
254 /* All non-root send & receive zero-length message. */
257 smpi_mpi_send (NULL, 0, MPI_BYTE, 0,
261 smpi_mpi_recv (NULL, 0, MPI_BYTE, 0,
263 comm, MPI_STATUS_IGNORE);
266 /* The root collects and broadcasts the messages. */
269 MPI_Request* requests;
271 requests = (MPI_Request*)malloc( size * sizeof(MPI_Request) );
272 for (i = 1; i < size; ++i) {
273 requests[i] = smpi_mpi_irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
274 COLL_TAG_BARRIER, comm
277 smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
279 for (i = 1; i < size; ++i) {
280 requests[i] = smpi_mpi_isend(NULL, 0, MPI_BYTE, i,
285 smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
294 /* copied function (with appropriate renaming) ends here */
297 * Another recursive doubling type algorithm, but in this case
298 * we go up the tree and back down the tree.
300 int smpi_coll_tuned_barrier_ompi_tree(MPI_Comm comm)
302 int rank, size, depth;
305 rank = smpi_comm_rank(comm);
306 size = smpi_comm_size(comm);
308 "ompi_coll_tuned_barrier_ompi_tree %d",
311 /* Find the nearest power of 2 of the communicator size. */
312 for(depth = 1; depth < size; depth <<= 1 );
314 for (jump=1; jump<depth; jump<<=1) {
315 partner = rank ^ jump;
316 if (!(partner & (jump-1)) && partner < size) {
317 if (partner > rank) {
318 smpi_mpi_recv (NULL, 0, MPI_BYTE, partner,
319 COLL_TAG_BARRIER, comm,
321 } else if (partner < rank) {
322 smpi_mpi_send (NULL, 0, MPI_BYTE, partner,
330 for (jump = depth; jump>0; jump>>=1) {
331 partner = rank ^ jump;
332 if (!(partner & (jump-1)) && partner < size) {
333 if (partner > rank) {
334 smpi_mpi_send (NULL, 0, MPI_BYTE, partner,
337 } else if (partner < rank) {
338 smpi_mpi_recv (NULL, 0, MPI_BYTE, partner,
339 COLL_TAG_BARRIER, comm,