2 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2006 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2005 The Regents of the University of California.
11 * All rights reserved.
12 * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
15 * Additional copyrights may follow
20 #include "colls_private.h"
21 #include "coll_tuned_topo.h"
24 #define MCA_COLL_BASE_TAG_BARRIER 100
26 * Barrier is ment to be a synchronous operation, as some BTLs can mark
27 * a request done before its passed to the NIC and progress might not be made
28 * elsewhere we cannot allow a process to exit the barrier until its last
29 * [round of] sends are completed.
31 * It is last round of sends rather than 'last' individual send as each pair of
32 * peers can use different channels/devices/btls and the receiver of one of
33 * these sends might be forced to wait as the sender
34 * leaves the collective and does not make progress until the next mpi call
39 * Simple double ring version of barrier
41 * synchronous gurantee made by last ring of sends are synchronous
44 int smpi_coll_tuned_barrier_ompi_doublering(MPI_Comm comm
51 rank = smpi_comm_rank(comm);
52 size = smpi_comm_size(comm);
54 XBT_DEBUG("ompi_coll_tuned_barrier_ompi_doublering rank %d", rank);
56 left = ((rank-1)%size);
57 right = ((rank+1)%size);
59 if (rank > 0) { /* receive message from the left */
60 smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
61 MCA_COLL_BASE_TAG_BARRIER, comm,
65 /* Send message to the right */
66 smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right,
67 MCA_COLL_BASE_TAG_BARRIER,
70 /* root needs to receive from the last node */
72 smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
73 MCA_COLL_BASE_TAG_BARRIER, comm,
77 /* Allow nodes to exit */
78 if (rank > 0) { /* post Receive from left */
79 smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
80 MCA_COLL_BASE_TAG_BARRIER, comm,
84 /* send message to the right one */
85 smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right,
86 MCA_COLL_BASE_TAG_BARRIER,
89 /* rank 0 post receive from the last node */
91 smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left,
92 MCA_COLL_BASE_TAG_BARRIER, comm,
102 * To make synchronous, uses sync sends and sync sendrecvs
105 int smpi_coll_tuned_barrier_ompi_recursivedoubling(MPI_Comm comm
108 int rank, size, adjsize;
111 rank = smpi_comm_rank(comm);
112 size = smpi_comm_size(comm);
114 "ompi_coll_tuned_barrier_ompi_recursivedoubling rank %d",
117 /* do nearest power of 2 less than size calc */
118 for( adjsize = 1; adjsize <= size; adjsize <<= 1 );
121 /* if size is not exact power of two, perform an extra step */
122 if (adjsize != size) {
123 if (rank >= adjsize) {
124 /* send message to lower ranked node */
125 remote = rank - adjsize;
126 smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
127 MCA_COLL_BASE_TAG_BARRIER,
128 NULL, 0, MPI_BYTE, remote,
129 MCA_COLL_BASE_TAG_BARRIER,
130 comm, MPI_STATUS_IGNORE);
132 } else if (rank < (size - adjsize)) {
134 /* receive message from high level rank */
135 smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
136 MCA_COLL_BASE_TAG_BARRIER, comm,
142 /* exchange messages */
143 if ( rank < adjsize ) {
145 while ( mask < adjsize ) {
146 remote = rank ^ mask;
148 if (remote >= adjsize) continue;
150 /* post receive from the remote node */
151 smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
152 MCA_COLL_BASE_TAG_BARRIER,
153 NULL, 0, MPI_BYTE, remote,
154 MCA_COLL_BASE_TAG_BARRIER,
155 comm, MPI_STATUS_IGNORE);
159 /* non-power of 2 case */
160 if (adjsize != size) {
161 if (rank < (size - adjsize)) {
162 /* send enter message to higher ranked node */
163 remote = rank + adjsize;
164 smpi_mpi_send((void*)NULL, 0, MPI_BYTE, remote,
165 MCA_COLL_BASE_TAG_BARRIER,
177 * To make synchronous, uses sync sends and sync sendrecvs
180 int smpi_coll_tuned_barrier_ompi_bruck(MPI_Comm comm
184 int distance, to, from;
186 rank = smpi_comm_rank(comm);
187 size = smpi_comm_size(comm);
189 "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank);
191 /* exchange data with rank-2^k and rank+2^k */
192 for (distance = 1; distance < size; distance <<= 1) {
193 from = (rank + size - distance) % size;
194 to = (rank + distance) % size;
196 /* send message to lower ranked node */
197 smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, to,
198 MCA_COLL_BASE_TAG_BARRIER,
199 NULL, 0, MPI_BYTE, from,
200 MCA_COLL_BASE_TAG_BARRIER,
201 comm, MPI_STATUS_IGNORE);
210 * To make synchronous, uses sync sends and sync sendrecvs
212 /* special case for two processes */
213 int smpi_coll_tuned_barrier_ompi_two_procs(MPI_Comm comm
218 remote = smpi_comm_rank(comm);
220 "ompi_coll_tuned_barrier_ompi_two_procs rank %d", remote);
221 remote = (remote + 1) & 0x1;
223 smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote,
224 MCA_COLL_BASE_TAG_BARRIER,
225 NULL, 0, MPI_BYTE, remote,
226 MCA_COLL_BASE_TAG_BARRIER,
227 comm, MPI_STATUS_IGNORE);
228 return (MPI_SUCCESS);
233 * Linear functions are copied from the BASIC coll module
234 * they do not segment the message and are simple implementations
235 * but for some small number of nodes and/or small data sizes they
236 * are just as fast as tuned/tree based segmenting operations
237 * and as such may be selected by the decision functions
238 * These are copied into this module due to the way we select modules
239 * in V1. i.e. in V2 we will handle this differently and so will not
240 * have to duplicate code.
241 * GEF Oct05 after asking Jeff.
244 /* copied function (with appropriate renaming) starts here */
246 int smpi_coll_tuned_barrier_ompi_basic_linear(MPI_Comm comm)
249 int size = smpi_comm_size(comm);
250 int rank = smpi_comm_rank(comm);
252 /* All non-root send & receive zero-length message. */
255 smpi_mpi_send (NULL, 0, MPI_BYTE, 0,
256 MCA_COLL_BASE_TAG_BARRIER,
259 smpi_mpi_recv (NULL, 0, MPI_BYTE, 0,
260 MCA_COLL_BASE_TAG_BARRIER,
261 comm, MPI_STATUS_IGNORE);
264 /* The root collects and broadcasts the messages. */
267 MPI_Request* requests;
269 requests = (MPI_Request*)malloc( size * sizeof(MPI_Request) );
270 for (i = 1; i < size; ++i) {
271 requests[i] = smpi_mpi_irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
272 MCA_COLL_BASE_TAG_BARRIER, comm
275 smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
277 for (i = 1; i < size; ++i) {
278 requests[i] = smpi_mpi_isend(NULL, 0, MPI_BYTE, i,
279 MCA_COLL_BASE_TAG_BARRIER,
283 smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
292 /* copied function (with appropriate renaming) ends here */
295 * Another recursive doubling type algorithm, but in this case
296 * we go up the tree and back down the tree.
298 int smpi_coll_tuned_barrier_ompi_tree(MPI_Comm comm)
300 int rank, size, depth;
303 rank = smpi_comm_rank(comm);
304 size = smpi_comm_size(comm);
306 "ompi_coll_tuned_barrier_ompi_tree %d",
309 /* Find the nearest power of 2 of the communicator size. */
310 for(depth = 1; depth < size; depth <<= 1 );
312 for (jump=1; jump<depth; jump<<=1) {
313 partner = rank ^ jump;
314 if (!(partner & (jump-1)) && partner < size) {
315 if (partner > rank) {
316 smpi_mpi_recv (NULL, 0, MPI_BYTE, partner,
317 MCA_COLL_BASE_TAG_BARRIER, comm,
319 } else if (partner < rank) {
320 smpi_mpi_send (NULL, 0, MPI_BYTE, partner,
321 MCA_COLL_BASE_TAG_BARRIER,
328 for (jump = depth; jump>0; jump>>=1) {
329 partner = rank ^ jump;
330 if (!(partner & (jump-1)) && partner < size) {
331 if (partner > rank) {
332 smpi_mpi_send (NULL, 0, MPI_BYTE, partner,
333 MCA_COLL_BASE_TAG_BARRIER,
335 } else if (partner < rank) {
336 smpi_mpi_recv (NULL, 0, MPI_BYTE, partner,
337 MCA_COLL_BASE_TAG_BARRIER, comm,