2 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2009 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2005 The Regents of the University of California.
11 * All rights reserved.
12 * Copyright (c) 2009 University of Houston. All rights reserved.
15 * Additional copyrights may follow
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions are
22 * - Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
25 * - Redistributions in binary form must reproduce the above copyright
26 * notice, this list of conditions and the following disclaimer listed
27 * in this license in the documentation and/or other materials
28 * provided with the distribution.
30 * - Neither the name of the copyright holders nor the names of its
31 * contributors may be used to endorse or promote products derived from
32 * this software without specific prior written permission.
34 * The copyright holders provide no reassurances that the source code
35 * provided does not infringe any patent, copyright, or any other
36 * intellectual property rights of third parties. The copyright holders
37 * disclaim any liability to any recipient for claims brought against
38 * recipient by any third party for infringement of that parties
39 * intellectual property rights.
41 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
42 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
43 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
44 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
45 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
46 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
47 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
48 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
49 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
50 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
51 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55 #include "colls_private.h"
56 #define MAXTREEFANOUT 32
57 typedef struct ompi_coll_tree_t {
62 int32_t tree_next[MAXTREEFANOUT];
63 int32_t tree_nextsize;
67 ompi_coll_tuned_topo_build_tree( int fanout,
73 * Some static helpers.
75 static int pown( int fanout, int num )
78 if( num < 0 ) return 0;
79 if (1==num) return fanout;
84 for( j = 0; j < num; j++ ) { p*= fanout; }
89 static int calculate_level( int fanout, int rank )
92 if( rank < 0 ) return -1;
93 for( level = 0, num = 0; num <= rank; level++ ) {
94 num += pown(fanout, level);
99 static int calculate_num_nodes_up_to_level( int fanout, int level )
101 /* just use geometric progression formula for sum:
102 a^0+a^1+...a^(n-1) = (a^n-1)/(a-1) */
103 return ((pown(fanout,level) - 1)/(fanout - 1));
107 * And now the building functions.
109 * An example for fanout = 2, comm_size = 7
111 * 0 <-- delta = 1 (fanout^0)
113 * 1 2 <-- delta = 2 (fanout^1)
115 * 3 5 4 6 <-- delta = 4 (fanout^2)
119 ompi_coll_tuned_topo_build_tree( int fanout,
125 int level; /* location of my rank in the tree structure of size */
126 int delta; /* number of nodes on my level */
127 int slimit; /* total number of nodes on levels above me */
130 ompi_coll_tree_t* tree;
132 XBT_DEBUG( "coll:tuned:topo_build_tree Building fo %d rt %d", fanout, root);
135 XBT_DEBUG( "coll:tuned:topo_build_tree invalid fanout %d", fanout);
138 if (fanout>MAXTREEFANOUT) {
139 XBT_DEBUG("coll:tuned:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT);
144 * Get size and rank of the process in this communicator
146 size = smpi_comm_size(comm);
147 rank = smpi_comm_rank(comm);
149 tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
151 XBT_DEBUG("coll:tuned:topo_build_tree PANIC::out of memory");
155 tree->tree_root = MPI_UNDEFINED;
156 tree->tree_nextsize = MPI_UNDEFINED;
161 tree->tree_root = root;
166 tree->tree_fanout = fanout;
167 tree->tree_bmtree = 0;
168 tree->tree_root = root;
169 tree->tree_prev = -1;
170 tree->tree_nextsize = 0;
171 for( i = 0; i < fanout; i++ ) {
172 tree->tree_next[i] = -1;
175 /* return if we have less than 2 processes */
181 * Shift all ranks by root, so that the algorithm can be
182 * designed as if root would be always 0
183 * shiftedrank should be used in calculating distances
184 * and position in tree
186 shiftedrank = rank - root;
187 if( shiftedrank < 0 ) {
191 /* calculate my level */
192 level = calculate_level( fanout, shiftedrank );
193 delta = pown( fanout, level );
195 /* find my children */
196 for( i = 0; i < fanout; i++ ) {
197 schild = shiftedrank + delta * (i+1);
198 if( schild < size ) {
199 tree->tree_next[i] = (schild+root)%size;
200 tree->tree_nextsize = tree->tree_nextsize + 1;
207 slimit = calculate_num_nodes_up_to_level( fanout, level );
208 sparent = shiftedrank;
209 if( sparent < fanout ) {
212 while( sparent >= slimit ) {
213 sparent -= delta/fanout;
216 tree->tree_prev = (sparent+root)%size;
223 smpi_coll_tuned_bcast_ompi_split_bintree ( void* buffer,
225 MPI_Datatype datatype,
231 int segindex, i, lr, pair;
232 int segcount[2]; /* Number ompi_request_wait_allof elements sent with each segment */
234 int num_segments[2]; /* Number of segmenets */
235 int sendcount[2]; /* the same like segcount, except for the last segment */
236 size_t realsegsize[2];
239 ptrdiff_t type_extent;
242 MPI_Request base_req, new_req;
243 ompi_coll_tree_t *tree;
244 // mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
245 // mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
247 size = smpi_comm_size(comm);
248 rank = smpi_comm_rank(comm);
251 //compute again segsize
252 const size_t intermediate_message_size = 370728;
253 size_t message_size = smpi_datatype_size(datatype) * (unsigned long)count;
254 if(message_size < intermediate_message_size)
259 XBT_DEBUG("ompi_coll_tuned_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize);
265 /* setup the binary tree topology. */
266 tree = ompi_coll_tuned_topo_build_tree(2,comm,root);
268 type_size = smpi_datatype_size( datatype );
270 /* Determine number of segments and number of elements per segment */
272 if (count % 2 != 0) counts[0]++;
273 counts[1] = count - counts[0];
275 /* Note that ompi_datatype_type_size() will never return a negative
276 value in typelng; it returns an int [vs. an unsigned type]
277 because of the MPI spec. */
278 if (segsize < ((uint32_t) type_size)) {
279 segsize = type_size; /* push segsize up to hold one type */
281 segcount[0] = segcount[1] = segsize / type_size;
282 num_segments[0] = counts[0]/segcount[0];
283 if ((counts[0] % segcount[0]) != 0) num_segments[0]++;
284 num_segments[1] = counts[1]/segcount[1];
285 if ((counts[1] % segcount[1]) != 0) num_segments[1]++;
287 segcount[0] = counts[0];
288 segcount[1] = counts[1];
289 num_segments[0] = num_segments[1] = 1;
292 /* if the message is too small to be split into segments */
293 if( (counts[0] == 0 || counts[1] == 0) ||
294 (segsize > counts[0] * type_size) ||
295 (segsize > counts[1] * type_size) ) {
296 /* call linear version here ! */
297 return (smpi_coll_tuned_bcast_SMP_linear ( buffer, count, datatype,
300 type_extent = smpi_datatype_get_extent(datatype);
303 /* Determine real segment size */
304 realsegsize[0] = segcount[0] * type_extent;
305 realsegsize[1] = segcount[1] * type_extent;
307 /* set the buffer pointers */
308 tmpbuf[0] = (char *) buffer;
309 tmpbuf[1] = (char *) buffer+counts[0] * type_extent;
312 Root splits the buffer in 2 and sends segmented message down the branches.
313 Left subtree of the tree receives first half of the buffer, while right
314 subtree receives the remaining message.
317 /* determine if I am left (0) or right (1), (root is right) */
318 lr = ((rank + size - root)%size + 1)%2;
322 /* determine segment count */
323 sendcount[0] = segcount[0];
324 sendcount[1] = segcount[1];
325 /* for each segment */
326 for (segindex = 0; segindex < num_segments[0]; segindex++) {
328 for( i = 0; i < tree->tree_nextsize && i < 2; i++ ) {
329 if (segindex >= num_segments[i]) { /* no more segments */
332 /* determine how many elements are being sent in this round */
333 if(segindex == (num_segments[i] - 1))
334 sendcount[i] = counts[i] - segindex*segcount[i];
336 smpi_mpi_send(tmpbuf[i], sendcount[i], datatype,
337 tree->tree_next[i], 777, comm);
338 /* update tmp buffer */
339 tmpbuf[i] += realsegsize[i];
344 /* intermediate nodes code */
345 else if( tree->tree_nextsize > 0 ) {
346 /* Intermediate nodes:
347 * It will receive segments only from one half of the data.
348 * Which one is determined by whether the node belongs to the "left" or "right"
349 * subtree. Topoloby building function builds binary tree such that
350 * odd "shifted ranks" ((rank + size - root)%size) are on the left subtree,
351 * and even on the right subtree.
353 * Create the pipeline. We first post the first receive, then in the loop we
354 * post the next receive and after that wait for the previous receive to complete
355 * and we disseminating the data to all children.
357 sendcount[lr] = segcount[lr];
358 base_req=smpi_mpi_irecv(tmpbuf[lr], sendcount[lr], datatype,
359 tree->tree_prev, 777,
362 for( segindex = 1; segindex < num_segments[lr]; segindex++ ) {
363 /* determine how many elements to expect in this round */
364 if( segindex == (num_segments[lr] - 1))
365 sendcount[lr] = counts[lr] - segindex*segcount[lr];
367 new_req = smpi_mpi_irecv( tmpbuf[lr] + realsegsize[lr], sendcount[lr],
368 datatype, tree->tree_prev, 777,
371 /* wait for and forward current segment */
372 smpi_mpi_waitall( 1, &base_req, MPI_STATUSES_IGNORE );
373 for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children (segcount[lr]) */
374 smpi_mpi_send( tmpbuf[lr], segcount[lr], datatype,
375 tree->tree_next[i], 777,
377 } /* end of for each child */
379 /* upate the base request */
381 /* go to the next buffer (ie. the one corresponding to the next recv) */
382 tmpbuf[lr] += realsegsize[lr];
383 } /* end of for segindex */
385 /* wait for the last segment and forward current segment */
386 smpi_mpi_waitall( 1, &base_req, MPI_STATUSES_IGNORE );
387 for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children */
388 smpi_mpi_send(tmpbuf[lr], sendcount[lr], datatype,
389 tree->tree_next[i], 777, comm);
390 } /* end of for each child */
395 /* Just consume segments as fast as possible */
396 sendcount[lr] = segcount[lr];
397 for (segindex = 0; segindex < num_segments[lr]; segindex++) {
398 /* determine how many elements to expect in this round */
399 if (segindex == (num_segments[lr] - 1)) sendcount[lr] = counts[lr] - segindex*segcount[lr];
400 /* receive segments */
401 smpi_mpi_recv(tmpbuf[lr], sendcount[lr], datatype,
402 tree->tree_prev, 777,
403 comm, MPI_STATUS_IGNORE);
404 /* update the initial pointer to the buffer */
405 tmpbuf[lr] += realsegsize[lr];
409 /* reset the buffer pointers */
410 tmpbuf[0] = (char *) buffer;
411 tmpbuf[1] = (char *) buffer+counts[0] * type_extent;
414 Find your immediate pair (identical node in opposite subtree) and SendRecv
415 data buffer with them.
416 The tree building function ensures that
418 if we are in the left subtree (lr == 0) our pair is (rank+1)%size.
419 if we are in the right subtree (lr == 1) our pair is (rank-1)%size
420 If we have even number of nodes the rank (size-1) will pair up with root.
423 pair = (rank+1)%size;
425 pair = (rank+size-1)%size;
428 if ( (size%2) != 0 && rank != root) {
430 smpi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype,
432 tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
434 comm, MPI_STATUS_IGNORE);
435 } else if ( (size%2) == 0 ) {
436 /* root sends right buffer to the last node */
438 smpi_mpi_send(tmpbuf[1], counts[1], datatype,
439 (root+size-1)%size, 777, comm);
442 /* last node receives right buffer from the root */
443 else if (rank == (root+size-1)%size) {
444 smpi_mpi_recv(tmpbuf[1], counts[1], datatype,
446 comm, MPI_STATUS_IGNORE);
448 /* everyone else exchanges buffers */
450 smpi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype,
452 tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
454 comm, MPI_STATUS_IGNORE);
457 return (MPI_SUCCESS);