1 /* Copyright (c) 2013-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2005 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
19 * Additional copyrights may follow
22 #include "colls_private.h"
23 #include "coll_tuned_topo.h"
25 * Some static helpers.
27 static int pown( int fanout, int num )
30 if( num < 0 ) return 0;
31 if (1==num) return fanout;
36 for( j = 0; j < num; j++ ) { p*= fanout; }
41 static int calculate_level( int fanout, int rank )
44 if( rank < 0 ) return -1;
45 for( level = 0, num = 0; num <= rank; level++ ) {
46 num += pown(fanout, level);
51 static int calculate_num_nodes_up_to_level( int fanout, int level )
53 /* just use geometric progression formula for sum:
54 a^0+a^1+...a^(n-1) = (a^n-1)/(a-1) */
55 return ((pown(fanout,level) - 1)/(fanout - 1));
59 * And now the building functions.
61 * An example for fanout = 2, comm_size = 7
63 * 0 <-- delta = 1 (fanout^0)
65 * 1 2 <-- delta = 2 (fanout^1)
67 * 3 5 4 6 <-- delta = 4 (fanout^2)
71 ompi_coll_tuned_topo_build_tree( int fanout,
77 int level; /* location of my rank in the tree structure of size */
78 int delta; /* number of nodes on my level */
79 int slimit; /* total number of nodes on levels above me */
82 ompi_coll_tree_t* tree;
84 XBT_DEBUG("coll:tuned:topo_build_tree Building fo %d rt %d", fanout, root);
87 XBT_DEBUG("coll:tuned:topo_build_tree invalid fanout %d", fanout);
90 if (fanout>MAXTREEFANOUT) {
91 XBT_DEBUG("coll:tuned:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT);
96 * Get size and rank of the process in this communicator
98 size = smpi_comm_size(comm);
99 rank = smpi_comm_rank(comm);
101 tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
103 XBT_DEBUG("coll:tuned:topo_build_tree PANIC::out of memory");
107 tree->tree_root = MPI_UNDEFINED;
108 tree->tree_nextsize = MPI_UNDEFINED;
113 tree->tree_root = root;
118 tree->tree_fanout = fanout;
119 tree->tree_bmtree = 0;
120 tree->tree_root = root;
121 tree->tree_prev = -1;
122 tree->tree_nextsize = 0;
123 for( i = 0; i < fanout; i++ ) {
124 tree->tree_next[i] = -1;
127 /* return if we have less than 2 processes */
133 * Shift all ranks by root, so that the algorithm can be
134 * designed as if root would be always 0
135 * shiftedrank should be used in calculating distances
136 * and position in tree
138 shiftedrank = rank - root;
139 if( shiftedrank < 0 ) {
143 /* calculate my level */
144 level = calculate_level( fanout, shiftedrank );
145 delta = pown( fanout, level );
147 /* find my children */
148 for( i = 0; i < fanout; i++ ) {
149 schild = shiftedrank + delta * (i+1);
150 if( schild < size ) {
151 tree->tree_next[i] = (schild+root)%size;
152 tree->tree_nextsize = tree->tree_nextsize + 1;
159 slimit = calculate_num_nodes_up_to_level( fanout, level );
160 sparent = shiftedrank;
161 if( sparent < fanout ) {
164 while( sparent >= slimit ) {
165 sparent -= delta/fanout;
168 tree->tree_prev = (sparent+root)%size;
174 * Constructs in-order binary tree which can be used for non-commutative reduce
176 * Root of this tree is always rank (size-1) and fanout is 2.
177 * Here are some of the examples of this tree:
178 * size == 2 size == 3 size == 4 size == 9
188 ompi_coll_tuned_topo_build_in_order_bintree( MPI_Comm comm )
191 int myrank, rightsize, delta;
192 int parent, lchild, rchild;
193 ompi_coll_tree_t* tree;
196 * Get size and rank of the process in this communicator
198 size = smpi_comm_size(comm);
199 rank = smpi_comm_rank(comm);
201 tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
204 "coll:tuned:topo_build_tree PANIC::out of memory");
208 tree->tree_root = MPI_UNDEFINED;
209 tree->tree_nextsize = MPI_UNDEFINED;
214 tree->tree_fanout = 2;
215 tree->tree_bmtree = 0;
216 tree->tree_root = size - 1;
217 tree->tree_prev = -1;
218 tree->tree_nextsize = 0;
219 tree->tree_next[0] = -1;
220 tree->tree_next[1] = -1;
222 "coll:tuned:topo_build_in_order_tree Building fo %d rt %d",
223 tree->tree_fanout, tree->tree_root);
233 /* Compute the size of the right subtree */
234 rightsize = size >> 1;
236 /* Determine the left and right child of this parent */
242 rchild = rightsize - 1;
246 /* The following cases are possible: myrank can be
248 - belong to the left subtree, or
249 - belong to the right subtee
250 Each of the cases need to be handled differently.
253 if (myrank == parent) {
255 - compute real ranks of my children, and exit the loop. */
256 if (lchild >= 0) tree->tree_next[0] = lchild + delta;
257 if (rchild >= 0) tree->tree_next[1] = rchild + delta;
260 if (myrank > rchild) {
261 /* I belong to the left subtree:
262 - If I am the left child, compute real rank of my parent
263 - Iterate down through tree:
264 compute new size, shift ranks down, and update delta.
266 if (myrank == lchild) {
267 tree->tree_prev = parent + delta;
269 size = size - rightsize - 1;
270 delta = delta + rightsize;
271 myrank = myrank - rightsize;
275 /* I belong to the right subtree:
276 - If I am the right child, compute real rank of my parent
277 - Iterate down through tree:
278 compute new size and parent,
279 but the delta and rank do not need to change.
281 if (myrank == rchild) {
282 tree->tree_prev = parent + delta;
289 if (tree->tree_next[0] >= 0) { tree->tree_nextsize = 1; }
290 if (tree->tree_next[1] >= 0) { tree->tree_nextsize += 1; }
295 int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
297 ompi_coll_tree_t *ptr;
299 if ((!tree)||(!*tree)) {
306 *tree = NULL; /* mark tree as gone */
313 * Here are some of the examples of this tree:
314 * size == 2 size = 4 size = 8
324 ompi_coll_tuned_topo_build_bmtree( MPI_Comm comm,
333 ompi_coll_tree_t *bmtree;
336 XBT_DEBUG("coll:tuned:topo:build_bmtree rt %d", root);
339 * Get size and rank of the process in this communicator
341 size = smpi_comm_size(comm);
342 rank = smpi_comm_rank(comm);
346 bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
348 XBT_DEBUG("coll:tuned:topo:build_bmtree PANIC out of memory");
352 bmtree->tree_bmtree = 1;
354 bmtree->tree_root = MPI_UNDEFINED;
355 bmtree->tree_nextsize = MPI_UNDEFINED;
356 for(i=0;i<MAXTREEFANOUT;i++) {
357 bmtree->tree_next[i] = -1;
360 if( index < 0 ) index += size;
362 while( mask <= index ) mask <<= 1;
364 /* Now I can compute my father rank */
366 bmtree->tree_prev = root;
368 remote = (index ^ (mask >> 1)) + root;
369 if( remote >= size ) remote -= size;
370 bmtree->tree_prev = remote;
372 /* And now let's fill my childs */
373 while( mask < size ) {
374 remote = (index ^ mask);
375 if( remote >= size ) break;
377 if( remote >= size ) remote -= size;
378 if (childs==MAXTREEFANOUT) {
379 XBT_DEBUG("coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs);
382 bmtree->tree_next[childs] = remote;
386 bmtree->tree_nextsize = childs;
387 bmtree->tree_root = root;
392 * Constructs in-order binomial tree which can be used for gather/scatter
395 * Here are some of the examples of this tree:
396 * size == 2 size = 4 size = 8
406 ompi_coll_tuned_topo_build_in_order_bmtree( MPI_Comm comm,
414 ompi_coll_tree_t *bmtree;
417 XBT_DEBUG("coll:tuned:topo:build_in_order_bmtree rt %d", root);
420 * Get size and rank of the process in this communicator
422 size = smpi_comm_size(comm);
423 rank = smpi_comm_rank(comm);
425 vrank = (rank - root + size) % size;
427 bmtree = (ompi_coll_tree_t*)xbt_malloc(sizeof(ompi_coll_tree_t));
429 XBT_DEBUG("coll:tuned:topo:build_bmtree PANIC out of memory");
433 bmtree->tree_bmtree = 1;
434 bmtree->tree_root = MPI_UNDEFINED;
435 bmtree->tree_nextsize = MPI_UNDEFINED;
436 for(i=0;i<MAXTREEFANOUT;i++) {
437 bmtree->tree_next[i] = -1;
441 bmtree->tree_prev = root;
444 while (mask < size) {
445 remote = vrank ^ mask;
446 if (remote < vrank) {
447 bmtree->tree_prev = (remote + root) % size;
449 } else if (remote < size) {
450 bmtree->tree_next[childs] = (remote + root) % size;
452 if (childs==MAXTREEFANOUT) {
454 "coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d",
455 MAXTREEFANOUT, childs);
461 bmtree->tree_nextsize = childs;
462 bmtree->tree_root = root;
469 ompi_coll_tuned_topo_build_chain( int fanout,
474 int srank; /* shifted rank */
477 ompi_coll_tree_t *chain;
479 XBT_DEBUG("coll:tuned:topo:build_chain fo %d rt %d", fanout, root);
482 * Get size and rank of the process in this communicator
484 size = smpi_comm_size(comm);
485 rank = smpi_comm_rank(comm);
488 XBT_DEBUG("coll:tuned:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!");
491 if (fanout>MAXTREEFANOUT) {
492 XBT_DEBUG("coll:tuned:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT);
493 fanout = MAXTREEFANOUT;
497 * Allocate space for topology arrays if needed
499 chain = (ompi_coll_tree_t*)malloc( sizeof(ompi_coll_tree_t) );
501 XBT_DEBUG("coll:tuned:topo:build_chain PANIC out of memory");
505 chain->tree_root = MPI_UNDEFINED;
506 chain->tree_nextsize = -1;
507 for(i=0;i<fanout;i++) chain->tree_next[i] = -1;
510 * Set root & numchain
512 chain->tree_root = root;
513 if( (size - 1) < fanout ) {
514 chain->tree_nextsize = size-1;
517 chain->tree_nextsize = fanout;
524 if (srank < 0) srank += size;
527 * Special case - fanout == 1
530 if( srank == 0 ) chain->tree_prev = -1;
531 else chain->tree_prev = (srank-1+root)%size;
533 if( (srank + 1) >= size) {
534 chain->tree_next[0] = -1;
535 chain->tree_nextsize = 0;
537 chain->tree_next[0] = (srank+1+root)%size;
538 chain->tree_nextsize = 1;
543 /* Let's handle the case where there is just one node in the communicator */
545 chain->tree_next[0] = -1;
546 chain->tree_nextsize = 0;
547 chain->tree_prev = -1;
551 * Calculate maximum chain length
553 maxchainlen = (size-1) / fanout;
554 if( (size-1) % fanout != 0 ) {
556 mark = (size-1)%fanout;
562 * Find your own place in the list of shifted ranks
566 if( srank-1 < (mark * maxchainlen) ) {
567 column = (srank-1)/maxchainlen;
568 head = 1+column*maxchainlen;
571 column = mark + (srank-1-mark*maxchainlen)/(maxchainlen-1);
572 head = mark*maxchainlen+1+(column-mark)*(maxchainlen-1);
576 if( srank == head ) {
577 chain->tree_prev = 0; /*root*/
579 chain->tree_prev = srank-1; /* rank -1 */
581 if( srank == (head + len - 1) ) {
582 chain->tree_next[0] = -1;
583 chain->tree_nextsize = 0;
585 if( (srank + 1) < size ) {
586 chain->tree_next[0] = srank+1;
587 chain->tree_nextsize = 1;
589 chain->tree_next[0] = -1;
590 chain->tree_nextsize = 0;
599 chain->tree_prev = -1;
600 chain->tree_next[0] = (root+1)%size;
601 for( i = 1; i < fanout; i++ ) {
602 chain->tree_next[i] = chain->tree_next[i-1] + maxchainlen;
604 chain->tree_next[i]--;
606 chain->tree_next[i] %= size;
608 chain->tree_nextsize = fanout;
610 chain->tree_prev = (chain->tree_prev+root)%size;
611 if( chain->tree_next[0] != -1 ) {
612 chain->tree_next[0] = (chain->tree_next[0]+root)%size;
619 int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank)
623 XBT_DEBUG("coll:tuned:topo:topo_dump_tree %1d tree root %d"
624 " fanout %d BM %1d nextsize %d prev %d",
625 rank, tree->tree_root, tree->tree_bmtree, tree->tree_fanout,
626 tree->tree_nextsize, tree->tree_prev);
627 if( tree->tree_nextsize ) {
628 for( i = 0; i < tree->tree_nextsize; i++ )
629 XBT_DEBUG("[%1d] %d", i, tree->tree_next[i]);