1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
3 /* Copyright (c) 2009, 2010. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
12 int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
13 MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
15 size_t dsize, block_dsize;
16 int comm_size = smpi_comm_size(comm);
17 const size_t intermediate_message = 10000;
20 * Decision function based on MX results from the Grig cluster at UTK.
22 * Currently, linear, recursive doubling, and nonoverlapping algorithms
23 * can handle both commutative and non-commutative operations.
24 * Ring algorithm does not support non-commutative operations.
26 dsize = smpi_datatype_size(dtype);
27 block_dsize = dsize * count;
29 if (block_dsize < intermediate_message) {
30 return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf,
35 if( smpi_op_is_commute(op) && (count > comm_size) ) {
36 const size_t segment_size = 1 << 20; /* 1 MB */
37 if ((comm_size * segment_size >= block_dsize)) {
38 //FIXME: ok, these are not the right algorithms, try to find closer ones
39 // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
40 return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
43 // return (smpi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf,
44 return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf,
51 return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count,
57 int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount,
59 void* rbuf, int rcount,
63 int communicator_size;
64 size_t dsize, block_dsize;
65 communicator_size = smpi_comm_size(comm);
67 /* Decision function based on measurement on Grig cluster at
68 the University of Tennessee (2GB MX) up to 64 nodes.
69 Has better performance for messages of intermediate sizes than the old one */
70 /* determine block size */
71 dsize = smpi_datatype_size(sdtype);
72 block_dsize = dsize * scount;
74 if ((block_dsize < 200) && (communicator_size > 12)) {
75 return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype,
79 } else if (block_dsize < 3000) {
80 return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype,
85 return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype,
90 int smpi_coll_tuned_alltoallv_ompi(void *sbuf, int *scounts, int *sdisps,
92 void *rbuf, int *rcounts, int *rdisps,
97 /* For starters, just keep the original algorithm. */
98 return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype,
99 rbuf, rcounts, rdisps,rdtype,
104 void smpi_coll_tuned_barrier_ompi(MPI_Comm comm)
105 { int communicator_size = smpi_comm_size(comm);
107 if( 2 == communicator_size )
108 return smpi_coll_tuned_barrier_intra_two_procs(comm, module);
109 * Basic optimisation. If we have a power of 2 number of nodes
110 * the use the recursive doubling algorithm, otherwise
111 * bruck is the one we want.
113 bool has_one = false;
114 for( ; communicator_size > 0; communicator_size >>= 1 ) {
115 if( communicator_size & 0x1 ) {
117 return smpi_coll_tuned_barrier_intra_bruck(comm, module);
122 return smpi_coll_tuned_barrier_intra_recursivedoubling(comm, module);
125 int smpi_coll_tuned_bcast_ompi(void *buff, int count,
126 MPI_Datatype datatype, int root,
130 /* Decision function based on MX results for
131 messages up to 36MB and communicator sizes up to 64 nodes */
132 const size_t small_message_size = 2048;
133 const size_t intermediate_message_size = 370728;
134 //const double a_p16 = 3.2118e-6; /* [1 / byte] */
135 //const double b_p16 = 8.7936;
136 //const double a_p64 = 2.3679e-6; /* [1 / byte] */
137 //const double b_p64 = 1.1787;
138 //const double a_p128 = 1.6134e-6; /* [1 / byte] */
139 //const double b_p128 = 2.1102;
141 int communicator_size;
143 size_t message_size, dsize;
145 communicator_size = smpi_comm_size(comm);
147 /* else we need data size for decision function */
148 dsize = smpi_datatype_size(datatype);
149 message_size = dsize * (unsigned long)count; /* needed for decision */
151 /* Handle messages of small and intermediate size, and
152 single-element broadcasts */
153 if ((message_size < small_message_size) || (count <= 1)) {
154 /* Binomial without segmentation */
155 return smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype,
158 } else if (message_size < intermediate_message_size) {
159 // SplittedBinary with 1KB segments
160 return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
164 Handle large message sizes
165 else if (communicator_size < (a_p128 * message_size + b_p128)) {
166 Pipeline with 128KB segments
168 return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
172 }*/ else if (communicator_size < 13) {
173 // Split Binary with 8KB segments
174 return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
177 } /*else if (communicator_size < (a_p64 * message_size + b_p64)) {
178 // Pipeline with 64KB segments
180 return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
184 } else if (communicator_size < (a_p16 * message_size + b_p16)) {
185 Pipeline with 16KB segments
186 //segsize = 1024 << 4;
187 return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
193 /* Pipeline with 8KB segments */
194 //segsize = 1024 << 3;
195 return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
199 /* this is based on gige measurements */
201 if (communicator_size < 4) {
202 return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
204 if (communicator_size == 4) {
205 if (message_size < 524288) segsize = 0;
206 else segsize = 16384;
207 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
209 if (communicator_size <= 8 && message_size < 4096) {
210 return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
212 if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
214 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
216 if (message_size >= 524288) {
218 return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
221 /* once tested can swap this back in */
222 /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
223 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
227 int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
228 int count, MPI_Datatype datatype,
233 int communicator_size=0;
235 size_t message_size, dsize;
236 //const double a1 = 0.6016 / 1024.0; /* [1/B] */
237 //const double b1 = 1.3496;
238 //const double a2 = 0.0410 / 1024.0; /* [1/B] */
239 //const double b2 = 9.7128;
240 //const double a3 = 0.0422 / 1024.0; /* [1/B] */
241 //const double b3 = 1.1614;
242 //const double a4 = 0.0033 / 1024.0; /* [1/B] */
243 //const double b4 = 1.6761;
245 //const int max_requests = 0; /* no limit on # of outstanding requests */
247 communicator_size = smpi_comm_size(comm);
249 /* need data size for decision function */
250 dsize=smpi_datatype_size(datatype);
251 message_size = dsize * count; /* needed for decision */
254 * If the operation is non commutative we currently have choice of linear
255 * or in-order binary tree algorithm.
257 /* if( !ompi_op_is_commute(op) ) {
258 if ((communicator_size < 12) && (message_size < 2048)) {
259 return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
261 return smpi_coll_tuned_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
265 if ((communicator_size < 8) && (message_size < 512)){
267 return smpi_coll_tuned_reduce_flat_tree (sendbuf, recvbuf, count, datatype, op, root, comm);
268 } else if (((communicator_size < 8) && (message_size < 20480)) ||
269 (message_size < 2048) || (count <= 1)) {
272 return smpi_coll_tuned_reduce_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
273 segsize, max_requests*/);
274 } /*else if (communicator_size > (a1 * message_size + b1)) {
277 return smpi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
278 segsize, max_requests);
279 } else if (communicator_size > (a2 * message_size + b2)) {
282 return smpi_coll_tuned_reduce_NTSL (sendbuf, recvbuf, count, datatype, op, root, comm, module,
283 segsize, max_requests);
284 } else if (communicator_size > (a3 * message_size + b3)) {
287 return smpi_coll_tuned_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
288 comm, module, segsize, max_requests);
290 if (communicator_size > (a4 * message_size + b4)) {
297 return smpi_coll_tuned_reduce_NTSL (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
298 segsize, max_requests*/);
301 /* for small messages use linear algorithm */
302 if (message_size <= 4096) {
304 fanout = communicator_size - 1;
305 /* when linear implemented or taken from basic put here, right now using chain as a linear system */
306 /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
307 return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
308 /* return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
310 if (message_size < 524288) {
311 if (message_size <= 65536 ) {
316 fanout = communicator_size/2;
318 /* later swap this for a binary tree */
320 return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
321 segsize, fanout, max_requests);
324 return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
325 segsize, max_requests);
329 /*int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
336 int comm_size, i, pow2;
337 size_t total_message_size, dsize;
338 const double a = 0.0012;
339 const double b = 8.0;
340 const size_t small_message_size = 12 * 1024;
341 const size_t large_message_size = 256 * 1024;
342 bool zerocounts = false;
344 OPAL_OUTPUT((smpi_coll_tuned_stream, "smpi_coll_tuned_reduce_scatter_ompi"));
346 comm_size = smpi_comm_size(comm);
347 // We need data size for decision function
348 ompi_datatype_type_size(dtype, &dsize);
349 total_message_size = 0;
350 for (i = 0; i < comm_size; i++) {
351 total_message_size += rcounts[i];
352 if (0 == rcounts[i]) {
357 if( !ompi_op_is_commute(op) || (zerocounts)) {
358 return smpi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts,
363 total_message_size *= dsize;
365 // compute the nearest power of 2
366 for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
368 if ((total_message_size <= small_message_size) ||
369 ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
370 (comm_size >= a * total_message_size + b)) {
372 smpi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
376 return smpi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
381 return smpi_coll_tuned_reduce_scatter(sbuf, rbuf, rcounts,
387 int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount,
389 void* rbuf, int rcount,
394 int communicator_size, pow2_size;
395 size_t dsize, total_dsize;
397 communicator_size = smpi_comm_size(comm);
399 /* Special case for 2 processes */
400 if (communicator_size == 2) {
401 return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype,
402 rbuf, rcount, rdtype,
406 /* Determine complete data size */
407 dsize=smpi_datatype_size(sdtype);
408 total_dsize = dsize * scount * communicator_size;
410 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
412 /* Decision based on MX 2Gb results from Grig cluster at
413 The University of Tennesse, Knoxville
414 - if total message size is less than 50KB use either bruck or
415 recursive doubling for non-power of two and power of two nodes,
417 - else use ring and neighbor exchange algorithms for odd and even
418 number of nodes, respectively.
420 if (total_dsize < 50000) {
421 if (pow2_size == communicator_size) {
422 return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
423 rbuf, rcount, rdtype,
426 return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
427 rbuf, rcount, rdtype,
431 //if (communicator_size % 2) {
432 return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
433 rbuf, rcount, rdtype,
436 return smpi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
437 rbuf, rcount, rdtype,
442 #if defined(USE_MPICH2_DECISION)
443 /* Decision as in MPICH-2
444 presented in Thakur et.al. "Optimization of Collective Communication
445 Operations in MPICH", International Journal of High Performance Computing
446 Applications, Vol. 19, No. 1, 49-66 (2005)
447 - for power-of-two processes and small and medium size messages
448 (up to 512KB) use recursive doubling
449 - for non-power-of-two processes and small messages (80KB) use bruck,
450 - for everything else use ring.
452 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
453 return smpi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
454 rbuf, rcount, rdtype,
456 } else if (total_dsize <= 81920) {
457 return smpi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
458 rbuf, rcount, rdtype,
461 return smpi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
462 rbuf, rcount, rdtype,
464 #endif /* defined(USE_MPICH2_DECISION) */
467 int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount,
469 void* rbuf, int *rcounts,
476 int communicator_size;
477 size_t dsize, total_dsize;
479 communicator_size = smpi_comm_size(comm);
481 /* Special case for 2 processes */
482 if (communicator_size == 2) {
483 return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
484 rbuf, rcounts, rdispls, rdtype,
488 /* Determine complete data size */
489 dsize=smpi_datatype_size(sdtype);
491 for (i = 0; i < communicator_size; i++) {
492 total_dsize += dsize * rcounts[i];
495 /* Decision based on allgather decision. */
496 if (total_dsize < 50000) {
497 /* return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
498 rbuf, rcounts, rdispls, rdtype,
500 return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
501 rbuf, rcounts, rdispls, rdtype,
505 // if (communicator_size % 2) {
506 return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
507 rbuf, rcounts, rdispls, rdtype,
510 return smpi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
511 rbuf, rcounts, rdispls, rdtype,
517 int smpi_coll_tuned_gather_ompi(void *sbuf, int scount,
519 void* rbuf, int rcount,
525 const int large_segment_size = 32768;
526 const int small_segment_size = 1024;
528 const size_t large_block_size = 92160;
529 const size_t intermediate_block_size = 6000;
530 const size_t small_block_size = 1024;
532 const int large_communicator_size = 60;
533 const int small_communicator_size = 10;
535 int communicator_size, rank;
536 size_t dsize, block_size;
538 OPAL_OUTPUT((smpi_coll_tuned_stream,
539 "smpi_coll_tuned_gather_ompi"));
541 communicator_size = smpi_comm_size(comm);
542 rank = ompi_comm_rank(comm);
544 // Determine block size
546 ompi_datatype_type_size(rdtype, &dsize);
547 block_size = dsize * rcount;
549 ompi_datatype_type_size(sdtype, &dsize);
550 block_size = dsize * scount;
553 if (block_size > large_block_size) {
554 return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
555 rbuf, rcount, rdtype,
559 } else if (block_size > intermediate_block_size) {
560 return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
561 rbuf, rcount, rdtype,
565 } else if ((communicator_size > large_communicator_size) ||
566 ((communicator_size > small_communicator_size) &&
567 (block_size < small_block_size))) {
568 return smpi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype,
569 rbuf, rcount, rdtype,
573 // Otherwise, use basic linear
574 return smpi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
575 rbuf, rcount, rdtype,
579 int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount,
581 void* rbuf, int rcount,
583 int root, MPI_Comm comm,
586 const size_t small_block_size = 300;
587 const int small_comm_size = 10;
588 int communicator_size, rank;
589 size_t dsize, block_size;
591 OPAL_OUTPUT((smpi_coll_tuned_stream,
592 "smpi_coll_tuned_scatter_ompi"));
594 communicator_size = smpi_comm_size(comm);
595 rank = ompi_comm_rank(comm);
596 // Determine block size
598 ompi_datatype_type_size(sdtype, &dsize);
599 block_size = dsize * scount;
601 ompi_datatype_type_size(rdtype, &dsize);
602 block_size = dsize * rcount;
605 if ((communicator_size > small_comm_size) &&
606 (block_size < small_block_size)) {
607 return smpi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype,
608 rbuf, rcount, rdtype,
611 return smpi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
612 rbuf, rcount, rdtype,