1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
3 /* Copyright (c) 2009, 2010. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
12 int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
13 MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
15 size_t dsize, block_dsize;
16 int comm_size = smpi_comm_size(comm);
17 const size_t intermediate_message = 10000;
20 * Decision function based on MX results from the Grig cluster at UTK.
22 * Currently, linear, recursive doubling, and nonoverlapping algorithms
23 * can handle both commutative and non-commutative operations.
24 * Ring algorithm does not support non-commutative operations.
26 dsize = smpi_datatype_size(dtype);
27 block_dsize = dsize * count;
29 if (block_dsize < intermediate_message) {
30 return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf,
35 if( smpi_op_is_commute(op) && (count > comm_size) ) {
36 const size_t segment_size = 1 << 20; /* 1 MB */
37 if ((comm_size * segment_size >= block_dsize)) {
38 //FIXME: ok, these are not the right algorithms, try to find closer ones
39 // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
40 return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
43 return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf,
50 return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count,
56 int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount,
58 void* rbuf, int rcount,
62 int communicator_size;
63 size_t dsize, block_dsize;
64 communicator_size = smpi_comm_size(comm);
66 /* Decision function based on measurement on Grig cluster at
67 the University of Tennessee (2GB MX) up to 64 nodes.
68 Has better performance for messages of intermediate sizes than the old one */
69 /* determine block size */
70 dsize = smpi_datatype_size(sdtype);
71 block_dsize = dsize * scount;
73 if ((block_dsize < 200) && (communicator_size > 12)) {
74 return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype,
78 } else if (block_dsize < 3000) {
79 return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype,
84 return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype,
89 int smpi_coll_tuned_alltoallv_ompi(void *sbuf, int *scounts, int *sdisps,
91 void *rbuf, int *rcounts, int *rdisps,
96 /* For starters, just keep the original algorithm. */
97 return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype,
98 rbuf, rcounts, rdisps,rdtype,
103 void smpi_coll_tuned_barrier_ompi(MPI_Comm comm)
104 { int communicator_size = smpi_comm_size(comm);
106 if( 2 == communicator_size )
107 return smpi_coll_tuned_barrier_intra_two_procs(comm, module);
108 * Basic optimisation. If we have a power of 2 number of nodes
109 * the use the recursive doubling algorithm, otherwise
110 * bruck is the one we want.
112 bool has_one = false;
113 for( ; communicator_size > 0; communicator_size >>= 1 ) {
114 if( communicator_size & 0x1 ) {
116 return smpi_coll_tuned_barrier_intra_bruck(comm, module);
121 return smpi_coll_tuned_barrier_intra_recursivedoubling(comm, module);
124 int smpi_coll_tuned_bcast_ompi(void *buff, int count,
125 MPI_Datatype datatype, int root,
129 /* Decision function based on MX results for
130 messages up to 36MB and communicator sizes up to 64 nodes */
131 const size_t small_message_size = 2048;
132 const size_t intermediate_message_size = 370728;
133 const double a_p16 = 3.2118e-6; /* [1 / byte] */
134 const double b_p16 = 8.7936;
135 const double a_p64 = 2.3679e-6; /* [1 / byte] */
136 const double b_p64 = 1.1787;
137 const double a_p128 = 1.6134e-6; /* [1 / byte] */
138 const double b_p128 = 2.1102;
140 int communicator_size;
142 size_t message_size, dsize;
144 communicator_size = smpi_comm_size(comm);
146 /* else we need data size for decision function */
147 dsize = smpi_datatype_size(datatype);
148 message_size = dsize * (unsigned long)count; /* needed for decision */
150 /* Handle messages of small and intermediate size, and
151 single-element broadcasts */
152 if ((message_size < small_message_size) || (count <= 1)) {
153 /* Binomial without segmentation */
154 return smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype,
157 } else if (message_size < intermediate_message_size) {
158 // SplittedBinary with 1KB segments
159 return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
163 //Handle large message sizes
164 else if (communicator_size < (a_p128 * message_size + b_p128)) {
165 //Pipeline with 128KB segments
166 //segsize = 1024 << 7;
167 return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
171 } else if (communicator_size < 13) {
172 // Split Binary with 8KB segments
173 return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
176 } else if (communicator_size < (a_p64 * message_size + b_p64)) {
177 // Pipeline with 64KB segments
178 //segsize = 1024 << 6;
179 return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
183 } else if (communicator_size < (a_p16 * message_size + b_p16)) {
184 //Pipeline with 16KB segments
185 //segsize = 1024 << 4;
186 return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
191 /* Pipeline with 8KB segments */
192 //segsize = 1024 << 3;
193 return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
197 /* this is based on gige measurements */
199 if (communicator_size < 4) {
200 return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
202 if (communicator_size == 4) {
203 if (message_size < 524288) segsize = 0;
204 else segsize = 16384;
205 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
207 if (communicator_size <= 8 && message_size < 4096) {
208 return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
210 if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
212 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
214 if (message_size >= 524288) {
216 return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
219 /* once tested can swap this back in */
220 /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
221 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
225 int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
226 int count, MPI_Datatype datatype,
231 int communicator_size=0;
233 size_t message_size, dsize;
234 const double a1 = 0.6016 / 1024.0; /* [1/B] */
235 const double b1 = 1.3496;
236 const double a2 = 0.0410 / 1024.0; /* [1/B] */
237 const double b2 = 9.7128;
238 const double a3 = 0.0422 / 1024.0; /* [1/B] */
239 const double b3 = 1.1614;
240 //const double a4 = 0.0033 / 1024.0; /* [1/B] */
241 //const double b4 = 1.6761;
243 //const int max_requests = 0; /* no limit on # of outstanding requests */
245 communicator_size = smpi_comm_size(comm);
247 /* need data size for decision function */
248 dsize=smpi_datatype_size(datatype);
249 message_size = dsize * count; /* needed for decision */
252 * If the operation is non commutative we currently have choice of linear
253 * or in-order binary tree algorithm.
255 if( !smpi_op_is_commute(op) ) {
256 if ((communicator_size < 12) && (message_size < 2048)) {
257 return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm/*, module*/);
259 return smpi_coll_tuned_reduce_ompi_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
263 if ((communicator_size < 8) && (message_size < 512)){
265 return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
266 } else if (((communicator_size < 8) && (message_size < 20480)) ||
267 (message_size < 2048) || (count <= 1)) {
270 return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
271 segsize, max_requests*/);
272 } else if (communicator_size > (a1 * message_size + b1)) {
275 return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
276 segsize, max_requests*/);
277 } else if (communicator_size > (a2 * message_size + b2)) {
280 return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
281 segsize, max_requests*/);
282 } else if (communicator_size > (a3 * message_size + b3)) {
285 return smpi_coll_tuned_reduce_ompi_binary( sendbuf, recvbuf, count, datatype, op, root,
286 comm/*, module, segsize, max_requests*/);
288 /*if (communicator_size > (a4 * message_size + b4)) {
295 return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
296 segsize, max_requests*/);
299 /* for small messages use linear algorithm */
300 if (message_size <= 4096) {
302 fanout = communicator_size - 1;
303 /* when linear implemented or taken from basic put here, right now using chain as a linear system */
304 /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
305 return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
306 /* return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
308 if (message_size < 524288) {
309 if (message_size <= 65536 ) {
314 fanout = communicator_size/2;
316 /* later swap this for a binary tree */
318 return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
319 segsize, fanout, max_requests);
322 return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
323 segsize, max_requests);
327 /*int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
334 int comm_size, i, pow2;
335 size_t total_message_size, dsize;
336 const double a = 0.0012;
337 const double b = 8.0;
338 const size_t small_message_size = 12 * 1024;
339 const size_t large_message_size = 256 * 1024;
340 bool zerocounts = false;
342 OPAL_OUTPUT((smpi_coll_tuned_stream, "smpi_coll_tuned_reduce_scatter_ompi"));
344 comm_size = smpi_comm_size(comm);
345 // We need data size for decision function
346 ompi_datatype_type_size(dtype, &dsize);
347 total_message_size = 0;
348 for (i = 0; i < comm_size; i++) {
349 total_message_size += rcounts[i];
350 if (0 == rcounts[i]) {
355 if( !ompi_op_is_commute(op) || (zerocounts)) {
356 return smpi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts,
361 total_message_size *= dsize;
363 // compute the nearest power of 2
364 for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
366 if ((total_message_size <= small_message_size) ||
367 ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
368 (comm_size >= a * total_message_size + b)) {
370 smpi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
374 return smpi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
379 return smpi_coll_tuned_reduce_scatter(sbuf, rbuf, rcounts,
385 int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount,
387 void* rbuf, int rcount,
392 int communicator_size, pow2_size;
393 size_t dsize, total_dsize;
395 communicator_size = smpi_comm_size(comm);
397 /* Special case for 2 processes */
398 if (communicator_size == 2) {
399 return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype,
400 rbuf, rcount, rdtype,
404 /* Determine complete data size */
405 dsize=smpi_datatype_size(sdtype);
406 total_dsize = dsize * scount * communicator_size;
408 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
410 /* Decision based on MX 2Gb results from Grig cluster at
411 The University of Tennesse, Knoxville
412 - if total message size is less than 50KB use either bruck or
413 recursive doubling for non-power of two and power of two nodes,
415 - else use ring and neighbor exchange algorithms for odd and even
416 number of nodes, respectively.
418 if (total_dsize < 50000) {
419 if (pow2_size == communicator_size) {
420 return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
421 rbuf, rcount, rdtype,
424 return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
425 rbuf, rcount, rdtype,
429 if (communicator_size % 2) {
430 return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
431 rbuf, rcount, rdtype,
434 return smpi_coll_tuned_allgather_ompi_neighborexchange(sbuf, scount, sdtype,
435 rbuf, rcount, rdtype,
440 #if defined(USE_MPICH2_DECISION)
441 /* Decision as in MPICH-2
442 presented in Thakur et.al. "Optimization of Collective Communication
443 Operations in MPICH", International Journal of High Performance Computing
444 Applications, Vol. 19, No. 1, 49-66 (2005)
445 - for power-of-two processes and small and medium size messages
446 (up to 512KB) use recursive doubling
447 - for non-power-of-two processes and small messages (80KB) use bruck,
448 - for everything else use ring.
450 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
451 return smpi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
452 rbuf, rcount, rdtype,
454 } else if (total_dsize <= 81920) {
455 return smpi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
456 rbuf, rcount, rdtype,
459 return smpi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
460 rbuf, rcount, rdtype,
462 #endif /* defined(USE_MPICH2_DECISION) */
465 int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount,
467 void* rbuf, int *rcounts,
474 int communicator_size;
475 size_t dsize, total_dsize;
477 communicator_size = smpi_comm_size(comm);
479 /* Special case for 2 processes */
480 if (communicator_size == 2) {
481 return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
482 rbuf, rcounts, rdispls, rdtype,
486 /* Determine complete data size */
487 dsize=smpi_datatype_size(sdtype);
489 for (i = 0; i < communicator_size; i++) {
490 total_dsize += dsize * rcounts[i];
493 /* Decision based on allgather decision. */
494 if (total_dsize < 50000) {
495 /* return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
496 rbuf, rcounts, rdispls, rdtype,
498 return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
499 rbuf, rcounts, rdispls, rdtype,
503 // if (communicator_size % 2) {
504 return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
505 rbuf, rcounts, rdispls, rdtype,
508 return smpi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
509 rbuf, rcounts, rdispls, rdtype,
515 int smpi_coll_tuned_gather_ompi(void *sbuf, int scount,
517 void* rbuf, int rcount,
523 const int large_segment_size = 32768;
524 const int small_segment_size = 1024;
526 const size_t large_block_size = 92160;
527 const size_t intermediate_block_size = 6000;
528 const size_t small_block_size = 1024;
530 const int large_communicator_size = 60;
531 const int small_communicator_size = 10;
533 int communicator_size, rank;
534 size_t dsize, block_size;
536 OPAL_OUTPUT((smpi_coll_tuned_stream,
537 "smpi_coll_tuned_gather_ompi"));
539 communicator_size = smpi_comm_size(comm);
540 rank = ompi_comm_rank(comm);
542 // Determine block size
544 ompi_datatype_type_size(rdtype, &dsize);
545 block_size = dsize * rcount;
547 ompi_datatype_type_size(sdtype, &dsize);
548 block_size = dsize * scount;
551 if (block_size > large_block_size) {
552 return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
553 rbuf, rcount, rdtype,
557 } else if (block_size > intermediate_block_size) {
558 return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
559 rbuf, rcount, rdtype,
563 } else if ((communicator_size > large_communicator_size) ||
564 ((communicator_size > small_communicator_size) &&
565 (block_size < small_block_size))) {
566 return smpi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype,
567 rbuf, rcount, rdtype,
571 // Otherwise, use basic linear
572 return smpi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
573 rbuf, rcount, rdtype,
577 int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount,
579 void* rbuf, int rcount,
581 int root, MPI_Comm comm,
584 const size_t small_block_size = 300;
585 const int small_comm_size = 10;
586 int communicator_size, rank;
587 size_t dsize, block_size;
589 OPAL_OUTPUT((smpi_coll_tuned_stream,
590 "smpi_coll_tuned_scatter_ompi"));
592 communicator_size = smpi_comm_size(comm);
593 rank = ompi_comm_rank(comm);
594 // Determine block size
596 ompi_datatype_type_size(sdtype, &dsize);
597 block_size = dsize * scount;
599 ompi_datatype_type_size(rdtype, &dsize);
600 block_size = dsize * rcount;
603 if ((communicator_size > small_comm_size) &&
604 (block_size < small_block_size)) {
605 return smpi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype,
606 rbuf, rcount, rdtype,
609 return smpi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
610 rbuf, rcount, rdtype,