1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
3 /* Copyright (c) 2009, 2010. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
12 int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
13 MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
15 size_t dsize, block_dsize;
16 int comm_size = smpi_comm_size(comm);
17 const size_t intermediate_message = 10000;
20 * Decision function based on MX results from the Grig cluster at UTK.
22 * Currently, linear, recursive doubling, and nonoverlapping algorithms
23 * can handle both commutative and non-commutative operations.
24 * Ring algorithm does not support non-commutative operations.
26 dsize = smpi_datatype_size(dtype);
27 block_dsize = dsize * count;
29 if (block_dsize < intermediate_message) {
30 return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf,
35 if( /*smpi_op_is_commute(op) && */(count > comm_size) ) {
36 const size_t segment_size = 1 << 20; /* 1 MB */
37 if ((comm_size * segment_size >= block_dsize)) {
38 //FIXME: ok, these are not the right algorithms, try to find closer ones
39 // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
40 return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
43 // return (smpi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf,
44 return (smpi_coll_tuned_allreduce_rab2 (sbuf, rbuf,
51 return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count,
57 int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount,
59 void* rbuf, int rcount,
63 int communicator_size;
64 size_t dsize, block_dsize;
65 communicator_size = smpi_comm_size(comm);
67 /* Decision function based on measurement on Grig cluster at
68 the University of Tennessee (2GB MX) up to 64 nodes.
69 Has better performance for messages of intermediate sizes than the old one */
70 /* determine block size */
71 dsize = smpi_datatype_size(sdtype);
72 block_dsize = dsize * scount;
74 if ((block_dsize < 200) && (communicator_size > 12)) {
75 return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype,
79 } else if (block_dsize < 3000) {
80 return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype,
85 return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype,
90 int smpi_coll_tuned_alltoallv_ompi(void *sbuf, int *scounts, int *sdisps,
92 void *rbuf, int *rcounts, int *rdisps,
97 /* For starters, just keep the original algorithm. */
98 return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype,
99 rbuf, rcounts, rdisps,rdtype,
104 void smpi_coll_tuned_barrier_ompi(MPI_Comm comm)
105 { int communicator_size = smpi_comm_size(comm);
107 if( 2 == communicator_size )
108 return smpi_coll_tuned_barrier_intra_two_procs(comm, module);
109 * Basic optimisation. If we have a power of 2 number of nodes
110 * the use the recursive doubling algorithm, otherwise
111 * bruck is the one we want.
113 bool has_one = false;
114 for( ; communicator_size > 0; communicator_size >>= 1 ) {
115 if( communicator_size & 0x1 ) {
117 return smpi_coll_tuned_barrier_intra_bruck(comm, module);
122 return smpi_coll_tuned_barrier_intra_recursivedoubling(comm, module);
125 int smpi_coll_tuned_bcast_ompi(void *buff, int count,
126 MPI_Datatype datatype, int root,
130 /* Decision function based on MX results for
131 messages up to 36MB and communicator sizes up to 64 nodes */
132 //const size_t small_message_size = 2048;
133 const size_t intermediate_message_size = 370728;
134 //const double a_p16 = 3.2118e-6; /* [1 / byte] */
135 //const double b_p16 = 8.7936;
136 //const double a_p64 = 2.3679e-6; /* [1 / byte] */
137 //const double b_p64 = 1.1787;
138 //const double a_p128 = 1.6134e-6; /* [1 / byte] */
139 //const double b_p128 = 2.1102;
141 //int communicator_size;
143 size_t message_size, dsize;
145 //communicator_size = smpi_comm_size(comm);
147 /* else we need data size for decision function */
148 dsize = smpi_datatype_size(datatype);
149 message_size = dsize * (unsigned long)count; /* needed for decision */
151 /* Handle messages of small and intermediate size, and
152 single-element broadcasts */
153 if ((message_size < /*small_message_size*/intermediate_message_size) || (count <= 1)) {
154 /* Binomial without segmentation */
156 return smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype,
160 } /*else if (message_size < intermediate_message_size) {
161 // SplittedBinary with 1KB segments
163 return smpi_coll_tuned_bcast_split_bintree(buff, count, datatype,
168 Handle large message sizes
169 else if (communicator_size < (a_p128 * message_size + b_p128)) {
170 Pipeline with 128KB segments
172 return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
176 } else if (communicator_size < 13) {
177 // Split Binary with 8KB segments
179 return smpi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype,
183 } else if (communicator_size < (a_p64 * message_size + b_p64)) {
184 // Pipeline with 64KB segments
186 return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
190 } else if (communicator_size < (a_p16 * message_size + b_p16)) {
191 Pipeline with 16KB segments
192 //segsize = 1024 << 4;
193 return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
199 /* Pipeline with 8KB segments */
200 //segsize = 1024 << 3;
201 return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
205 /* this is based on gige measurements */
207 if (communicator_size < 4) {
208 return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
210 if (communicator_size == 4) {
211 if (message_size < 524288) segsize = 0;
212 else segsize = 16384;
213 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
215 if (communicator_size <= 8 && message_size < 4096) {
216 return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
218 if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
220 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
222 if (message_size >= 524288) {
224 return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
227 /* once tested can swap this back in */
228 /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
229 return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
233 int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
234 int count, MPI_Datatype datatype,
239 int communicator_size=0;
241 size_t message_size, dsize;
242 //const double a1 = 0.6016 / 1024.0; /* [1/B] */
243 //const double b1 = 1.3496;
244 //const double a2 = 0.0410 / 1024.0; /* [1/B] */
245 //const double b2 = 9.7128;
246 //const double a3 = 0.0422 / 1024.0; /* [1/B] */
247 //const double b3 = 1.1614;
248 //const double a4 = 0.0033 / 1024.0; /* [1/B] */
249 //const double b4 = 1.6761;
251 //const int max_requests = 0; /* no limit on # of outstanding requests */
253 communicator_size = smpi_comm_size(comm);
255 /* need data size for decision function */
256 dsize=smpi_datatype_size(datatype);
257 message_size = dsize * count; /* needed for decision */
260 * If the operation is non commutative we currently have choice of linear
261 * or in-order binary tree algorithm.
263 /* if( !ompi_op_is_commute(op) ) {
264 if ((communicator_size < 12) && (message_size < 2048)) {
265 return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
267 return smpi_coll_tuned_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
271 if ((communicator_size < 8) && (message_size < 512)){
273 return smpi_coll_tuned_reduce_flat_tree (sendbuf, recvbuf, count, datatype, op, root, comm);
274 } else if (((communicator_size < 8) && (message_size < 20480)) ||
275 (message_size < 2048) || (count <= 1)) {
278 return smpi_coll_tuned_reduce_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
279 segsize, max_requests*/);
280 } /*else if (communicator_size > (a1 * message_size + b1)) {
283 return smpi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
284 segsize, max_requests);
285 } else if (communicator_size > (a2 * message_size + b2)) {
288 return smpi_coll_tuned_reduce_NTSL (sendbuf, recvbuf, count, datatype, op, root, comm, module,
289 segsize, max_requests);
290 } else if (communicator_size > (a3 * message_size + b3)) {
293 return smpi_coll_tuned_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
294 comm, module, segsize, max_requests);
296 if (communicator_size > (a4 * message_size + b4)) {
303 return smpi_coll_tuned_reduce_NTSL (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
304 segsize, max_requests*/);
307 /* for small messages use linear algorithm */
308 if (message_size <= 4096) {
310 fanout = communicator_size - 1;
311 /* when linear implemented or taken from basic put here, right now using chain as a linear system */
312 /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
313 return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
314 /* return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
316 if (message_size < 524288) {
317 if (message_size <= 65536 ) {
322 fanout = communicator_size/2;
324 /* later swap this for a binary tree */
326 return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
327 segsize, fanout, max_requests);
330 return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
331 segsize, max_requests);
335 /*int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
342 int comm_size, i, pow2;
343 size_t total_message_size, dsize;
344 const double a = 0.0012;
345 const double b = 8.0;
346 const size_t small_message_size = 12 * 1024;
347 const size_t large_message_size = 256 * 1024;
348 bool zerocounts = false;
350 OPAL_OUTPUT((smpi_coll_tuned_stream, "smpi_coll_tuned_reduce_scatter_ompi"));
352 comm_size = smpi_comm_size(comm);
353 // We need data size for decision function
354 ompi_datatype_type_size(dtype, &dsize);
355 total_message_size = 0;
356 for (i = 0; i < comm_size; i++) {
357 total_message_size += rcounts[i];
358 if (0 == rcounts[i]) {
363 if( !ompi_op_is_commute(op) || (zerocounts)) {
364 return smpi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts,
369 total_message_size *= dsize;
371 // compute the nearest power of 2
372 for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
374 if ((total_message_size <= small_message_size) ||
375 ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
376 (comm_size >= a * total_message_size + b)) {
378 smpi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
382 return smpi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
387 return smpi_coll_tuned_reduce_scatter(sbuf, rbuf, rcounts,
393 int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount,
395 void* rbuf, int rcount,
400 int communicator_size, pow2_size;
401 size_t dsize, total_dsize;
403 communicator_size = smpi_comm_size(comm);
405 /* Special case for 2 processes */
406 if (communicator_size == 2) {
407 return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype,
408 rbuf, rcount, rdtype,
412 /* Determine complete data size */
413 dsize=smpi_datatype_size(sdtype);
414 total_dsize = dsize * scount * communicator_size;
416 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
418 /* Decision based on MX 2Gb results from Grig cluster at
419 The University of Tennesse, Knoxville
420 - if total message size is less than 50KB use either bruck or
421 recursive doubling for non-power of two and power of two nodes,
423 - else use ring and neighbor exchange algorithms for odd and even
424 number of nodes, respectively.
426 if (total_dsize < 50000) {
427 if (pow2_size == communicator_size) {
428 return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
429 rbuf, rcount, rdtype,
432 return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
433 rbuf, rcount, rdtype,
437 //if (communicator_size % 2) {
438 return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
439 rbuf, rcount, rdtype,
442 return smpi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
443 rbuf, rcount, rdtype,
448 #if defined(USE_MPICH2_DECISION)
449 /* Decision as in MPICH-2
450 presented in Thakur et.al. "Optimization of Collective Communication
451 Operations in MPICH", International Journal of High Performance Computing
452 Applications, Vol. 19, No. 1, 49-66 (2005)
453 - for power-of-two processes and small and medium size messages
454 (up to 512KB) use recursive doubling
455 - for non-power-of-two processes and small messages (80KB) use bruck,
456 - for everything else use ring.
458 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
459 return smpi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
460 rbuf, rcount, rdtype,
462 } else if (total_dsize <= 81920) {
463 return smpi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
464 rbuf, rcount, rdtype,
467 return smpi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
468 rbuf, rcount, rdtype,
470 #endif /* defined(USE_MPICH2_DECISION) */
473 int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount,
475 void* rbuf, int *rcounts,
482 int communicator_size;
483 size_t dsize, total_dsize;
485 communicator_size = smpi_comm_size(comm);
487 /* Special case for 2 processes */
488 if (communicator_size == 2) {
489 return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
490 rbuf, rcounts, rdispls, rdtype,
494 /* Determine complete data size */
495 dsize=smpi_datatype_size(sdtype);
497 for (i = 0; i < communicator_size; i++) {
498 total_dsize += dsize * rcounts[i];
501 /* Decision based on allgather decision. */
502 if (total_dsize < 50000) {
503 /* return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
504 rbuf, rcounts, rdispls, rdtype,
506 return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
507 rbuf, rcounts, rdispls, rdtype,
511 // if (communicator_size % 2) {
512 return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
513 rbuf, rcounts, rdispls, rdtype,
516 return smpi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
517 rbuf, rcounts, rdispls, rdtype,
523 int smpi_coll_tuned_gather_ompi(void *sbuf, int scount,
525 void* rbuf, int rcount,
531 const int large_segment_size = 32768;
532 const int small_segment_size = 1024;
534 const size_t large_block_size = 92160;
535 const size_t intermediate_block_size = 6000;
536 const size_t small_block_size = 1024;
538 const int large_communicator_size = 60;
539 const int small_communicator_size = 10;
541 int communicator_size, rank;
542 size_t dsize, block_size;
544 OPAL_OUTPUT((smpi_coll_tuned_stream,
545 "smpi_coll_tuned_gather_ompi"));
547 communicator_size = smpi_comm_size(comm);
548 rank = ompi_comm_rank(comm);
550 // Determine block size
552 ompi_datatype_type_size(rdtype, &dsize);
553 block_size = dsize * rcount;
555 ompi_datatype_type_size(sdtype, &dsize);
556 block_size = dsize * scount;
559 if (block_size > large_block_size) {
560 return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
561 rbuf, rcount, rdtype,
565 } else if (block_size > intermediate_block_size) {
566 return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
567 rbuf, rcount, rdtype,
571 } else if ((communicator_size > large_communicator_size) ||
572 ((communicator_size > small_communicator_size) &&
573 (block_size < small_block_size))) {
574 return smpi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype,
575 rbuf, rcount, rdtype,
579 // Otherwise, use basic linear
580 return smpi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
581 rbuf, rcount, rdtype,
585 int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount,
587 void* rbuf, int rcount,
589 int root, MPI_Comm comm,
592 const size_t small_block_size = 300;
593 const int small_comm_size = 10;
594 int communicator_size, rank;
595 size_t dsize, block_size;
597 OPAL_OUTPUT((smpi_coll_tuned_stream,
598 "smpi_coll_tuned_scatter_ompi"));
600 communicator_size = smpi_comm_size(comm);
601 rank = ompi_comm_rank(comm);
602 // Determine block size
604 ompi_datatype_type_size(sdtype, &dsize);
605 block_size = dsize * scount;
607 ompi_datatype_type_size(rdtype, &dsize);
608 block_size = dsize * rcount;
611 if ((communicator_size > small_comm_size) &&
612 (block_size < small_block_size)) {
613 return smpi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype,
614 rbuf, rcount, rdtype,
617 return smpi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
618 rbuf, rcount, rdtype,