1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
3 /* Copyright (c) 2009-2010, 2013-2017. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
14 int Coll_allreduce_ompi::allreduce(void *sbuf, void *rbuf, int count,
15 MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
17 size_t dsize, block_dsize;
18 int comm_size = comm->size();
19 const size_t intermediate_message = 10000;
22 * Decision function based on MX results from the Grig cluster at UTK.
24 * Currently, linear, recursive doubling, and nonoverlapping algorithms
25 * can handle both commutative and non-commutative operations.
26 * Ring algorithm does not support non-commutative operations.
28 dsize = dtype->size();
29 block_dsize = dsize * count;
31 if (block_dsize < intermediate_message) {
32 return (Coll_allreduce_rdb::allreduce (sbuf, rbuf,
37 if( ((op==MPI_OP_NULL) || op->is_commutative()) && (count > comm_size) ) {
38 const size_t segment_size = 1 << 20; /* 1 MB */
39 if ((comm_size * segment_size >= block_dsize)) {
40 //FIXME: ok, these are not the right algorithms, try to find closer ones
41 // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
42 return Coll_allreduce_lr::allreduce(sbuf, rbuf, count, dtype,
45 return (Coll_allreduce_ompi_ring_segmented::allreduce (sbuf, rbuf,
52 return (Coll_allreduce_redbcast::allreduce(sbuf, rbuf, count,
58 int Coll_alltoall_ompi::alltoall( void *sbuf, int scount,
60 void* rbuf, int rcount,
64 int communicator_size;
65 size_t dsize, block_dsize;
66 communicator_size = comm->size();
68 /* Decision function based on measurement on Grig cluster at
69 the University of Tennessee (2GB MX) up to 64 nodes.
70 Has better performance for messages of intermediate sizes than the old one */
71 /* determine block size */
72 dsize = sdtype->size();
73 block_dsize = dsize * scount;
75 if ((block_dsize < 200) && (communicator_size > 12)) {
76 return Coll_alltoall_bruck::alltoall(sbuf, scount, sdtype,
80 } else if (block_dsize < 3000) {
81 return Coll_alltoall_basic_linear::alltoall(sbuf, scount, sdtype,
86 return Coll_alltoall_ring::alltoall (sbuf, scount, sdtype,
91 int Coll_alltoallv_ompi::alltoallv(void *sbuf, int *scounts, int *sdisps,
93 void *rbuf, int *rcounts, int *rdisps,
98 /* For starters, just keep the original algorithm. */
99 return Coll_alltoallv_ompi_basic_linear::alltoallv(sbuf, scounts, sdisps, sdtype,
100 rbuf, rcounts, rdisps,rdtype,
105 int Coll_barrier_ompi::barrier(MPI_Comm comm)
106 { int communicator_size = comm->size();
108 if( 2 == communicator_size )
109 return Coll_barrier_ompi_two_procs::barrier(comm);
110 /* * Basic optimisation. If we have a power of 2 number of nodes*/
111 /* * the use the recursive doubling algorithm, otherwise*/
112 /* * bruck is the one we want.*/
115 for( ; communicator_size > 0; communicator_size >>= 1 ) {
116 if( communicator_size & 0x1 ) {
118 return Coll_barrier_ompi_bruck::barrier(comm);
123 return Coll_barrier_ompi_recursivedoubling::barrier(comm);
126 int Coll_bcast_ompi::bcast(void *buff, int count,
127 MPI_Datatype datatype, int root,
131 /* Decision function based on MX results for
132 messages up to 36MB and communicator sizes up to 64 nodes */
133 const size_t small_message_size = 2048;
134 const size_t intermediate_message_size = 370728;
135 const double a_p16 = 3.2118e-6; /* [1 / byte] */
136 const double b_p16 = 8.7936;
137 const double a_p64 = 2.3679e-6; /* [1 / byte] */
138 const double b_p64 = 1.1787;
139 const double a_p128 = 1.6134e-6; /* [1 / byte] */
140 const double b_p128 = 2.1102;
142 int communicator_size;
144 size_t message_size, dsize;
146 communicator_size = comm->size();
148 /* else we need data size for decision function */
149 dsize = datatype->size();
150 message_size = dsize * (unsigned long)count; /* needed for decision */
152 /* Handle messages of small and intermediate size, and
153 single-element broadcasts */
154 if ((message_size < small_message_size) || (count <= 1)) {
155 /* Binomial without segmentation */
156 return Coll_bcast_binomial_tree::bcast (buff, count, datatype,
159 } else if (message_size < intermediate_message_size) {
160 // SplittedBinary with 1KB segments
161 return Coll_bcast_ompi_split_bintree::bcast(buff, count, datatype,
165 //Handle large message sizes
166 else if (communicator_size < (a_p128 * message_size + b_p128)) {
167 //Pipeline with 128KB segments
168 //segsize = 1024 << 7;
169 return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
173 } else if (communicator_size < 13) {
174 // Split Binary with 8KB segments
175 return Coll_bcast_ompi_split_bintree::bcast(buff, count, datatype,
178 } else if (communicator_size < (a_p64 * message_size + b_p64)) {
179 // Pipeline with 64KB segments
180 //segsize = 1024 << 6;
181 return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
185 } else if (communicator_size < (a_p16 * message_size + b_p16)) {
186 //Pipeline with 16KB segments
187 //segsize = 1024 << 4;
188 return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
193 /* Pipeline with 8KB segments */
194 //segsize = 1024 << 3;
195 return Coll_bcast_flattree_pipeline::bcast (buff, count, datatype,
199 /* this is based on gige measurements */
201 if (communicator_size < 4) {
202 return Coll_bcast_intra_basic_linear::bcast (buff, count, datatype, root, comm, module);
204 if (communicator_size == 4) {
205 if (message_size < 524288) segsize = 0;
206 else segsize = 16384;
207 return Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
209 if (communicator_size <= 8 && message_size < 4096) {
210 return Coll_bcast_intra_basic_linear::bcast (buff, count, datatype, root, comm, module);
212 if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
214 return Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
216 if (message_size >= 524288) {
218 return Coll_bcast_intra_pipeline::bcast (buff, count, datatype, root, comm, module, segsize);
221 /* once tested can swap this back in */
222 /* return Coll_bcast_intra_bmtree::bcast (buff, count, datatype, root, comm, segsize); */
223 return Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
227 int Coll_reduce_ompi::reduce( void *sendbuf, void *recvbuf,
228 int count, MPI_Datatype datatype,
233 int communicator_size=0;
235 size_t message_size, dsize;
236 const double a1 = 0.6016 / 1024.0; /* [1/B] */
237 const double b1 = 1.3496;
238 const double a2 = 0.0410 / 1024.0; /* [1/B] */
239 const double b2 = 9.7128;
240 const double a3 = 0.0422 / 1024.0; /* [1/B] */
241 const double b3 = 1.1614;
242 //const double a4 = 0.0033 / 1024.0; [1/B]
243 //const double b4 = 1.6761;
245 /* no limit on # of outstanding requests */
246 //const int max_requests = 0;
248 communicator_size = comm->size();
250 /* need data size for decision function */
251 dsize=datatype->size();
252 message_size = dsize * count; /* needed for decision */
255 * If the operation is non commutative we currently have choice of linear
256 * or in-order binary tree algorithm.
258 if( (op!=MPI_OP_NULL) && !op->is_commutative() ) {
259 if ((communicator_size < 12) && (message_size < 2048)) {
260 return Coll_reduce_ompi_basic_linear::reduce (sendbuf, recvbuf, count, datatype, op, root, comm/*, module*/);
262 return Coll_reduce_ompi_in_order_binary::reduce (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
266 if ((communicator_size < 8) && (message_size < 512)){
268 return Coll_reduce_ompi_basic_linear::reduce (sendbuf, recvbuf, count, datatype, op, root, comm);
269 } else if (((communicator_size < 8) && (message_size < 20480)) ||
270 (message_size < 2048) || (count <= 1)) {
273 return Coll_reduce_ompi_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
274 segsize, max_requests*/);
275 } else if (communicator_size > (a1 * message_size + b1)) {
278 return Coll_reduce_ompi_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
279 segsize, max_requests*/);
280 } else if (communicator_size > (a2 * message_size + b2)) {
283 return Coll_reduce_ompi_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
284 segsize, max_requests*/);
285 } else if (communicator_size > (a3 * message_size + b3)) {
288 return Coll_reduce_ompi_binary::reduce( sendbuf, recvbuf, count, datatype, op, root,
289 comm/*, module, segsize, max_requests*/);
291 // if (communicator_size > (a4 * message_size + b4)) {
293 // segsize = 32*1024;
296 // segsize = 64*1024;
298 return Coll_reduce_ompi_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
299 segsize, max_requests*/);
302 /* for small messages use linear algorithm */
303 if (message_size <= 4096) {
305 fanout = communicator_size - 1;
306 /* when linear implemented or taken from basic put here, right now using chain as a linear system */
307 /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
308 return Coll_reduce_intra_basic_linear::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module);
309 /* return Coll_reduce_intra_chain::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
311 if (message_size < 524288) {
312 if (message_size <= 65536 ) {
317 fanout = communicator_size/2;
319 /* later swap this for a binary tree */
321 return Coll_reduce_intra_chain::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module,
322 segsize, fanout, max_requests);
325 return Coll_reduce_intra_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module,
326 segsize, max_requests);
330 int Coll_reduce_scatter_ompi::reduce_scatter( void *sbuf, void *rbuf,
337 int comm_size, i, pow2;
338 size_t total_message_size, dsize;
339 const double a = 0.0012;
340 const double b = 8.0;
341 const size_t small_message_size = 12 * 1024;
342 const size_t large_message_size = 256 * 1024;
345 XBT_DEBUG("Coll_reduce_scatter_ompi::reduce_scatter");
347 comm_size = comm->size();
348 // We need data size for decision function
350 total_message_size = 0;
351 for (i = 0; i < comm_size; i++) {
352 total_message_size += rcounts[i];
353 if (0 == rcounts[i]) {
358 if( ((op!=MPI_OP_NULL) && !op->is_commutative()) || (zerocounts)) {
359 Coll_reduce_scatter_default::reduce_scatter (sbuf, rbuf, rcounts,
365 total_message_size *= dsize;
367 // compute the nearest power of 2
368 for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
370 if ((total_message_size <= small_message_size) ||
371 ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
372 (comm_size >= a * total_message_size + b)) {
374 Coll_reduce_scatter_ompi_basic_recursivehalving::reduce_scatter(sbuf, rbuf, rcounts,
378 return Coll_reduce_scatter_ompi_ring::reduce_scatter(sbuf, rbuf, rcounts,
386 int Coll_allgather_ompi::allgather(void *sbuf, int scount,
388 void* rbuf, int rcount,
393 int communicator_size, pow2_size;
394 size_t dsize, total_dsize;
396 communicator_size = comm->size();
398 /* Special case for 2 processes */
399 if (communicator_size == 2) {
400 return Coll_allgather_pair::allgather (sbuf, scount, sdtype,
401 rbuf, rcount, rdtype,
405 /* Determine complete data size */
406 dsize=sdtype->size();
407 total_dsize = dsize * scount * communicator_size;
409 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
411 /* Decision based on MX 2Gb results from Grig cluster at
412 The University of Tennesse, Knoxville
413 - if total message size is less than 50KB use either bruck or
414 recursive doubling for non-power of two and power of two nodes,
416 - else use ring and neighbor exchange algorithms for odd and even
417 number of nodes, respectively.
419 if (total_dsize < 50000) {
420 if (pow2_size == communicator_size) {
421 return Coll_allgather_rdb::allgather(sbuf, scount, sdtype,
422 rbuf, rcount, rdtype,
425 return Coll_allgather_bruck::allgather(sbuf, scount, sdtype,
426 rbuf, rcount, rdtype,
430 if (communicator_size % 2) {
431 return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
432 rbuf, rcount, rdtype,
435 return Coll_allgather_ompi_neighborexchange::allgather(sbuf, scount, sdtype,
436 rbuf, rcount, rdtype,
441 #if defined(USE_MPICH2_DECISION)
442 /* Decision as in MPICH-2
443 presented in Thakur et.al. "Optimization of Collective Communication
444 Operations in MPICH", International Journal of High Performance Computing
445 Applications, Vol. 19, No. 1, 49-66 (2005)
446 - for power-of-two processes and small and medium size messages
447 (up to 512KB) use recursive doubling
448 - for non-power-of-two processes and small messages (80KB) use bruck,
449 - for everything else use ring.
451 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
452 return Coll_allgather_rdb::allgather(sbuf, scount, sdtype,
453 rbuf, rcount, rdtype,
455 } else if (total_dsize <= 81920) {
456 return Coll_allgather_bruck::allgather(sbuf, scount, sdtype,
457 rbuf, rcount, rdtype,
460 return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
461 rbuf, rcount, rdtype,
463 #endif /* defined(USE_MPICH2_DECISION) */
466 int Coll_allgatherv_ompi::allgatherv(void *sbuf, int scount,
468 void* rbuf, int *rcounts,
475 int communicator_size;
476 size_t dsize, total_dsize;
478 communicator_size = comm->size();
480 /* Special case for 2 processes */
481 if (communicator_size == 2) {
482 return Coll_allgatherv_pair::allgatherv(sbuf, scount, sdtype,
483 rbuf, rcounts, rdispls, rdtype,
487 /* Determine complete data size */
488 dsize=sdtype->size();
490 for (i = 0; i < communicator_size; i++) {
491 total_dsize += dsize * rcounts[i];
494 /* Decision based on allgather decision. */
495 if (total_dsize < 50000) {
496 /* return Coll_allgatherv_intra_bruck::allgatherv(sbuf, scount, sdtype,
497 rbuf, rcounts, rdispls, rdtype,
499 return Coll_allgatherv_ring::allgatherv(sbuf, scount, sdtype,
500 rbuf, rcounts, rdispls, rdtype,
504 if (communicator_size % 2) {
505 return Coll_allgatherv_ring::allgatherv(sbuf, scount, sdtype,
506 rbuf, rcounts, rdispls, rdtype,
509 return Coll_allgatherv_ompi_neighborexchange::allgatherv(sbuf, scount, sdtype,
510 rbuf, rcounts, rdispls, rdtype,
516 int Coll_gather_ompi::gather(void *sbuf, int scount,
518 void* rbuf, int rcount,
524 //const int large_segment_size = 32768;
525 //const int small_segment_size = 1024;
527 //const size_t large_block_size = 92160;
528 const size_t intermediate_block_size = 6000;
529 const size_t small_block_size = 1024;
531 const int large_communicator_size = 60;
532 const int small_communicator_size = 10;
534 int communicator_size, rank;
535 size_t dsize, block_size;
537 XBT_DEBUG("smpi_coll_tuned_gather_ompi");
539 communicator_size = comm->size();
542 // Determine block size
544 dsize = rdtype->size();
545 block_size = dsize * rcount;
547 dsize = sdtype->size();
548 block_size = dsize * scount;
551 /* if (block_size > large_block_size) {*/
552 /* return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, */
553 /* rbuf, rcount, rdtype, */
556 /* } else*/ if (block_size > intermediate_block_size) {
557 return Coll_gather_ompi_linear_sync::gather (sbuf, scount, sdtype,
558 rbuf, rcount, rdtype,
561 } else if ((communicator_size > large_communicator_size) ||
562 ((communicator_size > small_communicator_size) &&
563 (block_size < small_block_size))) {
564 return Coll_gather_ompi_binomial::gather (sbuf, scount, sdtype,
565 rbuf, rcount, rdtype,
569 // Otherwise, use basic linear
570 return Coll_gather_ompi_basic_linear::gather (sbuf, scount, sdtype,
571 rbuf, rcount, rdtype,
576 int Coll_scatter_ompi::scatter(void *sbuf, int scount,
578 void* rbuf, int rcount,
580 int root, MPI_Comm comm
583 const size_t small_block_size = 300;
584 const int small_comm_size = 10;
585 int communicator_size, rank;
586 size_t dsize, block_size;
588 XBT_DEBUG("Coll_scatter_ompi::scatter");
590 communicator_size = comm->size();
592 // Determine block size
594 dsize=sdtype->size();
595 block_size = dsize * scount;
597 dsize=rdtype->size();
598 block_size = dsize * rcount;
601 if ((communicator_size > small_comm_size) &&
602 (block_size < small_block_size)) {
604 sbuf=xbt_malloc(rcount*rdtype->get_extent());
608 int ret=Coll_scatter_ompi_binomial::scatter (sbuf, scount, sdtype,
609 rbuf, rcount, rdtype,
616 return Coll_scatter_ompi_basic_linear::scatter (sbuf, scount, sdtype,
617 rbuf, rcount, rdtype,