1 /* selector for collective algorithms based on mpich decision logic */
3 /* Copyright (c) 2009-2019. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.hpp"
11 /* This is the default implementation of allreduce. The algorithm is:
13 Algorithm: MPI_Allreduce
15 For the heterogeneous case, we call MPI_Reduce followed by MPI_Bcast
16 in order to meet the requirement that all processes must have the
17 same result. For the homogeneous case, we use the following algorithms.
20 For long messages and for builtin ops and if count >= pof2 (where
21 pof2 is the nearest power-of-two less than or equal to the number
22 of processes), we use Rabenseifner's algorithm (see
23 http://www.hlrs.de/mpi/myreduce.html).
24 This algorithm implements the allreduce in two steps: first a
25 reduce-scatter, followed by an allgather. A recursive-halving
26 algorithm (beginning with processes that are distance 1 apart) is
27 used for the reduce-scatter, and a recursive doubling
28 algorithm is used for the allgather. The non-power-of-two case is
29 handled by dropping to the nearest lower power-of-two: the first
30 few even-numbered processes send their data to their right neighbors
31 (rank+1), and the reduce-scatter and allgather happen among the remaining
32 power-of-two processes. At the end, the first few even-numbered
33 processes get the result from their right neighbors.
35 For the power-of-two case, the cost for the reduce-scatter is
36 lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
37 allgather lgp.alpha + n.((p-1)/p).beta. Therefore, the
39 Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
41 For the non-power-of-two case,
42 Cost = (2.floor(lgp)+2).alpha + (2.((p-1)/p) + 2).n.beta + n.(1+(p-1)/p).gamma
45 For short messages, for user-defined ops, and for count < pof2
46 we use a recursive doubling algorithm (similar to the one in
47 MPI_Allgather). We use this algorithm in the case of user-defined ops
48 because in this case derived datatypes are allowed, and the user
49 could pass basic datatypes on one process and derived on another as
50 long as the type maps are the same. Breaking up derived datatypes
51 to do the reduce-scatter is tricky.
53 Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
55 Possible improvements:
57 End Algorithm: MPI_Allreduce
61 int Coll_allreduce_mpich::allreduce(const void *sbuf, void *rbuf, int count,
62 MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
64 size_t dsize, block_dsize;
65 int comm_size = comm->size();
66 const size_t large_message = 2048; //MPIR_PARAM_ALLREDUCE_SHORT_MSG_SIZE
68 dsize = dtype->size();
69 block_dsize = dsize * count;
71 /*MPICH uses SMP algorithms for all commutative ops now*/
72 if(!comm->is_smp_comm()){
73 if(comm->get_leaders_comm()==MPI_COMM_NULL){
76 if(op->is_commutative())
77 return Coll_allreduce_mvapich2_two_level::allreduce (sbuf, rbuf,count, dtype, op, comm);
80 /* find nearest power-of-two less than or equal to comm_size */
82 while (pof2 <= comm_size) pof2 <<= 1;
85 if (block_dsize > large_message && count >= pof2 && (op==MPI_OP_NULL || op->is_commutative())) {
87 return Coll_allreduce_rab_rdb::allreduce (sbuf, rbuf, count, dtype, op, comm);
89 //for short ones and count < pof2
90 return Coll_allreduce_rdb::allreduce (sbuf, rbuf, count, dtype, op, comm);
95 /* This is the default implementation of alltoall. The algorithm is:
97 Algorithm: MPI_Alltoall
99 We use four algorithms for alltoall. For short messages and
100 (comm_size >= 8), we use the algorithm by Jehoshua Bruck et al,
101 IEEE TPDS, Nov. 1997. It is a store-and-forward algorithm that
102 takes lgp steps. Because of the extra communication, the bandwidth
103 requirement is (n/2).lgp.beta.
105 Cost = lgp.alpha + (n/2).lgp.beta
107 where n is the total amount of data a process needs to send to all
110 For medium size messages and (short messages for comm_size < 8), we
111 use an algorithm that posts all irecvs and isends and then does a
112 waitall. We scatter the order of sources and destinations among the
113 processes, so that all processes don't try to send/recv to/from the
114 same process at the same time.
116 *** Modification: We post only a small number of isends and irecvs
117 at a time and wait on them as suggested by Tony Ladd. ***
118 *** See comments below about an additional modification that
119 we may want to consider ***
121 For long messages and power-of-two number of processes, we use a
122 pairwise exchange algorithm, which takes p-1 steps. We
123 calculate the pairs by using an exclusive-or algorithm:
124 for (i=1; i<comm_size; i++)
126 This algorithm doesn't work if the number of processes is not a power of
127 two. For a non-power-of-two number of processes, we use an
128 algorithm in which, in step i, each process receives from (rank-i)
129 and sends to (rank+i).
131 Cost = (p-1).alpha + n.beta
133 where n is the total amount of data a process needs to send to all
136 Possible improvements:
138 End Algorithm: MPI_Alltoall
141 int Coll_alltoall_mpich::alltoall(const void *sbuf, int scount,
143 void* rbuf, int rcount,
147 int communicator_size;
148 size_t dsize, block_dsize;
149 communicator_size = comm->size();
151 unsigned int short_size=256;
152 unsigned int medium_size=32768;
153 //short size and comm_size >=8 -> bruck
155 // medium size messages and (short messages for comm_size < 8), we
156 // use an algorithm that posts all irecvs and isends and then does a
159 // For long messages and power-of-two number of processes, we use a
160 // pairwise exchange algorithm
162 // For a non-power-of-two number of processes, we use an
163 // algorithm in which, in step i, each process receives from (rank-i)
164 // and sends to (rank+i).
167 dsize = sdtype->size();
168 block_dsize = dsize * scount;
170 if ((block_dsize < short_size) && (communicator_size >= 8)) {
171 return Coll_alltoall_bruck::alltoall(sbuf, scount, sdtype,
172 rbuf, rcount, rdtype,
175 } else if (block_dsize < medium_size) {
176 return Coll_alltoall_mvapich2_scatter_dest::alltoall(sbuf, scount, sdtype,
177 rbuf, rcount, rdtype,
179 }else if (communicator_size%2){
180 return Coll_alltoall_pair::alltoall(sbuf, scount, sdtype,
181 rbuf, rcount, rdtype,
185 return Coll_alltoall_ring::alltoall (sbuf, scount, sdtype,
186 rbuf, rcount, rdtype,
190 int Coll_alltoallv_mpich::alltoallv(const void *sbuf, const int *scounts, const int *sdisps,
192 void *rbuf, const int *rcounts, const int *rdisps,
197 /* For starters, just keep the original algorithm. */
198 return Coll_alltoallv_bruck::alltoallv(sbuf, scounts, sdisps, sdtype,
199 rbuf, rcounts, rdisps,rdtype,
204 int Coll_barrier_mpich::barrier(MPI_Comm comm)
206 return Coll_barrier_ompi_bruck::barrier(comm);
209 /* This is the default implementation of broadcast. The algorithm is:
213 For short messages, we use a binomial tree algorithm.
214 Cost = lgp.alpha + n.lgp.beta
216 For long messages, we do a scatter followed by an allgather.
217 We first scatter the buffer using a binomial tree algorithm. This costs
218 lgp.alpha + n.((p-1)/p).beta
219 If the datatype is contiguous and the communicator is homogeneous,
220 we treat the data as bytes and divide (scatter) it among processes
221 by using ceiling division. For the noncontiguous or heterogeneous
222 cases, we first pack the data into a temporary buffer by using
223 MPI_Pack, scatter it as bytes, and unpack it after the allgather.
225 For the allgather, we use a recursive doubling algorithm for
226 medium-size messages and power-of-two number of processes. This
227 takes lgp steps. In each step pairs of processes exchange all the
228 data they have (we take care of non-power-of-two situations). This
229 costs approximately lgp.alpha + n.((p-1)/p).beta. (Approximately
230 because it may be slightly more in the non-power-of-two case, but
231 it's still a logarithmic algorithm.) Therefore, for long messages
232 Total Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta
234 Note that this algorithm has twice the latency as the tree algorithm
235 we use for short messages, but requires lower bandwidth: 2.n.beta
236 versus n.lgp.beta. Therefore, for long messages and when lgp > 2,
237 this algorithm will perform better.
239 For long messages and for medium-size messages and non-power-of-two
240 processes, we use a ring algorithm for the allgather, which
241 takes p-1 steps, because it performs better than recursive doubling.
242 Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta
244 Possible improvements:
245 For clusters of SMPs, we may want to do something differently to
246 take advantage of shared memory on each node.
248 End Algorithm: MPI_Bcast
252 int Coll_bcast_mpich::bcast(void *buff, int count,
253 MPI_Datatype datatype, int root,
257 /* Decision function based on MX results for
258 messages up to 36MB and communicator sizes up to 64 nodes */
259 const size_t small_message_size = 12288;
260 const size_t intermediate_message_size = 524288;
262 int communicator_size;
264 size_t message_size, dsize;
266 if(!comm->is_smp_comm()){
267 if(comm->get_leaders_comm()==MPI_COMM_NULL){
270 if(comm->is_uniform())
271 return Coll_bcast_SMP_binomial::bcast(buff, count, datatype, root, comm);
274 communicator_size = comm->size();
276 /* else we need data size for decision function */
277 dsize = datatype->size();
278 message_size = dsize * (unsigned long)count; /* needed for decision */
280 /* Handle messages of small and intermediate size, and
281 single-element broadcasts */
282 if ((message_size < small_message_size) || (communicator_size <= 8)) {
283 /* Binomial without segmentation */
284 return Coll_bcast_binomial_tree::bcast (buff, count, datatype,
287 } else if (message_size < intermediate_message_size && !(communicator_size%2)) {
288 // SplittedBinary with 1KB segments
289 return Coll_bcast_scatter_rdb_allgather::bcast(buff, count, datatype,
293 //Handle large message sizes
294 return Coll_bcast_scatter_LR_allgather::bcast (buff, count, datatype,
301 /* This is the default implementation of reduce. The algorithm is:
303 Algorithm: MPI_Reduce
305 For long messages and for builtin ops and if count >= pof2 (where
306 pof2 is the nearest power-of-two less than or equal to the number
307 of processes), we use Rabenseifner's algorithm (see
308 http://www.hlrs.de/organization/par/services/models/mpi/myreduce.html ).
309 This algorithm implements the reduce in two steps: first a
310 reduce-scatter, followed by a gather to the root. A
311 recursive-halving algorithm (beginning with processes that are
312 distance 1 apart) is used for the reduce-scatter, and a binomial tree
313 algorithm is used for the gather. The non-power-of-two case is
314 handled by dropping to the nearest lower power-of-two: the first
315 few odd-numbered processes send their data to their left neighbors
316 (rank-1), and the reduce-scatter happens among the remaining
317 power-of-two processes. If the root is one of the excluded
318 processes, then after the reduce-scatter, rank 0 sends its result to
319 the root and exits; the root now acts as rank 0 in the binomial tree
320 algorithm for gather.
322 For the power-of-two case, the cost for the reduce-scatter is
323 lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
324 gather to root is lgp.alpha + n.((p-1)/p).beta. Therefore, the
326 Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
328 For the non-power-of-two case, assuming the root is not one of the
329 odd-numbered processes that get excluded in the reduce-scatter,
330 Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta +
334 For short messages, user-defined ops, and count < pof2, we use a
335 binomial tree algorithm for both short and long messages.
337 Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
340 We use the binomial tree algorithm in the case of user-defined ops
341 because in this case derived datatypes are allowed, and the user
342 could pass basic datatypes on one process and derived on another as
343 long as the type maps are the same. Breaking up derived datatypes
344 to do the reduce-scatter is tricky.
346 FIXME: Per the MPI-2.1 standard this case is not possible. We
347 should be able to use the reduce-scatter/gather approach as long as
348 count >= pof2. [goodell@ 2009-01-21]
350 Possible improvements:
352 End Algorithm: MPI_Reduce
356 int Coll_reduce_mpich::reduce(const void *sendbuf, void *recvbuf,
357 int count, MPI_Datatype datatype,
362 int communicator_size=0;
363 size_t message_size, dsize;
365 if(!comm->is_smp_comm()){
366 if(comm->get_leaders_comm()==MPI_COMM_NULL){
369 if (op->is_commutative() == 1)
370 return Coll_reduce_mvapich2_two_level::reduce(sendbuf, recvbuf, count, datatype, op, root, comm);
373 communicator_size = comm->size();
375 /* need data size for decision function */
376 dsize=datatype->size();
377 message_size = dsize * count; /* needed for decision */
380 while (pof2 <= communicator_size) pof2 <<= 1;
383 if ((count < pof2) || (message_size < 2048) || (op != MPI_OP_NULL && not op->is_commutative())) {
384 return Coll_reduce_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm);
386 return Coll_reduce_scatter_gather::reduce(sendbuf, recvbuf, count, datatype, op, root, comm);
391 /* This is the default implementation of reduce_scatter. The algorithm is:
393 Algorithm: MPI_Reduce_scatter
395 If the operation is commutative, for short and medium-size
396 messages, we use a recursive-halving
397 algorithm in which the first p/2 processes send the second n/2 data
398 to their counterparts in the other half and receive the first n/2
399 data from them. This procedure continues recursively, halving the
400 data communicated at each step, for a total of lgp steps. If the
401 number of processes is not a power-of-two, we convert it to the
402 nearest lower power-of-two by having the first few even-numbered
403 processes send their data to the neighboring odd-numbered process
404 at (rank+1). Those odd-numbered processes compute the result for
405 their left neighbor as well in the recursive halving algorithm, and
406 then at the end send the result back to the processes that didn't
408 Therefore, if p is a power-of-two,
409 Cost = lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
410 If p is not a power-of-two,
411 Cost = (floor(lgp)+2).alpha + n.(1+(p-1+n)/p).beta + n.(1+(p-1)/p).gamma
412 The above cost in the non power-of-two case is approximate because
413 there is some imbalance in the amount of work each process does
414 because some processes do the work of their neighbors as well.
416 For commutative operations and very long messages we use
417 we use a pairwise exchange algorithm similar to
418 the one used in MPI_Alltoall. At step i, each process sends n/p
419 amount of data to (rank+i) and receives n/p amount of data from
421 Cost = (p-1).alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
424 If the operation is not commutative, we do the following:
426 We use a recursive doubling algorithm, which
427 takes lgp steps. At step 1, processes exchange (n-n/p) amount of
428 data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
429 amount of data, and so forth.
431 Cost = lgp.alpha + n.(lgp-(p-1)/p).beta + n.(lgp-(p-1)/p).gamma
433 Possible improvements:
435 End Algorithm: MPI_Reduce_scatter
439 int Coll_reduce_scatter_mpich::reduce_scatter(const void *sbuf, void *rbuf,
447 size_t total_message_size;
449 if(sbuf==rbuf)sbuf=MPI_IN_PLACE; //restore MPI_IN_PLACE as these algorithms handle it
451 XBT_DEBUG("Coll_reduce_scatter_mpich::reduce");
453 comm_size = comm->size();
454 // We need data size for decision function
455 total_message_size = 0;
456 for (i = 0; i < comm_size; i++) {
457 total_message_size += rcounts[i];
460 if( (op==MPI_OP_NULL || op->is_commutative()) && total_message_size > 524288) {
461 return Coll_reduce_scatter_mpich_pair::reduce_scatter (sbuf, rbuf, rcounts,
464 } else if ((op != MPI_OP_NULL && not op->is_commutative())) {
465 int is_block_regular = 1;
466 for (i = 0; i < (comm_size - 1); ++i) {
467 if (rcounts[i] != rcounts[i + 1]) {
468 is_block_regular = 0;
473 /* slightly retask pof2 to mean pof2 equal or greater, not always greater as it is above */
475 while (pof2 < comm_size)
478 if (pof2 == comm_size && is_block_regular) {
479 /* noncommutative, pof2 size, and block regular */
480 return Coll_reduce_scatter_mpich_noncomm::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
483 return Coll_reduce_scatter_mpich_rdb::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
485 return Coll_reduce_scatter_mpich_rdb::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
490 /* This is the default implementation of allgather. The algorithm is:
492 Algorithm: MPI_Allgather
494 For short messages and non-power-of-two no. of processes, we use
495 the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
496 paper. It is a variant of the disemmination algorithm for
497 barrier. It takes ceiling(lg p) steps.
499 Cost = lgp.alpha + n.((p-1)/p).beta
500 where n is total size of data gathered on each process.
502 For short or medium-size messages and power-of-two no. of
503 processes, we use the recursive doubling algorithm.
505 Cost = lgp.alpha + n.((p-1)/p).beta
507 TODO: On TCP, we may want to use recursive doubling instead of the Bruck
508 algorithm in all cases because of the pairwise-exchange property of
509 recursive doubling (see Benson et al paper in Euro PVM/MPI
512 It is interesting to note that either of the above algorithms for
513 MPI_Allgather has the same cost as the tree algorithm for MPI_Gather!
515 For long messages or medium-size messages and non-power-of-two
516 no. of processes, we use a ring algorithm. In the first step, each
517 process i sends its contribution to process i+1 and receives
518 the contribution from process i-1 (with wrap-around). From the
519 second step onwards, each process i forwards to process i+1 the
520 data it received from process i-1 in the previous step. This takes
521 a total of p-1 steps.
523 Cost = (p-1).alpha + n.((p-1)/p).beta
525 We use this algorithm instead of recursive doubling for long
526 messages because we find that this communication pattern (nearest
527 neighbor) performs twice as fast as recursive doubling for long
528 messages (on Myrinet and IBM SP).
530 Possible improvements:
532 End Algorithm: MPI_Allgather
535 int Coll_allgather_mpich::allgather(const void *sbuf, int scount,
537 void* rbuf, int rcount,
542 int communicator_size, pow2_size;
543 size_t dsize, total_dsize;
545 communicator_size = comm->size();
547 /* Determine complete data size */
548 dsize=sdtype->size();
549 total_dsize = dsize * scount * communicator_size;
551 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
553 /* Decision as in MPICH-2
554 presented in Thakur et.al. "Optimization of Collective Communication
555 Operations in MPICH", International Journal of High Performance Computing
556 Applications, Vol. 19, No. 1, 49-66 (2005)
557 - for power-of-two processes and small and medium size messages
558 (up to 512KB) use recursive doubling
559 - for non-power-of-two processes and small messages (80KB) use bruck,
560 - for everything else use ring.
562 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
563 return Coll_allgather_rdb::allgather(sbuf, scount, sdtype,
564 rbuf, rcount, rdtype,
566 } else if (total_dsize <= 81920) {
567 return Coll_allgather_bruck::allgather(sbuf, scount, sdtype,
568 rbuf, rcount, rdtype,
571 return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
572 rbuf, rcount, rdtype,
577 /* This is the default implementation of allgatherv. The algorithm is:
579 Algorithm: MPI_Allgatherv
581 For short messages and non-power-of-two no. of processes, we use
582 the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
583 paper. It is a variant of the disemmination algorithm for
584 barrier. It takes ceiling(lg p) steps.
586 Cost = lgp.alpha + n.((p-1)/p).beta
587 where n is total size of data gathered on each process.
589 For short or medium-size messages and power-of-two no. of
590 processes, we use the recursive doubling algorithm.
592 Cost = lgp.alpha + n.((p-1)/p).beta
594 TODO: On TCP, we may want to use recursive doubling instead of the Bruck
595 algorithm in all cases because of the pairwise-exchange property of
596 recursive doubling (see Benson et al paper in Euro PVM/MPI
599 For long messages or medium-size messages and non-power-of-two
600 no. of processes, we use a ring algorithm. In the first step, each
601 process i sends its contribution to process i+1 and receives
602 the contribution from process i-1 (with wrap-around). From the
603 second step onwards, each process i forwards to process i+1 the
604 data it received from process i-1 in the previous step. This takes
605 a total of p-1 steps.
607 Cost = (p-1).alpha + n.((p-1)/p).beta
609 Possible improvements:
611 End Algorithm: MPI_Allgatherv
613 int Coll_allgatherv_mpich::allgatherv(const void *sbuf, int scount,
615 void* rbuf, const int *rcounts,
621 int communicator_size, pow2_size,i;
624 communicator_size = comm->size();
626 /* Determine complete data size */
628 for (i=0; i<communicator_size; i++)
629 total_dsize += rcounts[i];
630 if (total_dsize == 0)
633 for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1);
635 if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
636 return Coll_allgatherv_mpich_rdb::allgatherv(sbuf, scount, sdtype,
637 rbuf, rcounts, rdispls, rdtype,
639 } else if (total_dsize <= 81920) {
640 return Coll_allgatherv_ompi_bruck::allgatherv(sbuf, scount, sdtype,
641 rbuf, rcounts, rdispls, rdtype,
644 return Coll_allgatherv_mpich_ring::allgatherv(sbuf, scount, sdtype,
645 rbuf, rcounts, rdispls, rdtype,
649 /* This is the default implementation of gather. The algorithm is:
651 Algorithm: MPI_Gather
653 We use a binomial tree algorithm for both short and long
654 messages. At nodes other than leaf nodes we need to allocate a
655 temporary buffer to store the incoming message. If the root is not
656 rank 0, for very small messages, we pack it into a temporary
657 contiguous buffer and reorder it to be placed in the right
658 order. For small (but not very small) messages, we use a derived
659 datatype to unpack the incoming data into non-contiguous buffers in
660 the right order. In the heterogeneous case we first pack the
661 buffers by using MPI_Pack and then do the gather.
663 Cost = lgp.alpha + n.((p-1)/p).beta
664 where n is the total size of the data gathered at the root.
666 Possible improvements:
668 End Algorithm: MPI_Gather
671 int Coll_gather_mpich::gather(const void *sbuf, int scount,
673 void* rbuf, int rcount,
679 return Coll_gather_ompi_binomial::gather (sbuf, scount, sdtype,
680 rbuf, rcount, rdtype,
684 /* This is the default implementation of scatter. The algorithm is:
686 Algorithm: MPI_Scatter
688 We use a binomial tree algorithm for both short and
689 long messages. At nodes other than leaf nodes we need to allocate
690 a temporary buffer to store the incoming message. If the root is
691 not rank 0, we reorder the sendbuf in order of relative ranks by
692 copying it into a temporary buffer, so that all the sends from the
693 root are contiguous and in the right order. In the heterogeneous
694 case, we first pack the buffer by using MPI_Pack and then do the
697 Cost = lgp.alpha + n.((p-1)/p).beta
698 where n is the total size of the data to be scattered from the root.
700 Possible improvements:
702 End Algorithm: MPI_Scatter
706 int Coll_scatter_mpich::scatter(const void *sbuf, int scount,
708 void* rbuf, int rcount,
710 int root, MPI_Comm comm
713 std::unique_ptr<unsigned char[]> tmp_buf;
714 if(comm->rank()!=root){
715 tmp_buf.reset(new unsigned char[rcount * rdtype->get_extent()]);
716 sbuf = tmp_buf.get();
720 return Coll_scatter_ompi_binomial::scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);