Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
13e5a41ebb97459fbd8f253a2aaf58705f4c638d
[simgrid.git] / src / smpi / colls / smpi_openmpi_selector.c
1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
2
3 /* Copyright (c) 2009, 2010. The SimGrid Team.
4  * All rights reserved.                                                     */
5
6 /* This program is free software; you can redistribute it and/or modify it
7  * under the terms of the license (GNU LGPL) which comes with this package. */
8
9 #include "colls_private.h"
10
11
12 int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
13                         MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
14 {
15     size_t dsize, block_dsize;
16     int comm_size = smpi_comm_size(comm);
17     const size_t intermediate_message = 10000;
18
19     /**
20      * Decision function based on MX results from the Grig cluster at UTK.
21      * 
22      * Currently, linear, recursive doubling, and nonoverlapping algorithms 
23      * can handle both commutative and non-commutative operations.
24      * Ring algorithm does not support non-commutative operations.
25      */
26     dsize = smpi_datatype_size(dtype);
27     block_dsize = dsize * count;
28
29     if (block_dsize < intermediate_message) {
30         return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf, 
31                                                                    count, dtype,
32                                                                    op, comm));
33     } 
34
35     if( smpi_op_is_commute(op) && (count > comm_size) ) {
36         const size_t segment_size = 1 << 20; /* 1 MB */
37         if ((comm_size * segment_size >= block_dsize)) {
38             //FIXME: ok, these are not the right algorithms, try to find closer ones
39             // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
40             return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
41                                               op, comm);
42         } else {
43            // return (smpi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf, 
44            return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf,
45                                                                     count, dtype, 
46                                                                     op, comm 
47                                                                     /*segment_size*/));
48         }
49     }
50
51     return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count, 
52                                                             dtype, op, comm));
53 }
54
55
56
57 int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount, 
58                                              MPI_Datatype sdtype,
59                                              void* rbuf, int rcount, 
60                                              MPI_Datatype rdtype, 
61                                              MPI_Comm comm)
62 {
63     int communicator_size;
64     size_t dsize, block_dsize;
65     communicator_size = smpi_comm_size(comm);
66
67     /* Decision function based on measurement on Grig cluster at 
68        the University of Tennessee (2GB MX) up to 64 nodes.
69        Has better performance for messages of intermediate sizes than the old one */
70     /* determine block size */
71     dsize = smpi_datatype_size(sdtype);
72     block_dsize = dsize * scount;
73
74     if ((block_dsize < 200) && (communicator_size > 12)) {
75         return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype, 
76                                                     rbuf, rcount, rdtype,
77                                                     comm);
78
79     } else if (block_dsize < 3000) {
80         return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype, 
81                                                            rbuf, rcount, rdtype, 
82                                                            comm);
83     }
84
85     return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype, 
86                                                     rbuf, rcount, rdtype,
87                                                     comm);
88 }
89
90 int smpi_coll_tuned_alltoallv_ompi(void *sbuf, int *scounts, int *sdisps,
91                                               MPI_Datatype sdtype,
92                                               void *rbuf, int *rcounts, int *rdisps,
93                                               MPI_Datatype rdtype,
94                                               MPI_Comm  comm
95                                               )
96 {
97     /* For starters, just keep the original algorithm. */
98     return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype, 
99                                                         rbuf, rcounts, rdisps,rdtype,
100                                                         comm);
101 }
102
103 /*
104 void smpi_coll_tuned_barrier_ompi(MPI_Comm  comm)
105 {    int communicator_size = smpi_comm_size(comm);
106
107     if( 2 == communicator_size )
108         return smpi_coll_tuned_barrier_intra_two_procs(comm, module);
109      * Basic optimisation. If we have a power of 2 number of nodes
110      * the use the recursive doubling algorithm, otherwise
111      * bruck is the one we want.
112     {
113         bool has_one = false;
114         for( ; communicator_size > 0; communicator_size >>= 1 ) {
115             if( communicator_size & 0x1 ) {
116                 if( has_one )
117                     return smpi_coll_tuned_barrier_intra_bruck(comm, module);
118                 has_one = true;
119             }
120         }
121     }
122     return smpi_coll_tuned_barrier_intra_recursivedoubling(comm, module);
123 }*/
124
125 int smpi_coll_tuned_bcast_ompi(void *buff, int count,
126                                           MPI_Datatype datatype, int root,
127                                           MPI_Comm  comm
128                                           )
129 {
130     /* Decision function based on MX results for 
131        messages up to 36MB and communicator sizes up to 64 nodes */
132     const size_t small_message_size = 2048;
133     const size_t intermediate_message_size = 370728;
134     //const double a_p16  = 3.2118e-6; /* [1 / byte] */
135     //const double b_p16  = 8.7936;   
136     //const double a_p64  = 2.3679e-6; /* [1 / byte] */
137     //const double b_p64  = 1.1787;     
138     //const double a_p128 = 1.6134e-6; /* [1 / byte] */
139     //const double b_p128 = 2.1102;
140
141     int communicator_size;
142     //int segsize = 0;
143     size_t message_size, dsize;
144
145     communicator_size = smpi_comm_size(comm);
146
147     /* else we need data size for decision function */
148     dsize = smpi_datatype_size(datatype);
149     message_size = dsize * (unsigned long)count;   /* needed for decision */
150
151     /* Handle messages of small and intermediate size, and 
152        single-element broadcasts */
153     if ((message_size < small_message_size) || (count <= 1)) {
154         /* Binomial without segmentation */
155         return  smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype, 
156                                                       root, comm);
157
158     } else if (message_size < intermediate_message_size) {
159         // SplittedBinary with 1KB segments
160         return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype, 
161                                                          root, comm);
162
163     } /*
164      Handle large message sizes 
165     else if (communicator_size < (a_p128 * message_size + b_p128)) {
166          Pipeline with 128KB segments 
167         segsize = 1024  << 7;
168         return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype, 
169                                                      root, comm, module,
170                                                      segsize);
171
172     }*/ else if (communicator_size < 13) {
173         // Split Binary with 8KB segments 
174         return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype, 
175                                                          root, comm);
176        
177     } /*else if (communicator_size < (a_p64 * message_size + b_p64)) {
178         // Pipeline with 64KB segments 
179         segsize = 1024 << 6;
180         return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, 
181                                                      root, comm, module,
182                                                      segsize);
183
184     } else if (communicator_size < (a_p16 * message_size + b_p16)) {
185          Pipeline with 16KB segments 
186         //segsize = 1024 << 4;
187         return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype, 
188                                                      root, comm, module,
189                                                      segsize);
190
191     }*/
192
193     /* Pipeline with 8KB segments */
194     //segsize = 1024 << 3;
195     return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype, 
196                                                  root, comm
197                                                  /*segsize*/);
198 #if 0
199     /* this is based on gige measurements */
200
201     if (communicator_size  < 4) {
202         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
203     }
204     if (communicator_size == 4) {
205         if (message_size < 524288) segsize = 0;
206         else segsize = 16384;
207         return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
208     }
209     if (communicator_size <= 8 && message_size < 4096) {
210         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
211     }
212     if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
213         segsize = 16384;
214         return  smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
215     }
216     if (message_size >= 524288) {
217         segsize = 16384;
218         return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
219     }
220     segsize = 0;
221     /* once tested can swap this back in */
222     /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
223     return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
224 #endif  /* 0 */
225 }
226
227 int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
228                                             int count, MPI_Datatype  datatype,
229                                             MPI_Op   op, int root,
230                                             MPI_Comm   comm
231                                             )
232 {
233     int communicator_size=0;
234     //int segsize = 0;
235     size_t message_size, dsize;
236     //const double a1 =  0.6016 / 1024.0; /* [1/B] */
237     //const double b1 =  1.3496;
238     //const double a2 =  0.0410 / 1024.0; /* [1/B] */
239     //const double b2 =  9.7128;
240     //const double a3 =  0.0422 / 1024.0; /* [1/B] */
241     //const double b3 =  1.1614;
242     //const double a4 =  0.0033 / 1024.0; /* [1/B] */
243     //const double b4 =  1.6761;
244
245     //const int max_requests = 0; /* no limit on # of outstanding requests */
246
247     communicator_size = smpi_comm_size(comm);
248
249     /* need data size for decision function */
250     dsize=smpi_datatype_size(datatype);
251     message_size = dsize * count;   /* needed for decision */
252
253     /**
254      * If the operation is non commutative we currently have choice of linear 
255      * or in-order binary tree algorithm.
256      */
257 /*    if( !ompi_op_is_commute(op) ) {
258         if ((communicator_size < 12) && (message_size < 2048)) {
259             return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); 
260         } 
261         return smpi_coll_tuned_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
262                                                              0, max_requests); 
263     }*/
264
265     if ((communicator_size < 8) && (message_size < 512)){
266         /* Linear_0K */
267         return smpi_coll_tuned_reduce_flat_tree (sendbuf, recvbuf, count, datatype, op, root, comm); 
268     } else if (((communicator_size < 8) && (message_size < 20480)) ||
269                (message_size < 2048) || (count <= 1)) {
270         /* Binomial_0K */
271         //segsize = 0;
272         return smpi_coll_tuned_reduce_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
273                                                      segsize, max_requests*/);
274     } /*else if (communicator_size > (a1 * message_size + b1)) {
275         // Binomial_1K 
276         segsize = 1024;
277         return smpi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
278                                                      segsize, max_requests);
279     } else if (communicator_size > (a2 * message_size + b2)) {
280         // Pipeline_1K 
281         segsize = 1024;
282         return smpi_coll_tuned_reduce_NTSL (sendbuf, recvbuf, count, datatype, op, root, comm, module, 
283                                                       segsize, max_requests);
284     } else if (communicator_size > (a3 * message_size + b3)) {
285         // Binary_32K 
286         segsize = 32*1024;
287         return smpi_coll_tuned_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
288                                                     comm, module, segsize, max_requests);
289     }
290     if (communicator_size > (a4 * message_size + b4)) {
291         // Pipeline_32K 
292         segsize = 32*1024;
293     } else {
294         // Pipeline_64K 
295         segsize = 64*1024;
296     }*/
297     return smpi_coll_tuned_reduce_NTSL (sendbuf, recvbuf, count, datatype, op, root, comm/*, module, 
298                                                   segsize, max_requests*/);
299
300 #if 0
301     /* for small messages use linear algorithm */
302     if (message_size <= 4096) {
303         segsize = 0;
304         fanout = communicator_size - 1;
305         /* when linear implemented or taken from basic put here, right now using chain as a linear system */
306         /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
307         return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); 
308         /*        return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
309     }
310     if (message_size < 524288) {
311         if (message_size <= 65536 ) {
312             segsize = 32768;
313             fanout = 8;
314         } else {
315             segsize = 1024;
316             fanout = communicator_size/2;
317         }
318         /* later swap this for a binary tree */
319         /*         fanout = 2; */
320         return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
321                                                    segsize, fanout, max_requests);
322     }
323     segsize = 1024;
324     return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
325                                                   segsize, max_requests);
326 #endif  /* 0 */
327 }
328
329 /*int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
330                                                     int *rcounts,
331                                                     MPI_Datatype dtype,
332                                                     MPI_Op  op,
333                                                     MPI_Comm  comm,
334                                                     )
335 {
336     int comm_size, i, pow2;
337     size_t total_message_size, dsize;
338     const double a = 0.0012;
339     const double b = 8.0;
340     const size_t small_message_size = 12 * 1024;
341     const size_t large_message_size = 256 * 1024;
342     bool zerocounts = false;
343
344     OPAL_OUTPUT((smpi_coll_tuned_stream, "smpi_coll_tuned_reduce_scatter_ompi"));
345
346     comm_size = smpi_comm_size(comm);
347     // We need data size for decision function 
348     ompi_datatype_type_size(dtype, &dsize);
349     total_message_size = 0;
350     for (i = 0; i < comm_size; i++) { 
351         total_message_size += rcounts[i];
352         if (0 == rcounts[i]) {
353             zerocounts = true;
354         }
355     }
356
357     if( !ompi_op_is_commute(op) || (zerocounts)) {
358         return smpi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts, 
359                                                                     dtype, op, 
360                                                                     comm, module); 
361     }
362    
363     total_message_size *= dsize;
364
365     // compute the nearest power of 2 
366     for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
367
368     if ((total_message_size <= small_message_size) ||
369         ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
370         (comm_size >= a * total_message_size + b)) {
371         return 
372             smpi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
373                                                                         dtype, op,
374                                                                         comm, module);
375     } 
376     return smpi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
377                                                      dtype, op,
378                                                      comm, module);
379
380   
381     return smpi_coll_tuned_reduce_scatter(sbuf, rbuf, rcounts,
382                                                      dtype, op,
383                                                      comm;
384
385 }*/
386
387 int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount, 
388                                               MPI_Datatype sdtype,
389                                               void* rbuf, int rcount, 
390                                               MPI_Datatype rdtype, 
391                                               MPI_Comm  comm
392                                               )
393 {
394     int communicator_size, pow2_size;
395     size_t dsize, total_dsize;
396
397     communicator_size = smpi_comm_size(comm);
398
399     /* Special case for 2 processes */
400     if (communicator_size == 2) {
401         return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype, 
402                                                           rbuf, rcount, rdtype, 
403                                                           comm/*, module*/);
404     }
405
406     /* Determine complete data size */
407     dsize=smpi_datatype_size(sdtype);
408     total_dsize = dsize * scount * communicator_size;   
409    
410     for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1); 
411
412     /* Decision based on MX 2Gb results from Grig cluster at 
413        The University of Tennesse, Knoxville 
414        - if total message size is less than 50KB use either bruck or 
415        recursive doubling for non-power of two and power of two nodes, 
416        respectively.
417        - else use ring and neighbor exchange algorithms for odd and even 
418        number of nodes, respectively.
419     */
420     if (total_dsize < 50000) {
421         if (pow2_size == communicator_size) {
422             return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, 
423                                                                      rbuf, rcount, rdtype,
424                                                                      comm);
425         } else {
426             return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, 
427                                                          rbuf, rcount, rdtype, 
428                                                          comm);
429         }
430     } else {
431         //if (communicator_size % 2) {
432             return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, 
433                                                         rbuf, rcount, rdtype, 
434                                                         comm);
435         /*} else {
436             return  smpi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
437                                                                      rbuf, rcount, rdtype,
438                                                                      comm, module);
439         }*/
440     }
441    
442 #if defined(USE_MPICH2_DECISION)
443     /* Decision as in MPICH-2 
444        presented in Thakur et.al. "Optimization of Collective Communication 
445        Operations in MPICH", International Journal of High Performance Computing 
446        Applications, Vol. 19, No. 1, 49-66 (2005)
447        - for power-of-two processes and small and medium size messages 
448        (up to 512KB) use recursive doubling
449        - for non-power-of-two processes and small messages (80KB) use bruck,
450        - for everything else use ring.
451     */
452     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
453         return smpi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype, 
454                                                                  rbuf, rcount, rdtype, 
455                                                                  comm, module);
456     } else if (total_dsize <= 81920) { 
457         return smpi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, 
458                                                      rbuf, rcount, rdtype,
459                                                      comm, module);
460     } 
461     return smpi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, 
462                                                 rbuf, rcount, rdtype,
463                                                 comm, module);
464 #endif  /* defined(USE_MPICH2_DECISION) */
465 }
466
467 int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount, 
468                                                MPI_Datatype sdtype,
469                                                void* rbuf, int *rcounts, 
470                                                int *rdispls,
471                                                MPI_Datatype rdtype, 
472                                                MPI_Comm  comm
473                                                )
474 {
475     int i;
476     int communicator_size;
477     size_t dsize, total_dsize;
478     
479     communicator_size = smpi_comm_size(comm);
480     
481     /* Special case for 2 processes */
482     if (communicator_size == 2) {
483         return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
484                                                            rbuf, rcounts, rdispls, rdtype, 
485                                                            comm);
486     }
487     
488     /* Determine complete data size */
489     dsize=smpi_datatype_size(sdtype);
490     total_dsize = 0;
491     for (i = 0; i < communicator_size; i++) {
492         total_dsize += dsize * rcounts[i];
493     }
494     
495     /* Decision based on allgather decision.   */
496     if (total_dsize < 50000) {
497 /*        return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype, 
498                                                       rbuf, rcounts, rdispls, rdtype, 
499                                                       comm, module);*/
500     return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, 
501                                                       rbuf, rcounts, rdispls, rdtype, 
502                                                       comm);
503
504     } else {
505 //        if (communicator_size % 2) {
506             return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, 
507                                                          rbuf, rcounts, rdispls, rdtype, 
508                                                          comm);
509 /*        } else {
510             return  smpi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
511                                                                       rbuf, rcounts, rdispls, rdtype, 
512                                                                       comm, module);
513         }*/
514     }
515 }
516 /*
517 int smpi_coll_tuned_gather_ompi(void *sbuf, int scount, 
518                                            MPI_Datatype sdtype,
519                                            void* rbuf, int rcount, 
520                                            MPI_Datatype rdtype, 
521                                            int root,
522                                            MPI_Comm  comm,
523                                            )
524 {
525     const int large_segment_size = 32768;
526     const int small_segment_size = 1024;
527
528     const size_t large_block_size = 92160;
529     const size_t intermediate_block_size = 6000;
530     const size_t small_block_size = 1024;
531
532     const int large_communicator_size = 60;
533     const int small_communicator_size = 10;
534
535     int communicator_size, rank;
536     size_t dsize, block_size;
537
538     OPAL_OUTPUT((smpi_coll_tuned_stream, 
539                  "smpi_coll_tuned_gather_ompi"));
540
541     communicator_size = smpi_comm_size(comm);
542     rank = ompi_comm_rank(comm);
543
544     // Determine block size 
545     if (rank == root) {
546         ompi_datatype_type_size(rdtype, &dsize);
547         block_size = dsize * rcount;
548     } else {
549         ompi_datatype_type_size(sdtype, &dsize);
550         block_size = dsize * scount;
551     }
552
553     if (block_size > large_block_size) {
554         return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype, 
555                                                          rbuf, rcount, rdtype, 
556                                                          root, comm, module,
557                                                          large_segment_size);
558
559     } else if (block_size > intermediate_block_size) {
560         return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype, 
561                                                          rbuf, rcount, rdtype, 
562                                                          root, comm, module,
563                                                          small_segment_size);
564
565     } else if ((communicator_size > large_communicator_size) ||
566                ((communicator_size > small_communicator_size) &&
567                 (block_size < small_block_size))) {
568         return smpi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype, 
569                                                       rbuf, rcount, rdtype, 
570                                                       root, comm, module);
571
572     }
573     // Otherwise, use basic linear 
574     return smpi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype, 
575                                                       rbuf, rcount, rdtype, 
576                                                       root, comm, module);
577 }*/
578 /*
579 int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount, 
580                                             MPI_Datatype sdtype,
581                                             void* rbuf, int rcount, 
582                                             MPI_Datatype rdtype, 
583                                             int root, MPI_Comm  comm,
584                                             )
585 {
586     const size_t small_block_size = 300;
587     const int small_comm_size = 10;
588     int communicator_size, rank;
589     size_t dsize, block_size;
590
591     OPAL_OUTPUT((smpi_coll_tuned_stream, 
592                  "smpi_coll_tuned_scatter_ompi"));
593
594     communicator_size = smpi_comm_size(comm);
595     rank = ompi_comm_rank(comm);
596     // Determine block size 
597     if (root == rank) {
598         ompi_datatype_type_size(sdtype, &dsize);
599         block_size = dsize * scount;
600     } else {
601         ompi_datatype_type_size(rdtype, &dsize);
602         block_size = dsize * rcount;
603     } 
604
605     if ((communicator_size > small_comm_size) &&
606         (block_size < small_block_size)) {
607         return smpi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype, 
608                                                        rbuf, rcount, rdtype, 
609                                                        root, comm, module);
610     }
611     return smpi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype, 
612                                                        rbuf, rcount, rdtype, 
613                                                        root, comm, module);
614 }*/
615