Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
remove warnings in smpi selector
[simgrid.git] / src / smpi / colls / smpi_openmpi_selector.c
1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
2
3 /* Copyright (c) 2009, 2010. The SimGrid Team.
4  * All rights reserved.                                                     */
5
6 /* This program is free software; you can redistribute it and/or modify it
7  * under the terms of the license (GNU LGPL) which comes with this package. */
8
9 #include "colls_private.h"
10
11
12 int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
13                         MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
14 {
15     size_t dsize, block_dsize;
16     int comm_size = smpi_comm_size(comm);
17     const size_t intermediate_message = 10000;
18
19     /**
20      * Decision function based on MX results from the Grig cluster at UTK.
21      * 
22      * Currently, linear, recursive doubling, and nonoverlapping algorithms 
23      * can handle both commutative and non-commutative operations.
24      * Ring algorithm does not support non-commutative operations.
25      */
26     dsize = smpi_datatype_size(dtype);
27     block_dsize = dsize * count;
28
29     if (block_dsize < intermediate_message) {
30         return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf, 
31                                                                    count, dtype,
32                                                                    op, comm));
33     } 
34
35     if( /*smpi_op_is_commute(op) && */(count > comm_size) ) {
36         const size_t segment_size = 1 << 20; /* 1 MB */
37         if ((comm_size * segment_size >= block_dsize)) {
38             //FIXME: ok, these are not the right algorithms, try to find closer ones
39             // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
40             return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
41                                               op, comm);
42         } else {
43            // return (smpi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf, 
44            return (smpi_coll_tuned_allreduce_rab2 (sbuf, rbuf,
45                                                                     count, dtype, 
46                                                                     op, comm 
47                                                                     /*segment_size*/));
48         }
49     }
50
51     return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count, 
52                                                             dtype, op, comm));
53 }
54
55
56
57 int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount, 
58                                              MPI_Datatype sdtype,
59                                              void* rbuf, int rcount, 
60                                              MPI_Datatype rdtype, 
61                                              MPI_Comm comm)
62 {
63     int communicator_size;
64     size_t dsize, block_dsize;
65     communicator_size = smpi_comm_size(comm);
66
67     /* Decision function based on measurement on Grig cluster at 
68        the University of Tennessee (2GB MX) up to 64 nodes.
69        Has better performance for messages of intermediate sizes than the old one */
70     /* determine block size */
71     dsize = smpi_datatype_size(sdtype);
72     block_dsize = dsize * scount;
73
74     if ((block_dsize < 200) && (communicator_size > 12)) {
75         return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype, 
76                                                     rbuf, rcount, rdtype,
77                                                     comm);
78
79     } else if (block_dsize < 3000) {
80         return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype, 
81                                                            rbuf, rcount, rdtype, 
82                                                            comm);
83     }
84
85     return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype, 
86                                                     rbuf, rcount, rdtype,
87                                                     comm);
88 }
89
90 int smpi_coll_tuned_alltoallv_ompi(void *sbuf, int *scounts, int *sdisps,
91                                               MPI_Datatype sdtype,
92                                               void *rbuf, int *rcounts, int *rdisps,
93                                               MPI_Datatype rdtype,
94                                               MPI_Comm  comm
95                                               )
96 {
97     /* For starters, just keep the original algorithm. */
98     return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype, 
99                                                         rbuf, rcounts, rdisps,rdtype,
100                                                         comm);
101 }
102
103 /*
104 void smpi_coll_tuned_barrier_ompi(MPI_Comm  comm)
105 {    int communicator_size = smpi_comm_size(comm);
106
107     if( 2 == communicator_size )
108         return smpi_coll_tuned_barrier_intra_two_procs(comm, module);
109      * Basic optimisation. If we have a power of 2 number of nodes
110      * the use the recursive doubling algorithm, otherwise
111      * bruck is the one we want.
112     {
113         bool has_one = false;
114         for( ; communicator_size > 0; communicator_size >>= 1 ) {
115             if( communicator_size & 0x1 ) {
116                 if( has_one )
117                     return smpi_coll_tuned_barrier_intra_bruck(comm, module);
118                 has_one = true;
119             }
120         }
121     }
122     return smpi_coll_tuned_barrier_intra_recursivedoubling(comm, module);
123 }*/
124
125 int smpi_coll_tuned_bcast_ompi(void *buff, int count,
126                                           MPI_Datatype datatype, int root,
127                                           MPI_Comm  comm
128                                           )
129 {
130     /* Decision function based on MX results for 
131        messages up to 36MB and communicator sizes up to 64 nodes */
132     //const size_t small_message_size = 2048;
133     const size_t intermediate_message_size = 370728;
134     //const double a_p16  = 3.2118e-6; /* [1 / byte] */
135     //const double b_p16  = 8.7936;   
136     //const double a_p64  = 2.3679e-6; /* [1 / byte] */
137     //const double b_p64  = 1.1787;     
138     //const double a_p128 = 1.6134e-6; /* [1 / byte] */
139     //const double b_p128 = 2.1102;
140
141     //int communicator_size;
142     //int segsize = 0;
143     size_t message_size, dsize;
144
145     //communicator_size = smpi_comm_size(comm);
146
147     /* else we need data size for decision function */
148     dsize = smpi_datatype_size(datatype);
149     message_size = dsize * (unsigned long)count;   /* needed for decision */
150
151     /* Handle messages of small and intermediate size, and 
152        single-element broadcasts */
153     if ((message_size < /*small_message_size*/intermediate_message_size) || (count <= 1)) {
154         /* Binomial without segmentation */
155         //segsize = 0;
156         return  smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype, 
157                                                       root, comm/*
158                                                       segsize*/);
159
160     } /*else if (message_size < intermediate_message_size) {
161         // SplittedBinary with 1KB segments
162         segsize = 1024;
163         return smpi_coll_tuned_bcast_split_bintree(buff, count, datatype, 
164                                                          root, comm
165                                                          segsize);
166
167     } 
168      Handle large message sizes 
169     else if (communicator_size < (a_p128 * message_size + b_p128)) {
170          Pipeline with 128KB segments 
171         segsize = 1024  << 7;
172         return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype, 
173                                                      root, comm, module,
174                                                      segsize);
175
176     } else if (communicator_size < 13) {
177         // Split Binary with 8KB segments 
178         segsize = 1024 << 3;
179         return smpi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype, 
180                                                          root, comm, module,
181                                                          segsize);
182        
183     } else if (communicator_size < (a_p64 * message_size + b_p64)) {
184         // Pipeline with 64KB segments 
185         segsize = 1024 << 6;
186         return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, 
187                                                      root, comm, module,
188                                                      segsize);
189
190     } else if (communicator_size < (a_p16 * message_size + b_p16)) {
191          Pipeline with 16KB segments 
192         //segsize = 1024 << 4;
193         return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype, 
194                                                      root, comm, module,
195                                                      segsize);
196
197     }*/
198
199     /* Pipeline with 8KB segments */
200     //segsize = 1024 << 3;
201     return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype, 
202                                                  root, comm
203                                                  /*segsize*/);
204 #if 0
205     /* this is based on gige measurements */
206
207     if (communicator_size  < 4) {
208         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
209     }
210     if (communicator_size == 4) {
211         if (message_size < 524288) segsize = 0;
212         else segsize = 16384;
213         return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
214     }
215     if (communicator_size <= 8 && message_size < 4096) {
216         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
217     }
218     if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
219         segsize = 16384;
220         return  smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
221     }
222     if (message_size >= 524288) {
223         segsize = 16384;
224         return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
225     }
226     segsize = 0;
227     /* once tested can swap this back in */
228     /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
229     return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
230 #endif  /* 0 */
231 }
232
233 int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
234                                             int count, MPI_Datatype  datatype,
235                                             MPI_Op   op, int root,
236                                             MPI_Comm   comm
237                                             )
238 {
239     int communicator_size=0;
240     //int segsize = 0;
241     size_t message_size, dsize;
242     //const double a1 =  0.6016 / 1024.0; /* [1/B] */
243     //const double b1 =  1.3496;
244     //const double a2 =  0.0410 / 1024.0; /* [1/B] */
245     //const double b2 =  9.7128;
246     //const double a3 =  0.0422 / 1024.0; /* [1/B] */
247     //const double b3 =  1.1614;
248     //const double a4 =  0.0033 / 1024.0; /* [1/B] */
249     //const double b4 =  1.6761;
250
251     //const int max_requests = 0; /* no limit on # of outstanding requests */
252
253     communicator_size = smpi_comm_size(comm);
254
255     /* need data size for decision function */
256     dsize=smpi_datatype_size(datatype);
257     message_size = dsize * count;   /* needed for decision */
258
259     /**
260      * If the operation is non commutative we currently have choice of linear 
261      * or in-order binary tree algorithm.
262      */
263 /*    if( !ompi_op_is_commute(op) ) {
264         if ((communicator_size < 12) && (message_size < 2048)) {
265             return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); 
266         } 
267         return smpi_coll_tuned_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
268                                                              0, max_requests); 
269     }*/
270
271     if ((communicator_size < 8) && (message_size < 512)){
272         /* Linear_0K */
273         return smpi_coll_tuned_reduce_flat_tree (sendbuf, recvbuf, count, datatype, op, root, comm); 
274     } else if (((communicator_size < 8) && (message_size < 20480)) ||
275                (message_size < 2048) || (count <= 1)) {
276         /* Binomial_0K */
277         //segsize = 0;
278         return smpi_coll_tuned_reduce_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
279                                                      segsize, max_requests*/);
280     } /*else if (communicator_size > (a1 * message_size + b1)) {
281         // Binomial_1K 
282         segsize = 1024;
283         return smpi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
284                                                      segsize, max_requests);
285     } else if (communicator_size > (a2 * message_size + b2)) {
286         // Pipeline_1K 
287         segsize = 1024;
288         return smpi_coll_tuned_reduce_NTSL (sendbuf, recvbuf, count, datatype, op, root, comm, module, 
289                                                       segsize, max_requests);
290     } else if (communicator_size > (a3 * message_size + b3)) {
291         // Binary_32K 
292         segsize = 32*1024;
293         return smpi_coll_tuned_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
294                                                     comm, module, segsize, max_requests);
295     }
296     if (communicator_size > (a4 * message_size + b4)) {
297         // Pipeline_32K 
298         segsize = 32*1024;
299     } else {
300         // Pipeline_64K 
301         segsize = 64*1024;
302     }*/
303     return smpi_coll_tuned_reduce_NTSL (sendbuf, recvbuf, count, datatype, op, root, comm/*, module, 
304                                                   segsize, max_requests*/);
305
306 #if 0
307     /* for small messages use linear algorithm */
308     if (message_size <= 4096) {
309         segsize = 0;
310         fanout = communicator_size - 1;
311         /* when linear implemented or taken from basic put here, right now using chain as a linear system */
312         /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
313         return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); 
314         /*        return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
315     }
316     if (message_size < 524288) {
317         if (message_size <= 65536 ) {
318             segsize = 32768;
319             fanout = 8;
320         } else {
321             segsize = 1024;
322             fanout = communicator_size/2;
323         }
324         /* later swap this for a binary tree */
325         /*         fanout = 2; */
326         return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
327                                                    segsize, fanout, max_requests);
328     }
329     segsize = 1024;
330     return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
331                                                   segsize, max_requests);
332 #endif  /* 0 */
333 }
334
335 /*int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
336                                                     int *rcounts,
337                                                     MPI_Datatype dtype,
338                                                     MPI_Op  op,
339                                                     MPI_Comm  comm,
340                                                     )
341 {
342     int comm_size, i, pow2;
343     size_t total_message_size, dsize;
344     const double a = 0.0012;
345     const double b = 8.0;
346     const size_t small_message_size = 12 * 1024;
347     const size_t large_message_size = 256 * 1024;
348     bool zerocounts = false;
349
350     OPAL_OUTPUT((smpi_coll_tuned_stream, "smpi_coll_tuned_reduce_scatter_ompi"));
351
352     comm_size = smpi_comm_size(comm);
353     // We need data size for decision function 
354     ompi_datatype_type_size(dtype, &dsize);
355     total_message_size = 0;
356     for (i = 0; i < comm_size; i++) { 
357         total_message_size += rcounts[i];
358         if (0 == rcounts[i]) {
359             zerocounts = true;
360         }
361     }
362
363     if( !ompi_op_is_commute(op) || (zerocounts)) {
364         return smpi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts, 
365                                                                     dtype, op, 
366                                                                     comm, module); 
367     }
368    
369     total_message_size *= dsize;
370
371     // compute the nearest power of 2 
372     for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
373
374     if ((total_message_size <= small_message_size) ||
375         ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
376         (comm_size >= a * total_message_size + b)) {
377         return 
378             smpi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
379                                                                         dtype, op,
380                                                                         comm, module);
381     } 
382     return smpi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
383                                                      dtype, op,
384                                                      comm, module);
385
386   
387     return smpi_coll_tuned_reduce_scatter(sbuf, rbuf, rcounts,
388                                                      dtype, op,
389                                                      comm;
390
391 }*/
392
393 int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount, 
394                                               MPI_Datatype sdtype,
395                                               void* rbuf, int rcount, 
396                                               MPI_Datatype rdtype, 
397                                               MPI_Comm  comm
398                                               )
399 {
400     int communicator_size, pow2_size;
401     size_t dsize, total_dsize;
402
403     communicator_size = smpi_comm_size(comm);
404
405     /* Special case for 2 processes */
406     if (communicator_size == 2) {
407         return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype, 
408                                                           rbuf, rcount, rdtype, 
409                                                           comm/*, module*/);
410     }
411
412     /* Determine complete data size */
413     dsize=smpi_datatype_size(sdtype);
414     total_dsize = dsize * scount * communicator_size;   
415    
416     for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1); 
417
418     /* Decision based on MX 2Gb results from Grig cluster at 
419        The University of Tennesse, Knoxville 
420        - if total message size is less than 50KB use either bruck or 
421        recursive doubling for non-power of two and power of two nodes, 
422        respectively.
423        - else use ring and neighbor exchange algorithms for odd and even 
424        number of nodes, respectively.
425     */
426     if (total_dsize < 50000) {
427         if (pow2_size == communicator_size) {
428             return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, 
429                                                                      rbuf, rcount, rdtype,
430                                                                      comm);
431         } else {
432             return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, 
433                                                          rbuf, rcount, rdtype, 
434                                                          comm);
435         }
436     } else {
437         //if (communicator_size % 2) {
438             return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, 
439                                                         rbuf, rcount, rdtype, 
440                                                         comm);
441         /*} else {
442             return  smpi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
443                                                                      rbuf, rcount, rdtype,
444                                                                      comm, module);
445         }*/
446     }
447    
448 #if defined(USE_MPICH2_DECISION)
449     /* Decision as in MPICH-2 
450        presented in Thakur et.al. "Optimization of Collective Communication 
451        Operations in MPICH", International Journal of High Performance Computing 
452        Applications, Vol. 19, No. 1, 49-66 (2005)
453        - for power-of-two processes and small and medium size messages 
454        (up to 512KB) use recursive doubling
455        - for non-power-of-two processes and small messages (80KB) use bruck,
456        - for everything else use ring.
457     */
458     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
459         return smpi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype, 
460                                                                  rbuf, rcount, rdtype, 
461                                                                  comm, module);
462     } else if (total_dsize <= 81920) { 
463         return smpi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, 
464                                                      rbuf, rcount, rdtype,
465                                                      comm, module);
466     } 
467     return smpi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, 
468                                                 rbuf, rcount, rdtype,
469                                                 comm, module);
470 #endif  /* defined(USE_MPICH2_DECISION) */
471 }
472
473 int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount, 
474                                                MPI_Datatype sdtype,
475                                                void* rbuf, int *rcounts, 
476                                                int *rdispls,
477                                                MPI_Datatype rdtype, 
478                                                MPI_Comm  comm
479                                                )
480 {
481     int i;
482     int communicator_size;
483     size_t dsize, total_dsize;
484     
485     communicator_size = smpi_comm_size(comm);
486     
487     /* Special case for 2 processes */
488     if (communicator_size == 2) {
489         return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
490                                                            rbuf, rcounts, rdispls, rdtype, 
491                                                            comm);
492     }
493     
494     /* Determine complete data size */
495     dsize=smpi_datatype_size(sdtype);
496     total_dsize = 0;
497     for (i = 0; i < communicator_size; i++) {
498         total_dsize += dsize * rcounts[i];
499     }
500     
501     /* Decision based on allgather decision.   */
502     if (total_dsize < 50000) {
503 /*        return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype, 
504                                                       rbuf, rcounts, rdispls, rdtype, 
505                                                       comm, module);*/
506     return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, 
507                                                       rbuf, rcounts, rdispls, rdtype, 
508                                                       comm);
509
510     } else {
511 //        if (communicator_size % 2) {
512             return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, 
513                                                          rbuf, rcounts, rdispls, rdtype, 
514                                                          comm);
515 /*        } else {
516             return  smpi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
517                                                                       rbuf, rcounts, rdispls, rdtype, 
518                                                                       comm, module);
519         }*/
520     }
521 }
522 /*
523 int smpi_coll_tuned_gather_ompi(void *sbuf, int scount, 
524                                            MPI_Datatype sdtype,
525                                            void* rbuf, int rcount, 
526                                            MPI_Datatype rdtype, 
527                                            int root,
528                                            MPI_Comm  comm,
529                                            )
530 {
531     const int large_segment_size = 32768;
532     const int small_segment_size = 1024;
533
534     const size_t large_block_size = 92160;
535     const size_t intermediate_block_size = 6000;
536     const size_t small_block_size = 1024;
537
538     const int large_communicator_size = 60;
539     const int small_communicator_size = 10;
540
541     int communicator_size, rank;
542     size_t dsize, block_size;
543
544     OPAL_OUTPUT((smpi_coll_tuned_stream, 
545                  "smpi_coll_tuned_gather_ompi"));
546
547     communicator_size = smpi_comm_size(comm);
548     rank = ompi_comm_rank(comm);
549
550     // Determine block size 
551     if (rank == root) {
552         ompi_datatype_type_size(rdtype, &dsize);
553         block_size = dsize * rcount;
554     } else {
555         ompi_datatype_type_size(sdtype, &dsize);
556         block_size = dsize * scount;
557     }
558
559     if (block_size > large_block_size) {
560         return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype, 
561                                                          rbuf, rcount, rdtype, 
562                                                          root, comm, module,
563                                                          large_segment_size);
564
565     } else if (block_size > intermediate_block_size) {
566         return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype, 
567                                                          rbuf, rcount, rdtype, 
568                                                          root, comm, module,
569                                                          small_segment_size);
570
571     } else if ((communicator_size > large_communicator_size) ||
572                ((communicator_size > small_communicator_size) &&
573                 (block_size < small_block_size))) {
574         return smpi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype, 
575                                                       rbuf, rcount, rdtype, 
576                                                       root, comm, module);
577
578     }
579     // Otherwise, use basic linear 
580     return smpi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype, 
581                                                       rbuf, rcount, rdtype, 
582                                                       root, comm, module);
583 }*/
584 /*
585 int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount, 
586                                             MPI_Datatype sdtype,
587                                             void* rbuf, int rcount, 
588                                             MPI_Datatype rdtype, 
589                                             int root, MPI_Comm  comm,
590                                             )
591 {
592     const size_t small_block_size = 300;
593     const int small_comm_size = 10;
594     int communicator_size, rank;
595     size_t dsize, block_size;
596
597     OPAL_OUTPUT((smpi_coll_tuned_stream, 
598                  "smpi_coll_tuned_scatter_ompi"));
599
600     communicator_size = smpi_comm_size(comm);
601     rank = ompi_comm_rank(comm);
602     // Determine block size 
603     if (root == rank) {
604         ompi_datatype_type_size(sdtype, &dsize);
605         block_size = dsize * scount;
606     } else {
607         ompi_datatype_type_size(rdtype, &dsize);
608         block_size = dsize * rcount;
609     } 
610
611     if ((communicator_size > small_comm_size) &&
612         (block_size < small_block_size)) {
613         return smpi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype, 
614                                                        rbuf, rcount, rdtype, 
615                                                        root, comm, module);
616     }
617     return smpi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype, 
618                                                        rbuf, rcount, rdtype, 
619                                                        root, comm, module);
620 }*/
621