Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
819242fb32023fd3ec940d25e5a8ee6fae8e4db4
[simgrid.git] / src / smpi / colls / smpi_openmpi_selector.c
1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
2
3 /* Copyright (c) 2009, 2010. The SimGrid Team.
4  * All rights reserved.                                                     */
5
6 /* This program is free software; you can redistribute it and/or modify it
7  * under the terms of the license (GNU LGPL) which comes with this package. */
8
9 #include "colls_private.h"
10
11
12 int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
13                         MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
14 {
15     size_t dsize, block_dsize;
16     int comm_size = smpi_comm_size(comm);
17     const size_t intermediate_message = 10000;
18
19     /**
20      * Decision function based on MX results from the Grig cluster at UTK.
21      * 
22      * Currently, linear, recursive doubling, and nonoverlapping algorithms 
23      * can handle both commutative and non-commutative operations.
24      * Ring algorithm does not support non-commutative operations.
25      */
26     dsize = smpi_datatype_size(dtype);
27     block_dsize = dsize * count;
28
29     if (block_dsize < intermediate_message) {
30         return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf, 
31                                                                    count, dtype,
32                                                                    op, comm));
33     } 
34
35     if( smpi_op_is_commute(op) && (count > comm_size) ) {
36         const size_t segment_size = 1 << 20; /* 1 MB */
37         if ((comm_size * segment_size >= block_dsize)) {
38             //FIXME: ok, these are not the right algorithms, try to find closer ones
39             // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
40             return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
41                                               op, comm);
42         } else {
43            return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf,
44                                                                     count, dtype, 
45                                                                     op, comm 
46                                                                     /*segment_size*/));
47         }
48     }
49
50     return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count, 
51                                                             dtype, op, comm));
52 }
53
54
55
56 int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount, 
57                                              MPI_Datatype sdtype,
58                                              void* rbuf, int rcount, 
59                                              MPI_Datatype rdtype, 
60                                              MPI_Comm comm)
61 {
62     int communicator_size;
63     size_t dsize, block_dsize;
64     communicator_size = smpi_comm_size(comm);
65
66     /* Decision function based on measurement on Grig cluster at 
67        the University of Tennessee (2GB MX) up to 64 nodes.
68        Has better performance for messages of intermediate sizes than the old one */
69     /* determine block size */
70     dsize = smpi_datatype_size(sdtype);
71     block_dsize = dsize * scount;
72
73     if ((block_dsize < 200) && (communicator_size > 12)) {
74         return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype, 
75                                                     rbuf, rcount, rdtype,
76                                                     comm);
77
78     } else if (block_dsize < 3000) {
79         return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype, 
80                                                            rbuf, rcount, rdtype, 
81                                                            comm);
82     }
83
84     return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype, 
85                                                     rbuf, rcount, rdtype,
86                                                     comm);
87 }
88
89 int smpi_coll_tuned_alltoallv_ompi(void *sbuf, int *scounts, int *sdisps,
90                                               MPI_Datatype sdtype,
91                                               void *rbuf, int *rcounts, int *rdisps,
92                                               MPI_Datatype rdtype,
93                                               MPI_Comm  comm
94                                               )
95 {
96     /* For starters, just keep the original algorithm. */
97     return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype, 
98                                                         rbuf, rcounts, rdisps,rdtype,
99                                                         comm);
100 }
101
102
103 int smpi_coll_tuned_barrier_ompi(MPI_Comm  comm)
104 {    int communicator_size = smpi_comm_size(comm);
105
106     if( 2 == communicator_size )
107         return smpi_coll_tuned_barrier_ompi_two_procs(comm);
108 /*     * Basic optimisation. If we have a power of 2 number of nodes*/
109 /*     * the use the recursive doubling algorithm, otherwise*/
110 /*     * bruck is the one we want.*/
111     {
112         int has_one = 0;
113         for( ; communicator_size > 0; communicator_size >>= 1 ) {
114             if( communicator_size & 0x1 ) {
115                 if( has_one )
116                     return smpi_coll_tuned_barrier_ompi_bruck(comm);
117                 has_one = 1;
118             }
119         }
120     }
121     return smpi_coll_tuned_barrier_ompi_recursivedoubling(comm);
122 }
123
124 int smpi_coll_tuned_bcast_ompi(void *buff, int count,
125                                           MPI_Datatype datatype, int root,
126                                           MPI_Comm  comm
127                                           )
128 {
129     /* Decision function based on MX results for 
130        messages up to 36MB and communicator sizes up to 64 nodes */
131     const size_t small_message_size = 2048;
132     const size_t intermediate_message_size = 370728;
133     const double a_p16  = 3.2118e-6; /* [1 / byte] */
134     const double b_p16  = 8.7936;   
135     const double a_p64  = 2.3679e-6; /* [1 / byte] */
136     const double b_p64  = 1.1787;     
137     const double a_p128 = 1.6134e-6; /* [1 / byte] */
138     const double b_p128 = 2.1102;
139
140     int communicator_size;
141     //int segsize = 0;
142     size_t message_size, dsize;
143
144     communicator_size = smpi_comm_size(comm);
145
146     /* else we need data size for decision function */
147     dsize = smpi_datatype_size(datatype);
148     message_size = dsize * (unsigned long)count;   /* needed for decision */
149
150     /* Handle messages of small and intermediate size, and 
151        single-element broadcasts */
152     if ((message_size < small_message_size) || (count <= 1)) {
153         /* Binomial without segmentation */
154         return  smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype, 
155                                                       root, comm);
156
157     } else if (message_size < intermediate_message_size) {
158         // SplittedBinary with 1KB segments
159         return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype, 
160                                                          root, comm);
161
162     }
163      //Handle large message sizes 
164     else if (communicator_size < (a_p128 * message_size + b_p128)) {
165         //Pipeline with 128KB segments 
166         //segsize = 1024  << 7;
167         return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype, 
168                                                      root, comm);
169                                                      
170
171     } else if (communicator_size < 13) {
172         // Split Binary with 8KB segments 
173         return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype, 
174                                                          root, comm);
175        
176     } else if (communicator_size < (a_p64 * message_size + b_p64)) {
177         // Pipeline with 64KB segments 
178         //segsize = 1024 << 6;
179         return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype, 
180                                                      root, comm);
181                                                      
182
183     } else if (communicator_size < (a_p16 * message_size + b_p16)) {
184         //Pipeline with 16KB segments 
185         //segsize = 1024 << 4;
186         return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype, 
187                                                      root, comm);
188                                                      
189
190     }
191     /* Pipeline with 8KB segments */
192     //segsize = 1024 << 3;
193     return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype, 
194                                                  root, comm
195                                                  /*segsize*/);
196 #if 0
197     /* this is based on gige measurements */
198
199     if (communicator_size  < 4) {
200         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
201     }
202     if (communicator_size == 4) {
203         if (message_size < 524288) segsize = 0;
204         else segsize = 16384;
205         return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
206     }
207     if (communicator_size <= 8 && message_size < 4096) {
208         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
209     }
210     if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
211         segsize = 16384;
212         return  smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
213     }
214     if (message_size >= 524288) {
215         segsize = 16384;
216         return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
217     }
218     segsize = 0;
219     /* once tested can swap this back in */
220     /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
221     return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
222 #endif  /* 0 */
223 }
224
225 int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
226                                             int count, MPI_Datatype  datatype,
227                                             MPI_Op   op, int root,
228                                             MPI_Comm   comm
229                                             )
230 {
231     int communicator_size=0;
232     //int segsize = 0;
233     size_t message_size, dsize;
234     const double a1 =  0.6016 / 1024.0; /* [1/B] */
235     const double b1 =  1.3496;
236     const double a2 =  0.0410 / 1024.0; /* [1/B] */
237     const double b2 =  9.7128;
238     const double a3 =  0.0422 / 1024.0; /* [1/B] */
239     const double b3 =  1.1614;
240     //const double a4 =  0.0033 / 1024.0; /* [1/B] */
241     //const double b4 =  1.6761;
242
243     //const int max_requests = 0; /* no limit on # of outstanding requests */
244
245     communicator_size = smpi_comm_size(comm);
246
247     /* need data size for decision function */
248     dsize=smpi_datatype_size(datatype);
249     message_size = dsize * count;   /* needed for decision */
250
251     /**
252      * If the operation is non commutative we currently have choice of linear 
253      * or in-order binary tree algorithm.
254      */
255     if( !smpi_op_is_commute(op) ) {
256         if ((communicator_size < 12) && (message_size < 2048)) {
257             return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm/*, module*/); 
258         } 
259         return smpi_coll_tuned_reduce_ompi_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
260                                                              0, max_requests*/); 
261     }
262
263     if ((communicator_size < 8) && (message_size < 512)){
264         /* Linear_0K */
265         return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm); 
266     } else if (((communicator_size < 8) && (message_size < 20480)) ||
267                (message_size < 2048) || (count <= 1)) {
268         /* Binomial_0K */
269         //segsize = 0;
270         return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
271                                                      segsize, max_requests*/);
272     } else if (communicator_size > (a1 * message_size + b1)) {
273         // Binomial_1K 
274         //segsize = 1024;
275         return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
276                                                      segsize, max_requests*/);
277     } else if (communicator_size > (a2 * message_size + b2)) {
278         // Pipeline_1K 
279         //segsize = 1024;
280         return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module, 
281                                                       segsize, max_requests*/);
282     } else if (communicator_size > (a3 * message_size + b3)) {
283         // Binary_32K 
284         //segsize = 32*1024;
285         return smpi_coll_tuned_reduce_ompi_binary( sendbuf, recvbuf, count, datatype, op, root,
286                                                     comm/*, module, segsize, max_requests*/);
287     }
288     /*if (communicator_size > (a4 * message_size + b4)) {
289         // Pipeline_32K 
290         segsize = 32*1024;
291     } else {
292         // Pipeline_64K 
293         segsize = 64*1024;
294     }*/
295     return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module, 
296                                                   segsize, max_requests*/);
297
298 #if 0
299     /* for small messages use linear algorithm */
300     if (message_size <= 4096) {
301         segsize = 0;
302         fanout = communicator_size - 1;
303         /* when linear implemented or taken from basic put here, right now using chain as a linear system */
304         /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
305         return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); 
306         /*        return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
307     }
308     if (message_size < 524288) {
309         if (message_size <= 65536 ) {
310             segsize = 32768;
311             fanout = 8;
312         } else {
313             segsize = 1024;
314             fanout = communicator_size/2;
315         }
316         /* later swap this for a binary tree */
317         /*         fanout = 2; */
318         return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
319                                                    segsize, fanout, max_requests);
320     }
321     segsize = 1024;
322     return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
323                                                   segsize, max_requests);
324 #endif  /* 0 */
325 }
326
327 int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
328                                                     int *rcounts,
329                                                     MPI_Datatype dtype,
330                                                     MPI_Op  op,
331                                                     MPI_Comm  comm
332                                                     )
333 {
334     int comm_size, i, pow2;
335     size_t total_message_size, dsize;
336     const double a = 0.0012;
337     const double b = 8.0;
338     const size_t small_message_size = 12 * 1024;
339     const size_t large_message_size = 256 * 1024;
340     int zerocounts = 0;
341
342     XBT_DEBUG("smpi_coll_tuned_reduce_scatter_ompi");
343     
344     comm_size = smpi_comm_size(comm);
345     // We need data size for decision function 
346     dsize=smpi_datatype_size(dtype);
347     total_message_size = 0;
348     for (i = 0; i < comm_size; i++) { 
349         total_message_size += rcounts[i];
350         if (0 == rcounts[i]) {
351             zerocounts = 1;
352         }
353     }
354
355     if( !smpi_op_is_commute(op) || (zerocounts)) {
356         smpi_mpi_reduce_scatter (sbuf, rbuf, rcounts, 
357                                                                     dtype, op, 
358                                                                     comm); 
359         return MPI_SUCCESS;
360     }
361    
362     total_message_size *= dsize;
363
364     // compute the nearest power of 2 
365     for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
366
367     if ((total_message_size <= small_message_size) ||
368         ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
369         (comm_size >= a * total_message_size + b)) {
370         return 
371             smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(sbuf, rbuf, rcounts,
372                                                                         dtype, op,
373                                                                         comm);
374     } 
375     return smpi_coll_tuned_reduce_scatter_ompi_ring(sbuf, rbuf, rcounts,
376                                                      dtype, op,
377                                                      comm);
378
379
380
381 }
382
383 int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount, 
384                                               MPI_Datatype sdtype,
385                                               void* rbuf, int rcount, 
386                                               MPI_Datatype rdtype, 
387                                               MPI_Comm  comm
388                                               )
389 {
390     int communicator_size, pow2_size;
391     size_t dsize, total_dsize;
392
393     communicator_size = smpi_comm_size(comm);
394
395     /* Special case for 2 processes */
396     if (communicator_size == 2) {
397         return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype, 
398                                                           rbuf, rcount, rdtype, 
399                                                           comm/*, module*/);
400     }
401
402     /* Determine complete data size */
403     dsize=smpi_datatype_size(sdtype);
404     total_dsize = dsize * scount * communicator_size;   
405    
406     for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1); 
407
408     /* Decision based on MX 2Gb results from Grig cluster at 
409        The University of Tennesse, Knoxville 
410        - if total message size is less than 50KB use either bruck or 
411        recursive doubling for non-power of two and power of two nodes, 
412        respectively.
413        - else use ring and neighbor exchange algorithms for odd and even 
414        number of nodes, respectively.
415     */
416     if (total_dsize < 50000) {
417         if (pow2_size == communicator_size) {
418             return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, 
419                                                                      rbuf, rcount, rdtype,
420                                                                      comm);
421         } else {
422             return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, 
423                                                          rbuf, rcount, rdtype, 
424                                                          comm);
425         }
426     } else {
427         if (communicator_size % 2) {
428             return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, 
429                                                         rbuf, rcount, rdtype, 
430                                                         comm);
431         } else {
432             return  smpi_coll_tuned_allgather_ompi_neighborexchange(sbuf, scount, sdtype,
433                                                                      rbuf, rcount, rdtype,
434                                                                      comm);
435         }
436     }
437    
438 #if defined(USE_MPICH2_DECISION)
439     /* Decision as in MPICH-2 
440        presented in Thakur et.al. "Optimization of Collective Communication 
441        Operations in MPICH", International Journal of High Performance Computing 
442        Applications, Vol. 19, No. 1, 49-66 (2005)
443        - for power-of-two processes and small and medium size messages 
444        (up to 512KB) use recursive doubling
445        - for non-power-of-two processes and small messages (80KB) use bruck,
446        - for everything else use ring.
447     */
448     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
449         return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, 
450                                                                  rbuf, rcount, rdtype, 
451                                                                  comm);
452     } else if (total_dsize <= 81920) { 
453         return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, 
454                                                      rbuf, rcount, rdtype,
455                                                      comm);
456     } 
457     return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, 
458                                                 rbuf, rcount, rdtype,
459                                                 comm);
460 #endif  /* defined(USE_MPICH2_DECISION) */
461 }
462
463 int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount, 
464                                                MPI_Datatype sdtype,
465                                                void* rbuf, int *rcounts, 
466                                                int *rdispls,
467                                                MPI_Datatype rdtype, 
468                                                MPI_Comm  comm
469                                                )
470 {
471     int i;
472     int communicator_size;
473     size_t dsize, total_dsize;
474     
475     communicator_size = smpi_comm_size(comm);
476     
477     /* Special case for 2 processes */
478     if (communicator_size == 2) {
479         return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
480                                                            rbuf, rcounts, rdispls, rdtype, 
481                                                            comm);
482     }
483     
484     /* Determine complete data size */
485     dsize=smpi_datatype_size(sdtype);
486     total_dsize = 0;
487     for (i = 0; i < communicator_size; i++) {
488         total_dsize += dsize * rcounts[i];
489     }
490     
491     /* Decision based on allgather decision.   */
492     if (total_dsize < 50000) {
493 /*        return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype, 
494                                                       rbuf, rcounts, rdispls, rdtype, 
495                                                       comm, module);*/
496     return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, 
497                                                       rbuf, rcounts, rdispls, rdtype, 
498                                                       comm);
499
500     } else {
501         if (communicator_size % 2) {
502             return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, 
503                                                          rbuf, rcounts, rdispls, rdtype, 
504                                                          comm);
505         } else {
506             return  smpi_coll_tuned_allgatherv_ompi_neighborexchange(sbuf, scount, sdtype,
507                                                                       rbuf, rcounts, rdispls, rdtype, 
508                                                                       comm);
509         }
510     }
511 }
512
513 int smpi_coll_tuned_gather_ompi(void *sbuf, int scount, 
514                                            MPI_Datatype sdtype,
515                                            void* rbuf, int rcount, 
516                                            MPI_Datatype rdtype, 
517                                            int root,
518                                            MPI_Comm  comm
519                                            )
520 {
521     //const int large_segment_size = 32768;
522     //const int small_segment_size = 1024;
523
524     //const size_t large_block_size = 92160;
525     const size_t intermediate_block_size = 6000;
526     const size_t small_block_size = 1024;
527
528     const int large_communicator_size = 60;
529     const int small_communicator_size = 10;
530
531     int communicator_size, rank;
532     size_t dsize, block_size;
533
534     XBT_DEBUG("smpi_coll_tuned_gather_ompi");
535
536     communicator_size = smpi_comm_size(comm);
537     rank = smpi_comm_rank(comm);
538
539     // Determine block size 
540     if (rank == root) {
541         dsize = smpi_datatype_size(rdtype);
542         block_size = dsize * rcount;
543     } else {
544         dsize = smpi_datatype_size(sdtype);
545         block_size = dsize * scount;
546     }
547
548 /*    if (block_size > large_block_size) {*/
549 /*        return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, */
550 /*                                                         rbuf, rcount, rdtype, */
551 /*                                                         root, comm);*/
552
553 /*    } else*/ if (block_size > intermediate_block_size) {
554         return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, 
555                                                          rbuf, rcount, rdtype, 
556                                                          root, comm);
557
558     } else if ((communicator_size > large_communicator_size) ||
559                ((communicator_size > small_communicator_size) &&
560                 (block_size < small_block_size))) {
561         return smpi_coll_tuned_gather_ompi_binomial (sbuf, scount, sdtype, 
562                                                       rbuf, rcount, rdtype, 
563                                                       root, comm);
564
565     }
566     // Otherwise, use basic linear 
567     return smpi_coll_tuned_gather_ompi_basic_linear (sbuf, scount, sdtype, 
568                                                       rbuf, rcount, rdtype, 
569                                                       root, comm);
570 }
571
572 int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount, 
573                                             MPI_Datatype sdtype,
574                                             void* rbuf, int rcount, 
575                                             MPI_Datatype rdtype, 
576                                             int root, MPI_Comm  comm
577                                             )
578 {
579     const size_t small_block_size = 300;
580     const int small_comm_size = 10;
581     int communicator_size, rank;
582     size_t dsize, block_size;
583
584     XBT_DEBUG("smpi_coll_tuned_scatter_ompi");
585
586     communicator_size = smpi_comm_size(comm);
587     rank = smpi_comm_rank(comm);
588     // Determine block size 
589     if (root == rank) {
590         dsize=smpi_datatype_size(sdtype);
591         block_size = dsize * scount;
592     } else {
593         dsize=smpi_datatype_size(rdtype);
594         block_size = dsize * rcount;
595     } 
596
597     if ((communicator_size > small_comm_size) &&
598         (block_size < small_block_size)) {
599         return smpi_coll_tuned_scatter_ompi_binomial (sbuf, scount, sdtype, 
600                                                        rbuf, rcount, rdtype, 
601                                                        root, comm);
602     }
603     return smpi_coll_tuned_scatter_ompi_basic_linear (sbuf, scount, sdtype, 
604                                                        rbuf, rcount, rdtype, 
605                                                        root, comm);
606 }
607