Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Add Intel MPI (impi) selector.
[simgrid.git] / src / smpi / colls / smpi_intel_mpi_selector.c
1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
2
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4  * All rights reserved.                                                     */
5
6 /* This program is free software; you can redistribute it and/or modify it
7  * under the terms of the license (GNU LGPL) which comes with this package. */
8
9 #include "colls_private.h"
10
11
12 // This selector is based on information gathered on the Stampede cluster, with Intel MPI 4.1.3.049, and from the intel reference manual. The data was gathered launching one process/node. With other settings, selection will be different (more SMP aware algorithms, for instance)
13
14
15 #define INTEL_MAX_NB_THRESHOLDS  32
16
17 typedef struct {
18   int max_size;
19   int algo;
20 } intel_tuning_table_element_element;
21
22 typedef struct {
23   int max_num_proc;
24   int num_elems;
25   intel_tuning_table_element_element elems[INTEL_MAX_NB_THRESHOLDS];
26 } intel_tuning_table_element;
27
28 /*
29 I_MPI_ADJUST_ALLREDUCE
30
31 MPI_Allreduce
32
33 1 - Recursive doubling algorithm
34 2 - Rabenseifner's algorithm
35 3 - Reduce + Bcast algorithm
36 4 - Topology aware Reduce + Bcast algorithm
37 5 - Binomial gather + scatter algorithm
38 6 - Topology aware binominal gather + scatter algorithm
39 7 - Shumilin's ring algorithm 
40 8 - Ring algorithm
41
42
43 //as Shumilin's ring algorithm is unknown, default to ring'
44 */
45
46
47 int (*intel_allreduce_functions_table[])(void *sendbuf,
48       void *recvbuf,
49       int count,
50       MPI_Datatype datatype,
51       MPI_Op op, MPI_Comm comm) ={
52       smpi_coll_tuned_allreduce_rdb,
53       smpi_coll_tuned_allreduce_rab1,
54       smpi_coll_tuned_allreduce_redbcast,
55       smpi_coll_tuned_allreduce_redbcast,
56       smpi_coll_tuned_allreduce_smp_binomial,
57       smpi_coll_tuned_allreduce_smp_binomial,
58       smpi_coll_tuned_allreduce_ompi_ring_segmented,
59       smpi_coll_tuned_allreduce_ompi_ring_segmented
60 };
61
62 intel_tuning_table_element intel_allreduce_table[] =
63 {
64   { 2,9,{
65     {6,7},
66     {85,1},
67     {192,7},
68     {853,1},
69     {1279,7},
70     {16684,1},
71     {34279,8},
72     {1681224,3},
73     {2147483647,7}
74   }
75   },
76   { 4, 8,{
77     {16,7},
78     {47,1},
79     {2062,7},
80     {16699,1},
81     {33627,7},
82     {70732,8},
83     {1300705,3},
84     {2147483647,8}
85   }
86   },
87   {8,8,{
88     {118,1},
89     {146,4},
90     {16760,1},
91     {36364,6},
92     {136239,8},
93     {315710,7},
94     {3220366,3},
95     {2147483647,8}
96     }
97   },
98   {16,7,{
99     {934,1},
100     {1160,6},
101     {15505,1},
102     {52730,2},
103     {300705,8},
104     {563680,7},
105     {2147483647,3}
106     }
107   },
108   {2147483647,11,{
109     {5,6},
110     {11,4},
111     {182,1},
112     {700,6},
113     {1450,4},
114     {11146,1},
115     {25539,6},
116     {37634,4},
117     {93784,6},
118     {817658,2},
119     {2147483647,3}
120   }
121   }
122 };
123
124
125
126 /*I_MPI_ADJUST_ALLTOALL 
127
128 MPI_Alltoall 
129
130 1. Bruck's algorithm 
131 2. Isend/Irecv + waitall algorithm 
132 3. Pair wise exchange algorithm 
133 4. Plum's algorithm
134
135 */
136
137
138 intel_tuning_table_element intel_alltoall_table[] =
139 {
140     { 2,1,
141         {
142         {2147483647,3}
143         }
144     },
145     { 4,2,
146         {
147         {0,4},
148         {2147483647,2}
149         }
150     },
151     {8,1,
152         {
153         {2147483647,2}
154         }
155     },
156     {16,5,
157         {
158         {0,3},
159         {84645,2},
160         {167570,3},
161         {413152,4},
162         {2147483647,2}
163         }
164     },
165     {32,6,
166         {
167         {61,1},
168         {164,2},
169         {696,1},
170         {143254,2},
171         {387024,3},
172         {2147483647,2}
173         },
174     },
175     {64,4,
176         {
177         {523,1},
178         {146088,2},
179         {488989,4},
180         {2147483647,2}
181         }
182     },
183     {2147483647,3,
184         {
185         {270,1},
186         {628,4},
187         {2147483647,2}
188         }
189     }
190 };
191 int (*intel_alltoall_functions_table[])(void *sbuf, int scount, 
192                                              MPI_Datatype sdtype,
193                                              void* rbuf, int rcount, 
194                                              MPI_Datatype rdtype, 
195                                              MPI_Comm comm) ={
196       smpi_coll_tuned_alltoall_bruck,
197       smpi_coll_tuned_alltoall_mvapich2_scatter_dest,
198       smpi_coll_tuned_alltoall_pair,
199       smpi_coll_tuned_alltoall_pair//Plum is proprietary ? (and super efficient)
200 };
201
202 /*I_MPI_ADJUST_BARRIER 
203
204 MPI_Barrier 
205
206 1. Dissemination algorithm 
207 2. Recursive doubling algorithm 
208 3. Topology aware dissemination algorithm 
209 4. Topology aware recursive doubling algorithm 
210 5. Binominal gather + scatter algorithm 
211 6. Topology aware binominal gather + scatter algorithm 
212
213 */
214 static int intel_barrier_gather_scatter(MPI_Comm comm){
215     //our default barrier performs a antibcast/bcast
216     smpi_mpi_barrier(comm);
217     return MPI_SUCCESS;
218 }
219
220 int (*intel_barrier_functions_table[])(MPI_Comm comm) ={
221       smpi_coll_tuned_barrier_ompi_basic_linear,
222       smpi_coll_tuned_barrier_ompi_recursivedoubling,
223       smpi_coll_tuned_barrier_ompi_basic_linear,
224       smpi_coll_tuned_barrier_ompi_recursivedoubling,
225       intel_barrier_gather_scatter,
226       intel_barrier_gather_scatter
227 };
228
229 intel_tuning_table_element intel_barrier_table[] =
230 {
231     {2,1,
232         {
233         {2147483647,2}
234         }
235     },
236     {4,1,
237         {
238         {2147483647,6}
239         }
240     },
241     {8,1,
242         {
243         {2147483647,1}
244         }
245     },
246     {64,1,
247         {
248         {2147483647,2}
249         }
250     },
251     {2147483647,1,
252         {
253         {2147483647,6}
254         }
255     }
256 };
257
258
259 /*I_MPI_ADJUST_BCAST 
260
261 MPI_Bcast 
262
263 1. Binomial algorithm 
264 2. Recursive doubling algorithm 
265 3. Ring algorithm 
266 4. Topology aware binomial algorithm 
267 5. Topology aware recursive doubling algorithm 
268 6. Topology aware ring algorithm 
269 7. Shumilin's bcast algorithm 
270 */
271
272 int (*intel_bcast_functions_table[])(void *buff, int count,
273                                           MPI_Datatype datatype, int root,
274                                           MPI_Comm  comm) ={
275       smpi_coll_tuned_bcast_binomial_tree,
276       //smpi_coll_tuned_bcast_scatter_rdb_allgather,
277       smpi_coll_tuned_bcast_NTSL,
278       smpi_coll_tuned_bcast_NTSL,
279       smpi_coll_tuned_bcast_SMP_binomial,
280       //smpi_coll_tuned_bcast_scatter_rdb_allgather,
281             smpi_coll_tuned_bcast_NTSL,
282       smpi_coll_tuned_bcast_SMP_linear,
283       smpi_coll_tuned_bcast_mvapich2,//we don't know shumilin's algo'
284 };
285
286 intel_tuning_table_element intel_bcast_table[] =
287 {
288     {2,9,
289         {
290         {1,2},
291         {402,7},
292         {682,5},
293         {1433,4},
294         {5734,7},
295         {21845,1},
296         {95963,6},
297         {409897,5},
298         {2147483647,1}
299         }
300     },
301     {4,1,
302         {
303         {2147483647,7}
304         }
305     },
306     {8,11,
307         {
308         {3,6},
309         {4,7},
310         {25,6},
311         {256,1},
312         {682,6},
313         {1264,1},
314         {2234,6},
315         {6655,5},
316         {16336,1},
317         {3998434,7},
318         {2147483647,6}
319         }
320     },
321     {2147483647,1,
322         {
323         {2147483647,7}
324         }
325     }
326 };
327
328
329 /*I_MPI_ADJUST_REDUCE 
330
331 MPI_Reduce 
332
333 1. Shumilin's algorithm 
334 2. Binomial algorithm 
335 3. Topology aware Shumilin's algorithm 
336 4. Topology aware binomial algorithm 
337 5. Rabenseifner's algorithm 
338 6. Topology aware Rabenseifner's algorithm
339
340 */
341
342 int (*intel_reduce_functions_table[])(void *sendbuf, void *recvbuf,
343                                             int count, MPI_Datatype  datatype,
344                                             MPI_Op   op, int root,
345                                             MPI_Comm   comm) ={
346       smpi_coll_tuned_reduce_mvapich2,
347       smpi_coll_tuned_reduce_binomial,
348       smpi_coll_tuned_reduce_mvapich2,
349       smpi_coll_tuned_reduce_binomial,
350       smpi_coll_tuned_reduce_rab,
351       smpi_coll_tuned_reduce_rab
352 };
353
354 intel_tuning_table_element intel_reduce_table[] =
355 {
356     {2147483647,1,
357         {
358         {2147483647,1}
359         }
360     }
361 };
362
363 /* I_MPI_ADJUST_REDUCE_SCATTER 
364
365 MPI_Reduce_scatter 
366
367 1. Recursive having algorithm 
368 2. Pair wise exchange algorithm 
369 3. Recursive doubling algorithm 
370 4. Reduce + Scatterv algorithm 
371 5. Topology aware Reduce + Scatterv algorithm 
372
373 */
374 static  int intel_reduce_scatter_reduce_scatterv(void *sbuf, void *rbuf,
375                                                     int *rcounts,
376                                                     MPI_Datatype dtype,
377                                                     MPI_Op  op,
378                                                     MPI_Comm  comm)
379 {
380   smpi_mpi_reduce_scatter(sbuf, rbuf, rcounts,dtype, op,comm);
381   return MPI_SUCCESS;
382 }
383
384 static  int  intel_reduce_scatter_recursivehalving(void *sbuf, void *rbuf,
385                                                     int *rcounts,
386                                                     MPI_Datatype dtype,
387                                                     MPI_Op  op,
388                                                     MPI_Comm  comm)
389 {
390   if(smpi_op_is_commute(op))
391     return smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(sbuf, rbuf, rcounts,dtype, op,comm);
392   else
393     return smpi_coll_tuned_reduce_scatter_mvapich2(sbuf, rbuf, rcounts,dtype, op,comm);
394 }
395
396 int (*intel_reduce_scatter_functions_table[])( void *sbuf, void *rbuf,
397                                                     int *rcounts,
398                                                     MPI_Datatype dtype,
399                                                     MPI_Op  op,
400                                                     MPI_Comm  comm
401                                                     ) ={
402       intel_reduce_scatter_recursivehalving,
403       smpi_coll_tuned_reduce_scatter_mpich_pair,
404       smpi_coll_tuned_reduce_scatter_mpich_rdb,
405       intel_reduce_scatter_reduce_scatterv,
406       intel_reduce_scatter_reduce_scatterv
407 };
408
409 intel_tuning_table_element intel_reduce_scatter_table[] =
410 {
411     {2,5,
412     {
413         {5,4},
414         {522429,2},
415         {1375877,5},
416         {2932736,2},
417         {2147483647,5}
418         }
419     },
420     {4,9,
421         {
422         {4,4},
423         {15,1},
424         {120,3},
425         {651,1},
426         {12188,3},
427         {33890,1},
428         {572117,2},
429         {1410202,5},
430         {2147483647,2}
431         }
432     },
433     {8,7,
434         {
435         {4,4},
436         {2263,1},
437         {25007,3},
438         {34861,1},
439         {169625,2},
440         {2734000,4},
441         {2147483647,2}
442         }
443     },
444     {16,5,
445         {
446         {4,4},
447         {14228,1},
448         {46084,3},
449         {522139,2},
450         {2147483647,5}
451         }
452     },
453     {32,5,
454         {
455         {4,4},
456         {27516,1},
457         {61693,3},
458         {2483469,2},
459         {2147483647,5}
460         }
461     },
462     {64,4,
463         {
464         {0,3},
465         {4,4},
466         {100396,1},
467         {2147483647,2}
468         }
469     },
470     {2147483647,6,
471         {
472         {0,3},
473         {4,4},
474         {186926,1},
475         {278259,3},
476         {1500100,2},
477         {2147483647,5}
478         }
479     }
480 };
481
482 /* I_MPI_ADJUST_ALLGATHER 
483
484 MPI_Allgather 
485
486 1. Recursive doubling algorithm 
487 2. Bruck's algorithm 
488 3. Ring algorithm 
489 4. Topology aware Gatherv + Bcast algorithm 
490
491 */
492
493 int (*intel_allgather_functions_table[])(void *sbuf, int scount, 
494                                               MPI_Datatype sdtype,
495                                               void* rbuf, int rcount, 
496                                               MPI_Datatype rdtype, 
497                                               MPI_Comm  comm
498                                                     ) ={
499       smpi_coll_tuned_allgather_rdb,
500       smpi_coll_tuned_allgather_bruck,
501       smpi_coll_tuned_allgather_ring,
502       smpi_coll_tuned_allgather_GB
503 };
504
505 intel_tuning_table_element intel_allgather_table[] =
506 {
507     {4,11,
508         {
509         {1,4},
510         {384,1},
511         {1533,4},
512         {3296,1},
513         {10763,4},
514         {31816,3},
515         {193343,4},
516         {405857,3},
517         {597626,4},
518         {1844323,3},
519         {2147483647,4}
520         }
521     },
522     {8,10,
523         {
524         {12,4},
525         {46,1},
526         {205,4},
527         {3422,2},
528         {4200,4},
529         {8748,1},
530         {24080,3},
531         {33244,4},
532         {371159,1},
533         {2147483647,3}
534         }
535     },
536     {16, 8,
537         {
538         {3,4},
539         {53,1},
540         {100,4},
541         {170,1},
542         {6077,4},
543         {127644,1},
544         {143741,4},
545         {2147483647,3}
546         }
547     },
548     {2147483647,10,
549         {
550         {184,1},
551         {320,4},
552         {759,1},
553         {1219,4},
554         {2633,1},
555         {8259,4},
556         {123678,1},
557         {160801,4},
558         {284341,1},
559         {2147483647,4}
560         }
561     }
562 };
563
564 /* I_MPI_ADJUST_ALLGATHERV 
565
566 MPI_Allgatherv 
567
568 1. Recursive doubling algorithm 
569 2. Bruck's algorithm 
570 3. Ring algorithm 
571 4. Topology aware Gatherv + Bcast algorithm 
572
573 */
574
575 int (*intel_allgatherv_functions_table[])(void *sbuf, int scount, 
576                                                MPI_Datatype sdtype,
577                                                void* rbuf, int *rcounts, 
578                                                int *rdispls,
579                                                MPI_Datatype rdtype, 
580                                                MPI_Comm  comm
581                                                     ) ={
582       smpi_coll_tuned_allgatherv_mpich_rdb,
583       smpi_coll_tuned_allgatherv_ompi_bruck,
584       smpi_coll_tuned_allgatherv_ring,
585       smpi_coll_tuned_allgatherv_GB
586 };
587
588 intel_tuning_table_element intel_allgatherv_table[] =
589 {
590     {2,3,
591         {
592         {259668,3},
593         {635750,4},
594         {2147483647,3}
595         }
596     },
597     {4,7,
598         {
599         {1,1},
600         {5,4},
601         {46,1},
602         {2590,2},
603         {1177259,3},
604         {2767234,4},
605         {2147483647,3}
606         }
607     },
608     {8, 6,
609         {
610         {99,2},
611         {143,1},
612         {4646,2},
613         {63522,3},
614         {2187806,4},
615         {2147483647,3}
616         }
617     },
618     {2147483647,7,
619         {
620         {1,1},
621         {5,4},
622         {46,1},
623         {2590,2},
624         {1177259,3},
625         {2767234,4},
626         {2147483647,3}
627         }
628     }
629 };
630
631
632 /* I_MPI_ADJUST_GATHER
633
634 MPI_Gather
635
636 1. Binomial algorithm 
637 2. Topology aware binomial algorithm 
638 3. Shumilin's algorithm
639
640 */
641
642 int (*intel_gather_functions_table[])(void *sbuf, int scount, 
643                                            MPI_Datatype sdtype,
644                                            void* rbuf, int rcount, 
645                                            MPI_Datatype rdtype, 
646                                            int root,
647                                            MPI_Comm  comm
648                                                     ) ={
649       smpi_coll_tuned_gather_ompi_binomial,
650       smpi_coll_tuned_gather_ompi_binomial,
651       smpi_coll_tuned_gather_mvapich2
652 };
653
654 intel_tuning_table_element intel_gather_table[] =
655 {
656     {8,3,
657         {
658         {17561,3},
659         {44791,2},
660         {2147483647,3}
661         }
662     },
663     {16,7,
664         {
665         {16932,3},
666         {84425,2},
667         {158363,3},
668         {702801,2},
669         {1341444,3},
670         {2413569,2},
671         {2147483647,3}
672         }
673     },
674     {2147483647,4,
675         {
676         {47187,3},
677         {349696,2},
678         {2147483647,3},
679         {2147483647,1}
680         }
681     }
682 };
683
684
685 /* I_MPI_ADJUST_SCATTER 
686
687 MPI_Scatter 
688
689 1. Binomial algorithm 
690 2. Topology aware binomial algorithm 
691 3. Shumilin's algorithm 
692
693 */
694
695 int (*intel_scatter_functions_table[])(void *sbuf, int scount, 
696                                             MPI_Datatype sdtype,
697                                             void* rbuf, int rcount, 
698                                             MPI_Datatype rdtype, 
699                                             int root, MPI_Comm  comm
700                                                     ) ={
701       smpi_coll_tuned_scatter_ompi_binomial,
702       smpi_coll_tuned_scatter_ompi_binomial,
703       smpi_coll_tuned_scatter_mvapich2
704 };
705
706 intel_tuning_table_element intel_scatter_table[] =
707 {
708     {2,2,
709         {
710         {16391,1},
711         {2147483647,3}
712         }
713     },
714     {4,6,
715         {
716         {16723,3},
717         {153541,2},
718         {425631,3},
719         {794142,2},
720         {1257027,3},
721         {2147483647,2}
722         }
723     },
724     {8,7,
725         {
726         {2633,3},
727         {6144,2},
728         {14043,3},
729         {24576,2},
730         {107995,3},
731         {1752729,2},
732         {2147483647,3}
733         }
734     },
735     {16,7,
736         {
737         {2043,3},
738         {2252,2},
739         {17749,3},
740         {106020,2},
741         {628654,3},
742         {3751354,2},
743         {2147483647,3}
744         }
745     },
746     {2147483647,4,
747         {
748         {65907,3},
749         {245132,2},
750         {1042439,3},
751         {2147483647,2},
752         {2147483647,1}
753         }
754     }
755 };
756
757
758
759 /* I_MPI_ADJUST_ALLTOALLV 
760
761 MPI_Alltoallv 
762
763 1. Isend/Irecv + waitall algorithm 
764 2. Plum's algorithm 
765
766 */
767
768 int (*intel_alltoallv_functions_table[])(void *sbuf, int *scounts, int *sdisps,
769                                               MPI_Datatype sdtype,
770                                               void *rbuf, int *rcounts, int *rdisps,
771                                               MPI_Datatype rdtype,
772                                               MPI_Comm  comm
773                                                     ) ={
774       smpi_coll_tuned_alltoallv_ompi_basic_linear,
775       smpi_coll_tuned_alltoallv_bruck
776 };
777
778 intel_tuning_table_element intel_alltoallv_table[] =
779 {
780     {2147483647,1,
781         {
782         {2147483647,1}
783         }
784     }
785 };
786
787
788 //These are collected from table 3.5-2 of the Intel MPI Reference Manual 
789
790     
791 #define SIZECOMP_reduce_scatter\
792     int total_message_size = 0;\
793     for (i = 0; i < comm_size; i++) { \
794         total_message_size += rcounts[i];\
795     }\
796     size_t block_dsize = total_message_size*smpi_datatype_size(dtype);\
797     
798 #define SIZECOMP_allreduce\
799   size_t block_dsize =rcount * smpi_datatype_size(dtype);
800   
801 #define SIZECOMP_alltoall\
802   size_t block_dsize =send_count * smpi_datatype_size(send_type);
803
804 #define SIZECOMP_bcast\
805   size_t block_dsize =count * smpi_datatype_size(datatype);
806
807 #define SIZECOMP_reduce\
808   size_t block_dsize =count * smpi_datatype_size(datatype);
809
810 #define SIZECOMP_barrier\
811   size_t block_dsize = 1;
812
813 #define SIZECOMP_allgather\
814   size_t block_dsize =recv_count * smpi_datatype_size(recv_type);
815
816 #define SIZECOMP_allgatherv\
817     int total_message_size = 0;\
818     for (i = 0; i < comm_size; i++) { \
819         total_message_size += recv_count[i];\
820     }\
821     size_t block_dsize = total_message_size*smpi_datatype_size(recv_type);
822     
823 #define SIZECOMP_gather\
824   int rank = smpi_comm_rank(comm);\
825   size_t block_dsize = (send_buff == MPI_IN_PLACE || rank ==root) ?\
826                 recv_count * smpi_datatype_size(recv_type) :\
827                 send_count * smpi_datatype_size(send_type);
828
829 #define SIZECOMP_scatter\
830   int rank = smpi_comm_rank(comm);\
831   size_t block_dsize = (sendbuf == MPI_IN_PLACE || rank !=root ) ?\
832                 recvcount * smpi_datatype_size(recvtype) :\
833                 sendcount * smpi_datatype_size(sendtype);
834
835 #define SIZECOMP_alltoallv\
836   size_t block_dsize = 1;
837   
838 #define IMPI_COLL_SELECT(cat, ret, args, args2)\
839 ret smpi_coll_tuned_ ## cat ## _impi (COLL_UNPAREN args)\
840 {\
841     int comm_size = smpi_comm_size(comm);\
842     int i =0;\
843     SIZECOMP_ ## cat\
844     i=0;\
845     int j =0;\
846     while(comm_size>=intel_ ## cat ## _table[i].max_num_proc\
847         && i < INTEL_MAX_NB_THRESHOLDS)\
848       i++;\
849     while(block_dsize >=intel_ ## cat ## _table[i].elems[j].max_size\
850          && j< intel_ ## cat ## _table[i].num_elems)\
851       j++;\
852     return (intel_ ## cat ## _functions_table[intel_ ## cat ## _table[i].elems[j].algo-1]\
853     args2);\
854 }
855
856 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHERV_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_disps, recv_type, comm));
857 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLREDUCE_SIG, (sbuf, rbuf, rcount, dtype, op, comm));
858 COLL_APPLY(IMPI_COLL_SELECT, COLL_GATHER_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_type, root, comm));
859 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHER_SIG, (send_buff,send_count,send_type,recv_buff,recv_count,recv_type,comm));
860 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALL_SIG,(send_buff, send_count, send_type, recv_buff, recv_count, recv_type,comm));
861 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALLV_SIG, (send_buff, send_counts, send_disps, send_type, recv_buff, recv_counts, recv_disps, recv_type, comm));
862 COLL_APPLY(IMPI_COLL_SELECT, COLL_BCAST_SIG , (buf, count, datatype, root, comm));
863 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SIG,(buf,rbuf, count, datatype, op, root, comm));
864 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SCATTER_SIG ,(sbuf,rbuf, rcounts,dtype,op,comm));
865 COLL_APPLY(IMPI_COLL_SELECT, COLL_SCATTER_SIG ,(sendbuf, sendcount, sendtype,recvbuf, recvcount, recvtype,root, comm));
866 COLL_APPLY(IMPI_COLL_SELECT, COLL_BARRIER_SIG,(comm));
867