Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
indent to avoid GCC6 warning madness
[simgrid.git] / src / smpi / colls / smpi_mvapich2_selector_stampede.h
1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
2
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4  * All rights reserved.                                                     */
5
6 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
7
8
9
10 /************ Alltoall variables and initializers                        */
11
12 #define MV2_MAX_NB_THRESHOLDS  32
13
14
15 typedef struct {
16   int min;
17   int max;
18   int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
19       void *recvbuf, int recvcount, MPI_Datatype recvtype,
20       MPI_Comm comm_ptr );
21 } mv2_alltoall_tuning_element;
22
23 typedef struct {
24   int numproc;
25   int size_table;
26   mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
27   mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
28 } mv2_alltoall_tuning_table;
29
30 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
31
32 /* Indicates number of processes per node */
33 int *mv2_alltoall_table_ppn_conf = NULL;
34 /* Indicates total number of configurations */
35 int mv2_alltoall_num_ppn_conf = 1;
36 int *mv2_size_alltoall_tuning_table = NULL;
37 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
38
39
40 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
41 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
42 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_mvapich2_scatter_dest
43 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
44 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring 
45
46
47 static void init_mv2_alltoall_tables_stampede(){
48   int i;
49   int agg_table_sum = 0;
50   mv2_alltoall_tuning_table **table_ptrs = NULL;
51   mv2_alltoall_num_ppn_conf = 3;
52   if(smpi_coll_cleanup_callback==NULL)
53     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
54   mv2_alltoall_thresholds_table = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
55       * mv2_alltoall_num_ppn_conf);
56   table_ptrs = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
57       * mv2_alltoall_num_ppn_conf);
58   mv2_size_alltoall_tuning_table = xbt_malloc(sizeof(int) *
59       mv2_alltoall_num_ppn_conf);
60   mv2_alltoall_table_ppn_conf = xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
61   mv2_alltoall_table_ppn_conf[0] = 1;
62   mv2_size_alltoall_tuning_table[0] = 6;
63   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
64       {2,
65           1,
66           {{0, -1, &MPIR_Alltoall_pairwise_MV2},
67           },
68
69           {{0, -1, &MPIR_Alltoall_inplace_MV2},
70           },
71       },
72
73       {4,
74           2,
75           {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
76               {262144, -1, &MPIR_Alltoall_pairwise_MV2},
77           },
78
79           {{0, -1, &MPIR_Alltoall_inplace_MV2},
80           },
81       },
82
83       {8,
84           2,
85           {{0, 8, &MPIR_Alltoall_RD_MV2},
86               {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
87           },
88
89           {{0, -1, &MPIR_Alltoall_inplace_MV2},
90           },
91       },
92
93       {16,
94           3,
95           {{0, 64, &MPIR_Alltoall_RD_MV2},
96               {64, 512, &MPIR_Alltoall_bruck_MV2},
97               {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
98           },
99
100           {{0,-1, &MPIR_Alltoall_inplace_MV2},
101           },
102       },
103
104       {32,
105           3,
106           {{0, 32, &MPIR_Alltoall_RD_MV2},
107               {32, 2048, &MPIR_Alltoall_bruck_MV2},
108               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
109           },
110
111           {{0, -1, &MPIR_Alltoall_inplace_MV2},
112           },
113       },
114
115       {64,
116           3,
117           {{0, 8, &MPIR_Alltoall_RD_MV2},
118               {8, 1024, &MPIR_Alltoall_bruck_MV2},
119               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
120           },
121
122           {{0, -1, &MPIR_Alltoall_inplace_MV2},
123           },
124       },
125   };
126   table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
127   mv2_alltoall_table_ppn_conf[1] = 2;
128   mv2_size_alltoall_tuning_table[1] = 6;
129   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
130       {4,
131           2,
132           {{0, 32, &MPIR_Alltoall_RD_MV2},
133               {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
134           },
135
136           {{0, -1, &MPIR_Alltoall_inplace_MV2},
137           },
138       },
139
140       {8,
141           2,
142           {{0, 64, &MPIR_Alltoall_RD_MV2},
143               {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
144           },
145
146           {{0, -1, &MPIR_Alltoall_inplace_MV2},
147           },
148       },
149
150       {16,
151           3,
152           {{0, 64, &MPIR_Alltoall_RD_MV2},
153               {64, 2048, &MPIR_Alltoall_bruck_MV2},
154               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
155           },
156
157           {{0,-1, &MPIR_Alltoall_inplace_MV2},
158           },
159       },
160
161       {32,
162           3,
163           {{0, 16, &MPIR_Alltoall_RD_MV2},
164               {16, 2048, &MPIR_Alltoall_bruck_MV2},
165               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
166           },
167
168           {{0, -1, &MPIR_Alltoall_inplace_MV2},
169           },
170       },
171
172       {64,
173           3,
174           {{0, 8, &MPIR_Alltoall_RD_MV2},
175               {8, 1024, &MPIR_Alltoall_bruck_MV2},
176               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
177           },
178
179           {{0, -1, &MPIR_Alltoall_inplace_MV2},
180           },
181       },
182
183       {128,
184           3,
185           {{0, 4, &MPIR_Alltoall_RD_MV2},
186               {4, 2048, &MPIR_Alltoall_bruck_MV2},
187               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
188           },
189
190           {{0, -1, &MPIR_Alltoall_inplace_MV2},
191           },
192       },
193   };
194   table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
195   mv2_alltoall_table_ppn_conf[2] = 16;
196   mv2_size_alltoall_tuning_table[2] = 7;
197   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
198       {16,
199           2,
200           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
201               {2048, -1,  &MPIR_Alltoall_Scatter_dest_MV2},
202           },
203
204           {{32768, -1, &MPIR_Alltoall_inplace_MV2},
205           },
206       },
207
208       {32,
209           2,
210           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
211               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
212           },
213
214           {{16384, -1, &MPIR_Alltoall_inplace_MV2},
215           },
216       },
217
218       {64,
219           3,
220           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
221               {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
222               {16384, -1, &MPIR_Alltoall_pairwise_MV2},
223           },
224
225           {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
226           },
227       },
228
229       {128,
230           2,
231           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
232               {2048, -1, &MPIR_Alltoall_pairwise_MV2},
233           },
234
235           {{16384,65536, &MPIR_Alltoall_inplace_MV2},
236           },
237       },
238
239       {256,
240           2,
241           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
242               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
243           },
244
245           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
246           },
247       },
248
249       {512,
250           2,
251           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
252               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
253           },
254
255           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
256           },
257       },
258       {1024,
259           2,
260           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
261               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
262           },
263
264           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
265           },
266       },
267
268   };
269   table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
270   agg_table_sum = 0;
271   for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
272       agg_table_sum += mv2_size_alltoall_tuning_table[i];
273   }
274   mv2_alltoall_thresholds_table[0] =
275       xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
276   memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
277       (sizeof(mv2_alltoall_tuning_table)
278           * mv2_size_alltoall_tuning_table[0]));
279   for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
280       mv2_alltoall_thresholds_table[i] =
281           mv2_alltoall_thresholds_table[i - 1]
282                                         + mv2_size_alltoall_tuning_table[i - 1];
283       memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
284           (sizeof(mv2_alltoall_tuning_table)
285               * mv2_size_alltoall_tuning_table[i]));
286   }
287   xbt_free(table_ptrs);
288
289
290 }
291
292
293 /************ Allgather variables and initializers                        */
294
295 typedef struct {
296   int min;
297   int max;
298   int (*MV2_pt_Allgather_function)(void *sendbuf,
299       int sendcount,
300       MPI_Datatype sendtype,
301       void *recvbuf,
302       int recvcount,
303       MPI_Datatype recvtype, MPI_Comm comm_ptr);
304 } mv2_allgather_tuning_element;
305
306 typedef struct {
307   int numproc;
308   int two_level[MV2_MAX_NB_THRESHOLDS];
309   int size_inter_table;
310   mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
311 } mv2_allgather_tuning_table;
312
313 int (*MV2_Allgather_function)(void *sendbuf,
314     int sendcount,
315     MPI_Datatype sendtype,
316     void *recvbuf,
317     int recvcount,
318     MPI_Datatype recvtype, MPI_Comm comm);
319
320 int *mv2_allgather_table_ppn_conf = NULL;
321 int mv2_allgather_num_ppn_conf = 1;
322 int *mv2_size_allgather_tuning_table = NULL;
323 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
324
325 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
326                                  int sendcount,
327                                  MPI_Datatype sendtype,
328                                  void *recvbuf,
329                                  int recvcount,
330                                  MPI_Datatype recvtype, MPI_Comm comm_ptr)
331 {
332     return 0;
333 }
334
335 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
336 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
337 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
338 #define MPIR_2lvl_Allgather_MV2 smpi_coll_tuned_allgather_mvapich2_smp
339
340 static void init_mv2_allgather_tables_stampede(){
341   int i;
342   int agg_table_sum = 0;
343   mv2_allgather_tuning_table **table_ptrs = NULL;
344   mv2_allgather_num_ppn_conf = 3;
345   mv2_allgather_thresholds_table
346   = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
347       * mv2_allgather_num_ppn_conf);
348   table_ptrs = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
349       * mv2_allgather_num_ppn_conf);
350   mv2_size_allgather_tuning_table = xbt_malloc(sizeof(int) *
351       mv2_allgather_num_ppn_conf);
352   mv2_allgather_table_ppn_conf
353   = xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int));
354   mv2_allgather_table_ppn_conf[0] = 1;
355   mv2_size_allgather_tuning_table[0] = 6;
356   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
357       {
358           2,
359           {0},
360           1,
361           {
362               {0, -1, &MPIR_Allgather_Ring_MV2},
363           },
364       },
365       {
366           4,
367           {0,0},
368           2,
369           {
370               {0, 262144, &MPIR_Allgather_RD_MV2},
371               {262144, -1, &MPIR_Allgather_Ring_MV2},
372           },
373       },
374       {
375           8,
376           {0,0},
377           2,
378           {
379               {0, 131072, &MPIR_Allgather_RD_MV2},
380               {131072, -1, &MPIR_Allgather_Ring_MV2},
381           },
382       },
383       {
384           16,
385           {0,0},
386           2,
387           {
388               {0, 131072, &MPIR_Allgather_RD_MV2},
389               {131072, -1, &MPIR_Allgather_Ring_MV2},
390           },
391       },
392       {
393           32,
394           {0,0},
395           2,
396           {
397               {0, 65536, &MPIR_Allgather_RD_MV2},
398               {65536, -1, &MPIR_Allgather_Ring_MV2},
399           },
400       },
401       {
402           64,
403           {0,0},
404           2,
405           {
406               {0, 32768, &MPIR_Allgather_RD_MV2},
407               {32768, -1, &MPIR_Allgather_Ring_MV2},
408           },
409       },
410   };
411   table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
412   mv2_allgather_table_ppn_conf[1] = 2;
413   mv2_size_allgather_tuning_table[1] = 6;
414   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
415       {
416           4,
417           {0,0},
418           2,
419           {
420               {0, 524288, &MPIR_Allgather_RD_MV2},
421               {524288, -1, &MPIR_Allgather_Ring_MV2},
422           },
423       },
424       {
425           8,
426           {0,1,0},
427           2,
428           {
429               {0, 32768, &MPIR_Allgather_RD_MV2},
430               {32768, 524288, &MPIR_Allgather_Ring_MV2},
431               {524288, -1, &MPIR_Allgather_Ring_MV2},
432           },
433       },
434       {
435           16,
436           {0,1,0},
437           2,
438           {
439               {0, 16384, &MPIR_Allgather_RD_MV2},
440               {16384, 524288, &MPIR_Allgather_Ring_MV2},
441               {524288, -1, &MPIR_Allgather_Ring_MV2},
442           },
443       },
444       {
445           32,
446           {1,1,0},
447           2,
448           {
449               {0, 65536, &MPIR_Allgather_RD_MV2},
450               {65536, 524288, &MPIR_Allgather_Ring_MV2},
451               {524288, -1, &MPIR_Allgather_Ring_MV2},
452           },
453       },
454       {
455           64,
456           {1,1,0},
457           2,
458           {
459               {0, 32768, &MPIR_Allgather_RD_MV2},
460               {32768, 524288, &MPIR_Allgather_Ring_MV2},
461               {524288, -1, &MPIR_Allgather_Ring_MV2},
462           },
463       },
464       {
465           128,
466           {1,1,0},
467           2,
468           {
469               {0, 65536, &MPIR_Allgather_RD_MV2},
470               {65536, 524288, &MPIR_Allgather_Ring_MV2},
471               {524288, -1, &MPIR_Allgather_Ring_MV2},
472           },
473       },
474   };
475   table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
476   mv2_allgather_table_ppn_conf[2] = 16;
477   mv2_size_allgather_tuning_table[2] = 6;
478   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
479       {
480           16,
481           {0,0},
482           2,
483           {
484               {0, 1024, &MPIR_Allgather_RD_MV2},
485               {1024, -1, &MPIR_Allgather_Ring_MV2},
486           },
487       },
488       {
489           32,
490           {0,0},
491           2,
492           {
493               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
494               {1024, -1, &MPIR_Allgather_Ring_MV2},
495           },
496       },
497       {
498           64,
499           {0,0},
500           2,
501           {
502               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
503               {1024, -1, &MPIR_Allgather_Ring_MV2},
504           },
505       },
506       {
507           128,
508           {0,0},
509           2,
510           {
511               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
512               {1024, -1, &MPIR_Allgather_Ring_MV2},
513           },
514       },
515       {
516           256,
517           {0,0},
518           2,
519           {
520               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
521               {1024, -1, &MPIR_Allgather_Ring_MV2},
522           },
523       },
524       {
525           512,
526           {0,0},
527           2,
528           {
529               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
530               {1024, -1, &MPIR_Allgather_Ring_MV2},
531           },
532       },
533
534   };
535   table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
536   agg_table_sum = 0;
537   for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
538       agg_table_sum += mv2_size_allgather_tuning_table[i];
539   }
540   mv2_allgather_thresholds_table[0] =
541       xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
542   memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
543       (sizeof(mv2_allgather_tuning_table)
544           * mv2_size_allgather_tuning_table[0]));
545   for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
546       mv2_allgather_thresholds_table[i] =
547           mv2_allgather_thresholds_table[i - 1]
548                                          + mv2_size_allgather_tuning_table[i - 1];
549       memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
550           (sizeof(mv2_allgather_tuning_table)
551               * mv2_size_allgather_tuning_table[i]));
552   }
553   xbt_free(table_ptrs);
554 }
555
556
557 /************ Gather variables and initializers                        */
558
559 typedef struct {
560   int min;
561   int max;
562   int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
563       MPI_Datatype sendtype, void *recvbuf, int recvcnt,
564       MPI_Datatype recvtype, int root, MPI_Comm  comm_ptr);
565 } mv2_gather_tuning_element;
566
567
568 typedef struct {
569   int numproc;
570   int size_inter_table;
571   mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
572   int size_intra_table;
573   mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
574 } mv2_gather_tuning_table;
575
576 int mv2_size_gather_tuning_table=7;
577 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL; 
578
579 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
580     int sendcnt,
581     MPI_Datatype sendtype,
582     void *recvbuf,
583     int recvcnt,
584     MPI_Datatype recvtype,
585     int root, MPI_Comm comm);
586
587 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
588 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
589
590
591 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
592 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_mvapich2_two_level
593 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
594
595
596 static void init_mv2_gather_tables_stampede(){
597
598   if(smpi_coll_cleanup_callback==NULL)
599     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
600   mv2_size_gather_tuning_table=7;
601   mv2_gather_thresholds_table = xbt_malloc(mv2_size_gather_tuning_table*
602       sizeof (mv2_gather_tuning_table));
603   mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
604       {16,
605           2,{{0, 524288, &MPIR_Gather_MV2_Direct},
606               {524288, -1, &MPIR_Gather_intra}},
607               1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
608               {32,
609                   3,{{0, 16384, &MPIR_Gather_MV2_Direct},
610                       {16384, 131072, &MPIR_Gather_intra},
611                       {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
612                       1,{{0, -1, &MPIR_Gather_intra}}},
613                       {64,
614                           3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
615                               {256, 16384, &MPIR_Gather_MV2_Direct},
616                               {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
617                               1,{{0, -1, &MPIR_Gather_intra}}},
618                               {128,
619                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
620                                       {512, 16384, &MPIR_Gather_MV2_Direct},
621                                       {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
622                                       1,{{0, -1, &MPIR_Gather_intra}}},
623                                       {256,
624                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
625                                               {512, 16384, &MPIR_Gather_MV2_Direct},
626                                               {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
627                                               1,{{0, -1, &MPIR_Gather_intra}}},
628                                               {512,
629                                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
630                                                       {512, 16384, &MPIR_Gather_MV2_Direct},
631                                                       {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
632                                                       1,{{0, -1, &MPIR_Gather_intra}}},
633                                                       {1024,
634                                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
635                                                               {512, 16384, &MPIR_Gather_MV2_Direct},
636                                                               {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
637                                                               1,{{0, -1, &MPIR_Gather_intra}}},
638   };
639
640   memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
641       mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
642
643 }
644
645
646 /************ Allgatherv variables and initializers                        */
647
648 typedef struct {
649   int min;
650   int max;
651   int (*MV2_pt_Allgatherv_function)(void *sendbuf,
652       int sendcount,
653       MPI_Datatype sendtype,
654       void *recvbuf,
655       int *recvcounts,
656       int *displs,
657       MPI_Datatype recvtype,
658       MPI_Comm commg);
659 } mv2_allgatherv_tuning_element;
660
661 typedef struct {
662   int numproc;
663   int size_inter_table;
664   mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
665 } mv2_allgatherv_tuning_table;
666
667 int (*MV2_Allgatherv_function)(void *sendbuf,
668     int sendcount,
669     MPI_Datatype sendtype,
670     void *recvbuf,
671     int *recvcounts,
672     int *displs,
673     MPI_Datatype recvtype,
674     MPI_Comm comm);
675
676 int mv2_size_allgatherv_tuning_table = 0;
677 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
678
679 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
680 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
681 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
682
683
684 static void init_mv2_allgatherv_tables_stampede(){
685   if(smpi_coll_cleanup_callback==NULL)
686     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
687   mv2_size_allgatherv_tuning_table = 6;
688   mv2_allgatherv_thresholds_table = xbt_malloc(mv2_size_allgatherv_tuning_table *
689       sizeof (mv2_allgatherv_tuning_table));
690   mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
691       {
692           16,
693           2,
694           {
695               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
696               {512, -1, &MPIR_Allgatherv_Ring_MV2},
697           },
698       },
699       {
700           32,
701           2,
702           {
703               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
704               {512, -1, &MPIR_Allgatherv_Ring_MV2},
705           },
706       },
707       {
708           64,
709           2,
710           {
711               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
712               {256, -1, &MPIR_Allgatherv_Ring_MV2},
713           },
714       },
715       {
716           128,
717           2,
718           {
719               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
720               {256, -1, &MPIR_Allgatherv_Ring_MV2},
721           },
722       },
723       {
724           256,
725           2,
726           {
727               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
728               {256, -1, &MPIR_Allgatherv_Ring_MV2},
729           },
730       },
731       {
732           512,
733           2,
734           {
735               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
736               {256, -1, &MPIR_Allgatherv_Ring_MV2},
737           },
738       },
739
740   };
741   memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
742       mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
743 }
744
745
746 /************ Allreduce variables and initializers                        */
747
748 typedef struct {
749   int min;
750   int max;
751   int (*MV2_pt_Allreduce_function)(void *sendbuf,
752       void *recvbuf,
753       int count,
754       MPI_Datatype datatype,
755       MPI_Op op, MPI_Comm comm);
756 } mv2_allreduce_tuning_element;
757
758 typedef struct {
759   int numproc;
760   int mcast_enabled;
761   int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
762   int size_inter_table;
763   mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
764   int size_intra_table;
765   mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
766 } mv2_allreduce_tuning_table;
767
768
769 int (*MV2_Allreduce_function)(void *sendbuf,
770     void *recvbuf,
771     int count,
772     MPI_Datatype datatype,
773     MPI_Op op, MPI_Comm comm)=NULL;
774
775
776 int (*MV2_Allreduce_intra_function)( void *sendbuf,
777     void *recvbuf,
778     int count,
779     MPI_Datatype datatype,
780     MPI_Op op, MPI_Comm comm)=NULL;
781
782 int mv2_size_allreduce_tuning_table = 0;
783 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
784
785
786
787
788
789 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
790     void *recvbuf,
791     int count,
792     MPI_Datatype datatype,
793     MPI_Op op, MPI_Comm comm)
794
795   return 0;
796 }
797
798 static  int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
799     void *recvbuf,
800     int count,
801     MPI_Datatype datatype,
802     MPI_Op op, MPI_Comm  comm)
803 {
804   return 0;
805 }
806
807 static  int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
808     void *recvbuf,
809     int count,
810     MPI_Datatype datatype,
811     MPI_Op op, MPI_Comm  comm)
812 {
813   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
814   return MPI_SUCCESS;
815 }
816
817 static  int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
818     void *recvbuf,
819     int count,
820     MPI_Datatype datatype,
821     MPI_Op op, MPI_Comm  comm)
822 {
823   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
824   return MPI_SUCCESS;
825 }
826
827 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
828 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
829 #define MPIR_Allreduce_two_level_MV2 smpi_coll_tuned_allreduce_mvapich2_two_level
830
831
832 static void init_mv2_allreduce_tables_stampede(){
833   if(smpi_coll_cleanup_callback==NULL)
834     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
835   mv2_size_allreduce_tuning_table = 8;
836   mv2_allreduce_thresholds_table = xbt_malloc(mv2_size_allreduce_tuning_table *
837       sizeof (mv2_allreduce_tuning_table));
838   mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
839       {
840           16,
841           0,
842           {1, 0},
843           2,
844           {
845               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
846               {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
847           },
848           2,
849           {
850               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
851               {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
852           },
853       },
854       {
855           32,
856           0,
857           {1, 1, 0},
858           3,
859           {
860               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
861               {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
862               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
863           },
864           2,
865           {
866               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
867               {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
868           },
869       },
870       {
871           64,
872           0,
873           {1, 1, 0},
874           3,
875           {
876               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
877               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
878               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
879           },
880           2,
881           {
882               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
883               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
884           },
885       },
886       {
887           128,
888           0,
889           {1, 1, 0},
890           3,
891           {
892               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
893               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
894               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
895           },
896           2,
897           {
898               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
899               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
900           },
901       },
902       {
903           256,
904           0,
905           {1, 1, 0},
906           3,
907           {
908               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
909               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
910               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
911           },
912           2,
913           {
914               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
915               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
916           },
917       },
918       {
919           512,
920           0,
921           {1, 1, 0},
922           3,
923           {
924               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
925               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
926               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
927           },
928           2,
929           {
930               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
931               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
932           },
933       },
934       {
935           1024,
936           0,
937           {1, 1, 1, 0},
938           4,
939           {
940               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
941               {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
942               {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
943               {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
944           },
945           2,
946           {
947               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
948               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
949           },
950       },
951       {
952           2048,
953           0,
954           {1, 1, 1, 0},
955           4,
956           {
957               {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
958               {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
959               {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
960               {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
961               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
962           },
963           2,
964           {
965               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
966               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
967           },
968       },
969
970   };
971   memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
972       mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
973 }
974
975
976
977
978 typedef struct {
979     int min;
980     int max;
981     int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
982                                   int root, MPI_Comm comm_ptr);
983     int zcpy_pipelined_knomial_factor;
984 } mv2_bcast_tuning_element;
985
986 typedef struct {
987     int numproc;
988     int bcast_segment_size;
989     int intra_node_knomial_factor;
990     int inter_node_knomial_factor;
991     int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
992     int size_inter_table;
993     mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
994     int size_intra_table;
995     mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
996 } mv2_bcast_tuning_table;
997
998 int mv2_size_bcast_tuning_table = 0;
999 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
1000
1001
1002 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
1003                            int root, MPI_Comm comm_ptr) = NULL;
1004
1005 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
1006                                       int root, MPI_Comm comm_ptr) = NULL;
1007
1008 int zcpy_knomial_factor = 2;
1009 int mv2_pipelined_zcpy_knomial_factor = -1;
1010 int bcast_segment_size = 8192;
1011 int mv2_inter_node_knomial_factor = 4;
1012 int mv2_intra_node_knomial_factor = 4;
1013 #define mv2_bcast_two_level_system_size  64
1014 #define mv2_bcast_short_msg             16384
1015 #define mv2_bcast_large_msg            512*1024
1016
1017 #define INTRA_NODE_ROOT 0
1018
1019 #define MPIR_Pipelined_Bcast_Zcpy_MV2 smpi_coll_tuned_bcast_mpich
1020 #define MPIR_Pipelined_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1021 #define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_binomial_tree
1022 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
1023 #define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_scatter_rdb_allgather
1024 #define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
1025 #define MPIR_Shmem_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1026 #define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
1027 #define MPIR_Bcast_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
1028 #define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mvapich2_knomial_intra_node
1029 #define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mvapich2_intra_node
1030
1031 static void init_mv2_bcast_tables_stampede(){
1032  //Stampede,
1033   if(smpi_coll_cleanup_callback==NULL)
1034     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1035   mv2_size_bcast_tuning_table=8;
1036   mv2_bcast_thresholds_table = xbt_malloc(mv2_size_bcast_tuning_table *
1037   sizeof (mv2_bcast_tuning_table));
1038
1039   mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1040     {
1041             16,
1042             8192, 4, 4,
1043             {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1044             11,
1045             {
1046               {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1047               {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1048               {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1049               {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1050               {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1051               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1052               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1053               {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1054               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1055               {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1056               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1057             },
1058             11,
1059             {
1060               {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1061               {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1062               {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1063               {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1064               {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1065               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1066               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1067               {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1068               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1069               {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1070               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1071             }
1072     },
1073     {
1074             32,
1075             8192, 4, 4,
1076             {1, 1, 1, 1, 1, 1, 1, 1},
1077             8,
1078             {
1079               {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1080               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1081               {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1082               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1083               {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1084               {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1085               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1086               {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1087             },
1088             8,
1089             {
1090               {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1091               {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1092               {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1093               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1094               {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1095               {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1096               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1097               {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1098             }
1099     },
1100     {
1101             64,
1102             8192, 4, 4,
1103             {1, 1, 1, 1, 1, 1, 1, 1, 1},
1104             9,
1105             {
1106               {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1107               {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1108               {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1109               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1110               {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1111               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1112               {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1113               {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1114               {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1115             },
1116             9,
1117             {
1118               {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1119               {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1120               {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1121               {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1122               {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1123               {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1124               {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1125               {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1126               {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1127             }
1128     },
1129     {
1130             128,
1131             8192, 4, 4,
1132             {1, 1, 1, 0},
1133             4,
1134             {
1135               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1136               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1137               {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1138               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1139             },
1140             4,
1141             {
1142               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1143               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1144               {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1145               {524288, -1, NULL, -1}
1146             }
1147     },
1148     {
1149             256,
1150             8192, 4, 4,
1151             {1, 1, 1, 1, 1},
1152             5,
1153             {
1154               {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1155               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1156               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1157               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1158               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1159             },
1160             5,
1161             {
1162               {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1163               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1164               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1165               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1166               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1167             }
1168     },
1169     {
1170             512,
1171             8192, 4, 4,
1172             {1, 1, 1, 1, 1},
1173             5,
1174             {
1175               {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1176               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1177               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1178               {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1179               {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1180             },
1181             5,
1182             {
1183               {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1184               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1185               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1186               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1187               {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1188             }
1189     },
1190     {
1191             1024,
1192             8192, 4, 4,
1193             {1, 1, 1, 1, 1},
1194             5,
1195             {
1196               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1197               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1198               {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1199               {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1200               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1201             },
1202             5,
1203             {
1204               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1205               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1206               {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1207               {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1208               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1209             }
1210     },
1211     {
1212             2048,
1213             8192, 4, 4,
1214             {1, 1, 1, 1, 1, 1, 1},
1215             7,
1216             {
1217               {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1218               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1219               {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1220               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1221               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1222               {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1223               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1224             },
1225             7,
1226             {
1227               {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1228               {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1229               {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1230               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1231               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1232               {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1233               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1234             }
1235     }
1236   };
1237
1238         memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1239                     mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1240 }
1241
1242
1243 /************ Reduce variables and initializers                        */
1244
1245 typedef struct {
1246   int min;
1247   int max;
1248   int (*MV2_pt_Reduce_function)(void *sendbuf,
1249       void *recvbuf,
1250       int count,
1251       MPI_Datatype datatype,
1252       MPI_Op op,
1253       int root,
1254       MPI_Comm  comm_ptr);
1255 } mv2_reduce_tuning_element;
1256
1257 typedef struct {
1258   int numproc;
1259   int inter_k_degree;
1260   int intra_k_degree;
1261   int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1262   int size_inter_table;
1263   mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1264   int size_intra_table;
1265   mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1266 } mv2_reduce_tuning_table;
1267
1268 int mv2_size_reduce_tuning_table = 0;
1269 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1270
1271
1272 int mv2_reduce_intra_knomial_factor = -1;
1273 int mv2_reduce_inter_knomial_factor = -1;
1274
1275 int (*MV2_Reduce_function)( void *sendbuf,
1276     void *recvbuf,
1277     int count,
1278     MPI_Datatype datatype,
1279     MPI_Op op,
1280     int root,
1281     MPI_Comm  comm_ptr)=NULL;
1282
1283 int (*MV2_Reduce_intra_function)( void *sendbuf,
1284     void *recvbuf,
1285     int count,
1286     MPI_Datatype datatype,
1287     MPI_Op op,
1288     int root,
1289     MPI_Comm  comm_ptr)=NULL;
1290
1291
1292 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1293 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1294 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1295 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1296 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1297 #define MPIR_Reduce_two_level_helper_MV2 smpi_coll_tuned_reduce_mvapich2_two_level
1298
1299
1300 static void init_mv2_reduce_tables_stampede(){
1301   if(smpi_coll_cleanup_callback==NULL)
1302     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1303   /*Stampede*/
1304   mv2_size_reduce_tuning_table = 8;
1305   mv2_reduce_thresholds_table = xbt_malloc(mv2_size_reduce_tuning_table *
1306       sizeof (mv2_reduce_tuning_table));
1307   mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1308       {
1309           16,
1310           4,
1311           4,
1312           {1, 0, 0},
1313           3,
1314           {
1315               {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1316               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1317               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1318           },
1319           2,
1320           {
1321               {0, 65536, &MPIR_Reduce_shmem_MV2},
1322               {65536,-1,  &MPIR_Reduce_binomial_MV2},
1323           },
1324       },
1325       {
1326           32,
1327           4,
1328           4,
1329           {1, 1, 1, 1, 0, 0, 0},
1330           7,
1331           {
1332               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1333               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1334               {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1335               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1336               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1337               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1338               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1339           },
1340           6,
1341           {
1342               {0, 8192, &MPIR_Reduce_shmem_MV2},
1343               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1344               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1345               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1346               {65536, 262144, &MPIR_Reduce_shmem_MV2},
1347               {262144,-1,  &MPIR_Reduce_binomial_MV2},
1348           },
1349       },
1350       {
1351           64,
1352           4,
1353           4,
1354           {1, 1, 1, 1, 0},
1355           5,
1356           {
1357               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1358               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1359               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1360               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1361               {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1362           },
1363           5,
1364           {
1365               {0, 8192, &MPIR_Reduce_shmem_MV2},
1366               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1367               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1368               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1369               {262144, -1, &MPIR_Reduce_binomial_MV2},
1370           },
1371       },
1372       {
1373           128,
1374           4,
1375           4,
1376           {1, 0, 1, 0, 1, 0},
1377           6,
1378           {
1379               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1380               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1381               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1382               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1383               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1384               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1385           },
1386           5,
1387           {
1388               {0, 8192, &MPIR_Reduce_shmem_MV2},
1389               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1390               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1391               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1392               {262144, -1, &MPIR_Reduce_binomial_MV2},
1393           },
1394       },
1395       {
1396           256,
1397           4,
1398           4,
1399           {1, 1, 1, 0, 1, 1, 0},
1400           7,
1401           {
1402               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1403               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1404               {16384, 32768, &MPIR_Reduce_binomial_MV2},
1405               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1406               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1407               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1408               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1409           },
1410           6,
1411           {
1412               {0, 8192, &MPIR_Reduce_shmem_MV2},
1413               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1414               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1415               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1416               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1417               {262144, -1, &MPIR_Reduce_binomial_MV2},
1418           },
1419       },
1420       {
1421           512,
1422           4,
1423           4,
1424           {1, 0, 1, 1, 1, 0},
1425           6,
1426           {
1427               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1428               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1429               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1430               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1431               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1432               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1433           },
1434           5,
1435           {
1436               {0, 8192, &MPIR_Reduce_shmem_MV2},
1437               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1438               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1439               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1440               {262144, -1, &MPIR_Reduce_binomial_MV2},
1441           },
1442       },
1443       {
1444           1024,
1445           4,
1446           4,
1447           {1, 0, 1, 1, 1},
1448           5,
1449           {
1450               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1451               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1452               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1453               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1454               {262144, -1, &MPIR_Reduce_binomial_MV2},
1455           },
1456           5,
1457           {
1458               {0, 8192, &MPIR_Reduce_shmem_MV2},
1459               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1460               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1461               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1462               {262144, -1, &MPIR_Reduce_binomial_MV2},
1463           },
1464       },
1465       {
1466           2048,
1467           4,
1468           4,
1469           {1, 0, 1, 1, 1,1},
1470           6,
1471           {
1472               {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1473               {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1474               {4096, 16384, &MPIR_Reduce_binomial_MV2},
1475               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1476               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1477               {131072, -1, &MPIR_Reduce_binomial_MV2},
1478           },
1479           6,
1480           {
1481               {0, 2048, &MPIR_Reduce_shmem_MV2},
1482               {2048, 4096, &MPIR_Reduce_shmem_MV2},
1483               {4096, 16384, &MPIR_Reduce_shmem_MV2},
1484               {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1485               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1486               {131072, -1, &MPIR_Reduce_shmem_MV2},
1487           },
1488       },
1489
1490   };
1491   memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1492       mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1493 }
1494
1495 /************ Reduce scatter variables and initializers                        */
1496
1497 typedef struct {
1498   int min;
1499   int max;
1500   int (*MV2_pt_Red_scat_function)(void *sendbuf,
1501       void *recvbuf,
1502       int *recvcnts,
1503       MPI_Datatype datatype,
1504       MPI_Op op,
1505       MPI_Comm comm_ptr);
1506 } mv2_red_scat_tuning_element;
1507
1508 typedef struct {
1509   int numproc;
1510   int size_inter_table;
1511   mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1512 } mv2_red_scat_tuning_table;
1513
1514 int mv2_size_red_scat_tuning_table = 0;
1515 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1516
1517
1518 int (*MV2_Red_scat_function)(void *sendbuf,
1519     void *recvbuf,
1520     int *recvcnts,
1521     MPI_Datatype datatype,
1522     MPI_Op op,
1523     MPI_Comm comm_ptr);
1524
1525
1526
1527 static  int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1528     void *recvbuf,
1529     int *recvcnts,
1530     MPI_Datatype datatype,
1531     MPI_Op op,
1532     MPI_Comm comm)
1533 {
1534   smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1535   return MPI_SUCCESS;
1536 }
1537 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1538 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1539 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1540
1541
1542
1543
1544 static void init_mv2_reduce_scatter_tables_stampede(){
1545   if(smpi_coll_cleanup_callback==NULL)
1546     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1547   mv2_size_red_scat_tuning_table = 6;
1548   mv2_red_scat_thresholds_table = xbt_malloc(mv2_size_red_scat_tuning_table *
1549       sizeof (mv2_red_scat_tuning_table));
1550   mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1551       {
1552           16,
1553           3,
1554           {
1555               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1556               {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1557               {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1558           },
1559       },
1560       {
1561           32,
1562           3,
1563           {
1564               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1565               {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1566               {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1567           },
1568       },
1569       {
1570           64,
1571           3,
1572           {
1573               {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1574               {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1575               {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1576           },
1577       },
1578       {
1579           128,
1580           2,
1581           {
1582               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1583               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1584           },
1585       },
1586       {
1587           256,
1588           2,
1589           {
1590               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1591               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1592           },
1593       },
1594       {
1595           512,
1596           2,
1597           {
1598               {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1599               {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1600           },
1601       },
1602
1603   };
1604   memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1605       mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1606 }
1607
1608 /************ Scatter variables and initializers                        */
1609
1610 typedef struct {
1611   int min;
1612   int max;
1613   int (*MV2_pt_Scatter_function)(void *sendbuf,
1614       int sendcnt,
1615       MPI_Datatype sendtype,
1616       void *recvbuf,
1617       int recvcnt,
1618       MPI_Datatype recvtype,
1619       int root, MPI_Comm comm);
1620 } mv2_scatter_tuning_element;
1621
1622 typedef struct {
1623   int numproc;
1624   int size_inter_table;
1625   mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1626   int size_intra_table;
1627   mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1628 } mv2_scatter_tuning_table;
1629
1630
1631 int *mv2_scatter_table_ppn_conf = NULL;
1632 int mv2_scatter_num_ppn_conf = 1;
1633 int *mv2_size_scatter_tuning_table = NULL;
1634 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1635
1636 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1637     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1638     int root, MPI_Comm comm)=NULL;
1639
1640 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1641     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1642     int root, MPI_Comm comm)=NULL;
1643 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1644     int sendcnt,
1645     MPI_Datatype sendtype,
1646     void *recvbuf,
1647     int recvcnt,
1648     MPI_Datatype recvtype,
1649     int root, MPI_Comm comm_ptr);
1650
1651 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1652     int sendcnt,
1653     MPI_Datatype sendtype,
1654     void *recvbuf,
1655     int recvcnt,
1656     MPI_Datatype recvtype,
1657     int root, MPI_Comm comm_ptr)
1658 {
1659   return 0;
1660 }
1661
1662 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1663 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1664 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_mvapich2_two_level_binomial
1665 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_mvapich2_two_level_direct
1666
1667
1668
1669
1670 static void init_mv2_scatter_tables_stampede(){
1671     if(smpi_coll_cleanup_callback==NULL)
1672       smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1673
1674     int agg_table_sum = 0;
1675     int i;
1676     mv2_scatter_tuning_table **table_ptrs = NULL;
1677     mv2_scatter_num_ppn_conf = 3;
1678     mv2_scatter_thresholds_table
1679     = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1680         * mv2_scatter_num_ppn_conf);
1681     table_ptrs = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1682         * mv2_scatter_num_ppn_conf);
1683     mv2_size_scatter_tuning_table = xbt_malloc(sizeof(int) *
1684         mv2_scatter_num_ppn_conf);
1685     mv2_scatter_table_ppn_conf
1686     = xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int));
1687     mv2_scatter_table_ppn_conf[0] = 1;
1688     mv2_size_scatter_tuning_table[0] = 6;
1689     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1690         {2,
1691             1,
1692             {
1693                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1694             },
1695             1,
1696             {
1697                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1698             },
1699         },
1700
1701         {4,
1702             1,
1703             {
1704                 {0, -1, &MPIR_Scatter_MV2_Direct},
1705             },
1706             1,
1707             {
1708                 {0, -1, &MPIR_Scatter_MV2_Direct},
1709             },
1710         },
1711
1712         {8,
1713             1,
1714             {
1715                 {0, -1, &MPIR_Scatter_MV2_Direct},
1716             },
1717             1,
1718             {
1719                 {0, -1, &MPIR_Scatter_MV2_Direct},
1720             },
1721         },
1722
1723         {16,
1724             1,
1725             {
1726                 {0, -1, &MPIR_Scatter_MV2_Direct},
1727             },
1728             1,
1729             {
1730                 {0, -1, &MPIR_Scatter_MV2_Direct},
1731             },
1732         },
1733
1734         {32,
1735             1,
1736             {
1737                 {0, -1, &MPIR_Scatter_MV2_Direct},
1738             },
1739             1,
1740             {
1741                 {0, -1, &MPIR_Scatter_MV2_Direct},
1742             },
1743         },
1744
1745         {64,
1746             2,
1747             {
1748                 {0, 32, &MPIR_Scatter_MV2_Binomial},
1749                 {32, -1, &MPIR_Scatter_MV2_Direct},
1750             },
1751             1,
1752             {
1753                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1754             },
1755         },
1756     };
1757     table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1758     mv2_scatter_table_ppn_conf[1] = 2;
1759     mv2_size_scatter_tuning_table[1] = 6;
1760     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1761         {4,
1762             2,
1763             {
1764                 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1765                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1766             },
1767             1,
1768             {
1769                 {0, -1, &MPIR_Scatter_MV2_Direct},
1770             },
1771         },
1772
1773         {8,
1774             2,
1775             {
1776                 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1777                 {512, -1, &MPIR_Scatter_MV2_Direct},
1778             },
1779             1,
1780             {
1781                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1782             },
1783         },
1784
1785         {16,
1786             2,
1787             {
1788                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1789                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1790             },
1791             1,
1792             {
1793                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1794             },
1795         },
1796
1797         {32,
1798             2,
1799             {
1800                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1801                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1802             },
1803             1,
1804             {
1805                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1806             },
1807         },
1808
1809         {64,
1810             2,
1811             {
1812                 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1813                 {8192, -1, &MPIR_Scatter_MV2_Direct},
1814             },
1815             1,
1816             {
1817                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1818             },
1819         },
1820
1821         {128,
1822             4,
1823             {
1824                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1825                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1826                 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1827                 {16384, -1, &MPIR_Scatter_MV2_Direct},
1828             },
1829             1,
1830             {
1831                 {0, 128, &MPIR_Scatter_MV2_Direct},
1832                 {128, -1, &MPIR_Scatter_MV2_Binomial},
1833             },
1834         },
1835     };
1836     table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1837     mv2_scatter_table_ppn_conf[2] = 16;
1838     mv2_size_scatter_tuning_table[2] = 8;
1839     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1840         {
1841             16,
1842             2,
1843             {
1844                 {0, 256, &MPIR_Scatter_MV2_Binomial},
1845                 {256, -1, &MPIR_Scatter_MV2_Direct},
1846             },
1847             1,
1848             {
1849                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1850             },
1851         },
1852
1853         {
1854             32,
1855             2,
1856             {
1857                 {0, 512, &MPIR_Scatter_MV2_Binomial},
1858                 {512, -1, &MPIR_Scatter_MV2_Direct},
1859             },
1860             1,
1861             {
1862                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1863             },
1864         },
1865
1866         {
1867             64,
1868             2,
1869             {
1870                 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1871                 {1024, -1, &MPIR_Scatter_MV2_Direct},
1872             },
1873             1,
1874             {
1875                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1876             },
1877         },
1878
1879         {
1880             128,
1881             4,
1882             {
1883                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1884                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1885                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1886                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1887             },
1888             1,
1889             {
1890                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1891             },
1892         },
1893
1894         {
1895             256,
1896             4,
1897             {
1898                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1899                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1900                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1901                 {2048, -1,  &MPIR_Scatter_MV2_Direct},
1902             },
1903             1,
1904             {
1905                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1906             },
1907         },
1908
1909         {
1910             512,
1911             4,
1912             {
1913                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1914                 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1915                 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1916                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1917             },
1918             1,
1919             {
1920                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1921             },
1922         },
1923         {
1924             1024,
1925             5,
1926             {
1927                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1928                 {0, 16,  &MPIR_Scatter_MV2_Binomial},
1929                 {16, 32, &MPIR_Scatter_MV2_Binomial},
1930                 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1931                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1932             },
1933             1,
1934             {
1935                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1936             },
1937         },
1938         {
1939             2048,
1940             7,
1941             {
1942                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1943                 {0, 16,  &MPIR_Scatter_MV2_two_level_Binomial},
1944                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1945                 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1946                 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1947                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1948                 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1949             },
1950             6,
1951             {
1952                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1953                 {16, 128, &MPIR_Scatter_MV2_Binomial},
1954                 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1955                 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1956                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1957                 {65536, -1, &MPIR_Scatter_MV2_Direct},
1958             },
1959         },
1960     };
1961     table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1962     agg_table_sum = 0;
1963     for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1964         agg_table_sum += mv2_size_scatter_tuning_table[i];
1965     }
1966     mv2_scatter_thresholds_table[0] =
1967         xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
1968     memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1969         (sizeof(mv2_scatter_tuning_table)
1970             * mv2_size_scatter_tuning_table[0]));
1971     for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1972         mv2_scatter_thresholds_table[i] =
1973             mv2_scatter_thresholds_table[i - 1]
1974                                          + mv2_size_scatter_tuning_table[i - 1];
1975         memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1976             (sizeof(mv2_scatter_tuning_table)
1977                 * mv2_size_scatter_tuning_table[i]));
1978     }
1979     xbt_free(table_ptrs);
1980   
1981 }
1982