Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
bf44fb75151925b71fd3af5fa84ef107431641c1
[simgrid.git] / src / smpi / colls / smpi_mvapich2_selector_stampede.h
1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
2
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4  * All rights reserved.                                                     */
5
6 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
7
8
9
10 /************ Alltoall variables and initializers                        */
11
12 #define MV2_MAX_NB_THRESHOLDS  32
13
14
15 typedef struct {
16   int min;
17   int max;
18   int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
19       void *recvbuf, int recvcount, MPI_Datatype recvtype,
20       MPI_Comm comm_ptr );
21 } mv2_alltoall_tuning_element;
22
23 typedef struct {
24   int numproc;
25   int size_table;
26   mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
27   mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
28 } mv2_alltoall_tuning_table;
29
30 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
31
32 /* Indicates number of processes per node */
33 int *mv2_alltoall_table_ppn_conf = NULL;
34 /* Indicates total number of configurations */
35 int mv2_alltoall_num_ppn_conf = 1;
36 int *mv2_size_alltoall_tuning_table = NULL;
37 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
38
39
40 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
41 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
42 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_mvapich2_scatter_dest
43 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
44 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring 
45
46
47 static void init_mv2_alltoall_tables_stampede(){
48   int i;
49   int agg_table_sum = 0;
50   mv2_alltoall_tuning_table **table_ptrs = NULL;
51   mv2_alltoall_num_ppn_conf = 3;
52   mv2_alltoall_thresholds_table = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
53       * mv2_alltoall_num_ppn_conf);
54   table_ptrs = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
55       * mv2_alltoall_num_ppn_conf);
56   mv2_size_alltoall_tuning_table = xbt_malloc(sizeof(int) *
57       mv2_alltoall_num_ppn_conf);
58   mv2_alltoall_table_ppn_conf = xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
59   mv2_alltoall_table_ppn_conf[0] = 1;
60   mv2_size_alltoall_tuning_table[0] = 6;
61   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
62       {2,
63           1,
64           {{0, -1, &MPIR_Alltoall_pairwise_MV2},
65           },
66
67           {{0, -1, &MPIR_Alltoall_inplace_MV2},
68           },
69       },
70
71       {4,
72           2,
73           {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
74               {262144, -1, &MPIR_Alltoall_pairwise_MV2},
75           },
76
77           {{0, -1, &MPIR_Alltoall_inplace_MV2},
78           },
79       },
80
81       {8,
82           2,
83           {{0, 8, &MPIR_Alltoall_RD_MV2},
84               {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
85           },
86
87           {{0, -1, &MPIR_Alltoall_inplace_MV2},
88           },
89       },
90
91       {16,
92           3,
93           {{0, 64, &MPIR_Alltoall_RD_MV2},
94               {64, 512, &MPIR_Alltoall_bruck_MV2},
95               {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
96           },
97
98           {{0,-1, &MPIR_Alltoall_inplace_MV2},
99           },
100       },
101
102       {32,
103           3,
104           {{0, 32, &MPIR_Alltoall_RD_MV2},
105               {32, 2048, &MPIR_Alltoall_bruck_MV2},
106               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
107           },
108
109           {{0, -1, &MPIR_Alltoall_inplace_MV2},
110           },
111       },
112
113       {64,
114           3,
115           {{0, 8, &MPIR_Alltoall_RD_MV2},
116               {8, 1024, &MPIR_Alltoall_bruck_MV2},
117               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
118           },
119
120           {{0, -1, &MPIR_Alltoall_inplace_MV2},
121           },
122       },
123   };
124   table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
125   mv2_alltoall_table_ppn_conf[1] = 2;
126   mv2_size_alltoall_tuning_table[1] = 6;
127   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
128       {4,
129           2,
130           {{0, 32, &MPIR_Alltoall_RD_MV2},
131               {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
132           },
133
134           {{0, -1, &MPIR_Alltoall_inplace_MV2},
135           },
136       },
137
138       {8,
139           2,
140           {{0, 64, &MPIR_Alltoall_RD_MV2},
141               {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
142           },
143
144           {{0, -1, &MPIR_Alltoall_inplace_MV2},
145           },
146       },
147
148       {16,
149           3,
150           {{0, 64, &MPIR_Alltoall_RD_MV2},
151               {64, 2048, &MPIR_Alltoall_bruck_MV2},
152               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
153           },
154
155           {{0,-1, &MPIR_Alltoall_inplace_MV2},
156           },
157       },
158
159       {32,
160           3,
161           {{0, 16, &MPIR_Alltoall_RD_MV2},
162               {16, 2048, &MPIR_Alltoall_bruck_MV2},
163               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
164           },
165
166           {{0, -1, &MPIR_Alltoall_inplace_MV2},
167           },
168       },
169
170       {64,
171           3,
172           {{0, 8, &MPIR_Alltoall_RD_MV2},
173               {8, 1024, &MPIR_Alltoall_bruck_MV2},
174               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
175           },
176
177           {{0, -1, &MPIR_Alltoall_inplace_MV2},
178           },
179       },
180
181       {128,
182           3,
183           {{0, 4, &MPIR_Alltoall_RD_MV2},
184               {4, 2048, &MPIR_Alltoall_bruck_MV2},
185               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
186           },
187
188           {{0, -1, &MPIR_Alltoall_inplace_MV2},
189           },
190       },
191   };
192   table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
193   mv2_alltoall_table_ppn_conf[2] = 16;
194   mv2_size_alltoall_tuning_table[2] = 7;
195   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
196       {16,
197           2,
198           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
199               {2048, -1,  &MPIR_Alltoall_Scatter_dest_MV2},
200           },
201
202           {{32768, -1, &MPIR_Alltoall_inplace_MV2},
203           },
204       },
205
206       {32,
207           2,
208           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
209               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
210           },
211
212           {{16384, -1, &MPIR_Alltoall_inplace_MV2},
213           },
214       },
215
216       {64,
217           3,
218           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
219               {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
220               {16384, -1, &MPIR_Alltoall_pairwise_MV2},
221           },
222
223           {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
224           },
225       },
226
227       {128,
228           2,
229           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
230               {2048, -1, &MPIR_Alltoall_pairwise_MV2},
231           },
232
233           {{16384,65536, &MPIR_Alltoall_inplace_MV2},
234           },
235       },
236
237       {256,
238           2,
239           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
240               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
241           },
242
243           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
244           },
245       },
246
247       {512,
248           2,
249           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
250               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
251           },
252
253           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
254           },
255       },
256       {1024,
257           2,
258           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
259               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
260           },
261
262           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
263           },
264       },
265
266   };
267   table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
268   agg_table_sum = 0;
269   for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
270       agg_table_sum += mv2_size_alltoall_tuning_table[i];
271   }
272   mv2_alltoall_thresholds_table[0] =
273       xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
274   memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
275       (sizeof(mv2_alltoall_tuning_table)
276           * mv2_size_alltoall_tuning_table[0]));
277   for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
278       mv2_alltoall_thresholds_table[i] =
279           mv2_alltoall_thresholds_table[i - 1]
280                                         + mv2_size_alltoall_tuning_table[i - 1];
281       memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
282           (sizeof(mv2_alltoall_tuning_table)
283               * mv2_size_alltoall_tuning_table[i]));
284   }
285   xbt_free(table_ptrs);
286
287
288 }
289
290
291 /************ Allgather variables and initializers                        */
292
293 typedef struct {
294   int min;
295   int max;
296   int (*MV2_pt_Allgather_function)(void *sendbuf,
297       int sendcount,
298       MPI_Datatype sendtype,
299       void *recvbuf,
300       int recvcount,
301       MPI_Datatype recvtype, MPI_Comm comm_ptr);
302 } mv2_allgather_tuning_element;
303
304 typedef struct {
305   int numproc;
306   int two_level[MV2_MAX_NB_THRESHOLDS];
307   int size_inter_table;
308   mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
309 } mv2_allgather_tuning_table;
310
311 int (*MV2_Allgather_function)(void *sendbuf,
312     int sendcount,
313     MPI_Datatype sendtype,
314     void *recvbuf,
315     int recvcount,
316     MPI_Datatype recvtype, MPI_Comm comm);
317
318 int *mv2_allgather_table_ppn_conf = NULL;
319 int mv2_allgather_num_ppn_conf = 1;
320 int *mv2_size_allgather_tuning_table = NULL;
321 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
322
323 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
324                                  int sendcount,
325                                  MPI_Datatype sendtype,
326                                  void *recvbuf,
327                                  int recvcount,
328                                  MPI_Datatype recvtype, MPI_Comm comm_ptr)
329 {
330     return 0;
331 }
332
333 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
334 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
335 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
336 #define MPIR_2lvl_Allgather_MV2 smpi_coll_tuned_allgather_mvapich2_smp
337
338 static void init_mv2_allgather_tables_stampede(){
339   int i;
340   int agg_table_sum = 0;
341   mv2_allgather_tuning_table **table_ptrs = NULL;
342   mv2_allgather_num_ppn_conf = 3;
343   mv2_allgather_thresholds_table
344   = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
345       * mv2_allgather_num_ppn_conf);
346   table_ptrs = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
347       * mv2_allgather_num_ppn_conf);
348   mv2_size_allgather_tuning_table = xbt_malloc(sizeof(int) *
349       mv2_allgather_num_ppn_conf);
350   mv2_allgather_table_ppn_conf
351   = xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int));
352   mv2_allgather_table_ppn_conf[0] = 1;
353   mv2_size_allgather_tuning_table[0] = 6;
354   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
355       {
356           2,
357           {0},
358           1,
359           {
360               {0, -1, &MPIR_Allgather_Ring_MV2},
361           },
362       },
363       {
364           4,
365           {0,0},
366           2,
367           {
368               {0, 262144, &MPIR_Allgather_RD_MV2},
369               {262144, -1, &MPIR_Allgather_Ring_MV2},
370           },
371       },
372       {
373           8,
374           {0,0},
375           2,
376           {
377               {0, 131072, &MPIR_Allgather_RD_MV2},
378               {131072, -1, &MPIR_Allgather_Ring_MV2},
379           },
380       },
381       {
382           16,
383           {0,0},
384           2,
385           {
386               {0, 131072, &MPIR_Allgather_RD_MV2},
387               {131072, -1, &MPIR_Allgather_Ring_MV2},
388           },
389       },
390       {
391           32,
392           {0,0},
393           2,
394           {
395               {0, 65536, &MPIR_Allgather_RD_MV2},
396               {65536, -1, &MPIR_Allgather_Ring_MV2},
397           },
398       },
399       {
400           64,
401           {0,0},
402           2,
403           {
404               {0, 32768, &MPIR_Allgather_RD_MV2},
405               {32768, -1, &MPIR_Allgather_Ring_MV2},
406           },
407       },
408   };
409   table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
410   mv2_allgather_table_ppn_conf[1] = 2;
411   mv2_size_allgather_tuning_table[1] = 6;
412   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
413       {
414           4,
415           {0,0},
416           2,
417           {
418               {0, 524288, &MPIR_Allgather_RD_MV2},
419               {524288, -1, &MPIR_Allgather_Ring_MV2},
420           },
421       },
422       {
423           8,
424           {0,1,0},
425           2,
426           {
427               {0, 32768, &MPIR_Allgather_RD_MV2},
428               {32768, 524288, &MPIR_Allgather_Ring_MV2},
429               {524288, -1, &MPIR_Allgather_Ring_MV2},
430           },
431       },
432       {
433           16,
434           {0,1,0},
435           2,
436           {
437               {0, 16384, &MPIR_Allgather_RD_MV2},
438               {16384, 524288, &MPIR_Allgather_Ring_MV2},
439               {524288, -1, &MPIR_Allgather_Ring_MV2},
440           },
441       },
442       {
443           32,
444           {1,1,0},
445           2,
446           {
447               {0, 65536, &MPIR_Allgather_RD_MV2},
448               {65536, 524288, &MPIR_Allgather_Ring_MV2},
449               {524288, -1, &MPIR_Allgather_Ring_MV2},
450           },
451       },
452       {
453           64,
454           {1,1,0},
455           2,
456           {
457               {0, 32768, &MPIR_Allgather_RD_MV2},
458               {32768, 524288, &MPIR_Allgather_Ring_MV2},
459               {524288, -1, &MPIR_Allgather_Ring_MV2},
460           },
461       },
462       {
463           128,
464           {1,1,0},
465           2,
466           {
467               {0, 65536, &MPIR_Allgather_RD_MV2},
468               {65536, 524288, &MPIR_Allgather_Ring_MV2},
469               {524288, -1, &MPIR_Allgather_Ring_MV2},
470           },
471       },
472   };
473   table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
474   mv2_allgather_table_ppn_conf[2] = 16;
475   mv2_size_allgather_tuning_table[2] = 6;
476   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
477       {
478           16,
479           {0,0},
480           2,
481           {
482               {0, 1024, &MPIR_Allgather_RD_MV2},
483               {1024, -1, &MPIR_Allgather_Ring_MV2},
484           },
485       },
486       {
487           32,
488           {0,0},
489           2,
490           {
491               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
492               {1024, -1, &MPIR_Allgather_Ring_MV2},
493           },
494       },
495       {
496           64,
497           {0,0},
498           2,
499           {
500               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
501               {1024, -1, &MPIR_Allgather_Ring_MV2},
502           },
503       },
504       {
505           128,
506           {0,0},
507           2,
508           {
509               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
510               {1024, -1, &MPIR_Allgather_Ring_MV2},
511           },
512       },
513       {
514           256,
515           {0,0},
516           2,
517           {
518               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
519               {1024, -1, &MPIR_Allgather_Ring_MV2},
520           },
521       },
522       {
523           512,
524           {0,0},
525           2,
526           {
527               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
528               {1024, -1, &MPIR_Allgather_Ring_MV2},
529           },
530       },
531
532   };
533   table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
534   agg_table_sum = 0;
535   for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
536       agg_table_sum += mv2_size_allgather_tuning_table[i];
537   }
538   mv2_allgather_thresholds_table[0] =
539       xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
540   memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
541       (sizeof(mv2_allgather_tuning_table)
542           * mv2_size_allgather_tuning_table[0]));
543   for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
544       mv2_allgather_thresholds_table[i] =
545           mv2_allgather_thresholds_table[i - 1]
546                                          + mv2_size_allgather_tuning_table[i - 1];
547       memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
548           (sizeof(mv2_allgather_tuning_table)
549               * mv2_size_allgather_tuning_table[i]));
550   }
551   xbt_free(table_ptrs);
552 }
553
554
555 /************ Gather variables and initializers                        */
556
557 typedef struct {
558   int min;
559   int max;
560   int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
561       MPI_Datatype sendtype, void *recvbuf, int recvcnt,
562       MPI_Datatype recvtype, int root, MPI_Comm  comm_ptr);
563 } mv2_gather_tuning_element;
564
565
566 typedef struct {
567   int numproc;
568   int size_inter_table;
569   mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
570   int size_intra_table;
571   mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
572 } mv2_gather_tuning_table;
573
574 int mv2_size_gather_tuning_table=7;
575 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL; 
576
577 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
578     int sendcnt,
579     MPI_Datatype sendtype,
580     void *recvbuf,
581     int recvcnt,
582     MPI_Datatype recvtype,
583     int root, MPI_Comm comm);
584
585 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
586 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
587
588
589 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
590 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_mvapich2_two_level
591 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
592
593
594 static void init_mv2_gather_tables_stampede(){
595
596   mv2_size_gather_tuning_table=7;
597   mv2_gather_thresholds_table = xbt_malloc(mv2_size_gather_tuning_table*
598       sizeof (mv2_gather_tuning_table));
599   mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
600       {16,
601           2,{{0, 524288, &MPIR_Gather_MV2_Direct},
602               {524288, -1, &MPIR_Gather_intra}},
603               1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
604               {32,
605                   3,{{0, 16384, &MPIR_Gather_MV2_Direct},
606                       {16384, 131072, &MPIR_Gather_intra},
607                       {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
608                       1,{{0, -1, &MPIR_Gather_intra}}},
609                       {64,
610                           3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
611                               {256, 16384, &MPIR_Gather_MV2_Direct},
612                               {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
613                               1,{{0, -1, &MPIR_Gather_intra}}},
614                               {128,
615                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
616                                       {512, 16384, &MPIR_Gather_MV2_Direct},
617                                       {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
618                                       1,{{0, -1, &MPIR_Gather_intra}}},
619                                       {256,
620                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
621                                               {512, 16384, &MPIR_Gather_MV2_Direct},
622                                               {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
623                                               1,{{0, -1, &MPIR_Gather_intra}}},
624                                               {512,
625                                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
626                                                       {512, 16384, &MPIR_Gather_MV2_Direct},
627                                                       {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
628                                                       1,{{0, -1, &MPIR_Gather_intra}}},
629                                                       {1024,
630                                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
631                                                               {512, 16384, &MPIR_Gather_MV2_Direct},
632                                                               {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
633                                                               1,{{0, -1, &MPIR_Gather_intra}}},
634   };
635
636   memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
637       mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
638
639 }
640
641
642 /************ Allgatherv variables and initializers                        */
643
644 typedef struct {
645   int min;
646   int max;
647   int (*MV2_pt_Allgatherv_function)(void *sendbuf,
648       int sendcount,
649       MPI_Datatype sendtype,
650       void *recvbuf,
651       int *recvcounts,
652       int *displs,
653       MPI_Datatype recvtype,
654       MPI_Comm commg);
655 } mv2_allgatherv_tuning_element;
656
657 typedef struct {
658   int numproc;
659   int size_inter_table;
660   mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
661 } mv2_allgatherv_tuning_table;
662
663 int (*MV2_Allgatherv_function)(void *sendbuf,
664     int sendcount,
665     MPI_Datatype sendtype,
666     void *recvbuf,
667     int *recvcounts,
668     int *displs,
669     MPI_Datatype recvtype,
670     MPI_Comm comm);
671
672 int mv2_size_allgatherv_tuning_table = 0;
673 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
674
675 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
676 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
677 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
678
679
680 static void init_mv2_allgatherv_tables_stampede(){
681   mv2_size_allgatherv_tuning_table = 6;
682   mv2_allgatherv_thresholds_table = xbt_malloc(mv2_size_allgatherv_tuning_table *
683       sizeof (mv2_allgatherv_tuning_table));
684   mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
685       {
686           16,
687           2,
688           {
689               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
690               {512, -1, &MPIR_Allgatherv_Ring_MV2},
691           },
692       },
693       {
694           32,
695           2,
696           {
697               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
698               {512, -1, &MPIR_Allgatherv_Ring_MV2},
699           },
700       },
701       {
702           64,
703           2,
704           {
705               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
706               {256, -1, &MPIR_Allgatherv_Ring_MV2},
707           },
708       },
709       {
710           128,
711           2,
712           {
713               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
714               {256, -1, &MPIR_Allgatherv_Ring_MV2},
715           },
716       },
717       {
718           256,
719           2,
720           {
721               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
722               {256, -1, &MPIR_Allgatherv_Ring_MV2},
723           },
724       },
725       {
726           512,
727           2,
728           {
729               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
730               {256, -1, &MPIR_Allgatherv_Ring_MV2},
731           },
732       },
733
734   };
735   memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
736       mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
737 }
738
739
740 /************ Allreduce variables and initializers                        */
741
742 typedef struct {
743   int min;
744   int max;
745   int (*MV2_pt_Allreduce_function)(void *sendbuf,
746       void *recvbuf,
747       int count,
748       MPI_Datatype datatype,
749       MPI_Op op, MPI_Comm comm);
750 } mv2_allreduce_tuning_element;
751
752 typedef struct {
753   int numproc;
754   int mcast_enabled;
755   int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
756   int size_inter_table;
757   mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
758   int size_intra_table;
759   mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
760 } mv2_allreduce_tuning_table;
761
762
763 int (*MV2_Allreduce_function)(void *sendbuf,
764     void *recvbuf,
765     int count,
766     MPI_Datatype datatype,
767     MPI_Op op, MPI_Comm comm)=NULL;
768
769
770 int (*MV2_Allreduce_intra_function)( void *sendbuf,
771     void *recvbuf,
772     int count,
773     MPI_Datatype datatype,
774     MPI_Op op, MPI_Comm comm)=NULL;
775
776 int mv2_size_allreduce_tuning_table = 0;
777 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
778
779
780
781
782
783 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
784     void *recvbuf,
785     int count,
786     MPI_Datatype datatype,
787     MPI_Op op, MPI_Comm comm)
788
789   return 0;
790 }
791
792 static  int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
793     void *recvbuf,
794     int count,
795     MPI_Datatype datatype,
796     MPI_Op op, MPI_Comm  comm)
797 {
798   return 0;
799 }
800
801 static  int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
802     void *recvbuf,
803     int count,
804     MPI_Datatype datatype,
805     MPI_Op op, MPI_Comm  comm)
806 {
807   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
808   return MPI_SUCCESS;
809 }
810
811 static  int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
812     void *recvbuf,
813     int count,
814     MPI_Datatype datatype,
815     MPI_Op op, MPI_Comm  comm)
816 {
817   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
818   return MPI_SUCCESS;
819 }
820
821 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
822 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
823 #define MPIR_Allreduce_two_level_MV2 smpi_coll_tuned_allreduce_mvapich2_two_level
824
825
826 static void init_mv2_allreduce_tables_stampede(){
827   mv2_size_allreduce_tuning_table = 8;
828   mv2_allreduce_thresholds_table = xbt_malloc(mv2_size_allreduce_tuning_table *
829       sizeof (mv2_allreduce_tuning_table));
830   mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
831       {
832           16,
833           0,
834           {1, 0},
835           2,
836           {
837               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
838               {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
839           },
840           2,
841           {
842               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
843               {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
844           },
845       },
846       {
847           32,
848           0,
849           {1, 1, 0},
850           3,
851           {
852               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
853               {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
854               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
855           },
856           2,
857           {
858               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
859               {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
860           },
861       },
862       {
863           64,
864           0,
865           {1, 1, 0},
866           3,
867           {
868               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
869               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
870               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
871           },
872           2,
873           {
874               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
875               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
876           },
877       },
878       {
879           128,
880           0,
881           {1, 1, 0},
882           3,
883           {
884               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
885               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
886               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
887           },
888           2,
889           {
890               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
891               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
892           },
893       },
894       {
895           256,
896           0,
897           {1, 1, 0},
898           3,
899           {
900               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
901               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
902               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
903           },
904           2,
905           {
906               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
907               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
908           },
909       },
910       {
911           512,
912           0,
913           {1, 1, 0},
914           3,
915           {
916               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
917               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
918               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
919           },
920           2,
921           {
922               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
923               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
924           },
925       },
926       {
927           1024,
928           0,
929           {1, 1, 1, 0},
930           4,
931           {
932               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
933               {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
934               {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
935               {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
936           },
937           2,
938           {
939               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
940               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
941           },
942       },
943       {
944           2048,
945           0,
946           {1, 1, 1, 0},
947           4,
948           {
949               {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
950               {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
951               {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
952               {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
953               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
954           },
955           2,
956           {
957               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
958               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
959           },
960       },
961
962   };
963   memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
964       mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
965 }
966
967
968
969
970 typedef struct {
971     int min;
972     int max;
973     int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
974                                   int root, MPI_Comm comm_ptr);
975     int zcpy_pipelined_knomial_factor;
976 } mv2_bcast_tuning_element;
977
978 typedef struct {
979     int numproc;
980     int bcast_segment_size;
981     int intra_node_knomial_factor;
982     int inter_node_knomial_factor;
983     int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
984     int size_inter_table;
985     mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
986     int size_intra_table;
987     mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
988 } mv2_bcast_tuning_table;
989
990 int mv2_size_bcast_tuning_table = 0;
991 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
992
993
994 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
995                            int root, MPI_Comm comm_ptr) = NULL;
996
997 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
998                                       int root, MPI_Comm comm_ptr) = NULL;
999
1000 int zcpy_knomial_factor = 2;
1001 int mv2_pipelined_zcpy_knomial_factor = -1;
1002 int bcast_segment_size = 8192;
1003 int mv2_inter_node_knomial_factor = 4;
1004 int mv2_intra_node_knomial_factor = 4;
1005 #define INTRA_NODE_ROOT 0
1006
1007 #define MPIR_Pipelined_Bcast_Zcpy_MV2 smpi_coll_tuned_bcast_mpich
1008 #define MPIR_Pipelined_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1009 #define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_mpich
1010 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_mpich
1011 #define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_mpich
1012 #define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_mpich
1013 #define MPIR_Shmem_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1014 #define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mpich
1015 #define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mpich
1016 #define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mpich
1017
1018 static void init_mv2_bcast_tables_stampede(){
1019  //Stampede,
1020         mv2_size_bcast_tuning_table=8;
1021         mv2_bcast_thresholds_table = xbt_malloc(mv2_size_bcast_tuning_table *
1022                                                  sizeof (mv2_bcast_tuning_table));
1023
1024   mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1025     {
1026             16,
1027             8192, 4, 4,
1028             {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1029             11,
1030             {
1031               {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1032               {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1033               {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1034               {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1035               {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1036               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1037               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1038               {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1039               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1040               {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1041               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1042             },
1043             11,
1044             {
1045               {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1046               {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1047               {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1048               {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1049               {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1050               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1051               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1052               {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1053               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1054               {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1055               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1056             }
1057     },
1058     {
1059             32,
1060             8192, 4, 4,
1061             {1, 1, 1, 1, 1, 1, 1, 1},
1062             8,
1063             {
1064               {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1065               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1066               {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1067               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1068               {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1069               {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1070               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1071               {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1072             },
1073             8,
1074             {
1075               {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1076               {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1077               {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1078               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1079               {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1080               {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1081               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1082               {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1083             }
1084     },
1085     {
1086             64,
1087             8192, 4, 4,
1088             {1, 1, 1, 1, 1, 1, 1, 1, 1},
1089             9,
1090             {
1091               {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1092               {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1093               {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1094               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1095               {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1096               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1097               {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1098               {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1099               {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1100             },
1101             9,
1102             {
1103               {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1104               {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1105               {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1106               {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1107               {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1108               {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1109               {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1110               {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1111               {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1112             }
1113     },
1114     {
1115             128,
1116             8192, 4, 4,
1117             {1, 1, 1, 0},
1118             4,
1119             {
1120               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1121               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1122               {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1123               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1124             },
1125             4,
1126             {
1127               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1128               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1129               {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1130               {524288, -1, NULL, -1}
1131             }
1132     },
1133     {
1134             256,
1135             8192, 4, 4,
1136             {1, 1, 1, 1, 1},
1137             5,
1138             {
1139               {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1140               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1141               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1142               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1143               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1144             },
1145             5,
1146             {
1147               {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1148               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1149               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1150               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1151               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1152             }
1153     },
1154     {
1155             512,
1156             8192, 4, 4,
1157             {1, 1, 1, 1, 1},
1158             5,
1159             {
1160               {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1161               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1162               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1163               {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1164               {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1165             },
1166             5,
1167             {
1168               {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1169               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1170               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1171               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1172               {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1173             }
1174     },
1175     {
1176             1024,
1177             8192, 4, 4,
1178             {1, 1, 1, 1, 1},
1179             5,
1180             {
1181               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1182               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1183               {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1184               {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1185               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1186             },
1187             5,
1188             {
1189               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1190               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1191               {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1192               {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1193               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1194             }
1195     },
1196     {
1197             2048,
1198             8192, 4, 4,
1199             {1, 1, 1, 1, 1, 1, 1},
1200             7,
1201             {
1202               {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1203               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1204               {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1205               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1206               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1207               {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1208               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1209             },
1210             7,
1211             {
1212               {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1213               {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1214               {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1215               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1216               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1217               {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1218               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1219             }
1220     }
1221   };
1222
1223         memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1224                     mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1225 }
1226
1227
1228 /************ Reduce variables and initializers                        */
1229
1230 typedef struct {
1231   int min;
1232   int max;
1233   int (*MV2_pt_Reduce_function)(void *sendbuf,
1234       void *recvbuf,
1235       int count,
1236       MPI_Datatype datatype,
1237       MPI_Op op,
1238       int root,
1239       MPI_Comm  comm_ptr);
1240 } mv2_reduce_tuning_element;
1241
1242 typedef struct {
1243   int numproc;
1244   int inter_k_degree;
1245   int intra_k_degree;
1246   int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1247   int size_inter_table;
1248   mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1249   int size_intra_table;
1250   mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1251 } mv2_reduce_tuning_table;
1252
1253 int mv2_size_reduce_tuning_table = 0;
1254 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1255
1256
1257 int mv2_reduce_intra_knomial_factor = 2;
1258 int mv2_reduce_inter_knomial_factor = 2;
1259
1260 int (*MV2_Reduce_function)( void *sendbuf,
1261     void *recvbuf,
1262     int count,
1263     MPI_Datatype datatype,
1264     MPI_Op op,
1265     int root,
1266     MPI_Comm  comm_ptr)=NULL;
1267
1268 int (*MV2_Reduce_intra_function)( void *sendbuf,
1269     void *recvbuf,
1270     int count,
1271     MPI_Datatype datatype,
1272     MPI_Op op,
1273     int root,
1274     MPI_Comm  comm_ptr)=NULL;
1275
1276
1277 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1278 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1279 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1280 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1281 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1282 #define MPIR_Reduce_two_level_helper_MV2 smpi_coll_tuned_reduce_mvapich2_two_level
1283
1284
1285 static void init_mv2_reduce_tables_stampede(){
1286   /*Stampede*/
1287   mv2_size_reduce_tuning_table = 8;
1288   mv2_reduce_thresholds_table = xbt_malloc(mv2_size_reduce_tuning_table *
1289       sizeof (mv2_reduce_tuning_table));
1290   mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1291       {
1292           16,
1293           4,
1294           4,
1295           {1, 0, 0},
1296           3,
1297           {
1298               {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1299               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1300               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1301           },
1302           2,
1303           {
1304               {0, 65536, &MPIR_Reduce_shmem_MV2},
1305               {65536,-1,  &MPIR_Reduce_binomial_MV2},
1306           },
1307       },
1308       {
1309           32,
1310           4,
1311           4,
1312           {1, 1, 1, 1, 0, 0, 0},
1313           7,
1314           {
1315               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1316               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1317               {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1318               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1319               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1320               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1321               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1322           },
1323           6,
1324           {
1325               {0, 8192, &MPIR_Reduce_shmem_MV2},
1326               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1327               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1328               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1329               {65536, 262144, &MPIR_Reduce_shmem_MV2},
1330               {262144,-1,  &MPIR_Reduce_binomial_MV2},
1331           },
1332       },
1333       {
1334           64,
1335           4,
1336           4,
1337           {1, 1, 1, 1, 0},
1338           5,
1339           {
1340               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1341               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1342               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1343               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1344               {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1345           },
1346           5,
1347           {
1348               {0, 8192, &MPIR_Reduce_shmem_MV2},
1349               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1350               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1351               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1352               {262144, -1, &MPIR_Reduce_binomial_MV2},
1353           },
1354       },
1355       {
1356           128,
1357           4,
1358           4,
1359           {1, 0, 1, 0, 1, 0},
1360           6,
1361           {
1362               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1363               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1364               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1365               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1366               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1367               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1368           },
1369           5,
1370           {
1371               {0, 8192, &MPIR_Reduce_shmem_MV2},
1372               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1373               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1374               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1375               {262144, -1, &MPIR_Reduce_binomial_MV2},
1376           },
1377       },
1378       {
1379           256,
1380           4,
1381           4,
1382           {1, 1, 1, 0, 1, 1, 0},
1383           7,
1384           {
1385               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1386               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1387               {16384, 32768, &MPIR_Reduce_binomial_MV2},
1388               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1389               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1390               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1391               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1392           },
1393           6,
1394           {
1395               {0, 8192, &MPIR_Reduce_shmem_MV2},
1396               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1397               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1398               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1399               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1400               {262144, -1, &MPIR_Reduce_binomial_MV2},
1401           },
1402       },
1403       {
1404           512,
1405           4,
1406           4,
1407           {1, 0, 1, 1, 1, 0},
1408           6,
1409           {
1410               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1411               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1412               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1413               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1414               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1415               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1416           },
1417           5,
1418           {
1419               {0, 8192, &MPIR_Reduce_shmem_MV2},
1420               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1421               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1422               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1423               {262144, -1, &MPIR_Reduce_binomial_MV2},
1424           },
1425       },
1426       {
1427           1024,
1428           4,
1429           4,
1430           {1, 0, 1, 1, 1},
1431           5,
1432           {
1433               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1434               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1435               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1436               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1437               {262144, -1, &MPIR_Reduce_binomial_MV2},
1438           },
1439           5,
1440           {
1441               {0, 8192, &MPIR_Reduce_shmem_MV2},
1442               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1443               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1444               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1445               {262144, -1, &MPIR_Reduce_binomial_MV2},
1446           },
1447       },
1448       {
1449           2048,
1450           4,
1451           4,
1452           {1, 0, 1, 1, 1,1},
1453           6,
1454           {
1455               {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1456               {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1457               {4096, 16384, &MPIR_Reduce_binomial_MV2},
1458               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1459               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1460               {131072, -1, &MPIR_Reduce_binomial_MV2},
1461           },
1462           6,
1463           {
1464               {0, 2048, &MPIR_Reduce_shmem_MV2},
1465               {2048, 4096, &MPIR_Reduce_shmem_MV2},
1466               {4096, 16384, &MPIR_Reduce_shmem_MV2},
1467               {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1468               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1469               {131072, -1, &MPIR_Reduce_shmem_MV2},
1470           },
1471       },
1472
1473   };
1474   memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1475       mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1476 }
1477
1478 /************ Reduce scatter variables and initializers                        */
1479
1480 typedef struct {
1481   int min;
1482   int max;
1483   int (*MV2_pt_Red_scat_function)(void *sendbuf,
1484       void *recvbuf,
1485       int *recvcnts,
1486       MPI_Datatype datatype,
1487       MPI_Op op,
1488       MPI_Comm comm_ptr);
1489 } mv2_red_scat_tuning_element;
1490
1491 typedef struct {
1492   int numproc;
1493   int size_inter_table;
1494   mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1495 } mv2_red_scat_tuning_table;
1496
1497 int mv2_size_red_scat_tuning_table = 0;
1498 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1499
1500
1501 int (*MV2_Red_scat_function)(void *sendbuf,
1502     void *recvbuf,
1503     int *recvcnts,
1504     MPI_Datatype datatype,
1505     MPI_Op op,
1506     MPI_Comm comm_ptr);
1507
1508
1509
1510 static  int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1511     void *recvbuf,
1512     int *recvcnts,
1513     MPI_Datatype datatype,
1514     MPI_Op op,
1515     MPI_Comm comm)
1516 {
1517   smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1518   return MPI_SUCCESS;
1519 }
1520 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1521 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1522 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1523
1524
1525
1526
1527 static void init_mv2_reduce_scatter_tables_stampede(){
1528   mv2_size_red_scat_tuning_table = 6;
1529   mv2_red_scat_thresholds_table = xbt_malloc(mv2_size_red_scat_tuning_table *
1530       sizeof (mv2_red_scat_tuning_table));
1531   mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1532       {
1533           16,
1534           3,
1535           {
1536               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1537               {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1538               {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1539           },
1540       },
1541       {
1542           32,
1543           3,
1544           {
1545               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1546               {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1547               {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1548           },
1549       },
1550       {
1551           64,
1552           3,
1553           {
1554               {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1555               {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1556               {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1557           },
1558       },
1559       {
1560           128,
1561           2,
1562           {
1563               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1564               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1565           },
1566       },
1567       {
1568           256,
1569           2,
1570           {
1571               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1572               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1573           },
1574       },
1575       {
1576           512,
1577           2,
1578           {
1579               {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1580               {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1581           },
1582       },
1583
1584   };
1585   memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1586       mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1587 }
1588
1589 /************ Scatter variables and initializers                        */
1590
1591 typedef struct {
1592   int min;
1593   int max;
1594   int (*MV2_pt_Scatter_function)(void *sendbuf,
1595       int sendcnt,
1596       MPI_Datatype sendtype,
1597       void *recvbuf,
1598       int recvcnt,
1599       MPI_Datatype recvtype,
1600       int root, MPI_Comm comm);
1601 } mv2_scatter_tuning_element;
1602
1603 typedef struct {
1604   int numproc;
1605   int size_inter_table;
1606   mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1607   int size_intra_table;
1608   mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1609 } mv2_scatter_tuning_table;
1610
1611
1612 int *mv2_scatter_table_ppn_conf = NULL;
1613 int mv2_scatter_num_ppn_conf = 1;
1614 int *mv2_size_scatter_tuning_table = NULL;
1615 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1616
1617 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1618     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1619     int root, MPI_Comm comm)=NULL;
1620
1621 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1622     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1623     int root, MPI_Comm comm)=NULL;
1624 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1625     int sendcnt,
1626     MPI_Datatype sendtype,
1627     void *recvbuf,
1628     int recvcnt,
1629     MPI_Datatype recvtype,
1630     int root, MPI_Comm comm_ptr);
1631
1632 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1633     int sendcnt,
1634     MPI_Datatype sendtype,
1635     void *recvbuf,
1636     int recvcnt,
1637     MPI_Datatype recvtype,
1638     int root, MPI_Comm comm_ptr)
1639 {
1640   return 0;
1641 }
1642
1643 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1644 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1645 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_mvapich2_two_level_binomial
1646 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_mvapich2_two_level_direct
1647
1648
1649
1650
1651 static void init_mv2_scatter_tables_stampede(){
1652   {
1653     int agg_table_sum = 0;
1654     int i;
1655     mv2_scatter_tuning_table **table_ptrs = NULL;
1656     mv2_scatter_num_ppn_conf = 3;
1657     mv2_scatter_thresholds_table
1658     = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1659         * mv2_scatter_num_ppn_conf);
1660     table_ptrs = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1661         * mv2_scatter_num_ppn_conf);
1662     mv2_size_scatter_tuning_table = xbt_malloc(sizeof(int) *
1663         mv2_scatter_num_ppn_conf);
1664     mv2_scatter_table_ppn_conf
1665     = xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int));
1666     mv2_scatter_table_ppn_conf[0] = 1;
1667     mv2_size_scatter_tuning_table[0] = 6;
1668     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1669         {2,
1670             1,
1671             {
1672                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1673             },
1674             1,
1675             {
1676                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1677             },
1678         },
1679
1680         {4,
1681             1,
1682             {
1683                 {0, -1, &MPIR_Scatter_MV2_Direct},
1684             },
1685             1,
1686             {
1687                 {0, -1, &MPIR_Scatter_MV2_Direct},
1688             },
1689         },
1690
1691         {8,
1692             1,
1693             {
1694                 {0, -1, &MPIR_Scatter_MV2_Direct},
1695             },
1696             1,
1697             {
1698                 {0, -1, &MPIR_Scatter_MV2_Direct},
1699             },
1700         },
1701
1702         {16,
1703             1,
1704             {
1705                 {0, -1, &MPIR_Scatter_MV2_Direct},
1706             },
1707             1,
1708             {
1709                 {0, -1, &MPIR_Scatter_MV2_Direct},
1710             },
1711         },
1712
1713         {32,
1714             1,
1715             {
1716                 {0, -1, &MPIR_Scatter_MV2_Direct},
1717             },
1718             1,
1719             {
1720                 {0, -1, &MPIR_Scatter_MV2_Direct},
1721             },
1722         },
1723
1724         {64,
1725             2,
1726             {
1727                 {0, 32, &MPIR_Scatter_MV2_Binomial},
1728                 {32, -1, &MPIR_Scatter_MV2_Direct},
1729             },
1730             1,
1731             {
1732                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1733             },
1734         },
1735     };
1736     table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1737     mv2_scatter_table_ppn_conf[1] = 2;
1738     mv2_size_scatter_tuning_table[1] = 6;
1739     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1740         {4,
1741             2,
1742             {
1743                 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1744                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1745             },
1746             1,
1747             {
1748                 {0, -1, &MPIR_Scatter_MV2_Direct},
1749             },
1750         },
1751
1752         {8,
1753             2,
1754             {
1755                 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1756                 {512, -1, &MPIR_Scatter_MV2_Direct},
1757             },
1758             1,
1759             {
1760                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1761             },
1762         },
1763
1764         {16,
1765             2,
1766             {
1767                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1768                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1769             },
1770             1,
1771             {
1772                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1773             },
1774         },
1775
1776         {32,
1777             2,
1778             {
1779                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1780                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1781             },
1782             1,
1783             {
1784                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1785             },
1786         },
1787
1788         {64,
1789             2,
1790             {
1791                 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1792                 {8192, -1, &MPIR_Scatter_MV2_Direct},
1793             },
1794             1,
1795             {
1796                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1797             },
1798         },
1799
1800         {128,
1801             4,
1802             {
1803                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1804                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1805                 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1806                 {16384, -1, &MPIR_Scatter_MV2_Direct},
1807             },
1808             1,
1809             {
1810                 {0, 128, &MPIR_Scatter_MV2_Direct},
1811                 {128, -1, &MPIR_Scatter_MV2_Binomial},
1812             },
1813         },
1814     };
1815     table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1816     mv2_scatter_table_ppn_conf[2] = 16;
1817     mv2_size_scatter_tuning_table[2] = 8;
1818     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1819         {
1820             16,
1821             2,
1822             {
1823                 {0, 256, &MPIR_Scatter_MV2_Binomial},
1824                 {256, -1, &MPIR_Scatter_MV2_Direct},
1825             },
1826             1,
1827             {
1828                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1829             },
1830         },
1831
1832         {
1833             32,
1834             2,
1835             {
1836                 {0, 512, &MPIR_Scatter_MV2_Binomial},
1837                 {512, -1, &MPIR_Scatter_MV2_Direct},
1838             },
1839             1,
1840             {
1841                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1842             },
1843         },
1844
1845         {
1846             64,
1847             2,
1848             {
1849                 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1850                 {1024, -1, &MPIR_Scatter_MV2_Direct},
1851             },
1852             1,
1853             {
1854                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1855             },
1856         },
1857
1858         {
1859             128,
1860             4,
1861             {
1862                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1863                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1864                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1865                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1866             },
1867             1,
1868             {
1869                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1870             },
1871         },
1872
1873         {
1874             256,
1875             4,
1876             {
1877                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1878                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1879                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1880                 {2048, -1,  &MPIR_Scatter_MV2_Direct},
1881             },
1882             1,
1883             {
1884                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1885             },
1886         },
1887
1888         {
1889             512,
1890             4,
1891             {
1892                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1893                 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1894                 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1895                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1896             },
1897             1,
1898             {
1899                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1900             },
1901         },
1902         {
1903             1024,
1904             5,
1905             {
1906                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1907                 {0, 16,  &MPIR_Scatter_MV2_Binomial},
1908                 {16, 32, &MPIR_Scatter_MV2_Binomial},
1909                 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1910                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1911             },
1912             1,
1913             {
1914                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1915             },
1916         },
1917         {
1918             2048,
1919             7,
1920             {
1921                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1922                 {0, 16,  &MPIR_Scatter_MV2_two_level_Binomial},
1923                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1924                 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1925                 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1926                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1927                 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1928             },
1929             6,
1930             {
1931                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1932                 {16, 128, &MPIR_Scatter_MV2_Binomial},
1933                 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1934                 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1935                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1936                 {65536, -1, &MPIR_Scatter_MV2_Direct},
1937             },
1938         },
1939     };
1940     table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1941     agg_table_sum = 0;
1942     for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1943         agg_table_sum += mv2_size_scatter_tuning_table[i];
1944     }
1945     mv2_scatter_thresholds_table[0] =
1946         xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
1947     memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1948         (sizeof(mv2_scatter_tuning_table)
1949             * mv2_size_scatter_tuning_table[0]));
1950     for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1951         mv2_scatter_thresholds_table[i] =
1952             mv2_scatter_thresholds_table[i - 1]
1953                                          + mv2_size_scatter_tuning_table[i - 1];
1954         memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1955             (sizeof(mv2_scatter_tuning_table)
1956                 * mv2_size_scatter_tuning_table[i]));
1957     }
1958     xbt_free(table_ptrs);
1959   }
1960 }
1961