Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
07614eaa052c16089f2ca3e41531cba5db72dc4d
[simgrid.git] / src / smpi / colls / smpi_mvapich2_selector_stampede.h
1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
2
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4  * All rights reserved.                                                     */
5
6 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
7
8
9
10 /************ Alltoall variables and initializers                        */
11
12 #define MV2_MAX_NB_THRESHOLDS  32
13 typedef struct {
14   int min;
15   int max;
16   int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
17       void *recvbuf, int recvcount, MPI_Datatype recvtype,
18       MPI_Comm comm_ptr );
19 } mv2_alltoall_tuning_element;
20
21 typedef struct {
22   int numproc;
23   int size_table;
24   mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
25   mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
26 } mv2_alltoall_tuning_table;
27
28 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
29
30 /* Indicates number of processes per node */
31 int *mv2_alltoall_table_ppn_conf = NULL;
32 /* Indicates total number of configurations */
33 int mv2_alltoall_num_ppn_conf = 1;
34 int *mv2_size_alltoall_tuning_table = NULL;
35 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
36
37
38 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
39 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
40 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_mvapich2_scatter_dest
41 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
42 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring 
43
44
45 static void init_mv2_alltoall_tables_stampede(){
46   int i;
47   int agg_table_sum = 0;
48   mv2_alltoall_tuning_table **table_ptrs = NULL;
49   mv2_alltoall_num_ppn_conf = 3;
50   mv2_alltoall_thresholds_table = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
51       * mv2_alltoall_num_ppn_conf);
52   table_ptrs = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
53       * mv2_alltoall_num_ppn_conf);
54   mv2_size_alltoall_tuning_table = xbt_malloc(sizeof(int) *
55       mv2_alltoall_num_ppn_conf);
56   mv2_alltoall_table_ppn_conf = xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
57   mv2_alltoall_table_ppn_conf[0] = 1;
58   mv2_size_alltoall_tuning_table[0] = 6;
59   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
60       {2,
61           1,
62           {{0, -1, &MPIR_Alltoall_pairwise_MV2},
63           },
64
65           {{0, -1, &MPIR_Alltoall_inplace_MV2},
66           },
67       },
68
69       {4,
70           2,
71           {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
72               {262144, -1, &MPIR_Alltoall_pairwise_MV2},
73           },
74
75           {{0, -1, &MPIR_Alltoall_inplace_MV2},
76           },
77       },
78
79       {8,
80           2,
81           {{0, 8, &MPIR_Alltoall_RD_MV2},
82               {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
83           },
84
85           {{0, -1, &MPIR_Alltoall_inplace_MV2},
86           },
87       },
88
89       {16,
90           3,
91           {{0, 64, &MPIR_Alltoall_RD_MV2},
92               {64, 512, &MPIR_Alltoall_bruck_MV2},
93               {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
94           },
95
96           {{0,-1, &MPIR_Alltoall_inplace_MV2},
97           },
98       },
99
100       {32,
101           3,
102           {{0, 32, &MPIR_Alltoall_RD_MV2},
103               {32, 2048, &MPIR_Alltoall_bruck_MV2},
104               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
105           },
106
107           {{0, -1, &MPIR_Alltoall_inplace_MV2},
108           },
109       },
110
111       {64,
112           3,
113           {{0, 8, &MPIR_Alltoall_RD_MV2},
114               {8, 1024, &MPIR_Alltoall_bruck_MV2},
115               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
116           },
117
118           {{0, -1, &MPIR_Alltoall_inplace_MV2},
119           },
120       },
121   };
122   table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
123   mv2_alltoall_table_ppn_conf[1] = 2;
124   mv2_size_alltoall_tuning_table[1] = 6;
125   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
126       {4,
127           2,
128           {{0, 32, &MPIR_Alltoall_RD_MV2},
129               {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
130           },
131
132           {{0, -1, &MPIR_Alltoall_inplace_MV2},
133           },
134       },
135
136       {8,
137           2,
138           {{0, 64, &MPIR_Alltoall_RD_MV2},
139               {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
140           },
141
142           {{0, -1, &MPIR_Alltoall_inplace_MV2},
143           },
144       },
145
146       {16,
147           3,
148           {{0, 64, &MPIR_Alltoall_RD_MV2},
149               {64, 2048, &MPIR_Alltoall_bruck_MV2},
150               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
151           },
152
153           {{0,-1, &MPIR_Alltoall_inplace_MV2},
154           },
155       },
156
157       {32,
158           3,
159           {{0, 16, &MPIR_Alltoall_RD_MV2},
160               {16, 2048, &MPIR_Alltoall_bruck_MV2},
161               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
162           },
163
164           {{0, -1, &MPIR_Alltoall_inplace_MV2},
165           },
166       },
167
168       {64,
169           3,
170           {{0, 8, &MPIR_Alltoall_RD_MV2},
171               {8, 1024, &MPIR_Alltoall_bruck_MV2},
172               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
173           },
174
175           {{0, -1, &MPIR_Alltoall_inplace_MV2},
176           },
177       },
178
179       {128,
180           3,
181           {{0, 4, &MPIR_Alltoall_RD_MV2},
182               {4, 2048, &MPIR_Alltoall_bruck_MV2},
183               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
184           },
185
186           {{0, -1, &MPIR_Alltoall_inplace_MV2},
187           },
188       },
189   };
190   table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
191   mv2_alltoall_table_ppn_conf[2] = 16;
192   mv2_size_alltoall_tuning_table[2] = 7;
193   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
194       {16,
195           2,
196           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
197               {2048, -1,  &MPIR_Alltoall_Scatter_dest_MV2},
198           },
199
200           {{32768, -1, &MPIR_Alltoall_inplace_MV2},
201           },
202       },
203
204       {32,
205           2,
206           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
207               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
208           },
209
210           {{16384, -1, &MPIR_Alltoall_inplace_MV2},
211           },
212       },
213
214       {64,
215           3,
216           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
217               {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
218               {16384, -1, &MPIR_Alltoall_pairwise_MV2},
219           },
220
221           {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
222           },
223       },
224
225       {128,
226           2,
227           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
228               {2048, -1, &MPIR_Alltoall_pairwise_MV2},
229           },
230
231           {{16384,65536, &MPIR_Alltoall_inplace_MV2},
232           },
233       },
234
235       {256,
236           2,
237           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
238               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
239           },
240
241           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
242           },
243       },
244
245       {512,
246           2,
247           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
248               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
249           },
250
251           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
252           },
253       },
254       {1024,
255           2,
256           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
257               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
258           },
259
260           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
261           },
262       },
263
264   };
265   table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
266   agg_table_sum = 0;
267   for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
268       agg_table_sum += mv2_size_alltoall_tuning_table[i];
269   }
270   mv2_alltoall_thresholds_table[0] =
271       xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
272   memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
273       (sizeof(mv2_alltoall_tuning_table)
274           * mv2_size_alltoall_tuning_table[0]));
275   for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
276       mv2_alltoall_thresholds_table[i] =
277           mv2_alltoall_thresholds_table[i - 1]
278                                         + mv2_size_alltoall_tuning_table[i - 1];
279       memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
280           (sizeof(mv2_alltoall_tuning_table)
281               * mv2_size_alltoall_tuning_table[i]));
282   }
283   xbt_free(table_ptrs);
284
285
286 }
287
288
289 /************ Allgather variables and initializers                        */
290
291 typedef struct {
292   int min;
293   int max;
294   int (*MV2_pt_Allgather_function)(void *sendbuf,
295       int sendcount,
296       MPI_Datatype sendtype,
297       void *recvbuf,
298       int recvcount,
299       MPI_Datatype recvtype, MPI_Comm comm_ptr);
300 } mv2_allgather_tuning_element;
301
302 typedef struct {
303   int numproc;
304   int two_level[MV2_MAX_NB_THRESHOLDS];
305   int size_inter_table;
306   mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
307 } mv2_allgather_tuning_table;
308
309 int (*MV2_Allgather_function)(void *sendbuf,
310     int sendcount,
311     MPI_Datatype sendtype,
312     void *recvbuf,
313     int recvcount,
314     MPI_Datatype recvtype, MPI_Comm comm);
315
316 int *mv2_allgather_table_ppn_conf = NULL;
317 int mv2_allgather_num_ppn_conf = 1;
318 int *mv2_size_allgather_tuning_table = NULL;
319 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
320
321 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
322                                  int sendcount,
323                                  MPI_Datatype sendtype,
324                                  void *recvbuf,
325                                  int recvcount,
326                                  MPI_Datatype recvtype, MPI_Comm comm_ptr)
327 {
328     return 0;
329 }
330
331 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
332 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
333 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
334 #define MPIR_2lvl_Allgather_MV2 smpi_coll_tuned_allgather_mvapich2_smp
335
336 static void init_mv2_allgather_tables_stampede(){
337   int i;
338   int agg_table_sum = 0;
339   mv2_allgather_tuning_table **table_ptrs = NULL;
340   mv2_allgather_num_ppn_conf = 3;
341   mv2_allgather_thresholds_table
342   = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
343       * mv2_allgather_num_ppn_conf);
344   table_ptrs = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
345       * mv2_allgather_num_ppn_conf);
346   mv2_size_allgather_tuning_table = xbt_malloc(sizeof(int) *
347       mv2_allgather_num_ppn_conf);
348   mv2_allgather_table_ppn_conf
349   = xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int));
350   mv2_allgather_table_ppn_conf[0] = 1;
351   mv2_size_allgather_tuning_table[0] = 6;
352   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
353       {
354           2,
355           {0},
356           1,
357           {
358               {0, -1, &MPIR_Allgather_Ring_MV2},
359           },
360       },
361       {
362           4,
363           {0,0},
364           2,
365           {
366               {0, 262144, &MPIR_Allgather_RD_MV2},
367               {262144, -1, &MPIR_Allgather_Ring_MV2},
368           },
369       },
370       {
371           8,
372           {0,0},
373           2,
374           {
375               {0, 131072, &MPIR_Allgather_RD_MV2},
376               {131072, -1, &MPIR_Allgather_Ring_MV2},
377           },
378       },
379       {
380           16,
381           {0,0},
382           2,
383           {
384               {0, 131072, &MPIR_Allgather_RD_MV2},
385               {131072, -1, &MPIR_Allgather_Ring_MV2},
386           },
387       },
388       {
389           32,
390           {0,0},
391           2,
392           {
393               {0, 65536, &MPIR_Allgather_RD_MV2},
394               {65536, -1, &MPIR_Allgather_Ring_MV2},
395           },
396       },
397       {
398           64,
399           {0,0},
400           2,
401           {
402               {0, 32768, &MPIR_Allgather_RD_MV2},
403               {32768, -1, &MPIR_Allgather_Ring_MV2},
404           },
405       },
406   };
407   table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
408   mv2_allgather_table_ppn_conf[1] = 2;
409   mv2_size_allgather_tuning_table[1] = 6;
410   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
411       {
412           4,
413           {0,0},
414           2,
415           {
416               {0, 524288, &MPIR_Allgather_RD_MV2},
417               {524288, -1, &MPIR_Allgather_Ring_MV2},
418           },
419       },
420       {
421           8,
422           {0,1,0},
423           2,
424           {
425               {0, 32768, &MPIR_Allgather_RD_MV2},
426               {32768, 524288, &MPIR_Allgather_Ring_MV2},
427               {524288, -1, &MPIR_Allgather_Ring_MV2},
428           },
429       },
430       {
431           16,
432           {0,1,0},
433           2,
434           {
435               {0, 16384, &MPIR_Allgather_RD_MV2},
436               {16384, 524288, &MPIR_Allgather_Ring_MV2},
437               {524288, -1, &MPIR_Allgather_Ring_MV2},
438           },
439       },
440       {
441           32,
442           {1,1,0},
443           2,
444           {
445               {0, 65536, &MPIR_Allgather_RD_MV2},
446               {65536, 524288, &MPIR_Allgather_Ring_MV2},
447               {524288, -1, &MPIR_Allgather_Ring_MV2},
448           },
449       },
450       {
451           64,
452           {1,1,0},
453           2,
454           {
455               {0, 32768, &MPIR_Allgather_RD_MV2},
456               {32768, 524288, &MPIR_Allgather_Ring_MV2},
457               {524288, -1, &MPIR_Allgather_Ring_MV2},
458           },
459       },
460       {
461           128,
462           {1,1,0},
463           2,
464           {
465               {0, 65536, &MPIR_Allgather_RD_MV2},
466               {65536, 524288, &MPIR_Allgather_Ring_MV2},
467               {524288, -1, &MPIR_Allgather_Ring_MV2},
468           },
469       },
470   };
471   table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
472   mv2_allgather_table_ppn_conf[2] = 16;
473   mv2_size_allgather_tuning_table[2] = 6;
474   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
475       {
476           16,
477           {0,0},
478           2,
479           {
480               {0, 1024, &MPIR_Allgather_RD_MV2},
481               {1024, -1, &MPIR_Allgather_Ring_MV2},
482           },
483       },
484       {
485           32,
486           {0,0},
487           2,
488           {
489               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
490               {1024, -1, &MPIR_Allgather_Ring_MV2},
491           },
492       },
493       {
494           64,
495           {0,0},
496           2,
497           {
498               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
499               {1024, -1, &MPIR_Allgather_Ring_MV2},
500           },
501       },
502       {
503           128,
504           {0,0},
505           2,
506           {
507               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
508               {1024, -1, &MPIR_Allgather_Ring_MV2},
509           },
510       },
511       {
512           256,
513           {0,0},
514           2,
515           {
516               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
517               {1024, -1, &MPIR_Allgather_Ring_MV2},
518           },
519       },
520       {
521           512,
522           {0,0},
523           2,
524           {
525               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
526               {1024, -1, &MPIR_Allgather_Ring_MV2},
527           },
528       },
529
530   };
531   table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
532   agg_table_sum = 0;
533   for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
534       agg_table_sum += mv2_size_allgather_tuning_table[i];
535   }
536   mv2_allgather_thresholds_table[0] =
537       xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
538   memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
539       (sizeof(mv2_allgather_tuning_table)
540           * mv2_size_allgather_tuning_table[0]));
541   for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
542       mv2_allgather_thresholds_table[i] =
543           mv2_allgather_thresholds_table[i - 1]
544                                          + mv2_size_allgather_tuning_table[i - 1];
545       memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
546           (sizeof(mv2_allgather_tuning_table)
547               * mv2_size_allgather_tuning_table[i]));
548   }
549   xbt_free(table_ptrs);
550 }
551
552
553 /************ Gather variables and initializers                        */
554
555 typedef struct {
556   int min;
557   int max;
558   int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
559       MPI_Datatype sendtype, void *recvbuf, int recvcnt,
560       MPI_Datatype recvtype, int root, MPI_Comm  comm_ptr);
561 } mv2_gather_tuning_element;
562
563
564 typedef struct {
565   int numproc;
566   int size_inter_table;
567   mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
568   int size_intra_table;
569   mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
570 } mv2_gather_tuning_table;
571
572 int mv2_size_gather_tuning_table=7;
573 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL; 
574
575 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
576     int sendcnt,
577     MPI_Datatype sendtype,
578     void *recvbuf,
579     int recvcnt,
580     MPI_Datatype recvtype,
581     int root, MPI_Comm comm);
582
583 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
584 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
585
586
587 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
588 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_mvapich2_two_level
589 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
590
591
592 static void init_mv2_gather_tables_stampede(){
593
594   mv2_size_gather_tuning_table=7;
595   mv2_gather_thresholds_table = xbt_malloc(mv2_size_gather_tuning_table*
596       sizeof (mv2_gather_tuning_table));
597   mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
598       {16,
599           2,{{0, 524288, &MPIR_Gather_MV2_Direct},
600               {524288, -1, &MPIR_Gather_intra}},
601               1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
602               {32,
603                   3,{{0, 16384, &MPIR_Gather_MV2_Direct},
604                       {16384, 131072, &MPIR_Gather_intra},
605                       {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
606                       1,{{0, -1, &MPIR_Gather_intra}}},
607                       {64,
608                           3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
609                               {256, 16384, &MPIR_Gather_MV2_Direct},
610                               {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
611                               1,{{0, -1, &MPIR_Gather_intra}}},
612                               {128,
613                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
614                                       {512, 16384, &MPIR_Gather_MV2_Direct},
615                                       {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
616                                       1,{{0, -1, &MPIR_Gather_intra}}},
617                                       {256,
618                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
619                                               {512, 16384, &MPIR_Gather_MV2_Direct},
620                                               {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
621                                               1,{{0, -1, &MPIR_Gather_intra}}},
622                                               {512,
623                                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
624                                                       {512, 16384, &MPIR_Gather_MV2_Direct},
625                                                       {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
626                                                       1,{{0, -1, &MPIR_Gather_intra}}},
627                                                       {1024,
628                                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
629                                                               {512, 16384, &MPIR_Gather_MV2_Direct},
630                                                               {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
631                                                               1,{{0, -1, &MPIR_Gather_intra}}},
632   };
633
634   memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
635       mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
636
637 }
638
639
640 /************ Allgatherv variables and initializers                        */
641
642 typedef struct {
643   int min;
644   int max;
645   int (*MV2_pt_Allgatherv_function)(void *sendbuf,
646       int sendcount,
647       MPI_Datatype sendtype,
648       void *recvbuf,
649       int *recvcounts,
650       int *displs,
651       MPI_Datatype recvtype,
652       MPI_Comm commg);
653 } mv2_allgatherv_tuning_element;
654
655 typedef struct {
656   int numproc;
657   int size_inter_table;
658   mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
659 } mv2_allgatherv_tuning_table;
660
661 int (*MV2_Allgatherv_function)(void *sendbuf,
662     int sendcount,
663     MPI_Datatype sendtype,
664     void *recvbuf,
665     int *recvcounts,
666     int *displs,
667     MPI_Datatype recvtype,
668     MPI_Comm comm);
669
670 int mv2_size_allgatherv_tuning_table = 0;
671 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
672
673 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
674 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
675 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
676
677
678 static void init_mv2_allgatherv_tables_stampede(){
679   mv2_size_allgatherv_tuning_table = 6;
680   mv2_allgatherv_thresholds_table = xbt_malloc(mv2_size_allgatherv_tuning_table *
681       sizeof (mv2_allgatherv_tuning_table));
682   mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
683       {
684           16,
685           2,
686           {
687               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
688               {512, -1, &MPIR_Allgatherv_Ring_MV2},
689           },
690       },
691       {
692           32,
693           2,
694           {
695               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
696               {512, -1, &MPIR_Allgatherv_Ring_MV2},
697           },
698       },
699       {
700           64,
701           2,
702           {
703               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
704               {256, -1, &MPIR_Allgatherv_Ring_MV2},
705           },
706       },
707       {
708           128,
709           2,
710           {
711               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
712               {256, -1, &MPIR_Allgatherv_Ring_MV2},
713           },
714       },
715       {
716           256,
717           2,
718           {
719               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
720               {256, -1, &MPIR_Allgatherv_Ring_MV2},
721           },
722       },
723       {
724           512,
725           2,
726           {
727               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
728               {256, -1, &MPIR_Allgatherv_Ring_MV2},
729           },
730       },
731
732   };
733   memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
734       mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
735 }
736
737
738 /************ Allreduce variables and initializers                        */
739
740 typedef struct {
741   int min;
742   int max;
743   int (*MV2_pt_Allreduce_function)(void *sendbuf,
744       void *recvbuf,
745       int count,
746       MPI_Datatype datatype,
747       MPI_Op op, MPI_Comm comm);
748 } mv2_allreduce_tuning_element;
749
750 typedef struct {
751   int numproc;
752   int mcast_enabled;
753   int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
754   int size_inter_table;
755   mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
756   int size_intra_table;
757   mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
758 } mv2_allreduce_tuning_table;
759
760
761 int (*MV2_Allreduce_function)(void *sendbuf,
762     void *recvbuf,
763     int count,
764     MPI_Datatype datatype,
765     MPI_Op op, MPI_Comm comm)=NULL;
766
767
768 int (*MV2_Allreduce_intra_function)( void *sendbuf,
769     void *recvbuf,
770     int count,
771     MPI_Datatype datatype,
772     MPI_Op op, MPI_Comm comm)=NULL;
773
774 int mv2_size_allreduce_tuning_table = 0;
775 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
776
777
778
779
780
781 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
782     void *recvbuf,
783     int count,
784     MPI_Datatype datatype,
785     MPI_Op op, MPI_Comm comm)
786
787   return 0;
788 }
789
790 static  int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
791     void *recvbuf,
792     int count,
793     MPI_Datatype datatype,
794     MPI_Op op, MPI_Comm  comm)
795 {
796   return 0;
797 }
798
799 static  int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
800     void *recvbuf,
801     int count,
802     MPI_Datatype datatype,
803     MPI_Op op, MPI_Comm  comm)
804 {
805   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
806   return MPI_SUCCESS;
807 }
808
809 static  int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
810     void *recvbuf,
811     int count,
812     MPI_Datatype datatype,
813     MPI_Op op, MPI_Comm  comm)
814 {
815   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
816   return MPI_SUCCESS;
817 }
818
819 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
820 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
821 #define MPIR_Allreduce_two_level_MV2 smpi_coll_tuned_allreduce_mvapich2_two_level
822
823
824 static void init_mv2_allreduce_tables_stampede(){
825   mv2_size_allreduce_tuning_table = 8;
826   mv2_allreduce_thresholds_table = xbt_malloc(mv2_size_allreduce_tuning_table *
827       sizeof (mv2_allreduce_tuning_table));
828   mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
829       {
830           16,
831           0,
832           {1, 0},
833           2,
834           {
835               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
836               {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
837           },
838           2,
839           {
840               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
841               {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
842           },
843       },
844       {
845           32,
846           0,
847           {1, 1, 0},
848           3,
849           {
850               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
851               {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
852               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
853           },
854           2,
855           {
856               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
857               {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
858           },
859       },
860       {
861           64,
862           0,
863           {1, 1, 0},
864           3,
865           {
866               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
867               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
868               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
869           },
870           2,
871           {
872               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
873               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
874           },
875       },
876       {
877           128,
878           0,
879           {1, 1, 0},
880           3,
881           {
882               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
883               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
884               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
885           },
886           2,
887           {
888               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
889               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
890           },
891       },
892       {
893           256,
894           0,
895           {1, 1, 0},
896           3,
897           {
898               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
899               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
900               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
901           },
902           2,
903           {
904               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
905               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
906           },
907       },
908       {
909           512,
910           0,
911           {1, 1, 0},
912           3,
913           {
914               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
915               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
916               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
917           },
918           2,
919           {
920               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
921               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
922           },
923       },
924       {
925           1024,
926           0,
927           {1, 1, 1, 0},
928           4,
929           {
930               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
931               {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
932               {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
933               {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
934           },
935           2,
936           {
937               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
938               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
939           },
940       },
941       {
942           2048,
943           0,
944           {1, 1, 1, 0},
945           4,
946           {
947               {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
948               {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
949               {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
950               {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
951               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
952           },
953           2,
954           {
955               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
956               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
957           },
958       },
959
960   };
961   memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
962       mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
963 }
964
965
966 /*
967 Bcast deactivated for now, defaults to mpich one
968 typedef struct {
969     int min;
970     int max;
971     int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
972                                   int root, MPI_Comm comm_ptr);
973     int zcpy_pipelined_knomial_factor;
974 } mv2_bcast_tuning_element;
975
976 typedef struct {
977     int numproc;
978     int bcast_segment_size;
979     int intra_node_knomial_factor;
980     int inter_node_knomial_factor;
981     int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
982     int size_inter_table;
983     mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
984     int size_intra_table;
985     mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
986 } mv2_bcast_tuning_table;
987
988 int mv2_size_bcast_tuning_table = 0;
989 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
990
991
992 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
993                            int root, MPI_Comm comm_ptr) = NULL;
994
995 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
996                                       int root, MPI_Comm comm_ptr) = NULL;
997
998
999  */
1000
1001
1002 /*
1003 static void init_mv2_bcast_tables_stampede(){
1004  //Stampede,
1005         mv2_size_bcast_tuning_table=8;
1006         mv2_bcast_thresholds_table = xbt_malloc(mv2_size_bcast_tuning_table *
1007                                                  sizeof (mv2_bcast_tuning_table));
1008
1009   mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1010     {
1011             16,
1012             8192, 4, 4,
1013             {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1014             11,
1015             {
1016               {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1017               {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1018               {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1019               {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1020               {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1021               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1022               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1023               {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1024               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1025               {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1026               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1027             },
1028             11,
1029             {
1030               {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1031               {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1032               {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1033               {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1034               {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1035               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1036               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1037               {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1038               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1039               {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1040               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1041             }
1042     },
1043     {
1044             32,
1045             8192, 4, 4,
1046             {1, 1, 1, 1, 1, 1, 1, 1},
1047             8,
1048             {
1049               {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1050               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1051               {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1052               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1053               {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1054               {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1055               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1056               {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1057             },
1058             8,
1059             {
1060               {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1061               {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1062               {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1063               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1064               {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1065               {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1066               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1067               {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1068             }
1069     },
1070     {
1071             64,
1072             8192, 4, 4,
1073             {1, 1, 1, 1, 1, 1, 1, 1, 1},
1074             9,
1075             {
1076               {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1077               {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1078               {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1079               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1080               {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1081               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1082               {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1083               {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1084               {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1085             },
1086             9,
1087             {
1088               {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1089               {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1090               {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1091               {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1092               {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1093               {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1094               {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1095               {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1096               {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1097             }
1098     },
1099     {
1100             128,
1101             8192, 4, 4,
1102             {1, 1, 1, 0},
1103             4,
1104             {
1105               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1106               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1107               {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1108               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1109             },
1110             4,
1111             {
1112               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1113               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1114               {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1115               {524288, -1, NULL, -1}
1116             }
1117     },
1118     {
1119             256,
1120             8192, 4, 4,
1121             {1, 1, 1, 1, 1},
1122             5,
1123             {
1124               {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1125               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1126               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1127               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1128               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1129             },
1130             5,
1131             {
1132               {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1133               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1134               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1135               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1136               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1137             }
1138     },
1139     {
1140             512,
1141             8192, 4, 4,
1142             {1, 1, 1, 1, 1},
1143             5,
1144             {
1145               {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1146               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1147               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1148               {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1149               {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1150             },
1151             5,
1152             {
1153               {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1154               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1155               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1156               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1157               {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1158             }
1159     },
1160     {
1161             1024,
1162             8192, 4, 4,
1163             {1, 1, 1, 1, 1},
1164             5,
1165             {
1166               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1167               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1168               {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1169               {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1170               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1171             },
1172             5,
1173             {
1174               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1175               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1176               {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1177               {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1178               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1179             }
1180     },
1181     {
1182             2048,
1183             8192, 4, 4,
1184             {1, 1, 1, 1, 1, 1, 1},
1185             7,
1186             {
1187               {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1188               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1189               {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1190               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1191               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1192               {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1193               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1194             },
1195             7,
1196             {
1197               {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1198               {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1199               {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1200               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1201               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1202               {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1203               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1204             }
1205     }
1206   };
1207
1208         memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1209                     mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1210 }*/
1211
1212
1213 /************ Reduce variables and initializers                        */
1214
1215 typedef struct {
1216   int min;
1217   int max;
1218   int (*MV2_pt_Reduce_function)(void *sendbuf,
1219       void *recvbuf,
1220       int count,
1221       MPI_Datatype datatype,
1222       MPI_Op op,
1223       int root,
1224       MPI_Comm  comm_ptr);
1225 } mv2_reduce_tuning_element;
1226
1227 typedef struct {
1228   int numproc;
1229   int inter_k_degree;
1230   int intra_k_degree;
1231   int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1232   int size_inter_table;
1233   mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1234   int size_intra_table;
1235   mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1236 } mv2_reduce_tuning_table;
1237
1238 int mv2_size_reduce_tuning_table = 0;
1239 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1240
1241
1242 int mv2_reduce_intra_knomial_factor = 2;
1243 int mv2_reduce_inter_knomial_factor = 2;
1244
1245 int (*MV2_Reduce_function)( void *sendbuf,
1246     void *recvbuf,
1247     int count,
1248     MPI_Datatype datatype,
1249     MPI_Op op,
1250     int root,
1251     MPI_Comm  comm_ptr)=NULL;
1252
1253 int (*MV2_Reduce_intra_function)( void *sendbuf,
1254     void *recvbuf,
1255     int count,
1256     MPI_Datatype datatype,
1257     MPI_Op op,
1258     int root,
1259     MPI_Comm  comm_ptr)=NULL;
1260
1261
1262 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1263 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1264 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1265 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1266 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1267
1268
1269
1270 static void init_mv2_reduce_tables_stampede(){
1271   /*Stampede*/
1272   mv2_size_reduce_tuning_table = 8;
1273   mv2_reduce_thresholds_table = xbt_malloc(mv2_size_reduce_tuning_table *
1274       sizeof (mv2_reduce_tuning_table));
1275   mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1276       {
1277           16,
1278           4,
1279           4,
1280           {1, 0, 0},
1281           3,
1282           {
1283               {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1284               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1285               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1286           },
1287           2,
1288           {
1289               {0, 65536, &MPIR_Reduce_shmem_MV2},
1290               {65536,-1,  &MPIR_Reduce_binomial_MV2},
1291           },
1292       },
1293       {
1294           32,
1295           4,
1296           4,
1297           {1, 1, 1, 1, 0, 0, 0},
1298           7,
1299           {
1300               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1301               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1302               {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1303               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1304               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1305               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1306               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1307           },
1308           6,
1309           {
1310               {0, 8192, &MPIR_Reduce_shmem_MV2},
1311               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1312               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1313               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1314               {65536, 262144, &MPIR_Reduce_shmem_MV2},
1315               {262144,-1,  &MPIR_Reduce_binomial_MV2},
1316           },
1317       },
1318       {
1319           64,
1320           4,
1321           4,
1322           {1, 1, 1, 1, 0},
1323           5,
1324           {
1325               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1326               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1327               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1328               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1329               {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1330           },
1331           5,
1332           {
1333               {0, 8192, &MPIR_Reduce_shmem_MV2},
1334               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1335               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1336               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1337               {262144, -1, &MPIR_Reduce_binomial_MV2},
1338           },
1339       },
1340       {
1341           128,
1342           4,
1343           4,
1344           {1, 0, 1, 0, 1, 0},
1345           6,
1346           {
1347               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1348               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1349               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1350               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1351               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1352               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1353           },
1354           5,
1355           {
1356               {0, 8192, &MPIR_Reduce_shmem_MV2},
1357               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1358               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1359               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1360               {262144, -1, &MPIR_Reduce_binomial_MV2},
1361           },
1362       },
1363       {
1364           256,
1365           4,
1366           4,
1367           {1, 1, 1, 0, 1, 1, 0},
1368           7,
1369           {
1370               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1371               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1372               {16384, 32768, &MPIR_Reduce_binomial_MV2},
1373               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1374               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1375               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1376               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1377           },
1378           6,
1379           {
1380               {0, 8192, &MPIR_Reduce_shmem_MV2},
1381               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1382               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1383               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1384               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1385               {262144, -1, &MPIR_Reduce_binomial_MV2},
1386           },
1387       },
1388       {
1389           512,
1390           4,
1391           4,
1392           {1, 0, 1, 1, 1, 0},
1393           6,
1394           {
1395               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1396               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1397               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1398               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1399               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1400               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1401           },
1402           5,
1403           {
1404               {0, 8192, &MPIR_Reduce_shmem_MV2},
1405               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1406               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1407               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1408               {262144, -1, &MPIR_Reduce_binomial_MV2},
1409           },
1410       },
1411       {
1412           1024,
1413           4,
1414           4,
1415           {1, 0, 1, 1, 1},
1416           5,
1417           {
1418               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1419               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1420               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1421               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1422               {262144, -1, &MPIR_Reduce_binomial_MV2},
1423           },
1424           5,
1425           {
1426               {0, 8192, &MPIR_Reduce_shmem_MV2},
1427               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1428               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1429               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1430               {262144, -1, &MPIR_Reduce_binomial_MV2},
1431           },
1432       },
1433       {
1434           2048,
1435           4,
1436           4,
1437           {1, 0, 1, 1, 1,1},
1438           6,
1439           {
1440               {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1441               {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1442               {4096, 16384, &MPIR_Reduce_binomial_MV2},
1443               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1444               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1445               {131072, -1, &MPIR_Reduce_binomial_MV2},
1446           },
1447           6,
1448           {
1449               {0, 2048, &MPIR_Reduce_shmem_MV2},
1450               {2048, 4096, &MPIR_Reduce_shmem_MV2},
1451               {4096, 16384, &MPIR_Reduce_shmem_MV2},
1452               {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1453               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1454               {131072, -1, &MPIR_Reduce_shmem_MV2},
1455           },
1456       },
1457
1458   };
1459   memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1460       mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1461 }
1462
1463 /************ Reduce scatter variables and initializers                        */
1464
1465 typedef struct {
1466   int min;
1467   int max;
1468   int (*MV2_pt_Red_scat_function)(void *sendbuf,
1469       void *recvbuf,
1470       int *recvcnts,
1471       MPI_Datatype datatype,
1472       MPI_Op op,
1473       MPI_Comm comm_ptr);
1474 } mv2_red_scat_tuning_element;
1475
1476 typedef struct {
1477   int numproc;
1478   int size_inter_table;
1479   mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1480 } mv2_red_scat_tuning_table;
1481
1482 int mv2_size_red_scat_tuning_table = 0;
1483 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1484
1485
1486 int (*MV2_Red_scat_function)(void *sendbuf,
1487     void *recvbuf,
1488     int *recvcnts,
1489     MPI_Datatype datatype,
1490     MPI_Op op,
1491     MPI_Comm comm_ptr);
1492
1493
1494
1495 static  int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1496     void *recvbuf,
1497     int *recvcnts,
1498     MPI_Datatype datatype,
1499     MPI_Op op,
1500     MPI_Comm comm)
1501 {
1502   smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1503   return MPI_SUCCESS;
1504 }
1505 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1506 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1507 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1508
1509
1510
1511
1512 static void init_mv2_reduce_scatter_tables_stampede(){
1513   mv2_size_red_scat_tuning_table = 6;
1514   mv2_red_scat_thresholds_table = xbt_malloc(mv2_size_red_scat_tuning_table *
1515       sizeof (mv2_red_scat_tuning_table));
1516   mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1517       {
1518           16,
1519           3,
1520           {
1521               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1522               {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1523               {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1524           },
1525       },
1526       {
1527           32,
1528           3,
1529           {
1530               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1531               {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1532               {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1533           },
1534       },
1535       {
1536           64,
1537           3,
1538           {
1539               {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1540               {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1541               {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1542           },
1543       },
1544       {
1545           128,
1546           2,
1547           {
1548               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1549               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1550           },
1551       },
1552       {
1553           256,
1554           2,
1555           {
1556               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1557               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1558           },
1559       },
1560       {
1561           512,
1562           2,
1563           {
1564               {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1565               {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1566           },
1567       },
1568
1569   };
1570   memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1571       mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1572 }
1573
1574 /************ Scatter variables and initializers                        */
1575
1576 typedef struct {
1577   int min;
1578   int max;
1579   int (*MV2_pt_Scatter_function)(void *sendbuf,
1580       int sendcnt,
1581       MPI_Datatype sendtype,
1582       void *recvbuf,
1583       int recvcnt,
1584       MPI_Datatype recvtype,
1585       int root, MPI_Comm comm);
1586 } mv2_scatter_tuning_element;
1587
1588 typedef struct {
1589   int numproc;
1590   int size_inter_table;
1591   mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1592   int size_intra_table;
1593   mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1594 } mv2_scatter_tuning_table;
1595
1596
1597 int *mv2_scatter_table_ppn_conf = NULL;
1598 int mv2_scatter_num_ppn_conf = 1;
1599 int *mv2_size_scatter_tuning_table = NULL;
1600 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1601
1602 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1603     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1604     int root, MPI_Comm comm)=NULL;
1605
1606 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1607     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1608     int root, MPI_Comm comm)=NULL;
1609 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1610     int sendcnt,
1611     MPI_Datatype sendtype,
1612     void *recvbuf,
1613     int recvcnt,
1614     MPI_Datatype recvtype,
1615     int root, MPI_Comm comm_ptr);
1616
1617 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1618     int sendcnt,
1619     MPI_Datatype sendtype,
1620     void *recvbuf,
1621     int recvcnt,
1622     MPI_Datatype recvtype,
1623     int root, MPI_Comm comm_ptr)
1624 {
1625   return 0;
1626 }
1627
1628 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1629 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1630 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_ompi_binomial
1631 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1632
1633
1634
1635
1636 static void init_mv2_scatter_tables_stampede(){
1637   {
1638     int agg_table_sum = 0;
1639     int i;
1640     mv2_scatter_tuning_table **table_ptrs = NULL;
1641     mv2_scatter_num_ppn_conf = 3;
1642     mv2_scatter_thresholds_table
1643     = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1644         * mv2_scatter_num_ppn_conf);
1645     table_ptrs = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1646         * mv2_scatter_num_ppn_conf);
1647     mv2_size_scatter_tuning_table = xbt_malloc(sizeof(int) *
1648         mv2_scatter_num_ppn_conf);
1649     mv2_scatter_table_ppn_conf
1650     = xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int));
1651     mv2_scatter_table_ppn_conf[0] = 1;
1652     mv2_size_scatter_tuning_table[0] = 6;
1653     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1654         {2,
1655             1,
1656             {
1657                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1658             },
1659             1,
1660             {
1661                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1662             },
1663         },
1664
1665         {4,
1666             1,
1667             {
1668                 {0, -1, &MPIR_Scatter_MV2_Direct},
1669             },
1670             1,
1671             {
1672                 {0, -1, &MPIR_Scatter_MV2_Direct},
1673             },
1674         },
1675
1676         {8,
1677             1,
1678             {
1679                 {0, -1, &MPIR_Scatter_MV2_Direct},
1680             },
1681             1,
1682             {
1683                 {0, -1, &MPIR_Scatter_MV2_Direct},
1684             },
1685         },
1686
1687         {16,
1688             1,
1689             {
1690                 {0, -1, &MPIR_Scatter_MV2_Direct},
1691             },
1692             1,
1693             {
1694                 {0, -1, &MPIR_Scatter_MV2_Direct},
1695             },
1696         },
1697
1698         {32,
1699             1,
1700             {
1701                 {0, -1, &MPIR_Scatter_MV2_Direct},
1702             },
1703             1,
1704             {
1705                 {0, -1, &MPIR_Scatter_MV2_Direct},
1706             },
1707         },
1708
1709         {64,
1710             2,
1711             {
1712                 {0, 32, &MPIR_Scatter_MV2_Binomial},
1713                 {32, -1, &MPIR_Scatter_MV2_Direct},
1714             },
1715             1,
1716             {
1717                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1718             },
1719         },
1720     };
1721     table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1722     mv2_scatter_table_ppn_conf[1] = 2;
1723     mv2_size_scatter_tuning_table[1] = 6;
1724     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1725         {4,
1726             2,
1727             {
1728                 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1729                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1730             },
1731             1,
1732             {
1733                 {0, -1, &MPIR_Scatter_MV2_Direct},
1734             },
1735         },
1736
1737         {8,
1738             2,
1739             {
1740                 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1741                 {512, -1, &MPIR_Scatter_MV2_Direct},
1742             },
1743             1,
1744             {
1745                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1746             },
1747         },
1748
1749         {16,
1750             2,
1751             {
1752                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1753                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1754             },
1755             1,
1756             {
1757                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1758             },
1759         },
1760
1761         {32,
1762             2,
1763             {
1764                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1765                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1766             },
1767             1,
1768             {
1769                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1770             },
1771         },
1772
1773         {64,
1774             2,
1775             {
1776                 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1777                 {8192, -1, &MPIR_Scatter_MV2_Direct},
1778             },
1779             1,
1780             {
1781                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1782             },
1783         },
1784
1785         {128,
1786             4,
1787             {
1788                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1789                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1790                 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1791                 {16384, -1, &MPIR_Scatter_MV2_Direct},
1792             },
1793             1,
1794             {
1795                 {0, 128, &MPIR_Scatter_MV2_Direct},
1796                 {128, -1, &MPIR_Scatter_MV2_Binomial},
1797             },
1798         },
1799     };
1800     table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1801     mv2_scatter_table_ppn_conf[2] = 16;
1802     mv2_size_scatter_tuning_table[2] = 8;
1803     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1804         {
1805             16,
1806             2,
1807             {
1808                 {0, 256, &MPIR_Scatter_MV2_Binomial},
1809                 {256, -1, &MPIR_Scatter_MV2_Direct},
1810             },
1811             1,
1812             {
1813                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1814             },
1815         },
1816
1817         {
1818             32,
1819             2,
1820             {
1821                 {0, 512, &MPIR_Scatter_MV2_Binomial},
1822                 {512, -1, &MPIR_Scatter_MV2_Direct},
1823             },
1824             1,
1825             {
1826                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1827             },
1828         },
1829
1830         {
1831             64,
1832             2,
1833             {
1834                 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1835                 {1024, -1, &MPIR_Scatter_MV2_Direct},
1836             },
1837             1,
1838             {
1839                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1840             },
1841         },
1842
1843         {
1844             128,
1845             4,
1846             {
1847                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1848                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1849                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1850                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1851             },
1852             1,
1853             {
1854                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1855             },
1856         },
1857
1858         {
1859             256,
1860             4,
1861             {
1862                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1863                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1864                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1865                 {2048, -1,  &MPIR_Scatter_MV2_Direct},
1866             },
1867             1,
1868             {
1869                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1870             },
1871         },
1872
1873         {
1874             512,
1875             4,
1876             {
1877                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1878                 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1879                 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1880                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1881             },
1882             1,
1883             {
1884                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1885             },
1886         },
1887         {
1888             1024,
1889             5,
1890             {
1891                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1892                 {0, 16,  &MPIR_Scatter_MV2_Binomial},
1893                 {16, 32, &MPIR_Scatter_MV2_Binomial},
1894                 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1895                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1896             },
1897             1,
1898             {
1899                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1900             },
1901         },
1902         {
1903             2048,
1904             7,
1905             {
1906                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1907                 {0, 16,  &MPIR_Scatter_MV2_two_level_Binomial},
1908                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1909                 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1910                 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1911                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1912                 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1913             },
1914             6,
1915             {
1916                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1917                 {16, 128, &MPIR_Scatter_MV2_Binomial},
1918                 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1919                 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1920                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1921                 {65536, -1, &MPIR_Scatter_MV2_Direct},
1922             },
1923         },
1924     };
1925     table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1926     agg_table_sum = 0;
1927     for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1928         agg_table_sum += mv2_size_scatter_tuning_table[i];
1929     }
1930     mv2_scatter_thresholds_table[0] =
1931         xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
1932     memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1933         (sizeof(mv2_scatter_tuning_table)
1934             * mv2_size_scatter_tuning_table[0]));
1935     for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1936         mv2_scatter_thresholds_table[i] =
1937             mv2_scatter_thresholds_table[i - 1]
1938                                          + mv2_size_scatter_tuning_table[i - 1];
1939         memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1940             (sizeof(mv2_scatter_tuning_table)
1941                 * mv2_size_scatter_tuning_table[i]));
1942     }
1943     xbt_free(table_ptrs);
1944   }
1945 }
1946