Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
65796c2c1b0b6e8b24086bca37cf37f8d7218625
[simgrid.git] / src / smpi / colls / smpi_mvapich2_selector_stampede.h
1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
2
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4  * All rights reserved.                                                     */
5
6 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
7
8
9
10 /************ Alltoall variables and initializers                        */
11
12 #define MV2_MAX_NB_THRESHOLDS  32
13
14
15 typedef struct {
16   int min;
17   int max;
18   int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
19       void *recvbuf, int recvcount, MPI_Datatype recvtype,
20       MPI_Comm comm_ptr );
21 } mv2_alltoall_tuning_element;
22
23 typedef struct {
24   int numproc;
25   int size_table;
26   mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
27   mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
28 } mv2_alltoall_tuning_table;
29
30 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
31
32 /* Indicates number of processes per node */
33 int *mv2_alltoall_table_ppn_conf = NULL;
34 /* Indicates total number of configurations */
35 int mv2_alltoall_num_ppn_conf = 1;
36 int *mv2_size_alltoall_tuning_table = NULL;
37 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
38
39
40 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
41 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
42 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_mvapich2_scatter_dest
43 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
44 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring 
45
46
47 static void init_mv2_alltoall_tables_stampede(){
48   int i;
49   int agg_table_sum = 0;
50   mv2_alltoall_tuning_table **table_ptrs = NULL;
51   mv2_alltoall_num_ppn_conf = 3;
52   mv2_alltoall_thresholds_table = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
53       * mv2_alltoall_num_ppn_conf);
54   table_ptrs = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
55       * mv2_alltoall_num_ppn_conf);
56   mv2_size_alltoall_tuning_table = xbt_malloc(sizeof(int) *
57       mv2_alltoall_num_ppn_conf);
58   mv2_alltoall_table_ppn_conf = xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
59   mv2_alltoall_table_ppn_conf[0] = 1;
60   mv2_size_alltoall_tuning_table[0] = 6;
61   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
62       {2,
63           1,
64           {{0, -1, &MPIR_Alltoall_pairwise_MV2},
65           },
66
67           {{0, -1, &MPIR_Alltoall_inplace_MV2},
68           },
69       },
70
71       {4,
72           2,
73           {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
74               {262144, -1, &MPIR_Alltoall_pairwise_MV2},
75           },
76
77           {{0, -1, &MPIR_Alltoall_inplace_MV2},
78           },
79       },
80
81       {8,
82           2,
83           {{0, 8, &MPIR_Alltoall_RD_MV2},
84               {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
85           },
86
87           {{0, -1, &MPIR_Alltoall_inplace_MV2},
88           },
89       },
90
91       {16,
92           3,
93           {{0, 64, &MPIR_Alltoall_RD_MV2},
94               {64, 512, &MPIR_Alltoall_bruck_MV2},
95               {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
96           },
97
98           {{0,-1, &MPIR_Alltoall_inplace_MV2},
99           },
100       },
101
102       {32,
103           3,
104           {{0, 32, &MPIR_Alltoall_RD_MV2},
105               {32, 2048, &MPIR_Alltoall_bruck_MV2},
106               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
107           },
108
109           {{0, -1, &MPIR_Alltoall_inplace_MV2},
110           },
111       },
112
113       {64,
114           3,
115           {{0, 8, &MPIR_Alltoall_RD_MV2},
116               {8, 1024, &MPIR_Alltoall_bruck_MV2},
117               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
118           },
119
120           {{0, -1, &MPIR_Alltoall_inplace_MV2},
121           },
122       },
123   };
124   table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
125   mv2_alltoall_table_ppn_conf[1] = 2;
126   mv2_size_alltoall_tuning_table[1] = 6;
127   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
128       {4,
129           2,
130           {{0, 32, &MPIR_Alltoall_RD_MV2},
131               {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
132           },
133
134           {{0, -1, &MPIR_Alltoall_inplace_MV2},
135           },
136       },
137
138       {8,
139           2,
140           {{0, 64, &MPIR_Alltoall_RD_MV2},
141               {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
142           },
143
144           {{0, -1, &MPIR_Alltoall_inplace_MV2},
145           },
146       },
147
148       {16,
149           3,
150           {{0, 64, &MPIR_Alltoall_RD_MV2},
151               {64, 2048, &MPIR_Alltoall_bruck_MV2},
152               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
153           },
154
155           {{0,-1, &MPIR_Alltoall_inplace_MV2},
156           },
157       },
158
159       {32,
160           3,
161           {{0, 16, &MPIR_Alltoall_RD_MV2},
162               {16, 2048, &MPIR_Alltoall_bruck_MV2},
163               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
164           },
165
166           {{0, -1, &MPIR_Alltoall_inplace_MV2},
167           },
168       },
169
170       {64,
171           3,
172           {{0, 8, &MPIR_Alltoall_RD_MV2},
173               {8, 1024, &MPIR_Alltoall_bruck_MV2},
174               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
175           },
176
177           {{0, -1, &MPIR_Alltoall_inplace_MV2},
178           },
179       },
180
181       {128,
182           3,
183           {{0, 4, &MPIR_Alltoall_RD_MV2},
184               {4, 2048, &MPIR_Alltoall_bruck_MV2},
185               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
186           },
187
188           {{0, -1, &MPIR_Alltoall_inplace_MV2},
189           },
190       },
191   };
192   table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
193   mv2_alltoall_table_ppn_conf[2] = 16;
194   mv2_size_alltoall_tuning_table[2] = 7;
195   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
196       {16,
197           2,
198           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
199               {2048, -1,  &MPIR_Alltoall_Scatter_dest_MV2},
200           },
201
202           {{32768, -1, &MPIR_Alltoall_inplace_MV2},
203           },
204       },
205
206       {32,
207           2,
208           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
209               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
210           },
211
212           {{16384, -1, &MPIR_Alltoall_inplace_MV2},
213           },
214       },
215
216       {64,
217           3,
218           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
219               {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
220               {16384, -1, &MPIR_Alltoall_pairwise_MV2},
221           },
222
223           {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
224           },
225       },
226
227       {128,
228           2,
229           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
230               {2048, -1, &MPIR_Alltoall_pairwise_MV2},
231           },
232
233           {{16384,65536, &MPIR_Alltoall_inplace_MV2},
234           },
235       },
236
237       {256,
238           2,
239           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
240               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
241           },
242
243           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
244           },
245       },
246
247       {512,
248           2,
249           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
250               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
251           },
252
253           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
254           },
255       },
256       {1024,
257           2,
258           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
259               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
260           },
261
262           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
263           },
264       },
265
266   };
267   table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
268   agg_table_sum = 0;
269   for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
270       agg_table_sum += mv2_size_alltoall_tuning_table[i];
271   }
272   mv2_alltoall_thresholds_table[0] =
273       xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
274   memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
275       (sizeof(mv2_alltoall_tuning_table)
276           * mv2_size_alltoall_tuning_table[0]));
277   for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
278       mv2_alltoall_thresholds_table[i] =
279           mv2_alltoall_thresholds_table[i - 1]
280                                         + mv2_size_alltoall_tuning_table[i - 1];
281       memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
282           (sizeof(mv2_alltoall_tuning_table)
283               * mv2_size_alltoall_tuning_table[i]));
284   }
285   xbt_free(table_ptrs);
286
287
288 }
289
290
291 /************ Allgather variables and initializers                        */
292
293 typedef struct {
294   int min;
295   int max;
296   int (*MV2_pt_Allgather_function)(void *sendbuf,
297       int sendcount,
298       MPI_Datatype sendtype,
299       void *recvbuf,
300       int recvcount,
301       MPI_Datatype recvtype, MPI_Comm comm_ptr);
302 } mv2_allgather_tuning_element;
303
304 typedef struct {
305   int numproc;
306   int two_level[MV2_MAX_NB_THRESHOLDS];
307   int size_inter_table;
308   mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
309 } mv2_allgather_tuning_table;
310
311 int (*MV2_Allgather_function)(void *sendbuf,
312     int sendcount,
313     MPI_Datatype sendtype,
314     void *recvbuf,
315     int recvcount,
316     MPI_Datatype recvtype, MPI_Comm comm);
317
318 int *mv2_allgather_table_ppn_conf = NULL;
319 int mv2_allgather_num_ppn_conf = 1;
320 int *mv2_size_allgather_tuning_table = NULL;
321 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
322
323 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
324                                  int sendcount,
325                                  MPI_Datatype sendtype,
326                                  void *recvbuf,
327                                  int recvcount,
328                                  MPI_Datatype recvtype, MPI_Comm comm_ptr)
329 {
330     return 0;
331 }
332
333 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
334 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
335 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
336 #define MPIR_2lvl_Allgather_MV2 smpi_coll_tuned_allgather_mvapich2_smp
337
338 static void init_mv2_allgather_tables_stampede(){
339   int i;
340   int agg_table_sum = 0;
341   mv2_allgather_tuning_table **table_ptrs = NULL;
342   mv2_allgather_num_ppn_conf = 3;
343   mv2_allgather_thresholds_table
344   = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
345       * mv2_allgather_num_ppn_conf);
346   table_ptrs = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
347       * mv2_allgather_num_ppn_conf);
348   mv2_size_allgather_tuning_table = xbt_malloc(sizeof(int) *
349       mv2_allgather_num_ppn_conf);
350   mv2_allgather_table_ppn_conf
351   = xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int));
352   mv2_allgather_table_ppn_conf[0] = 1;
353   mv2_size_allgather_tuning_table[0] = 6;
354   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
355       {
356           2,
357           {0},
358           1,
359           {
360               {0, -1, &MPIR_Allgather_Ring_MV2},
361           },
362       },
363       {
364           4,
365           {0,0},
366           2,
367           {
368               {0, 262144, &MPIR_Allgather_RD_MV2},
369               {262144, -1, &MPIR_Allgather_Ring_MV2},
370           },
371       },
372       {
373           8,
374           {0,0},
375           2,
376           {
377               {0, 131072, &MPIR_Allgather_RD_MV2},
378               {131072, -1, &MPIR_Allgather_Ring_MV2},
379           },
380       },
381       {
382           16,
383           {0,0},
384           2,
385           {
386               {0, 131072, &MPIR_Allgather_RD_MV2},
387               {131072, -1, &MPIR_Allgather_Ring_MV2},
388           },
389       },
390       {
391           32,
392           {0,0},
393           2,
394           {
395               {0, 65536, &MPIR_Allgather_RD_MV2},
396               {65536, -1, &MPIR_Allgather_Ring_MV2},
397           },
398       },
399       {
400           64,
401           {0,0},
402           2,
403           {
404               {0, 32768, &MPIR_Allgather_RD_MV2},
405               {32768, -1, &MPIR_Allgather_Ring_MV2},
406           },
407       },
408   };
409   table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
410   mv2_allgather_table_ppn_conf[1] = 2;
411   mv2_size_allgather_tuning_table[1] = 6;
412   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
413       {
414           4,
415           {0,0},
416           2,
417           {
418               {0, 524288, &MPIR_Allgather_RD_MV2},
419               {524288, -1, &MPIR_Allgather_Ring_MV2},
420           },
421       },
422       {
423           8,
424           {0,1,0},
425           2,
426           {
427               {0, 32768, &MPIR_Allgather_RD_MV2},
428               {32768, 524288, &MPIR_Allgather_Ring_MV2},
429               {524288, -1, &MPIR_Allgather_Ring_MV2},
430           },
431       },
432       {
433           16,
434           {0,1,0},
435           2,
436           {
437               {0, 16384, &MPIR_Allgather_RD_MV2},
438               {16384, 524288, &MPIR_Allgather_Ring_MV2},
439               {524288, -1, &MPIR_Allgather_Ring_MV2},
440           },
441       },
442       {
443           32,
444           {1,1,0},
445           2,
446           {
447               {0, 65536, &MPIR_Allgather_RD_MV2},
448               {65536, 524288, &MPIR_Allgather_Ring_MV2},
449               {524288, -1, &MPIR_Allgather_Ring_MV2},
450           },
451       },
452       {
453           64,
454           {1,1,0},
455           2,
456           {
457               {0, 32768, &MPIR_Allgather_RD_MV2},
458               {32768, 524288, &MPIR_Allgather_Ring_MV2},
459               {524288, -1, &MPIR_Allgather_Ring_MV2},
460           },
461       },
462       {
463           128,
464           {1,1,0},
465           2,
466           {
467               {0, 65536, &MPIR_Allgather_RD_MV2},
468               {65536, 524288, &MPIR_Allgather_Ring_MV2},
469               {524288, -1, &MPIR_Allgather_Ring_MV2},
470           },
471       },
472   };
473   table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
474   mv2_allgather_table_ppn_conf[2] = 16;
475   mv2_size_allgather_tuning_table[2] = 6;
476   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
477       {
478           16,
479           {0,0},
480           2,
481           {
482               {0, 1024, &MPIR_Allgather_RD_MV2},
483               {1024, -1, &MPIR_Allgather_Ring_MV2},
484           },
485       },
486       {
487           32,
488           {0,0},
489           2,
490           {
491               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
492               {1024, -1, &MPIR_Allgather_Ring_MV2},
493           },
494       },
495       {
496           64,
497           {0,0},
498           2,
499           {
500               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
501               {1024, -1, &MPIR_Allgather_Ring_MV2},
502           },
503       },
504       {
505           128,
506           {0,0},
507           2,
508           {
509               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
510               {1024, -1, &MPIR_Allgather_Ring_MV2},
511           },
512       },
513       {
514           256,
515           {0,0},
516           2,
517           {
518               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
519               {1024, -1, &MPIR_Allgather_Ring_MV2},
520           },
521       },
522       {
523           512,
524           {0,0},
525           2,
526           {
527               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
528               {1024, -1, &MPIR_Allgather_Ring_MV2},
529           },
530       },
531
532   };
533   table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
534   agg_table_sum = 0;
535   for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
536       agg_table_sum += mv2_size_allgather_tuning_table[i];
537   }
538   mv2_allgather_thresholds_table[0] =
539       xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
540   memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
541       (sizeof(mv2_allgather_tuning_table)
542           * mv2_size_allgather_tuning_table[0]));
543   for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
544       mv2_allgather_thresholds_table[i] =
545           mv2_allgather_thresholds_table[i - 1]
546                                          + mv2_size_allgather_tuning_table[i - 1];
547       memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
548           (sizeof(mv2_allgather_tuning_table)
549               * mv2_size_allgather_tuning_table[i]));
550   }
551   xbt_free(table_ptrs);
552 }
553
554
555 /************ Gather variables and initializers                        */
556
557 typedef struct {
558   int min;
559   int max;
560   int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
561       MPI_Datatype sendtype, void *recvbuf, int recvcnt,
562       MPI_Datatype recvtype, int root, MPI_Comm  comm_ptr);
563 } mv2_gather_tuning_element;
564
565
566 typedef struct {
567   int numproc;
568   int size_inter_table;
569   mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
570   int size_intra_table;
571   mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
572 } mv2_gather_tuning_table;
573
574 int mv2_size_gather_tuning_table=7;
575 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL; 
576
577 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
578     int sendcnt,
579     MPI_Datatype sendtype,
580     void *recvbuf,
581     int recvcnt,
582     MPI_Datatype recvtype,
583     int root, MPI_Comm comm);
584
585 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
586 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
587
588
589 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
590 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_mvapich2_two_level
591 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
592
593
594 static void init_mv2_gather_tables_stampede(){
595
596   mv2_size_gather_tuning_table=7;
597   mv2_gather_thresholds_table = xbt_malloc(mv2_size_gather_tuning_table*
598       sizeof (mv2_gather_tuning_table));
599   mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
600       {16,
601           2,{{0, 524288, &MPIR_Gather_MV2_Direct},
602               {524288, -1, &MPIR_Gather_intra}},
603               1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
604               {32,
605                   3,{{0, 16384, &MPIR_Gather_MV2_Direct},
606                       {16384, 131072, &MPIR_Gather_intra},
607                       {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
608                       1,{{0, -1, &MPIR_Gather_intra}}},
609                       {64,
610                           3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
611                               {256, 16384, &MPIR_Gather_MV2_Direct},
612                               {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
613                               1,{{0, -1, &MPIR_Gather_intra}}},
614                               {128,
615                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
616                                       {512, 16384, &MPIR_Gather_MV2_Direct},
617                                       {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
618                                       1,{{0, -1, &MPIR_Gather_intra}}},
619                                       {256,
620                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
621                                               {512, 16384, &MPIR_Gather_MV2_Direct},
622                                               {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
623                                               1,{{0, -1, &MPIR_Gather_intra}}},
624                                               {512,
625                                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
626                                                       {512, 16384, &MPIR_Gather_MV2_Direct},
627                                                       {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
628                                                       1,{{0, -1, &MPIR_Gather_intra}}},
629                                                       {1024,
630                                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
631                                                               {512, 16384, &MPIR_Gather_MV2_Direct},
632                                                               {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
633                                                               1,{{0, -1, &MPIR_Gather_intra}}},
634   };
635
636   memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
637       mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
638
639 }
640
641
642 /************ Allgatherv variables and initializers                        */
643
644 typedef struct {
645   int min;
646   int max;
647   int (*MV2_pt_Allgatherv_function)(void *sendbuf,
648       int sendcount,
649       MPI_Datatype sendtype,
650       void *recvbuf,
651       int *recvcounts,
652       int *displs,
653       MPI_Datatype recvtype,
654       MPI_Comm commg);
655 } mv2_allgatherv_tuning_element;
656
657 typedef struct {
658   int numproc;
659   int size_inter_table;
660   mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
661 } mv2_allgatherv_tuning_table;
662
663 int (*MV2_Allgatherv_function)(void *sendbuf,
664     int sendcount,
665     MPI_Datatype sendtype,
666     void *recvbuf,
667     int *recvcounts,
668     int *displs,
669     MPI_Datatype recvtype,
670     MPI_Comm comm);
671
672 int mv2_size_allgatherv_tuning_table = 0;
673 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
674
675 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
676 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
677 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
678
679
680 static void init_mv2_allgatherv_tables_stampede(){
681   mv2_size_allgatherv_tuning_table = 6;
682   mv2_allgatherv_thresholds_table = xbt_malloc(mv2_size_allgatherv_tuning_table *
683       sizeof (mv2_allgatherv_tuning_table));
684   mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
685       {
686           16,
687           2,
688           {
689               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
690               {512, -1, &MPIR_Allgatherv_Ring_MV2},
691           },
692       },
693       {
694           32,
695           2,
696           {
697               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
698               {512, -1, &MPIR_Allgatherv_Ring_MV2},
699           },
700       },
701       {
702           64,
703           2,
704           {
705               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
706               {256, -1, &MPIR_Allgatherv_Ring_MV2},
707           },
708       },
709       {
710           128,
711           2,
712           {
713               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
714               {256, -1, &MPIR_Allgatherv_Ring_MV2},
715           },
716       },
717       {
718           256,
719           2,
720           {
721               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
722               {256, -1, &MPIR_Allgatherv_Ring_MV2},
723           },
724       },
725       {
726           512,
727           2,
728           {
729               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
730               {256, -1, &MPIR_Allgatherv_Ring_MV2},
731           },
732       },
733
734   };
735   memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
736       mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
737 }
738
739
740 /************ Allreduce variables and initializers                        */
741
742 typedef struct {
743   int min;
744   int max;
745   int (*MV2_pt_Allreduce_function)(void *sendbuf,
746       void *recvbuf,
747       int count,
748       MPI_Datatype datatype,
749       MPI_Op op, MPI_Comm comm);
750 } mv2_allreduce_tuning_element;
751
752 typedef struct {
753   int numproc;
754   int mcast_enabled;
755   int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
756   int size_inter_table;
757   mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
758   int size_intra_table;
759   mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
760 } mv2_allreduce_tuning_table;
761
762
763 int (*MV2_Allreduce_function)(void *sendbuf,
764     void *recvbuf,
765     int count,
766     MPI_Datatype datatype,
767     MPI_Op op, MPI_Comm comm)=NULL;
768
769
770 int (*MV2_Allreduce_intra_function)( void *sendbuf,
771     void *recvbuf,
772     int count,
773     MPI_Datatype datatype,
774     MPI_Op op, MPI_Comm comm)=NULL;
775
776 int mv2_size_allreduce_tuning_table = 0;
777 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
778
779
780
781
782
783 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
784     void *recvbuf,
785     int count,
786     MPI_Datatype datatype,
787     MPI_Op op, MPI_Comm comm)
788
789   return 0;
790 }
791
792 static  int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
793     void *recvbuf,
794     int count,
795     MPI_Datatype datatype,
796     MPI_Op op, MPI_Comm  comm)
797 {
798   return 0;
799 }
800
801 static  int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
802     void *recvbuf,
803     int count,
804     MPI_Datatype datatype,
805     MPI_Op op, MPI_Comm  comm)
806 {
807   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
808   return MPI_SUCCESS;
809 }
810
811 static  int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
812     void *recvbuf,
813     int count,
814     MPI_Datatype datatype,
815     MPI_Op op, MPI_Comm  comm)
816 {
817   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
818   return MPI_SUCCESS;
819 }
820
821 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
822 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
823 #define MPIR_Allreduce_two_level_MV2 smpi_coll_tuned_allreduce_mvapich2_two_level
824
825
826 static void init_mv2_allreduce_tables_stampede(){
827   mv2_size_allreduce_tuning_table = 8;
828   mv2_allreduce_thresholds_table = xbt_malloc(mv2_size_allreduce_tuning_table *
829       sizeof (mv2_allreduce_tuning_table));
830   mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
831       {
832           16,
833           0,
834           {1, 0},
835           2,
836           {
837               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
838               {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
839           },
840           2,
841           {
842               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
843               {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
844           },
845       },
846       {
847           32,
848           0,
849           {1, 1, 0},
850           3,
851           {
852               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
853               {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
854               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
855           },
856           2,
857           {
858               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
859               {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
860           },
861       },
862       {
863           64,
864           0,
865           {1, 1, 0},
866           3,
867           {
868               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
869               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
870               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
871           },
872           2,
873           {
874               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
875               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
876           },
877       },
878       {
879           128,
880           0,
881           {1, 1, 0},
882           3,
883           {
884               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
885               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
886               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
887           },
888           2,
889           {
890               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
891               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
892           },
893       },
894       {
895           256,
896           0,
897           {1, 1, 0},
898           3,
899           {
900               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
901               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
902               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
903           },
904           2,
905           {
906               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
907               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
908           },
909       },
910       {
911           512,
912           0,
913           {1, 1, 0},
914           3,
915           {
916               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
917               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
918               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
919           },
920           2,
921           {
922               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
923               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
924           },
925       },
926       {
927           1024,
928           0,
929           {1, 1, 1, 0},
930           4,
931           {
932               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
933               {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
934               {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
935               {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
936           },
937           2,
938           {
939               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
940               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
941           },
942       },
943       {
944           2048,
945           0,
946           {1, 1, 1, 0},
947           4,
948           {
949               {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
950               {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
951               {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
952               {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
953               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
954           },
955           2,
956           {
957               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
958               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
959           },
960       },
961
962   };
963   memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
964       mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
965 }
966
967
968
969
970 typedef struct {
971     int min;
972     int max;
973     int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
974                                   int root, MPI_Comm comm_ptr);
975     int zcpy_pipelined_knomial_factor;
976 } mv2_bcast_tuning_element;
977
978 typedef struct {
979     int numproc;
980     int bcast_segment_size;
981     int intra_node_knomial_factor;
982     int inter_node_knomial_factor;
983     int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
984     int size_inter_table;
985     mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
986     int size_intra_table;
987     mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
988 } mv2_bcast_tuning_table;
989
990 int mv2_size_bcast_tuning_table = 0;
991 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
992
993
994 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
995                            int root, MPI_Comm comm_ptr) = NULL;
996
997 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
998                                       int root, MPI_Comm comm_ptr) = NULL;
999
1000 int zcpy_knomial_factor = 2;
1001 int mv2_pipelined_zcpy_knomial_factor = -1;
1002 int bcast_segment_size = 8192;
1003 int mv2_inter_node_knomial_factor = 4;
1004 int mv2_intra_node_knomial_factor = 4;
1005 #define mv2_bcast_two_level_system_size  64
1006 #define mv2_bcast_short_msg             16384
1007 #define mv2_bcast_large_msg            512*1024
1008
1009 #define INTRA_NODE_ROOT 0
1010
1011 #define MPIR_Pipelined_Bcast_Zcpy_MV2 smpi_coll_tuned_bcast_mpich
1012 #define MPIR_Pipelined_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1013 #define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_binomial_tree
1014 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
1015 #define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_scatter_rdb_allgather
1016 #define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
1017 #define MPIR_Shmem_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1018 #define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
1019 #define MPIR_Bcast_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
1020 #define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mvapich2_knomial_intra_node
1021 #define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mvapich2_intra_node
1022
1023 static void init_mv2_bcast_tables_stampede(){
1024  //Stampede,
1025         mv2_size_bcast_tuning_table=8;
1026         mv2_bcast_thresholds_table = xbt_malloc(mv2_size_bcast_tuning_table *
1027                                                  sizeof (mv2_bcast_tuning_table));
1028
1029   mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1030     {
1031             16,
1032             8192, 4, 4,
1033             {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1034             11,
1035             {
1036               {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1037               {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1038               {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1039               {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1040               {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1041               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1042               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1043               {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1044               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1045               {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1046               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1047             },
1048             11,
1049             {
1050               {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1051               {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1052               {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1053               {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1054               {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1055               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1056               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1057               {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1058               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1059               {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1060               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1061             }
1062     },
1063     {
1064             32,
1065             8192, 4, 4,
1066             {1, 1, 1, 1, 1, 1, 1, 1},
1067             8,
1068             {
1069               {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1070               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1071               {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1072               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1073               {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1074               {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1075               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1076               {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1077             },
1078             8,
1079             {
1080               {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1081               {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1082               {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1083               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1084               {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1085               {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1086               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1087               {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1088             }
1089     },
1090     {
1091             64,
1092             8192, 4, 4,
1093             {1, 1, 1, 1, 1, 1, 1, 1, 1},
1094             9,
1095             {
1096               {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1097               {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1098               {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1099               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1100               {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1101               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1102               {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1103               {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1104               {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1105             },
1106             9,
1107             {
1108               {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1109               {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1110               {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1111               {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1112               {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1113               {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1114               {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1115               {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1116               {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1117             }
1118     },
1119     {
1120             128,
1121             8192, 4, 4,
1122             {1, 1, 1, 0},
1123             4,
1124             {
1125               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1126               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1127               {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1128               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1129             },
1130             4,
1131             {
1132               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1133               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1134               {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1135               {524288, -1, NULL, -1}
1136             }
1137     },
1138     {
1139             256,
1140             8192, 4, 4,
1141             {1, 1, 1, 1, 1},
1142             5,
1143             {
1144               {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1145               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1146               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1147               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1148               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1149             },
1150             5,
1151             {
1152               {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1153               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1154               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1155               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1156               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1157             }
1158     },
1159     {
1160             512,
1161             8192, 4, 4,
1162             {1, 1, 1, 1, 1},
1163             5,
1164             {
1165               {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1166               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1167               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1168               {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1169               {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1170             },
1171             5,
1172             {
1173               {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1174               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1175               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1176               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1177               {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1178             }
1179     },
1180     {
1181             1024,
1182             8192, 4, 4,
1183             {1, 1, 1, 1, 1},
1184             5,
1185             {
1186               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1187               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1188               {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1189               {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1190               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1191             },
1192             5,
1193             {
1194               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1195               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1196               {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1197               {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1198               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1199             }
1200     },
1201     {
1202             2048,
1203             8192, 4, 4,
1204             {1, 1, 1, 1, 1, 1, 1},
1205             7,
1206             {
1207               {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1208               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1209               {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1210               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1211               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1212               {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1213               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1214             },
1215             7,
1216             {
1217               {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1218               {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1219               {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1220               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1221               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1222               {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1223               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1224             }
1225     }
1226   };
1227
1228         memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1229                     mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1230 }
1231
1232
1233 /************ Reduce variables and initializers                        */
1234
1235 typedef struct {
1236   int min;
1237   int max;
1238   int (*MV2_pt_Reduce_function)(void *sendbuf,
1239       void *recvbuf,
1240       int count,
1241       MPI_Datatype datatype,
1242       MPI_Op op,
1243       int root,
1244       MPI_Comm  comm_ptr);
1245 } mv2_reduce_tuning_element;
1246
1247 typedef struct {
1248   int numproc;
1249   int inter_k_degree;
1250   int intra_k_degree;
1251   int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1252   int size_inter_table;
1253   mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1254   int size_intra_table;
1255   mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1256 } mv2_reduce_tuning_table;
1257
1258 int mv2_size_reduce_tuning_table = 0;
1259 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1260
1261
1262 int mv2_reduce_intra_knomial_factor = 2;
1263 int mv2_reduce_inter_knomial_factor = 2;
1264
1265 int (*MV2_Reduce_function)( void *sendbuf,
1266     void *recvbuf,
1267     int count,
1268     MPI_Datatype datatype,
1269     MPI_Op op,
1270     int root,
1271     MPI_Comm  comm_ptr)=NULL;
1272
1273 int (*MV2_Reduce_intra_function)( void *sendbuf,
1274     void *recvbuf,
1275     int count,
1276     MPI_Datatype datatype,
1277     MPI_Op op,
1278     int root,
1279     MPI_Comm  comm_ptr)=NULL;
1280
1281
1282 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1283 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1284 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1285 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1286 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1287 #define MPIR_Reduce_two_level_helper_MV2 smpi_coll_tuned_reduce_mvapich2_two_level
1288
1289
1290 static void init_mv2_reduce_tables_stampede(){
1291   /*Stampede*/
1292   mv2_size_reduce_tuning_table = 8;
1293   mv2_reduce_thresholds_table = xbt_malloc(mv2_size_reduce_tuning_table *
1294       sizeof (mv2_reduce_tuning_table));
1295   mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1296       {
1297           16,
1298           4,
1299           4,
1300           {1, 0, 0},
1301           3,
1302           {
1303               {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1304               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1305               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1306           },
1307           2,
1308           {
1309               {0, 65536, &MPIR_Reduce_shmem_MV2},
1310               {65536,-1,  &MPIR_Reduce_binomial_MV2},
1311           },
1312       },
1313       {
1314           32,
1315           4,
1316           4,
1317           {1, 1, 1, 1, 0, 0, 0},
1318           7,
1319           {
1320               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1321               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1322               {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1323               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1324               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1325               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1326               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1327           },
1328           6,
1329           {
1330               {0, 8192, &MPIR_Reduce_shmem_MV2},
1331               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1332               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1333               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1334               {65536, 262144, &MPIR_Reduce_shmem_MV2},
1335               {262144,-1,  &MPIR_Reduce_binomial_MV2},
1336           },
1337       },
1338       {
1339           64,
1340           4,
1341           4,
1342           {1, 1, 1, 1, 0},
1343           5,
1344           {
1345               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1346               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1347               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1348               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1349               {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1350           },
1351           5,
1352           {
1353               {0, 8192, &MPIR_Reduce_shmem_MV2},
1354               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1355               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1356               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1357               {262144, -1, &MPIR_Reduce_binomial_MV2},
1358           },
1359       },
1360       {
1361           128,
1362           4,
1363           4,
1364           {1, 0, 1, 0, 1, 0},
1365           6,
1366           {
1367               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1368               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1369               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1370               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1371               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1372               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1373           },
1374           5,
1375           {
1376               {0, 8192, &MPIR_Reduce_shmem_MV2},
1377               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1378               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1379               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1380               {262144, -1, &MPIR_Reduce_binomial_MV2},
1381           },
1382       },
1383       {
1384           256,
1385           4,
1386           4,
1387           {1, 1, 1, 0, 1, 1, 0},
1388           7,
1389           {
1390               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1391               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1392               {16384, 32768, &MPIR_Reduce_binomial_MV2},
1393               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1394               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1395               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1396               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1397           },
1398           6,
1399           {
1400               {0, 8192, &MPIR_Reduce_shmem_MV2},
1401               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1402               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1403               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1404               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1405               {262144, -1, &MPIR_Reduce_binomial_MV2},
1406           },
1407       },
1408       {
1409           512,
1410           4,
1411           4,
1412           {1, 0, 1, 1, 1, 0},
1413           6,
1414           {
1415               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1416               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1417               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1418               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1419               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1420               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1421           },
1422           5,
1423           {
1424               {0, 8192, &MPIR_Reduce_shmem_MV2},
1425               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1426               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1427               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1428               {262144, -1, &MPIR_Reduce_binomial_MV2},
1429           },
1430       },
1431       {
1432           1024,
1433           4,
1434           4,
1435           {1, 0, 1, 1, 1},
1436           5,
1437           {
1438               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1439               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1440               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1441               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1442               {262144, -1, &MPIR_Reduce_binomial_MV2},
1443           },
1444           5,
1445           {
1446               {0, 8192, &MPIR_Reduce_shmem_MV2},
1447               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1448               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1449               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1450               {262144, -1, &MPIR_Reduce_binomial_MV2},
1451           },
1452       },
1453       {
1454           2048,
1455           4,
1456           4,
1457           {1, 0, 1, 1, 1,1},
1458           6,
1459           {
1460               {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1461               {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1462               {4096, 16384, &MPIR_Reduce_binomial_MV2},
1463               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1464               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1465               {131072, -1, &MPIR_Reduce_binomial_MV2},
1466           },
1467           6,
1468           {
1469               {0, 2048, &MPIR_Reduce_shmem_MV2},
1470               {2048, 4096, &MPIR_Reduce_shmem_MV2},
1471               {4096, 16384, &MPIR_Reduce_shmem_MV2},
1472               {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1473               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1474               {131072, -1, &MPIR_Reduce_shmem_MV2},
1475           },
1476       },
1477
1478   };
1479   memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1480       mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1481 }
1482
1483 /************ Reduce scatter variables and initializers                        */
1484
1485 typedef struct {
1486   int min;
1487   int max;
1488   int (*MV2_pt_Red_scat_function)(void *sendbuf,
1489       void *recvbuf,
1490       int *recvcnts,
1491       MPI_Datatype datatype,
1492       MPI_Op op,
1493       MPI_Comm comm_ptr);
1494 } mv2_red_scat_tuning_element;
1495
1496 typedef struct {
1497   int numproc;
1498   int size_inter_table;
1499   mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1500 } mv2_red_scat_tuning_table;
1501
1502 int mv2_size_red_scat_tuning_table = 0;
1503 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1504
1505
1506 int (*MV2_Red_scat_function)(void *sendbuf,
1507     void *recvbuf,
1508     int *recvcnts,
1509     MPI_Datatype datatype,
1510     MPI_Op op,
1511     MPI_Comm comm_ptr);
1512
1513
1514
1515 static  int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1516     void *recvbuf,
1517     int *recvcnts,
1518     MPI_Datatype datatype,
1519     MPI_Op op,
1520     MPI_Comm comm)
1521 {
1522   smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1523   return MPI_SUCCESS;
1524 }
1525 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1526 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1527 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1528
1529
1530
1531
1532 static void init_mv2_reduce_scatter_tables_stampede(){
1533   mv2_size_red_scat_tuning_table = 6;
1534   mv2_red_scat_thresholds_table = xbt_malloc(mv2_size_red_scat_tuning_table *
1535       sizeof (mv2_red_scat_tuning_table));
1536   mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1537       {
1538           16,
1539           3,
1540           {
1541               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1542               {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1543               {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1544           },
1545       },
1546       {
1547           32,
1548           3,
1549           {
1550               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1551               {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1552               {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1553           },
1554       },
1555       {
1556           64,
1557           3,
1558           {
1559               {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1560               {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1561               {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1562           },
1563       },
1564       {
1565           128,
1566           2,
1567           {
1568               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1569               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1570           },
1571       },
1572       {
1573           256,
1574           2,
1575           {
1576               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1577               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1578           },
1579       },
1580       {
1581           512,
1582           2,
1583           {
1584               {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1585               {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1586           },
1587       },
1588
1589   };
1590   memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1591       mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1592 }
1593
1594 /************ Scatter variables and initializers                        */
1595
1596 typedef struct {
1597   int min;
1598   int max;
1599   int (*MV2_pt_Scatter_function)(void *sendbuf,
1600       int sendcnt,
1601       MPI_Datatype sendtype,
1602       void *recvbuf,
1603       int recvcnt,
1604       MPI_Datatype recvtype,
1605       int root, MPI_Comm comm);
1606 } mv2_scatter_tuning_element;
1607
1608 typedef struct {
1609   int numproc;
1610   int size_inter_table;
1611   mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1612   int size_intra_table;
1613   mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1614 } mv2_scatter_tuning_table;
1615
1616
1617 int *mv2_scatter_table_ppn_conf = NULL;
1618 int mv2_scatter_num_ppn_conf = 1;
1619 int *mv2_size_scatter_tuning_table = NULL;
1620 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1621
1622 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1623     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1624     int root, MPI_Comm comm)=NULL;
1625
1626 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1627     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1628     int root, MPI_Comm comm)=NULL;
1629 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1630     int sendcnt,
1631     MPI_Datatype sendtype,
1632     void *recvbuf,
1633     int recvcnt,
1634     MPI_Datatype recvtype,
1635     int root, MPI_Comm comm_ptr);
1636
1637 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1638     int sendcnt,
1639     MPI_Datatype sendtype,
1640     void *recvbuf,
1641     int recvcnt,
1642     MPI_Datatype recvtype,
1643     int root, MPI_Comm comm_ptr)
1644 {
1645   return 0;
1646 }
1647
1648 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1649 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1650 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_mvapich2_two_level_binomial
1651 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_mvapich2_two_level_direct
1652
1653
1654
1655
1656 static void init_mv2_scatter_tables_stampede(){
1657   {
1658     int agg_table_sum = 0;
1659     int i;
1660     mv2_scatter_tuning_table **table_ptrs = NULL;
1661     mv2_scatter_num_ppn_conf = 3;
1662     mv2_scatter_thresholds_table
1663     = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1664         * mv2_scatter_num_ppn_conf);
1665     table_ptrs = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1666         * mv2_scatter_num_ppn_conf);
1667     mv2_size_scatter_tuning_table = xbt_malloc(sizeof(int) *
1668         mv2_scatter_num_ppn_conf);
1669     mv2_scatter_table_ppn_conf
1670     = xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int));
1671     mv2_scatter_table_ppn_conf[0] = 1;
1672     mv2_size_scatter_tuning_table[0] = 6;
1673     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1674         {2,
1675             1,
1676             {
1677                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1678             },
1679             1,
1680             {
1681                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1682             },
1683         },
1684
1685         {4,
1686             1,
1687             {
1688                 {0, -1, &MPIR_Scatter_MV2_Direct},
1689             },
1690             1,
1691             {
1692                 {0, -1, &MPIR_Scatter_MV2_Direct},
1693             },
1694         },
1695
1696         {8,
1697             1,
1698             {
1699                 {0, -1, &MPIR_Scatter_MV2_Direct},
1700             },
1701             1,
1702             {
1703                 {0, -1, &MPIR_Scatter_MV2_Direct},
1704             },
1705         },
1706
1707         {16,
1708             1,
1709             {
1710                 {0, -1, &MPIR_Scatter_MV2_Direct},
1711             },
1712             1,
1713             {
1714                 {0, -1, &MPIR_Scatter_MV2_Direct},
1715             },
1716         },
1717
1718         {32,
1719             1,
1720             {
1721                 {0, -1, &MPIR_Scatter_MV2_Direct},
1722             },
1723             1,
1724             {
1725                 {0, -1, &MPIR_Scatter_MV2_Direct},
1726             },
1727         },
1728
1729         {64,
1730             2,
1731             {
1732                 {0, 32, &MPIR_Scatter_MV2_Binomial},
1733                 {32, -1, &MPIR_Scatter_MV2_Direct},
1734             },
1735             1,
1736             {
1737                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1738             },
1739         },
1740     };
1741     table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1742     mv2_scatter_table_ppn_conf[1] = 2;
1743     mv2_size_scatter_tuning_table[1] = 6;
1744     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1745         {4,
1746             2,
1747             {
1748                 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1749                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1750             },
1751             1,
1752             {
1753                 {0, -1, &MPIR_Scatter_MV2_Direct},
1754             },
1755         },
1756
1757         {8,
1758             2,
1759             {
1760                 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1761                 {512, -1, &MPIR_Scatter_MV2_Direct},
1762             },
1763             1,
1764             {
1765                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1766             },
1767         },
1768
1769         {16,
1770             2,
1771             {
1772                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1773                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1774             },
1775             1,
1776             {
1777                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1778             },
1779         },
1780
1781         {32,
1782             2,
1783             {
1784                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1785                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1786             },
1787             1,
1788             {
1789                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1790             },
1791         },
1792
1793         {64,
1794             2,
1795             {
1796                 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1797                 {8192, -1, &MPIR_Scatter_MV2_Direct},
1798             },
1799             1,
1800             {
1801                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1802             },
1803         },
1804
1805         {128,
1806             4,
1807             {
1808                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1809                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1810                 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1811                 {16384, -1, &MPIR_Scatter_MV2_Direct},
1812             },
1813             1,
1814             {
1815                 {0, 128, &MPIR_Scatter_MV2_Direct},
1816                 {128, -1, &MPIR_Scatter_MV2_Binomial},
1817             },
1818         },
1819     };
1820     table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1821     mv2_scatter_table_ppn_conf[2] = 16;
1822     mv2_size_scatter_tuning_table[2] = 8;
1823     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1824         {
1825             16,
1826             2,
1827             {
1828                 {0, 256, &MPIR_Scatter_MV2_Binomial},
1829                 {256, -1, &MPIR_Scatter_MV2_Direct},
1830             },
1831             1,
1832             {
1833                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1834             },
1835         },
1836
1837         {
1838             32,
1839             2,
1840             {
1841                 {0, 512, &MPIR_Scatter_MV2_Binomial},
1842                 {512, -1, &MPIR_Scatter_MV2_Direct},
1843             },
1844             1,
1845             {
1846                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1847             },
1848         },
1849
1850         {
1851             64,
1852             2,
1853             {
1854                 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1855                 {1024, -1, &MPIR_Scatter_MV2_Direct},
1856             },
1857             1,
1858             {
1859                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1860             },
1861         },
1862
1863         {
1864             128,
1865             4,
1866             {
1867                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1868                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1869                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1870                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1871             },
1872             1,
1873             {
1874                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1875             },
1876         },
1877
1878         {
1879             256,
1880             4,
1881             {
1882                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1883                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1884                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1885                 {2048, -1,  &MPIR_Scatter_MV2_Direct},
1886             },
1887             1,
1888             {
1889                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1890             },
1891         },
1892
1893         {
1894             512,
1895             4,
1896             {
1897                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1898                 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1899                 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1900                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1901             },
1902             1,
1903             {
1904                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1905             },
1906         },
1907         {
1908             1024,
1909             5,
1910             {
1911                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1912                 {0, 16,  &MPIR_Scatter_MV2_Binomial},
1913                 {16, 32, &MPIR_Scatter_MV2_Binomial},
1914                 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1915                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1916             },
1917             1,
1918             {
1919                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1920             },
1921         },
1922         {
1923             2048,
1924             7,
1925             {
1926                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1927                 {0, 16,  &MPIR_Scatter_MV2_two_level_Binomial},
1928                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1929                 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1930                 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1931                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1932                 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1933             },
1934             6,
1935             {
1936                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1937                 {16, 128, &MPIR_Scatter_MV2_Binomial},
1938                 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1939                 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1940                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1941                 {65536, -1, &MPIR_Scatter_MV2_Direct},
1942             },
1943         },
1944     };
1945     table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1946     agg_table_sum = 0;
1947     for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1948         agg_table_sum += mv2_size_scatter_tuning_table[i];
1949     }
1950     mv2_scatter_thresholds_table[0] =
1951         xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
1952     memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1953         (sizeof(mv2_scatter_tuning_table)
1954             * mv2_size_scatter_tuning_table[0]));
1955     for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1956         mv2_scatter_thresholds_table[i] =
1957             mv2_scatter_thresholds_table[i - 1]
1958                                          + mv2_size_scatter_tuning_table[i - 1];
1959         memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1960             (sizeof(mv2_scatter_tuning_table)
1961                 * mv2_size_scatter_tuning_table[i]));
1962     }
1963     xbt_free(table_ptrs);
1964   }
1965 }
1966