Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Add Reduce SMP collective from MVAPICH2
[simgrid.git] / src / smpi / colls / smpi_mvapich2_selector_stampede.h
1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
2
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4  * All rights reserved.                                                     */
5
6 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
7
8
9
10 /************ Alltoall variables and initializers                        */
11
12 #define MV2_MAX_NB_THRESHOLDS  32
13
14
15 typedef struct {
16   int min;
17   int max;
18   int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
19       void *recvbuf, int recvcount, MPI_Datatype recvtype,
20       MPI_Comm comm_ptr );
21 } mv2_alltoall_tuning_element;
22
23 typedef struct {
24   int numproc;
25   int size_table;
26   mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
27   mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
28 } mv2_alltoall_tuning_table;
29
30 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
31
32 /* Indicates number of processes per node */
33 int *mv2_alltoall_table_ppn_conf = NULL;
34 /* Indicates total number of configurations */
35 int mv2_alltoall_num_ppn_conf = 1;
36 int *mv2_size_alltoall_tuning_table = NULL;
37 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
38
39
40 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
41 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
42 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_mvapich2_scatter_dest
43 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
44 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring 
45
46
47 static void init_mv2_alltoall_tables_stampede(){
48   int i;
49   int agg_table_sum = 0;
50   mv2_alltoall_tuning_table **table_ptrs = NULL;
51   mv2_alltoall_num_ppn_conf = 3;
52   mv2_alltoall_thresholds_table = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
53       * mv2_alltoall_num_ppn_conf);
54   table_ptrs = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
55       * mv2_alltoall_num_ppn_conf);
56   mv2_size_alltoall_tuning_table = xbt_malloc(sizeof(int) *
57       mv2_alltoall_num_ppn_conf);
58   mv2_alltoall_table_ppn_conf = xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
59   mv2_alltoall_table_ppn_conf[0] = 1;
60   mv2_size_alltoall_tuning_table[0] = 6;
61   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
62       {2,
63           1,
64           {{0, -1, &MPIR_Alltoall_pairwise_MV2},
65           },
66
67           {{0, -1, &MPIR_Alltoall_inplace_MV2},
68           },
69       },
70
71       {4,
72           2,
73           {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
74               {262144, -1, &MPIR_Alltoall_pairwise_MV2},
75           },
76
77           {{0, -1, &MPIR_Alltoall_inplace_MV2},
78           },
79       },
80
81       {8,
82           2,
83           {{0, 8, &MPIR_Alltoall_RD_MV2},
84               {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
85           },
86
87           {{0, -1, &MPIR_Alltoall_inplace_MV2},
88           },
89       },
90
91       {16,
92           3,
93           {{0, 64, &MPIR_Alltoall_RD_MV2},
94               {64, 512, &MPIR_Alltoall_bruck_MV2},
95               {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
96           },
97
98           {{0,-1, &MPIR_Alltoall_inplace_MV2},
99           },
100       },
101
102       {32,
103           3,
104           {{0, 32, &MPIR_Alltoall_RD_MV2},
105               {32, 2048, &MPIR_Alltoall_bruck_MV2},
106               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
107           },
108
109           {{0, -1, &MPIR_Alltoall_inplace_MV2},
110           },
111       },
112
113       {64,
114           3,
115           {{0, 8, &MPIR_Alltoall_RD_MV2},
116               {8, 1024, &MPIR_Alltoall_bruck_MV2},
117               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
118           },
119
120           {{0, -1, &MPIR_Alltoall_inplace_MV2},
121           },
122       },
123   };
124   table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
125   mv2_alltoall_table_ppn_conf[1] = 2;
126   mv2_size_alltoall_tuning_table[1] = 6;
127   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
128       {4,
129           2,
130           {{0, 32, &MPIR_Alltoall_RD_MV2},
131               {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
132           },
133
134           {{0, -1, &MPIR_Alltoall_inplace_MV2},
135           },
136       },
137
138       {8,
139           2,
140           {{0, 64, &MPIR_Alltoall_RD_MV2},
141               {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
142           },
143
144           {{0, -1, &MPIR_Alltoall_inplace_MV2},
145           },
146       },
147
148       {16,
149           3,
150           {{0, 64, &MPIR_Alltoall_RD_MV2},
151               {64, 2048, &MPIR_Alltoall_bruck_MV2},
152               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
153           },
154
155           {{0,-1, &MPIR_Alltoall_inplace_MV2},
156           },
157       },
158
159       {32,
160           3,
161           {{0, 16, &MPIR_Alltoall_RD_MV2},
162               {16, 2048, &MPIR_Alltoall_bruck_MV2},
163               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
164           },
165
166           {{0, -1, &MPIR_Alltoall_inplace_MV2},
167           },
168       },
169
170       {64,
171           3,
172           {{0, 8, &MPIR_Alltoall_RD_MV2},
173               {8, 1024, &MPIR_Alltoall_bruck_MV2},
174               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
175           },
176
177           {{0, -1, &MPIR_Alltoall_inplace_MV2},
178           },
179       },
180
181       {128,
182           3,
183           {{0, 4, &MPIR_Alltoall_RD_MV2},
184               {4, 2048, &MPIR_Alltoall_bruck_MV2},
185               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
186           },
187
188           {{0, -1, &MPIR_Alltoall_inplace_MV2},
189           },
190       },
191   };
192   table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
193   mv2_alltoall_table_ppn_conf[2] = 16;
194   mv2_size_alltoall_tuning_table[2] = 7;
195   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
196       {16,
197           2,
198           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
199               {2048, -1,  &MPIR_Alltoall_Scatter_dest_MV2},
200           },
201
202           {{32768, -1, &MPIR_Alltoall_inplace_MV2},
203           },
204       },
205
206       {32,
207           2,
208           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
209               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
210           },
211
212           {{16384, -1, &MPIR_Alltoall_inplace_MV2},
213           },
214       },
215
216       {64,
217           3,
218           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
219               {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
220               {16384, -1, &MPIR_Alltoall_pairwise_MV2},
221           },
222
223           {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
224           },
225       },
226
227       {128,
228           2,
229           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
230               {2048, -1, &MPIR_Alltoall_pairwise_MV2},
231           },
232
233           {{16384,65536, &MPIR_Alltoall_inplace_MV2},
234           },
235       },
236
237       {256,
238           2,
239           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
240               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
241           },
242
243           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
244           },
245       },
246
247       {512,
248           2,
249           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
250               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
251           },
252
253           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
254           },
255       },
256       {1024,
257           2,
258           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
259               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
260           },
261
262           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
263           },
264       },
265
266   };
267   table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
268   agg_table_sum = 0;
269   for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
270       agg_table_sum += mv2_size_alltoall_tuning_table[i];
271   }
272   mv2_alltoall_thresholds_table[0] =
273       xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
274   memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
275       (sizeof(mv2_alltoall_tuning_table)
276           * mv2_size_alltoall_tuning_table[0]));
277   for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
278       mv2_alltoall_thresholds_table[i] =
279           mv2_alltoall_thresholds_table[i - 1]
280                                         + mv2_size_alltoall_tuning_table[i - 1];
281       memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
282           (sizeof(mv2_alltoall_tuning_table)
283               * mv2_size_alltoall_tuning_table[i]));
284   }
285   xbt_free(table_ptrs);
286
287
288 }
289
290
291 /************ Allgather variables and initializers                        */
292
293 typedef struct {
294   int min;
295   int max;
296   int (*MV2_pt_Allgather_function)(void *sendbuf,
297       int sendcount,
298       MPI_Datatype sendtype,
299       void *recvbuf,
300       int recvcount,
301       MPI_Datatype recvtype, MPI_Comm comm_ptr);
302 } mv2_allgather_tuning_element;
303
304 typedef struct {
305   int numproc;
306   int two_level[MV2_MAX_NB_THRESHOLDS];
307   int size_inter_table;
308   mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
309 } mv2_allgather_tuning_table;
310
311 int (*MV2_Allgather_function)(void *sendbuf,
312     int sendcount,
313     MPI_Datatype sendtype,
314     void *recvbuf,
315     int recvcount,
316     MPI_Datatype recvtype, MPI_Comm comm);
317
318 int *mv2_allgather_table_ppn_conf = NULL;
319 int mv2_allgather_num_ppn_conf = 1;
320 int *mv2_size_allgather_tuning_table = NULL;
321 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
322
323 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
324                                  int sendcount,
325                                  MPI_Datatype sendtype,
326                                  void *recvbuf,
327                                  int recvcount,
328                                  MPI_Datatype recvtype, MPI_Comm comm_ptr)
329 {
330     return 0;
331 }
332
333 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
334 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
335 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
336 #define MPIR_2lvl_Allgather_MV2 smpi_coll_tuned_allgather_mvapich2_smp
337
338 static void init_mv2_allgather_tables_stampede(){
339   int i;
340   int agg_table_sum = 0;
341   mv2_allgather_tuning_table **table_ptrs = NULL;
342   mv2_allgather_num_ppn_conf = 3;
343   mv2_allgather_thresholds_table
344   = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
345       * mv2_allgather_num_ppn_conf);
346   table_ptrs = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
347       * mv2_allgather_num_ppn_conf);
348   mv2_size_allgather_tuning_table = xbt_malloc(sizeof(int) *
349       mv2_allgather_num_ppn_conf);
350   mv2_allgather_table_ppn_conf
351   = xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int));
352   mv2_allgather_table_ppn_conf[0] = 1;
353   mv2_size_allgather_tuning_table[0] = 6;
354   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
355       {
356           2,
357           {0},
358           1,
359           {
360               {0, -1, &MPIR_Allgather_Ring_MV2},
361           },
362       },
363       {
364           4,
365           {0,0},
366           2,
367           {
368               {0, 262144, &MPIR_Allgather_RD_MV2},
369               {262144, -1, &MPIR_Allgather_Ring_MV2},
370           },
371       },
372       {
373           8,
374           {0,0},
375           2,
376           {
377               {0, 131072, &MPIR_Allgather_RD_MV2},
378               {131072, -1, &MPIR_Allgather_Ring_MV2},
379           },
380       },
381       {
382           16,
383           {0,0},
384           2,
385           {
386               {0, 131072, &MPIR_Allgather_RD_MV2},
387               {131072, -1, &MPIR_Allgather_Ring_MV2},
388           },
389       },
390       {
391           32,
392           {0,0},
393           2,
394           {
395               {0, 65536, &MPIR_Allgather_RD_MV2},
396               {65536, -1, &MPIR_Allgather_Ring_MV2},
397           },
398       },
399       {
400           64,
401           {0,0},
402           2,
403           {
404               {0, 32768, &MPIR_Allgather_RD_MV2},
405               {32768, -1, &MPIR_Allgather_Ring_MV2},
406           },
407       },
408   };
409   table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
410   mv2_allgather_table_ppn_conf[1] = 2;
411   mv2_size_allgather_tuning_table[1] = 6;
412   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
413       {
414           4,
415           {0,0},
416           2,
417           {
418               {0, 524288, &MPIR_Allgather_RD_MV2},
419               {524288, -1, &MPIR_Allgather_Ring_MV2},
420           },
421       },
422       {
423           8,
424           {0,1,0},
425           2,
426           {
427               {0, 32768, &MPIR_Allgather_RD_MV2},
428               {32768, 524288, &MPIR_Allgather_Ring_MV2},
429               {524288, -1, &MPIR_Allgather_Ring_MV2},
430           },
431       },
432       {
433           16,
434           {0,1,0},
435           2,
436           {
437               {0, 16384, &MPIR_Allgather_RD_MV2},
438               {16384, 524288, &MPIR_Allgather_Ring_MV2},
439               {524288, -1, &MPIR_Allgather_Ring_MV2},
440           },
441       },
442       {
443           32,
444           {1,1,0},
445           2,
446           {
447               {0, 65536, &MPIR_Allgather_RD_MV2},
448               {65536, 524288, &MPIR_Allgather_Ring_MV2},
449               {524288, -1, &MPIR_Allgather_Ring_MV2},
450           },
451       },
452       {
453           64,
454           {1,1,0},
455           2,
456           {
457               {0, 32768, &MPIR_Allgather_RD_MV2},
458               {32768, 524288, &MPIR_Allgather_Ring_MV2},
459               {524288, -1, &MPIR_Allgather_Ring_MV2},
460           },
461       },
462       {
463           128,
464           {1,1,0},
465           2,
466           {
467               {0, 65536, &MPIR_Allgather_RD_MV2},
468               {65536, 524288, &MPIR_Allgather_Ring_MV2},
469               {524288, -1, &MPIR_Allgather_Ring_MV2},
470           },
471       },
472   };
473   table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
474   mv2_allgather_table_ppn_conf[2] = 16;
475   mv2_size_allgather_tuning_table[2] = 6;
476   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
477       {
478           16,
479           {0,0},
480           2,
481           {
482               {0, 1024, &MPIR_Allgather_RD_MV2},
483               {1024, -1, &MPIR_Allgather_Ring_MV2},
484           },
485       },
486       {
487           32,
488           {0,0},
489           2,
490           {
491               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
492               {1024, -1, &MPIR_Allgather_Ring_MV2},
493           },
494       },
495       {
496           64,
497           {0,0},
498           2,
499           {
500               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
501               {1024, -1, &MPIR_Allgather_Ring_MV2},
502           },
503       },
504       {
505           128,
506           {0,0},
507           2,
508           {
509               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
510               {1024, -1, &MPIR_Allgather_Ring_MV2},
511           },
512       },
513       {
514           256,
515           {0,0},
516           2,
517           {
518               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
519               {1024, -1, &MPIR_Allgather_Ring_MV2},
520           },
521       },
522       {
523           512,
524           {0,0},
525           2,
526           {
527               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
528               {1024, -1, &MPIR_Allgather_Ring_MV2},
529           },
530       },
531
532   };
533   table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
534   agg_table_sum = 0;
535   for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
536       agg_table_sum += mv2_size_allgather_tuning_table[i];
537   }
538   mv2_allgather_thresholds_table[0] =
539       xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
540   memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
541       (sizeof(mv2_allgather_tuning_table)
542           * mv2_size_allgather_tuning_table[0]));
543   for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
544       mv2_allgather_thresholds_table[i] =
545           mv2_allgather_thresholds_table[i - 1]
546                                          + mv2_size_allgather_tuning_table[i - 1];
547       memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
548           (sizeof(mv2_allgather_tuning_table)
549               * mv2_size_allgather_tuning_table[i]));
550   }
551   xbt_free(table_ptrs);
552 }
553
554
555 /************ Gather variables and initializers                        */
556
557 typedef struct {
558   int min;
559   int max;
560   int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
561       MPI_Datatype sendtype, void *recvbuf, int recvcnt,
562       MPI_Datatype recvtype, int root, MPI_Comm  comm_ptr);
563 } mv2_gather_tuning_element;
564
565
566 typedef struct {
567   int numproc;
568   int size_inter_table;
569   mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
570   int size_intra_table;
571   mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
572 } mv2_gather_tuning_table;
573
574 int mv2_size_gather_tuning_table=7;
575 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL; 
576
577 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
578     int sendcnt,
579     MPI_Datatype sendtype,
580     void *recvbuf,
581     int recvcnt,
582     MPI_Datatype recvtype,
583     int root, MPI_Comm comm);
584
585 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
586 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
587
588
589 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
590 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_mvapich2_two_level
591 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
592
593
594 static void init_mv2_gather_tables_stampede(){
595
596   mv2_size_gather_tuning_table=7;
597   mv2_gather_thresholds_table = xbt_malloc(mv2_size_gather_tuning_table*
598       sizeof (mv2_gather_tuning_table));
599   mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
600       {16,
601           2,{{0, 524288, &MPIR_Gather_MV2_Direct},
602               {524288, -1, &MPIR_Gather_intra}},
603               1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
604               {32,
605                   3,{{0, 16384, &MPIR_Gather_MV2_Direct},
606                       {16384, 131072, &MPIR_Gather_intra},
607                       {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
608                       1,{{0, -1, &MPIR_Gather_intra}}},
609                       {64,
610                           3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
611                               {256, 16384, &MPIR_Gather_MV2_Direct},
612                               {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
613                               1,{{0, -1, &MPIR_Gather_intra}}},
614                               {128,
615                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
616                                       {512, 16384, &MPIR_Gather_MV2_Direct},
617                                       {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
618                                       1,{{0, -1, &MPIR_Gather_intra}}},
619                                       {256,
620                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
621                                               {512, 16384, &MPIR_Gather_MV2_Direct},
622                                               {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
623                                               1,{{0, -1, &MPIR_Gather_intra}}},
624                                               {512,
625                                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
626                                                       {512, 16384, &MPIR_Gather_MV2_Direct},
627                                                       {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
628                                                       1,{{0, -1, &MPIR_Gather_intra}}},
629                                                       {1024,
630                                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
631                                                               {512, 16384, &MPIR_Gather_MV2_Direct},
632                                                               {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
633                                                               1,{{0, -1, &MPIR_Gather_intra}}},
634   };
635
636   memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
637       mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
638
639 }
640
641
642 /************ Allgatherv variables and initializers                        */
643
644 typedef struct {
645   int min;
646   int max;
647   int (*MV2_pt_Allgatherv_function)(void *sendbuf,
648       int sendcount,
649       MPI_Datatype sendtype,
650       void *recvbuf,
651       int *recvcounts,
652       int *displs,
653       MPI_Datatype recvtype,
654       MPI_Comm commg);
655 } mv2_allgatherv_tuning_element;
656
657 typedef struct {
658   int numproc;
659   int size_inter_table;
660   mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
661 } mv2_allgatherv_tuning_table;
662
663 int (*MV2_Allgatherv_function)(void *sendbuf,
664     int sendcount,
665     MPI_Datatype sendtype,
666     void *recvbuf,
667     int *recvcounts,
668     int *displs,
669     MPI_Datatype recvtype,
670     MPI_Comm comm);
671
672 int mv2_size_allgatherv_tuning_table = 0;
673 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
674
675 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
676 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
677 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
678
679
680 static void init_mv2_allgatherv_tables_stampede(){
681   mv2_size_allgatherv_tuning_table = 6;
682   mv2_allgatherv_thresholds_table = xbt_malloc(mv2_size_allgatherv_tuning_table *
683       sizeof (mv2_allgatherv_tuning_table));
684   mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
685       {
686           16,
687           2,
688           {
689               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
690               {512, -1, &MPIR_Allgatherv_Ring_MV2},
691           },
692       },
693       {
694           32,
695           2,
696           {
697               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
698               {512, -1, &MPIR_Allgatherv_Ring_MV2},
699           },
700       },
701       {
702           64,
703           2,
704           {
705               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
706               {256, -1, &MPIR_Allgatherv_Ring_MV2},
707           },
708       },
709       {
710           128,
711           2,
712           {
713               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
714               {256, -1, &MPIR_Allgatherv_Ring_MV2},
715           },
716       },
717       {
718           256,
719           2,
720           {
721               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
722               {256, -1, &MPIR_Allgatherv_Ring_MV2},
723           },
724       },
725       {
726           512,
727           2,
728           {
729               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
730               {256, -1, &MPIR_Allgatherv_Ring_MV2},
731           },
732       },
733
734   };
735   memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
736       mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
737 }
738
739
740 /************ Allreduce variables and initializers                        */
741
742 typedef struct {
743   int min;
744   int max;
745   int (*MV2_pt_Allreduce_function)(void *sendbuf,
746       void *recvbuf,
747       int count,
748       MPI_Datatype datatype,
749       MPI_Op op, MPI_Comm comm);
750 } mv2_allreduce_tuning_element;
751
752 typedef struct {
753   int numproc;
754   int mcast_enabled;
755   int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
756   int size_inter_table;
757   mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
758   int size_intra_table;
759   mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
760 } mv2_allreduce_tuning_table;
761
762
763 int (*MV2_Allreduce_function)(void *sendbuf,
764     void *recvbuf,
765     int count,
766     MPI_Datatype datatype,
767     MPI_Op op, MPI_Comm comm)=NULL;
768
769
770 int (*MV2_Allreduce_intra_function)( void *sendbuf,
771     void *recvbuf,
772     int count,
773     MPI_Datatype datatype,
774     MPI_Op op, MPI_Comm comm)=NULL;
775
776 int mv2_size_allreduce_tuning_table = 0;
777 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
778
779
780
781
782
783 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
784     void *recvbuf,
785     int count,
786     MPI_Datatype datatype,
787     MPI_Op op, MPI_Comm comm)
788
789   return 0;
790 }
791
792 static  int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
793     void *recvbuf,
794     int count,
795     MPI_Datatype datatype,
796     MPI_Op op, MPI_Comm  comm)
797 {
798   return 0;
799 }
800
801 static  int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
802     void *recvbuf,
803     int count,
804     MPI_Datatype datatype,
805     MPI_Op op, MPI_Comm  comm)
806 {
807   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
808   return MPI_SUCCESS;
809 }
810
811 static  int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
812     void *recvbuf,
813     int count,
814     MPI_Datatype datatype,
815     MPI_Op op, MPI_Comm  comm)
816 {
817   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
818   return MPI_SUCCESS;
819 }
820
821 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
822 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
823 #define MPIR_Allreduce_two_level_MV2 smpi_coll_tuned_allreduce_mvapich2_two_level
824
825
826 static void init_mv2_allreduce_tables_stampede(){
827   mv2_size_allreduce_tuning_table = 8;
828   mv2_allreduce_thresholds_table = xbt_malloc(mv2_size_allreduce_tuning_table *
829       sizeof (mv2_allreduce_tuning_table));
830   mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
831       {
832           16,
833           0,
834           {1, 0},
835           2,
836           {
837               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
838               {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
839           },
840           2,
841           {
842               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
843               {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
844           },
845       },
846       {
847           32,
848           0,
849           {1, 1, 0},
850           3,
851           {
852               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
853               {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
854               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
855           },
856           2,
857           {
858               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
859               {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
860           },
861       },
862       {
863           64,
864           0,
865           {1, 1, 0},
866           3,
867           {
868               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
869               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
870               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
871           },
872           2,
873           {
874               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
875               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
876           },
877       },
878       {
879           128,
880           0,
881           {1, 1, 0},
882           3,
883           {
884               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
885               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
886               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
887           },
888           2,
889           {
890               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
891               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
892           },
893       },
894       {
895           256,
896           0,
897           {1, 1, 0},
898           3,
899           {
900               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
901               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
902               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
903           },
904           2,
905           {
906               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
907               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
908           },
909       },
910       {
911           512,
912           0,
913           {1, 1, 0},
914           3,
915           {
916               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
917               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
918               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
919           },
920           2,
921           {
922               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
923               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
924           },
925       },
926       {
927           1024,
928           0,
929           {1, 1, 1, 0},
930           4,
931           {
932               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
933               {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
934               {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
935               {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
936           },
937           2,
938           {
939               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
940               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
941           },
942       },
943       {
944           2048,
945           0,
946           {1, 1, 1, 0},
947           4,
948           {
949               {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
950               {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
951               {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
952               {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
953               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
954           },
955           2,
956           {
957               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
958               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
959           },
960       },
961
962   };
963   memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
964       mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
965 }
966
967
968 /*
969 Bcast deactivated for now, defaults to mpich one
970 typedef struct {
971     int min;
972     int max;
973     int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
974                                   int root, MPI_Comm comm_ptr);
975     int zcpy_pipelined_knomial_factor;
976 } mv2_bcast_tuning_element;
977
978 typedef struct {
979     int numproc;
980     int bcast_segment_size;
981     int intra_node_knomial_factor;
982     int inter_node_knomial_factor;
983     int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
984     int size_inter_table;
985     mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
986     int size_intra_table;
987     mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
988 } mv2_bcast_tuning_table;
989
990 int mv2_size_bcast_tuning_table = 0;
991 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
992
993
994 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
995                            int root, MPI_Comm comm_ptr) = NULL;
996
997 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
998                                       int root, MPI_Comm comm_ptr) = NULL;
999
1000
1001  */
1002
1003
1004 /*
1005 static void init_mv2_bcast_tables_stampede(){
1006  //Stampede,
1007         mv2_size_bcast_tuning_table=8;
1008         mv2_bcast_thresholds_table = xbt_malloc(mv2_size_bcast_tuning_table *
1009                                                  sizeof (mv2_bcast_tuning_table));
1010
1011   mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1012     {
1013             16,
1014             8192, 4, 4,
1015             {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1016             11,
1017             {
1018               {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1019               {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1020               {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1021               {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1022               {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1023               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1024               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1025               {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1026               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1027               {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1028               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1029             },
1030             11,
1031             {
1032               {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1033               {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1034               {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1035               {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1036               {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1037               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1038               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1039               {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1040               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1041               {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1042               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1043             }
1044     },
1045     {
1046             32,
1047             8192, 4, 4,
1048             {1, 1, 1, 1, 1, 1, 1, 1},
1049             8,
1050             {
1051               {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1052               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1053               {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1054               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1055               {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1056               {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1057               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1058               {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1059             },
1060             8,
1061             {
1062               {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1063               {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1064               {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1065               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1066               {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1067               {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1068               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1069               {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1070             }
1071     },
1072     {
1073             64,
1074             8192, 4, 4,
1075             {1, 1, 1, 1, 1, 1, 1, 1, 1},
1076             9,
1077             {
1078               {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1079               {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1080               {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1081               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1082               {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1083               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1084               {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1085               {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1086               {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1087             },
1088             9,
1089             {
1090               {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1091               {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1092               {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1093               {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1094               {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1095               {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1096               {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1097               {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1098               {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1099             }
1100     },
1101     {
1102             128,
1103             8192, 4, 4,
1104             {1, 1, 1, 0},
1105             4,
1106             {
1107               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1108               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1109               {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1110               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1111             },
1112             4,
1113             {
1114               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1115               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1116               {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1117               {524288, -1, NULL, -1}
1118             }
1119     },
1120     {
1121             256,
1122             8192, 4, 4,
1123             {1, 1, 1, 1, 1},
1124             5,
1125             {
1126               {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1127               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1128               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1129               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1130               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1131             },
1132             5,
1133             {
1134               {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1135               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1136               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1137               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1138               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1139             }
1140     },
1141     {
1142             512,
1143             8192, 4, 4,
1144             {1, 1, 1, 1, 1},
1145             5,
1146             {
1147               {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1148               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1149               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1150               {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1151               {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1152             },
1153             5,
1154             {
1155               {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1156               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1157               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1158               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1159               {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1160             }
1161     },
1162     {
1163             1024,
1164             8192, 4, 4,
1165             {1, 1, 1, 1, 1},
1166             5,
1167             {
1168               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1169               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1170               {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1171               {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1172               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1173             },
1174             5,
1175             {
1176               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1177               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1178               {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1179               {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1180               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1181             }
1182     },
1183     {
1184             2048,
1185             8192, 4, 4,
1186             {1, 1, 1, 1, 1, 1, 1},
1187             7,
1188             {
1189               {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1190               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1191               {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1192               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1193               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1194               {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1195               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1196             },
1197             7,
1198             {
1199               {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1200               {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1201               {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1202               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1203               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1204               {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1205               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1206             }
1207     }
1208   };
1209
1210         memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1211                     mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1212 }*/
1213
1214
1215 /************ Reduce variables and initializers                        */
1216
1217 typedef struct {
1218   int min;
1219   int max;
1220   int (*MV2_pt_Reduce_function)(void *sendbuf,
1221       void *recvbuf,
1222       int count,
1223       MPI_Datatype datatype,
1224       MPI_Op op,
1225       int root,
1226       MPI_Comm  comm_ptr);
1227 } mv2_reduce_tuning_element;
1228
1229 typedef struct {
1230   int numproc;
1231   int inter_k_degree;
1232   int intra_k_degree;
1233   int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1234   int size_inter_table;
1235   mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1236   int size_intra_table;
1237   mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1238 } mv2_reduce_tuning_table;
1239
1240 int mv2_size_reduce_tuning_table = 0;
1241 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1242
1243
1244 int mv2_reduce_intra_knomial_factor = 2;
1245 int mv2_reduce_inter_knomial_factor = 2;
1246
1247 int (*MV2_Reduce_function)( void *sendbuf,
1248     void *recvbuf,
1249     int count,
1250     MPI_Datatype datatype,
1251     MPI_Op op,
1252     int root,
1253     MPI_Comm  comm_ptr)=NULL;
1254
1255 int (*MV2_Reduce_intra_function)( void *sendbuf,
1256     void *recvbuf,
1257     int count,
1258     MPI_Datatype datatype,
1259     MPI_Op op,
1260     int root,
1261     MPI_Comm  comm_ptr)=NULL;
1262
1263
1264 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1265 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1266 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1267 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1268 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1269 #define MPIR_Reduce_two_level_helper_MV2 smpi_coll_tuned_reduce_mvapich2_two_level
1270
1271
1272 static void init_mv2_reduce_tables_stampede(){
1273   /*Stampede*/
1274   mv2_size_reduce_tuning_table = 8;
1275   mv2_reduce_thresholds_table = xbt_malloc(mv2_size_reduce_tuning_table *
1276       sizeof (mv2_reduce_tuning_table));
1277   mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1278       {
1279           16,
1280           4,
1281           4,
1282           {1, 0, 0},
1283           3,
1284           {
1285               {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1286               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1287               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1288           },
1289           2,
1290           {
1291               {0, 65536, &MPIR_Reduce_shmem_MV2},
1292               {65536,-1,  &MPIR_Reduce_binomial_MV2},
1293           },
1294       },
1295       {
1296           32,
1297           4,
1298           4,
1299           {1, 1, 1, 1, 0, 0, 0},
1300           7,
1301           {
1302               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1303               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1304               {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1305               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1306               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1307               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1308               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1309           },
1310           6,
1311           {
1312               {0, 8192, &MPIR_Reduce_shmem_MV2},
1313               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1314               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1315               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1316               {65536, 262144, &MPIR_Reduce_shmem_MV2},
1317               {262144,-1,  &MPIR_Reduce_binomial_MV2},
1318           },
1319       },
1320       {
1321           64,
1322           4,
1323           4,
1324           {1, 1, 1, 1, 0},
1325           5,
1326           {
1327               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1328               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1329               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1330               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1331               {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1332           },
1333           5,
1334           {
1335               {0, 8192, &MPIR_Reduce_shmem_MV2},
1336               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1337               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1338               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1339               {262144, -1, &MPIR_Reduce_binomial_MV2},
1340           },
1341       },
1342       {
1343           128,
1344           4,
1345           4,
1346           {1, 0, 1, 0, 1, 0},
1347           6,
1348           {
1349               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1350               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1351               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1352               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1353               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1354               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1355           },
1356           5,
1357           {
1358               {0, 8192, &MPIR_Reduce_shmem_MV2},
1359               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1360               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1361               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1362               {262144, -1, &MPIR_Reduce_binomial_MV2},
1363           },
1364       },
1365       {
1366           256,
1367           4,
1368           4,
1369           {1, 1, 1, 0, 1, 1, 0},
1370           7,
1371           {
1372               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1373               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1374               {16384, 32768, &MPIR_Reduce_binomial_MV2},
1375               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1376               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1377               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1378               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1379           },
1380           6,
1381           {
1382               {0, 8192, &MPIR_Reduce_shmem_MV2},
1383               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1384               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1385               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1386               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1387               {262144, -1, &MPIR_Reduce_binomial_MV2},
1388           },
1389       },
1390       {
1391           512,
1392           4,
1393           4,
1394           {1, 0, 1, 1, 1, 0},
1395           6,
1396           {
1397               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1398               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1399               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1400               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1401               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1402               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1403           },
1404           5,
1405           {
1406               {0, 8192, &MPIR_Reduce_shmem_MV2},
1407               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1408               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1409               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1410               {262144, -1, &MPIR_Reduce_binomial_MV2},
1411           },
1412       },
1413       {
1414           1024,
1415           4,
1416           4,
1417           {1, 0, 1, 1, 1},
1418           5,
1419           {
1420               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1421               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1422               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1423               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1424               {262144, -1, &MPIR_Reduce_binomial_MV2},
1425           },
1426           5,
1427           {
1428               {0, 8192, &MPIR_Reduce_shmem_MV2},
1429               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1430               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1431               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1432               {262144, -1, &MPIR_Reduce_binomial_MV2},
1433           },
1434       },
1435       {
1436           2048,
1437           4,
1438           4,
1439           {1, 0, 1, 1, 1,1},
1440           6,
1441           {
1442               {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1443               {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1444               {4096, 16384, &MPIR_Reduce_binomial_MV2},
1445               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1446               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1447               {131072, -1, &MPIR_Reduce_binomial_MV2},
1448           },
1449           6,
1450           {
1451               {0, 2048, &MPIR_Reduce_shmem_MV2},
1452               {2048, 4096, &MPIR_Reduce_shmem_MV2},
1453               {4096, 16384, &MPIR_Reduce_shmem_MV2},
1454               {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1455               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1456               {131072, -1, &MPIR_Reduce_shmem_MV2},
1457           },
1458       },
1459
1460   };
1461   memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1462       mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1463 }
1464
1465 /************ Reduce scatter variables and initializers                        */
1466
1467 typedef struct {
1468   int min;
1469   int max;
1470   int (*MV2_pt_Red_scat_function)(void *sendbuf,
1471       void *recvbuf,
1472       int *recvcnts,
1473       MPI_Datatype datatype,
1474       MPI_Op op,
1475       MPI_Comm comm_ptr);
1476 } mv2_red_scat_tuning_element;
1477
1478 typedef struct {
1479   int numproc;
1480   int size_inter_table;
1481   mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1482 } mv2_red_scat_tuning_table;
1483
1484 int mv2_size_red_scat_tuning_table = 0;
1485 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1486
1487
1488 int (*MV2_Red_scat_function)(void *sendbuf,
1489     void *recvbuf,
1490     int *recvcnts,
1491     MPI_Datatype datatype,
1492     MPI_Op op,
1493     MPI_Comm comm_ptr);
1494
1495
1496
1497 static  int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1498     void *recvbuf,
1499     int *recvcnts,
1500     MPI_Datatype datatype,
1501     MPI_Op op,
1502     MPI_Comm comm)
1503 {
1504   smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1505   return MPI_SUCCESS;
1506 }
1507 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1508 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1509 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1510
1511
1512
1513
1514 static void init_mv2_reduce_scatter_tables_stampede(){
1515   mv2_size_red_scat_tuning_table = 6;
1516   mv2_red_scat_thresholds_table = xbt_malloc(mv2_size_red_scat_tuning_table *
1517       sizeof (mv2_red_scat_tuning_table));
1518   mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1519       {
1520           16,
1521           3,
1522           {
1523               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1524               {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1525               {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1526           },
1527       },
1528       {
1529           32,
1530           3,
1531           {
1532               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1533               {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1534               {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1535           },
1536       },
1537       {
1538           64,
1539           3,
1540           {
1541               {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1542               {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1543               {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1544           },
1545       },
1546       {
1547           128,
1548           2,
1549           {
1550               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1551               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1552           },
1553       },
1554       {
1555           256,
1556           2,
1557           {
1558               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1559               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1560           },
1561       },
1562       {
1563           512,
1564           2,
1565           {
1566               {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1567               {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1568           },
1569       },
1570
1571   };
1572   memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1573       mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1574 }
1575
1576 /************ Scatter variables and initializers                        */
1577
1578 typedef struct {
1579   int min;
1580   int max;
1581   int (*MV2_pt_Scatter_function)(void *sendbuf,
1582       int sendcnt,
1583       MPI_Datatype sendtype,
1584       void *recvbuf,
1585       int recvcnt,
1586       MPI_Datatype recvtype,
1587       int root, MPI_Comm comm);
1588 } mv2_scatter_tuning_element;
1589
1590 typedef struct {
1591   int numproc;
1592   int size_inter_table;
1593   mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1594   int size_intra_table;
1595   mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1596 } mv2_scatter_tuning_table;
1597
1598
1599 int *mv2_scatter_table_ppn_conf = NULL;
1600 int mv2_scatter_num_ppn_conf = 1;
1601 int *mv2_size_scatter_tuning_table = NULL;
1602 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1603
1604 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1605     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1606     int root, MPI_Comm comm)=NULL;
1607
1608 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1609     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1610     int root, MPI_Comm comm)=NULL;
1611 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1612     int sendcnt,
1613     MPI_Datatype sendtype,
1614     void *recvbuf,
1615     int recvcnt,
1616     MPI_Datatype recvtype,
1617     int root, MPI_Comm comm_ptr);
1618
1619 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1620     int sendcnt,
1621     MPI_Datatype sendtype,
1622     void *recvbuf,
1623     int recvcnt,
1624     MPI_Datatype recvtype,
1625     int root, MPI_Comm comm_ptr)
1626 {
1627   return 0;
1628 }
1629
1630 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1631 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1632 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_ompi_binomial
1633 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1634
1635
1636
1637
1638 static void init_mv2_scatter_tables_stampede(){
1639   {
1640     int agg_table_sum = 0;
1641     int i;
1642     mv2_scatter_tuning_table **table_ptrs = NULL;
1643     mv2_scatter_num_ppn_conf = 3;
1644     mv2_scatter_thresholds_table
1645     = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1646         * mv2_scatter_num_ppn_conf);
1647     table_ptrs = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1648         * mv2_scatter_num_ppn_conf);
1649     mv2_size_scatter_tuning_table = xbt_malloc(sizeof(int) *
1650         mv2_scatter_num_ppn_conf);
1651     mv2_scatter_table_ppn_conf
1652     = xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int));
1653     mv2_scatter_table_ppn_conf[0] = 1;
1654     mv2_size_scatter_tuning_table[0] = 6;
1655     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1656         {2,
1657             1,
1658             {
1659                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1660             },
1661             1,
1662             {
1663                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1664             },
1665         },
1666
1667         {4,
1668             1,
1669             {
1670                 {0, -1, &MPIR_Scatter_MV2_Direct},
1671             },
1672             1,
1673             {
1674                 {0, -1, &MPIR_Scatter_MV2_Direct},
1675             },
1676         },
1677
1678         {8,
1679             1,
1680             {
1681                 {0, -1, &MPIR_Scatter_MV2_Direct},
1682             },
1683             1,
1684             {
1685                 {0, -1, &MPIR_Scatter_MV2_Direct},
1686             },
1687         },
1688
1689         {16,
1690             1,
1691             {
1692                 {0, -1, &MPIR_Scatter_MV2_Direct},
1693             },
1694             1,
1695             {
1696                 {0, -1, &MPIR_Scatter_MV2_Direct},
1697             },
1698         },
1699
1700         {32,
1701             1,
1702             {
1703                 {0, -1, &MPIR_Scatter_MV2_Direct},
1704             },
1705             1,
1706             {
1707                 {0, -1, &MPIR_Scatter_MV2_Direct},
1708             },
1709         },
1710
1711         {64,
1712             2,
1713             {
1714                 {0, 32, &MPIR_Scatter_MV2_Binomial},
1715                 {32, -1, &MPIR_Scatter_MV2_Direct},
1716             },
1717             1,
1718             {
1719                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1720             },
1721         },
1722     };
1723     table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1724     mv2_scatter_table_ppn_conf[1] = 2;
1725     mv2_size_scatter_tuning_table[1] = 6;
1726     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1727         {4,
1728             2,
1729             {
1730                 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1731                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1732             },
1733             1,
1734             {
1735                 {0, -1, &MPIR_Scatter_MV2_Direct},
1736             },
1737         },
1738
1739         {8,
1740             2,
1741             {
1742                 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1743                 {512, -1, &MPIR_Scatter_MV2_Direct},
1744             },
1745             1,
1746             {
1747                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1748             },
1749         },
1750
1751         {16,
1752             2,
1753             {
1754                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1755                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1756             },
1757             1,
1758             {
1759                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1760             },
1761         },
1762
1763         {32,
1764             2,
1765             {
1766                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1767                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1768             },
1769             1,
1770             {
1771                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1772             },
1773         },
1774
1775         {64,
1776             2,
1777             {
1778                 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1779                 {8192, -1, &MPIR_Scatter_MV2_Direct},
1780             },
1781             1,
1782             {
1783                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1784             },
1785         },
1786
1787         {128,
1788             4,
1789             {
1790                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1791                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1792                 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1793                 {16384, -1, &MPIR_Scatter_MV2_Direct},
1794             },
1795             1,
1796             {
1797                 {0, 128, &MPIR_Scatter_MV2_Direct},
1798                 {128, -1, &MPIR_Scatter_MV2_Binomial},
1799             },
1800         },
1801     };
1802     table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1803     mv2_scatter_table_ppn_conf[2] = 16;
1804     mv2_size_scatter_tuning_table[2] = 8;
1805     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1806         {
1807             16,
1808             2,
1809             {
1810                 {0, 256, &MPIR_Scatter_MV2_Binomial},
1811                 {256, -1, &MPIR_Scatter_MV2_Direct},
1812             },
1813             1,
1814             {
1815                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1816             },
1817         },
1818
1819         {
1820             32,
1821             2,
1822             {
1823                 {0, 512, &MPIR_Scatter_MV2_Binomial},
1824                 {512, -1, &MPIR_Scatter_MV2_Direct},
1825             },
1826             1,
1827             {
1828                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1829             },
1830         },
1831
1832         {
1833             64,
1834             2,
1835             {
1836                 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1837                 {1024, -1, &MPIR_Scatter_MV2_Direct},
1838             },
1839             1,
1840             {
1841                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1842             },
1843         },
1844
1845         {
1846             128,
1847             4,
1848             {
1849                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1850                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1851                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1852                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1853             },
1854             1,
1855             {
1856                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1857             },
1858         },
1859
1860         {
1861             256,
1862             4,
1863             {
1864                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1865                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1866                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1867                 {2048, -1,  &MPIR_Scatter_MV2_Direct},
1868             },
1869             1,
1870             {
1871                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1872             },
1873         },
1874
1875         {
1876             512,
1877             4,
1878             {
1879                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1880                 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1881                 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1882                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1883             },
1884             1,
1885             {
1886                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1887             },
1888         },
1889         {
1890             1024,
1891             5,
1892             {
1893                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1894                 {0, 16,  &MPIR_Scatter_MV2_Binomial},
1895                 {16, 32, &MPIR_Scatter_MV2_Binomial},
1896                 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1897                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1898             },
1899             1,
1900             {
1901                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1902             },
1903         },
1904         {
1905             2048,
1906             7,
1907             {
1908                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1909                 {0, 16,  &MPIR_Scatter_MV2_two_level_Binomial},
1910                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1911                 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1912                 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1913                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1914                 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1915             },
1916             6,
1917             {
1918                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1919                 {16, 128, &MPIR_Scatter_MV2_Binomial},
1920                 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1921                 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1922                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1923                 {65536, -1, &MPIR_Scatter_MV2_Direct},
1924             },
1925         },
1926     };
1927     table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1928     agg_table_sum = 0;
1929     for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1930         agg_table_sum += mv2_size_scatter_tuning_table[i];
1931     }
1932     mv2_scatter_thresholds_table[0] =
1933         xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
1934     memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1935         (sizeof(mv2_scatter_tuning_table)
1936             * mv2_size_scatter_tuning_table[0]));
1937     for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1938         mv2_scatter_thresholds_table[i] =
1939             mv2_scatter_thresholds_table[i - 1]
1940                                          + mv2_size_scatter_tuning_table[i - 1];
1941         memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1942             (sizeof(mv2_scatter_tuning_table)
1943                 * mv2_size_scatter_tuning_table[i]));
1944     }
1945     xbt_free(table_ptrs);
1946   }
1947 }
1948