Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
72450900803db9da00232d05b9ef49da50d149e1
[simgrid.git] / src / smpi / colls / smpi_mvapich2_selector_stampede.h
1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
2
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4  * All rights reserved.                                                     */
5
6 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
7
8
9
10 /************ Alltoall variables and initializers                        */
11
12 #define MV2_MAX_NB_THRESHOLDS  32
13
14
15 typedef struct {
16   int min;
17   int max;
18   int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
19       void *recvbuf, int recvcount, MPI_Datatype recvtype,
20       MPI_Comm comm_ptr );
21 } mv2_alltoall_tuning_element;
22
23 typedef struct {
24   int numproc;
25   int size_table;
26   mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
27   mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
28 } mv2_alltoall_tuning_table;
29
30 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
31
32 /* Indicates number of processes per node */
33 int *mv2_alltoall_table_ppn_conf = NULL;
34 /* Indicates total number of configurations */
35 int mv2_alltoall_num_ppn_conf = 1;
36 int *mv2_size_alltoall_tuning_table = NULL;
37 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
38
39
40 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
41 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
42 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_mvapich2_scatter_dest
43 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
44 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring 
45
46
47 static void init_mv2_alltoall_tables_stampede(){
48   int i;
49   int agg_table_sum = 0;
50   mv2_alltoall_tuning_table **table_ptrs = NULL;
51   mv2_alltoall_num_ppn_conf = 3;
52   if(smpi_coll_cleanup_callback==NULL)
53     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
54   mv2_alltoall_thresholds_table = static_cast<mv2_alltoall_tuning_table**>(xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
55       * mv2_alltoall_num_ppn_conf));
56   table_ptrs = static_cast<mv2_alltoall_tuning_table**>(xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
57       * mv2_alltoall_num_ppn_conf));
58   mv2_size_alltoall_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
59       mv2_alltoall_num_ppn_conf));
60   mv2_alltoall_table_ppn_conf = static_cast<int*>(xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int)));
61   mv2_alltoall_table_ppn_conf[0] = 1;
62   mv2_size_alltoall_tuning_table[0] = 6;
63   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
64       {2,
65           1,
66           {{0, -1, &MPIR_Alltoall_pairwise_MV2},
67           },
68
69           {{0, -1, &MPIR_Alltoall_inplace_MV2},
70           },
71       },
72
73       {4,
74           2,
75           {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
76               {262144, -1, &MPIR_Alltoall_pairwise_MV2},
77           },
78
79           {{0, -1, &MPIR_Alltoall_inplace_MV2},
80           },
81       },
82
83       {8,
84           2,
85           {{0, 8, &MPIR_Alltoall_RD_MV2},
86               {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
87           },
88
89           {{0, -1, &MPIR_Alltoall_inplace_MV2},
90           },
91       },
92
93       {16,
94           3,
95           {{0, 64, &MPIR_Alltoall_RD_MV2},
96               {64, 512, &MPIR_Alltoall_bruck_MV2},
97               {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
98           },
99
100           {{0,-1, &MPIR_Alltoall_inplace_MV2},
101           },
102       },
103
104       {32,
105           3,
106           {{0, 32, &MPIR_Alltoall_RD_MV2},
107               {32, 2048, &MPIR_Alltoall_bruck_MV2},
108               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
109           },
110
111           {{0, -1, &MPIR_Alltoall_inplace_MV2},
112           },
113       },
114
115       {64,
116           3,
117           {{0, 8, &MPIR_Alltoall_RD_MV2},
118               {8, 1024, &MPIR_Alltoall_bruck_MV2},
119               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
120           },
121
122           {{0, -1, &MPIR_Alltoall_inplace_MV2},
123           },
124       },
125   };
126   table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
127   mv2_alltoall_table_ppn_conf[1] = 2;
128   mv2_size_alltoall_tuning_table[1] = 6;
129   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
130       {4,
131           2,
132           {{0, 32, &MPIR_Alltoall_RD_MV2},
133               {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
134           },
135
136           {{0, -1, &MPIR_Alltoall_inplace_MV2},
137           },
138       },
139
140       {8,
141           2,
142           {{0, 64, &MPIR_Alltoall_RD_MV2},
143               {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
144           },
145
146           {{0, -1, &MPIR_Alltoall_inplace_MV2},
147           },
148       },
149
150       {16,
151           3,
152           {{0, 64, &MPIR_Alltoall_RD_MV2},
153               {64, 2048, &MPIR_Alltoall_bruck_MV2},
154               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
155           },
156
157           {{0,-1, &MPIR_Alltoall_inplace_MV2},
158           },
159       },
160
161       {32,
162           3,
163           {{0, 16, &MPIR_Alltoall_RD_MV2},
164               {16, 2048, &MPIR_Alltoall_bruck_MV2},
165               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
166           },
167
168           {{0, -1, &MPIR_Alltoall_inplace_MV2},
169           },
170       },
171
172       {64,
173           3,
174           {{0, 8, &MPIR_Alltoall_RD_MV2},
175               {8, 1024, &MPIR_Alltoall_bruck_MV2},
176               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
177           },
178
179           {{0, -1, &MPIR_Alltoall_inplace_MV2},
180           },
181       },
182
183       {128,
184           3,
185           {{0, 4, &MPIR_Alltoall_RD_MV2},
186               {4, 2048, &MPIR_Alltoall_bruck_MV2},
187               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
188           },
189
190           {{0, -1, &MPIR_Alltoall_inplace_MV2},
191           },
192       },
193   };
194   table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
195   mv2_alltoall_table_ppn_conf[2] = 16;
196   mv2_size_alltoall_tuning_table[2] = 7;
197   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
198       {16,
199           2,
200           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
201               {2048, -1,  &MPIR_Alltoall_Scatter_dest_MV2},
202           },
203
204           {{32768, -1, &MPIR_Alltoall_inplace_MV2},
205           },
206       },
207
208       {32,
209           2,
210           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
211               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
212           },
213
214           {{16384, -1, &MPIR_Alltoall_inplace_MV2},
215           },
216       },
217
218       {64,
219           3,
220           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
221               {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
222               {16384, -1, &MPIR_Alltoall_pairwise_MV2},
223           },
224
225           {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
226           },
227       },
228
229       {128,
230           2,
231           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
232               {2048, -1, &MPIR_Alltoall_pairwise_MV2},
233           },
234
235           {{16384,65536, &MPIR_Alltoall_inplace_MV2},
236           },
237       },
238
239       {256,
240           2,
241           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
242               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
243           },
244
245           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
246           },
247       },
248
249       {512,
250           2,
251           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
252               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
253           },
254
255           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
256           },
257       },
258       {1024,
259           2,
260           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
261               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
262           },
263
264           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
265           },
266       },
267
268   };
269   table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
270   agg_table_sum = 0;
271   for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
272       agg_table_sum += mv2_size_alltoall_tuning_table[i];
273   }
274   mv2_alltoall_thresholds_table[0] =
275       static_cast<mv2_alltoall_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table)));
276   memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
277       (sizeof(mv2_alltoall_tuning_table)
278           * mv2_size_alltoall_tuning_table[0]));
279   for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
280       mv2_alltoall_thresholds_table[i] =
281           mv2_alltoall_thresholds_table[i - 1]
282                                         + mv2_size_alltoall_tuning_table[i - 1];
283       memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
284           (sizeof(mv2_alltoall_tuning_table)
285               * mv2_size_alltoall_tuning_table[i]));
286   }
287   xbt_free(table_ptrs);
288
289
290 }
291
292
293 /************ Allgather variables and initializers                        */
294
295 typedef struct {
296   int min;
297   int max;
298   int (*MV2_pt_Allgather_function)(void *sendbuf,
299       int sendcount,
300       MPI_Datatype sendtype,
301       void *recvbuf,
302       int recvcount,
303       MPI_Datatype recvtype, MPI_Comm comm_ptr);
304 } mv2_allgather_tuning_element;
305
306 typedef struct {
307   int numproc;
308   int two_level[MV2_MAX_NB_THRESHOLDS];
309   int size_inter_table;
310   mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
311 } mv2_allgather_tuning_table;
312
313 int (*MV2_Allgather_function)(void *sendbuf,
314     int sendcount,
315     MPI_Datatype sendtype,
316     void *recvbuf,
317     int recvcount,
318     MPI_Datatype recvtype, MPI_Comm comm);
319
320 int *mv2_allgather_table_ppn_conf = NULL;
321 int mv2_allgather_num_ppn_conf = 1;
322 int *mv2_size_allgather_tuning_table = NULL;
323 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
324
325 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
326                                  int sendcount,
327                                  MPI_Datatype sendtype,
328                                  void *recvbuf,
329                                  int recvcount,
330                                  MPI_Datatype recvtype, MPI_Comm comm_ptr)
331 {
332     return 0;
333 }
334
335 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
336 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
337 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
338 #define MPIR_2lvl_Allgather_MV2 smpi_coll_tuned_allgather_mvapich2_smp
339
340 static void init_mv2_allgather_tables_stampede(){
341   int i;
342   int agg_table_sum = 0;
343
344   if(smpi_coll_cleanup_callback==NULL)
345     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
346   mv2_allgather_tuning_table **table_ptrs = NULL;
347   mv2_allgather_num_ppn_conf = 3;
348   mv2_allgather_thresholds_table
349   = static_cast<mv2_allgather_tuning_table**>(xbt_malloc(sizeof(mv2_allgather_tuning_table *)
350       * mv2_allgather_num_ppn_conf));
351   table_ptrs = static_cast<mv2_allgather_tuning_table**>(xbt_malloc(sizeof(mv2_allgather_tuning_table *)
352       * mv2_allgather_num_ppn_conf));
353   mv2_size_allgather_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
354       mv2_allgather_num_ppn_conf));
355   mv2_allgather_table_ppn_conf
356   = static_cast<int*>(xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int)));
357   mv2_allgather_table_ppn_conf[0] = 1;
358   mv2_size_allgather_tuning_table[0] = 6;
359   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
360       {
361           2,
362           {0},
363           1,
364           {
365               {0, -1, &MPIR_Allgather_Ring_MV2},
366           },
367       },
368       {
369           4,
370           {0,0},
371           2,
372           {
373               {0, 262144, &MPIR_Allgather_RD_MV2},
374               {262144, -1, &MPIR_Allgather_Ring_MV2},
375           },
376       },
377       {
378           8,
379           {0,0},
380           2,
381           {
382               {0, 131072, &MPIR_Allgather_RD_MV2},
383               {131072, -1, &MPIR_Allgather_Ring_MV2},
384           },
385       },
386       {
387           16,
388           {0,0},
389           2,
390           {
391               {0, 131072, &MPIR_Allgather_RD_MV2},
392               {131072, -1, &MPIR_Allgather_Ring_MV2},
393           },
394       },
395       {
396           32,
397           {0,0},
398           2,
399           {
400               {0, 65536, &MPIR_Allgather_RD_MV2},
401               {65536, -1, &MPIR_Allgather_Ring_MV2},
402           },
403       },
404       {
405           64,
406           {0,0},
407           2,
408           {
409               {0, 32768, &MPIR_Allgather_RD_MV2},
410               {32768, -1, &MPIR_Allgather_Ring_MV2},
411           },
412       },
413   };
414   table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
415   mv2_allgather_table_ppn_conf[1] = 2;
416   mv2_size_allgather_tuning_table[1] = 6;
417   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
418       {
419           4,
420           {0,0},
421           2,
422           {
423               {0, 524288, &MPIR_Allgather_RD_MV2},
424               {524288, -1, &MPIR_Allgather_Ring_MV2},
425           },
426       },
427       {
428           8,
429           {0,1,0},
430           2,
431           {
432               {0, 32768, &MPIR_Allgather_RD_MV2},
433               {32768, 524288, &MPIR_Allgather_Ring_MV2},
434               {524288, -1, &MPIR_Allgather_Ring_MV2},
435           },
436       },
437       {
438           16,
439           {0,1,0},
440           2,
441           {
442               {0, 16384, &MPIR_Allgather_RD_MV2},
443               {16384, 524288, &MPIR_Allgather_Ring_MV2},
444               {524288, -1, &MPIR_Allgather_Ring_MV2},
445           },
446       },
447       {
448           32,
449           {1,1,0},
450           2,
451           {
452               {0, 65536, &MPIR_Allgather_RD_MV2},
453               {65536, 524288, &MPIR_Allgather_Ring_MV2},
454               {524288, -1, &MPIR_Allgather_Ring_MV2},
455           },
456       },
457       {
458           64,
459           {1,1,0},
460           2,
461           {
462               {0, 32768, &MPIR_Allgather_RD_MV2},
463               {32768, 524288, &MPIR_Allgather_Ring_MV2},
464               {524288, -1, &MPIR_Allgather_Ring_MV2},
465           },
466       },
467       {
468           128,
469           {1,1,0},
470           2,
471           {
472               {0, 65536, &MPIR_Allgather_RD_MV2},
473               {65536, 524288, &MPIR_Allgather_Ring_MV2},
474               {524288, -1, &MPIR_Allgather_Ring_MV2},
475           },
476       },
477   };
478   table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
479   mv2_allgather_table_ppn_conf[2] = 16;
480   mv2_size_allgather_tuning_table[2] = 6;
481   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
482       {
483           16,
484           {0,0},
485           2,
486           {
487               {0, 1024, &MPIR_Allgather_RD_MV2},
488               {1024, -1, &MPIR_Allgather_Ring_MV2},
489           },
490       },
491       {
492           32,
493           {0,0},
494           2,
495           {
496               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
497               {1024, -1, &MPIR_Allgather_Ring_MV2},
498           },
499       },
500       {
501           64,
502           {0,0},
503           2,
504           {
505               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
506               {1024, -1, &MPIR_Allgather_Ring_MV2},
507           },
508       },
509       {
510           128,
511           {0,0},
512           2,
513           {
514               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
515               {1024, -1, &MPIR_Allgather_Ring_MV2},
516           },
517       },
518       {
519           256,
520           {0,0},
521           2,
522           {
523               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
524               {1024, -1, &MPIR_Allgather_Ring_MV2},
525           },
526       },
527       {
528           512,
529           {0,0},
530           2,
531           {
532               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
533               {1024, -1, &MPIR_Allgather_Ring_MV2},
534           },
535       },
536
537   };
538   table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
539   agg_table_sum = 0;
540   for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
541       agg_table_sum += mv2_size_allgather_tuning_table[i];
542   }
543   mv2_allgather_thresholds_table[0] =
544       static_cast<mv2_allgather_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table)));
545   memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
546       (sizeof(mv2_allgather_tuning_table)
547           * mv2_size_allgather_tuning_table[0]));
548   for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
549       mv2_allgather_thresholds_table[i] =
550           mv2_allgather_thresholds_table[i - 1]
551                                          + mv2_size_allgather_tuning_table[i - 1];
552       memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
553           (sizeof(mv2_allgather_tuning_table)
554               * mv2_size_allgather_tuning_table[i]));
555   }
556   xbt_free(table_ptrs);
557 }
558
559
560 /************ Gather variables and initializers                        */
561
562 typedef struct {
563   int min;
564   int max;
565   int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
566       MPI_Datatype sendtype, void *recvbuf, int recvcnt,
567       MPI_Datatype recvtype, int root, MPI_Comm  comm_ptr);
568 } mv2_gather_tuning_element;
569
570
571 typedef struct {
572   int numproc;
573   int size_inter_table;
574   mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
575   int size_intra_table;
576   mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
577 } mv2_gather_tuning_table;
578
579 int mv2_size_gather_tuning_table=7;
580 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL; 
581
582 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
583     int sendcnt,
584     MPI_Datatype sendtype,
585     void *recvbuf,
586     int recvcnt,
587     MPI_Datatype recvtype,
588     int root, MPI_Comm comm);
589
590 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
591 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
592
593
594 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
595 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_mvapich2_two_level
596 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
597
598
599 static void init_mv2_gather_tables_stampede(){
600
601   if(smpi_coll_cleanup_callback==NULL)
602     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
603   mv2_size_gather_tuning_table=7;
604   mv2_gather_thresholds_table = static_cast<mv2_gather_tuning_table*>(xbt_malloc(mv2_size_gather_tuning_table*
605       sizeof (mv2_gather_tuning_table)));
606   mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
607       {16,
608           2,{{0, 524288, &MPIR_Gather_MV2_Direct},
609               {524288, -1, &MPIR_Gather_intra}},
610               1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
611               {32,
612                   3,{{0, 16384, &MPIR_Gather_MV2_Direct},
613                       {16384, 131072, &MPIR_Gather_intra},
614                       {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
615                       1,{{0, -1, &MPIR_Gather_intra}}},
616                       {64,
617                           3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
618                               {256, 16384, &MPIR_Gather_MV2_Direct},
619                               {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
620                               1,{{0, -1, &MPIR_Gather_intra}}},
621                               {128,
622                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
623                                       {512, 16384, &MPIR_Gather_MV2_Direct},
624                                       {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
625                                       1,{{0, -1, &MPIR_Gather_intra}}},
626                                       {256,
627                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
628                                               {512, 16384, &MPIR_Gather_MV2_Direct},
629                                               {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
630                                               1,{{0, -1, &MPIR_Gather_intra}}},
631                                               {512,
632                                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
633                                                       {512, 16384, &MPIR_Gather_MV2_Direct},
634                                                       {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
635                                                       1,{{0, -1, &MPIR_Gather_intra}}},
636                                                       {1024,
637                                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
638                                                               {512, 16384, &MPIR_Gather_MV2_Direct},
639                                                               {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
640                                                               1,{{0, -1, &MPIR_Gather_intra}}},
641   };
642
643   memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
644       mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
645
646 }
647
648
649 /************ Allgatherv variables and initializers                        */
650
651 typedef struct {
652   int min;
653   int max;
654   int (*MV2_pt_Allgatherv_function)(void *sendbuf,
655       int sendcount,
656       MPI_Datatype sendtype,
657       void *recvbuf,
658       int *recvcounts,
659       int *displs,
660       MPI_Datatype recvtype,
661       MPI_Comm commg);
662 } mv2_allgatherv_tuning_element;
663
664 typedef struct {
665   int numproc;
666   int size_inter_table;
667   mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
668 } mv2_allgatherv_tuning_table;
669
670 int (*MV2_Allgatherv_function)(void *sendbuf,
671     int sendcount,
672     MPI_Datatype sendtype,
673     void *recvbuf,
674     int *recvcounts,
675     int *displs,
676     MPI_Datatype recvtype,
677     MPI_Comm comm);
678
679 int mv2_size_allgatherv_tuning_table = 0;
680 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
681
682 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
683 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
684 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
685
686
687 static void init_mv2_allgatherv_tables_stampede(){
688   if(smpi_coll_cleanup_callback==NULL)
689     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
690   mv2_size_allgatherv_tuning_table = 6;
691   mv2_allgatherv_thresholds_table = static_cast<mv2_allgatherv_tuning_table*>(xbt_malloc(mv2_size_allgatherv_tuning_table *
692       sizeof (mv2_allgatherv_tuning_table)));
693   mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
694       {
695           16,
696           2,
697           {
698               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
699               {512, -1, &MPIR_Allgatherv_Ring_MV2},
700           },
701       },
702       {
703           32,
704           2,
705           {
706               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
707               {512, -1, &MPIR_Allgatherv_Ring_MV2},
708           },
709       },
710       {
711           64,
712           2,
713           {
714               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
715               {256, -1, &MPIR_Allgatherv_Ring_MV2},
716           },
717       },
718       {
719           128,
720           2,
721           {
722               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
723               {256, -1, &MPIR_Allgatherv_Ring_MV2},
724           },
725       },
726       {
727           256,
728           2,
729           {
730               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
731               {256, -1, &MPIR_Allgatherv_Ring_MV2},
732           },
733       },
734       {
735           512,
736           2,
737           {
738               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
739               {256, -1, &MPIR_Allgatherv_Ring_MV2},
740           },
741       },
742
743   };
744   memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
745       mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
746 }
747
748
749 /************ Allreduce variables and initializers                        */
750
751 typedef struct {
752   int min;
753   int max;
754   int (*MV2_pt_Allreduce_function)(void *sendbuf,
755       void *recvbuf,
756       int count,
757       MPI_Datatype datatype,
758       MPI_Op op, MPI_Comm comm);
759 } mv2_allreduce_tuning_element;
760
761 typedef struct {
762   int numproc;
763   int mcast_enabled;
764   int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
765   int size_inter_table;
766   mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
767   int size_intra_table;
768   mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
769 } mv2_allreduce_tuning_table;
770
771
772 int (*MV2_Allreduce_function)(void *sendbuf,
773     void *recvbuf,
774     int count,
775     MPI_Datatype datatype,
776     MPI_Op op, MPI_Comm comm)=NULL;
777
778
779 int (*MV2_Allreduce_intra_function)( void *sendbuf,
780     void *recvbuf,
781     int count,
782     MPI_Datatype datatype,
783     MPI_Op op, MPI_Comm comm)=NULL;
784
785 int mv2_size_allreduce_tuning_table = 0;
786 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
787
788
789
790
791
792 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
793     void *recvbuf,
794     int count,
795     MPI_Datatype datatype,
796     MPI_Op op, MPI_Comm comm)
797
798   return 0;
799 }
800
801 static  int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
802     void *recvbuf,
803     int count,
804     MPI_Datatype datatype,
805     MPI_Op op, MPI_Comm  comm)
806 {
807   return 0;
808 }
809
810 static  int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
811     void *recvbuf,
812     int count,
813     MPI_Datatype datatype,
814     MPI_Op op, MPI_Comm  comm)
815 {
816   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
817   return MPI_SUCCESS;
818 }
819
820 static  int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
821     void *recvbuf,
822     int count,
823     MPI_Datatype datatype,
824     MPI_Op op, MPI_Comm  comm)
825 {
826   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
827   return MPI_SUCCESS;
828 }
829
830 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
831 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
832 #define MPIR_Allreduce_two_level_MV2 smpi_coll_tuned_allreduce_mvapich2_two_level
833
834
835 static void init_mv2_allreduce_tables_stampede(){
836   if(smpi_coll_cleanup_callback==NULL)
837     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
838   mv2_size_allreduce_tuning_table = 8;
839   mv2_allreduce_thresholds_table = static_cast<mv2_allreduce_tuning_table*>(xbt_malloc(mv2_size_allreduce_tuning_table *
840       sizeof (mv2_allreduce_tuning_table)));
841   mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
842       {
843           16,
844           0,
845           {1, 0},
846           2,
847           {
848               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
849               {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
850           },
851           2,
852           {
853               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
854               {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
855           },
856       },
857       {
858           32,
859           0,
860           {1, 1, 0},
861           3,
862           {
863               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
864               {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
865               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
866           },
867           2,
868           {
869               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
870               {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
871           },
872       },
873       {
874           64,
875           0,
876           {1, 1, 0},
877           3,
878           {
879               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
880               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
881               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
882           },
883           2,
884           {
885               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
886               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
887           },
888       },
889       {
890           128,
891           0,
892           {1, 1, 0},
893           3,
894           {
895               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
896               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
897               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
898           },
899           2,
900           {
901               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
902               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
903           },
904       },
905       {
906           256,
907           0,
908           {1, 1, 0},
909           3,
910           {
911               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
912               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
913               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
914           },
915           2,
916           {
917               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
918               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
919           },
920       },
921       {
922           512,
923           0,
924           {1, 1, 0},
925           3,
926           {
927               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
928               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
929               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
930           },
931           2,
932           {
933               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
934               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
935           },
936       },
937       {
938           1024,
939           0,
940           {1, 1, 1, 0},
941           4,
942           {
943               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
944               {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
945               {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
946               {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
947           },
948           2,
949           {
950               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
951               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
952           },
953       },
954       {
955           2048,
956           0,
957           {1, 1, 1, 0},
958           4,
959           {
960               {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
961               {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
962               {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
963               {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
964               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
965           },
966           2,
967           {
968               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
969               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
970           },
971       },
972
973   };
974   memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
975       mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
976 }
977
978
979
980
981 typedef struct {
982     int min;
983     int max;
984     int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
985                                   int root, MPI_Comm comm_ptr);
986     int zcpy_pipelined_knomial_factor;
987 } mv2_bcast_tuning_element;
988
989 typedef struct {
990     int numproc;
991     int bcast_segment_size;
992     int intra_node_knomial_factor;
993     int inter_node_knomial_factor;
994     int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
995     int size_inter_table;
996     mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
997     int size_intra_table;
998     mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
999 } mv2_bcast_tuning_table;
1000
1001 int mv2_size_bcast_tuning_table = 0;
1002 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
1003
1004
1005 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
1006                            int root, MPI_Comm comm_ptr) = NULL;
1007
1008 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
1009                                       int root, MPI_Comm comm_ptr) = NULL;
1010
1011 int zcpy_knomial_factor = 2;
1012 int mv2_pipelined_zcpy_knomial_factor = -1;
1013 int bcast_segment_size = 8192;
1014 int mv2_inter_node_knomial_factor = 4;
1015 int mv2_intra_node_knomial_factor = 4;
1016 #define mv2_bcast_two_level_system_size  64
1017 #define mv2_bcast_short_msg             16384
1018 #define mv2_bcast_large_msg            512*1024
1019
1020 #define INTRA_NODE_ROOT 0
1021
1022 #define MPIR_Pipelined_Bcast_Zcpy_MV2 smpi_coll_tuned_bcast_mpich
1023 #define MPIR_Pipelined_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1024 #define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_binomial_tree
1025 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
1026 #define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_scatter_rdb_allgather
1027 #define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
1028 #define MPIR_Shmem_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1029 #define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
1030 #define MPIR_Bcast_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
1031 #define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mvapich2_knomial_intra_node
1032 #define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mvapich2_intra_node
1033
1034 static void init_mv2_bcast_tables_stampede(){
1035  //Stampede,
1036   if(smpi_coll_cleanup_callback==NULL)
1037     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1038   mv2_size_bcast_tuning_table=8;
1039   mv2_bcast_thresholds_table = static_cast<mv2_bcast_tuning_table*>(xbt_malloc(mv2_size_bcast_tuning_table *
1040   sizeof (mv2_bcast_tuning_table)));
1041
1042   mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1043     {
1044             16,
1045             8192, 4, 4,
1046             {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1047             11,
1048             {
1049               {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1050               {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1051               {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1052               {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1053               {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1054               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1055               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1056               {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1057               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1058               {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1059               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1060             },
1061             11,
1062             {
1063               {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1064               {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1065               {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1066               {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1067               {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1068               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1069               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1070               {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1071               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1072               {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1073               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1074             }
1075     },
1076     {
1077             32,
1078             8192, 4, 4,
1079             {1, 1, 1, 1, 1, 1, 1, 1},
1080             8,
1081             {
1082               {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1083               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1084               {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1085               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1086               {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1087               {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1088               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1089               {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1090             },
1091             8,
1092             {
1093               {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1094               {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1095               {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1096               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1097               {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1098               {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1099               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1100               {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1101             }
1102     },
1103     {
1104             64,
1105             8192, 4, 4,
1106             {1, 1, 1, 1, 1, 1, 1, 1, 1},
1107             9,
1108             {
1109               {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1110               {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1111               {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1112               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1113               {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1114               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1115               {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1116               {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1117               {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1118             },
1119             9,
1120             {
1121               {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1122               {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1123               {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1124               {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1125               {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1126               {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1127               {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1128               {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1129               {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1130             }
1131     },
1132     {
1133             128,
1134             8192, 4, 4,
1135             {1, 1, 1, 0},
1136             4,
1137             {
1138               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1139               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1140               {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1141               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1142             },
1143             4,
1144             {
1145               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1146               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1147               {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1148               {524288, -1, NULL, -1}
1149             }
1150     },
1151     {
1152             256,
1153             8192, 4, 4,
1154             {1, 1, 1, 1, 1},
1155             5,
1156             {
1157               {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1158               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1159               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1160               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1161               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1162             },
1163             5,
1164             {
1165               {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1166               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1167               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1168               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1169               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1170             }
1171     },
1172     {
1173             512,
1174             8192, 4, 4,
1175             {1, 1, 1, 1, 1},
1176             5,
1177             {
1178               {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1179               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1180               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1181               {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1182               {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1183             },
1184             5,
1185             {
1186               {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1187               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1188               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1189               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1190               {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1191             }
1192     },
1193     {
1194             1024,
1195             8192, 4, 4,
1196             {1, 1, 1, 1, 1},
1197             5,
1198             {
1199               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1200               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1201               {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1202               {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1203               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1204             },
1205             5,
1206             {
1207               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1208               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1209               {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1210               {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1211               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1212             }
1213     },
1214     {
1215             2048,
1216             8192, 4, 4,
1217             {1, 1, 1, 1, 1, 1, 1},
1218             7,
1219             {
1220               {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1221               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1222               {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1223               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1224               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1225               {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1226               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1227             },
1228             7,
1229             {
1230               {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1231               {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1232               {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1233               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1234               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1235               {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1236               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1237             }
1238     }
1239   };
1240
1241         memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1242                     mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1243 }
1244
1245
1246 /************ Reduce variables and initializers                        */
1247
1248 typedef struct {
1249   int min;
1250   int max;
1251   int (*MV2_pt_Reduce_function)(void *sendbuf,
1252       void *recvbuf,
1253       int count,
1254       MPI_Datatype datatype,
1255       MPI_Op op,
1256       int root,
1257       MPI_Comm  comm_ptr);
1258 } mv2_reduce_tuning_element;
1259
1260 typedef struct {
1261   int numproc;
1262   int inter_k_degree;
1263   int intra_k_degree;
1264   int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1265   int size_inter_table;
1266   mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1267   int size_intra_table;
1268   mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1269 } mv2_reduce_tuning_table;
1270
1271 int mv2_size_reduce_tuning_table = 0;
1272 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1273
1274
1275 int mv2_reduce_intra_knomial_factor = -1;
1276 int mv2_reduce_inter_knomial_factor = -1;
1277
1278 int (*MV2_Reduce_function)( void *sendbuf,
1279     void *recvbuf,
1280     int count,
1281     MPI_Datatype datatype,
1282     MPI_Op op,
1283     int root,
1284     MPI_Comm  comm_ptr)=NULL;
1285
1286 int (*MV2_Reduce_intra_function)( void *sendbuf,
1287     void *recvbuf,
1288     int count,
1289     MPI_Datatype datatype,
1290     MPI_Op op,
1291     int root,
1292     MPI_Comm  comm_ptr)=NULL;
1293
1294
1295 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1296 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1297 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1298 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1299 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1300 #define MPIR_Reduce_two_level_helper_MV2 smpi_coll_tuned_reduce_mvapich2_two_level
1301
1302
1303 static void init_mv2_reduce_tables_stampede(){
1304   if(smpi_coll_cleanup_callback==NULL)
1305     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1306   /*Stampede*/
1307   mv2_size_reduce_tuning_table = 8;
1308   mv2_reduce_thresholds_table = static_cast<mv2_reduce_tuning_table*>(xbt_malloc(mv2_size_reduce_tuning_table *
1309       sizeof (mv2_reduce_tuning_table)));
1310   mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1311       {
1312           16,
1313           4,
1314           4,
1315           {1, 0, 0},
1316           3,
1317           {
1318               {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1319               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1320               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1321           },
1322           2,
1323           {
1324               {0, 65536, &MPIR_Reduce_shmem_MV2},
1325               {65536,-1,  &MPIR_Reduce_binomial_MV2},
1326           },
1327       },
1328       {
1329           32,
1330           4,
1331           4,
1332           {1, 1, 1, 1, 0, 0, 0},
1333           7,
1334           {
1335               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1336               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1337               {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1338               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1339               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1340               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1341               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1342           },
1343           6,
1344           {
1345               {0, 8192, &MPIR_Reduce_shmem_MV2},
1346               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1347               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1348               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1349               {65536, 262144, &MPIR_Reduce_shmem_MV2},
1350               {262144,-1,  &MPIR_Reduce_binomial_MV2},
1351           },
1352       },
1353       {
1354           64,
1355           4,
1356           4,
1357           {1, 1, 1, 1, 0},
1358           5,
1359           {
1360               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1361               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1362               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1363               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1364               {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1365           },
1366           5,
1367           {
1368               {0, 8192, &MPIR_Reduce_shmem_MV2},
1369               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1370               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1371               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1372               {262144, -1, &MPIR_Reduce_binomial_MV2},
1373           },
1374       },
1375       {
1376           128,
1377           4,
1378           4,
1379           {1, 0, 1, 0, 1, 0},
1380           6,
1381           {
1382               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1383               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1384               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1385               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1386               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1387               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1388           },
1389           5,
1390           {
1391               {0, 8192, &MPIR_Reduce_shmem_MV2},
1392               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1393               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1394               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1395               {262144, -1, &MPIR_Reduce_binomial_MV2},
1396           },
1397       },
1398       {
1399           256,
1400           4,
1401           4,
1402           {1, 1, 1, 0, 1, 1, 0},
1403           7,
1404           {
1405               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1406               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1407               {16384, 32768, &MPIR_Reduce_binomial_MV2},
1408               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1409               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1410               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1411               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1412           },
1413           6,
1414           {
1415               {0, 8192, &MPIR_Reduce_shmem_MV2},
1416               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1417               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1418               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1419               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1420               {262144, -1, &MPIR_Reduce_binomial_MV2},
1421           },
1422       },
1423       {
1424           512,
1425           4,
1426           4,
1427           {1, 0, 1, 1, 1, 0},
1428           6,
1429           {
1430               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1431               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1432               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1433               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1434               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1435               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1436           },
1437           5,
1438           {
1439               {0, 8192, &MPIR_Reduce_shmem_MV2},
1440               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1441               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1442               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1443               {262144, -1, &MPIR_Reduce_binomial_MV2},
1444           },
1445       },
1446       {
1447           1024,
1448           4,
1449           4,
1450           {1, 0, 1, 1, 1},
1451           5,
1452           {
1453               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1454               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1455               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1456               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1457               {262144, -1, &MPIR_Reduce_binomial_MV2},
1458           },
1459           5,
1460           {
1461               {0, 8192, &MPIR_Reduce_shmem_MV2},
1462               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1463               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1464               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1465               {262144, -1, &MPIR_Reduce_binomial_MV2},
1466           },
1467       },
1468       {
1469           2048,
1470           4,
1471           4,
1472           {1, 0, 1, 1, 1,1},
1473           6,
1474           {
1475               {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1476               {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1477               {4096, 16384, &MPIR_Reduce_binomial_MV2},
1478               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1479               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1480               {131072, -1, &MPIR_Reduce_binomial_MV2},
1481           },
1482           6,
1483           {
1484               {0, 2048, &MPIR_Reduce_shmem_MV2},
1485               {2048, 4096, &MPIR_Reduce_shmem_MV2},
1486               {4096, 16384, &MPIR_Reduce_shmem_MV2},
1487               {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1488               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1489               {131072, -1, &MPIR_Reduce_shmem_MV2},
1490           },
1491       },
1492
1493   };
1494   memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1495       mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1496 }
1497
1498 /************ Reduce scatter variables and initializers                        */
1499
1500 typedef struct {
1501   int min;
1502   int max;
1503   int (*MV2_pt_Red_scat_function)(void *sendbuf,
1504       void *recvbuf,
1505       int *recvcnts,
1506       MPI_Datatype datatype,
1507       MPI_Op op,
1508       MPI_Comm comm_ptr);
1509 } mv2_red_scat_tuning_element;
1510
1511 typedef struct {
1512   int numproc;
1513   int size_inter_table;
1514   mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1515 } mv2_red_scat_tuning_table;
1516
1517 int mv2_size_red_scat_tuning_table = 0;
1518 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1519
1520
1521 int (*MV2_Red_scat_function)(void *sendbuf,
1522     void *recvbuf,
1523     int *recvcnts,
1524     MPI_Datatype datatype,
1525     MPI_Op op,
1526     MPI_Comm comm_ptr);
1527
1528
1529
1530 static  int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1531     void *recvbuf,
1532     int *recvcnts,
1533     MPI_Datatype datatype,
1534     MPI_Op op,
1535     MPI_Comm comm)
1536 {
1537   smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1538   return MPI_SUCCESS;
1539 }
1540 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1541 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1542 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1543
1544
1545
1546
1547 static void init_mv2_reduce_scatter_tables_stampede(){
1548   if(smpi_coll_cleanup_callback==NULL)
1549     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1550   mv2_size_red_scat_tuning_table = 6;
1551   mv2_red_scat_thresholds_table = static_cast<mv2_red_scat_tuning_table*>(xbt_malloc(mv2_size_red_scat_tuning_table *
1552       sizeof (mv2_red_scat_tuning_table)));
1553   mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1554       {
1555           16,
1556           3,
1557           {
1558               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1559               {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1560               {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1561           },
1562       },
1563       {
1564           32,
1565           3,
1566           {
1567               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1568               {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1569               {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1570           },
1571       },
1572       {
1573           64,
1574           3,
1575           {
1576               {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1577               {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1578               {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1579           },
1580       },
1581       {
1582           128,
1583           2,
1584           {
1585               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1586               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1587           },
1588       },
1589       {
1590           256,
1591           2,
1592           {
1593               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1594               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1595           },
1596       },
1597       {
1598           512,
1599           2,
1600           {
1601               {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1602               {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1603           },
1604       },
1605
1606   };
1607   memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1608       mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1609 }
1610
1611 /************ Scatter variables and initializers                        */
1612
1613 typedef struct {
1614   int min;
1615   int max;
1616   int (*MV2_pt_Scatter_function)(void *sendbuf,
1617       int sendcnt,
1618       MPI_Datatype sendtype,
1619       void *recvbuf,
1620       int recvcnt,
1621       MPI_Datatype recvtype,
1622       int root, MPI_Comm comm);
1623 } mv2_scatter_tuning_element;
1624
1625 typedef struct {
1626   int numproc;
1627   int size_inter_table;
1628   mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1629   int size_intra_table;
1630   mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1631 } mv2_scatter_tuning_table;
1632
1633
1634 int *mv2_scatter_table_ppn_conf = NULL;
1635 int mv2_scatter_num_ppn_conf = 1;
1636 int *mv2_size_scatter_tuning_table = NULL;
1637 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1638
1639 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1640     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1641     int root, MPI_Comm comm)=NULL;
1642
1643 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1644     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1645     int root, MPI_Comm comm)=NULL;
1646 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1647     int sendcnt,
1648     MPI_Datatype sendtype,
1649     void *recvbuf,
1650     int recvcnt,
1651     MPI_Datatype recvtype,
1652     int root, MPI_Comm comm_ptr);
1653
1654 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1655     int sendcnt,
1656     MPI_Datatype sendtype,
1657     void *recvbuf,
1658     int recvcnt,
1659     MPI_Datatype recvtype,
1660     int root, MPI_Comm comm_ptr)
1661 {
1662   return 0;
1663 }
1664
1665 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1666 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1667 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_mvapich2_two_level_binomial
1668 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_mvapich2_two_level_direct
1669
1670
1671
1672
1673 static void init_mv2_scatter_tables_stampede(){
1674     if(smpi_coll_cleanup_callback==NULL)
1675       smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1676
1677     int agg_table_sum = 0;
1678     int i;
1679     mv2_scatter_tuning_table **table_ptrs = NULL;
1680     mv2_scatter_num_ppn_conf = 3;
1681     mv2_scatter_thresholds_table
1682     = static_cast<mv2_scatter_tuning_table**>(xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1683         * mv2_scatter_num_ppn_conf));
1684     table_ptrs = static_cast<mv2_scatter_tuning_table**>(xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1685         * mv2_scatter_num_ppn_conf));
1686     mv2_size_scatter_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
1687         mv2_scatter_num_ppn_conf));
1688     mv2_scatter_table_ppn_conf
1689     = static_cast<int*>(xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int)));
1690     mv2_scatter_table_ppn_conf[0] = 1;
1691     mv2_size_scatter_tuning_table[0] = 6;
1692     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1693         {2,
1694             1,
1695             {
1696                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1697             },
1698             1,
1699             {
1700                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1701             },
1702         },
1703
1704         {4,
1705             1,
1706             {
1707                 {0, -1, &MPIR_Scatter_MV2_Direct},
1708             },
1709             1,
1710             {
1711                 {0, -1, &MPIR_Scatter_MV2_Direct},
1712             },
1713         },
1714
1715         {8,
1716             1,
1717             {
1718                 {0, -1, &MPIR_Scatter_MV2_Direct},
1719             },
1720             1,
1721             {
1722                 {0, -1, &MPIR_Scatter_MV2_Direct},
1723             },
1724         },
1725
1726         {16,
1727             1,
1728             {
1729                 {0, -1, &MPIR_Scatter_MV2_Direct},
1730             },
1731             1,
1732             {
1733                 {0, -1, &MPIR_Scatter_MV2_Direct},
1734             },
1735         },
1736
1737         {32,
1738             1,
1739             {
1740                 {0, -1, &MPIR_Scatter_MV2_Direct},
1741             },
1742             1,
1743             {
1744                 {0, -1, &MPIR_Scatter_MV2_Direct},
1745             },
1746         },
1747
1748         {64,
1749             2,
1750             {
1751                 {0, 32, &MPIR_Scatter_MV2_Binomial},
1752                 {32, -1, &MPIR_Scatter_MV2_Direct},
1753             },
1754             1,
1755             {
1756                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1757             },
1758         },
1759     };
1760     table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1761     mv2_scatter_table_ppn_conf[1] = 2;
1762     mv2_size_scatter_tuning_table[1] = 6;
1763     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1764         {4,
1765             2,
1766             {
1767                 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1768                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1769             },
1770             1,
1771             {
1772                 {0, -1, &MPIR_Scatter_MV2_Direct},
1773             },
1774         },
1775
1776         {8,
1777             2,
1778             {
1779                 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1780                 {512, -1, &MPIR_Scatter_MV2_Direct},
1781             },
1782             1,
1783             {
1784                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1785             },
1786         },
1787
1788         {16,
1789             2,
1790             {
1791                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1792                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1793             },
1794             1,
1795             {
1796                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1797             },
1798         },
1799
1800         {32,
1801             2,
1802             {
1803                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1804                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1805             },
1806             1,
1807             {
1808                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1809             },
1810         },
1811
1812         {64,
1813             2,
1814             {
1815                 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1816                 {8192, -1, &MPIR_Scatter_MV2_Direct},
1817             },
1818             1,
1819             {
1820                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1821             },
1822         },
1823
1824         {128,
1825             4,
1826             {
1827                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1828                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1829                 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1830                 {16384, -1, &MPIR_Scatter_MV2_Direct},
1831             },
1832             1,
1833             {
1834                 {0, 128, &MPIR_Scatter_MV2_Direct},
1835                 {128, -1, &MPIR_Scatter_MV2_Binomial},
1836             },
1837         },
1838     };
1839     table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1840     mv2_scatter_table_ppn_conf[2] = 16;
1841     mv2_size_scatter_tuning_table[2] = 8;
1842     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1843         {
1844             16,
1845             2,
1846             {
1847                 {0, 256, &MPIR_Scatter_MV2_Binomial},
1848                 {256, -1, &MPIR_Scatter_MV2_Direct},
1849             },
1850             1,
1851             {
1852                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1853             },
1854         },
1855
1856         {
1857             32,
1858             2,
1859             {
1860                 {0, 512, &MPIR_Scatter_MV2_Binomial},
1861                 {512, -1, &MPIR_Scatter_MV2_Direct},
1862             },
1863             1,
1864             {
1865                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1866             },
1867         },
1868
1869         {
1870             64,
1871             2,
1872             {
1873                 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1874                 {1024, -1, &MPIR_Scatter_MV2_Direct},
1875             },
1876             1,
1877             {
1878                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1879             },
1880         },
1881
1882         {
1883             128,
1884             4,
1885             {
1886                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1887                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1888                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1889                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1890             },
1891             1,
1892             {
1893                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1894             },
1895         },
1896
1897         {
1898             256,
1899             4,
1900             {
1901                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1902                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1903                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1904                 {2048, -1,  &MPIR_Scatter_MV2_Direct},
1905             },
1906             1,
1907             {
1908                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1909             },
1910         },
1911
1912         {
1913             512,
1914             4,
1915             {
1916                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1917                 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1918                 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1919                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1920             },
1921             1,
1922             {
1923                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1924             },
1925         },
1926         {
1927             1024,
1928             5,
1929             {
1930                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1931                 {0, 16,  &MPIR_Scatter_MV2_Binomial},
1932                 {16, 32, &MPIR_Scatter_MV2_Binomial},
1933                 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1934                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1935             },
1936             1,
1937             {
1938                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1939             },
1940         },
1941         {
1942             2048,
1943             7,
1944             {
1945                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1946                 {0, 16,  &MPIR_Scatter_MV2_two_level_Binomial},
1947                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1948                 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1949                 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1950                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1951                 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1952             },
1953             6,
1954             {
1955                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1956                 {16, 128, &MPIR_Scatter_MV2_Binomial},
1957                 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1958                 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1959                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1960                 {65536, -1, &MPIR_Scatter_MV2_Direct},
1961             },
1962         },
1963     };
1964     table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1965     agg_table_sum = 0;
1966     for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1967         agg_table_sum += mv2_size_scatter_tuning_table[i];
1968     }
1969     mv2_scatter_thresholds_table[0] =
1970         static_cast<mv2_scatter_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table)));
1971     memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1972         (sizeof(mv2_scatter_tuning_table)
1973             * mv2_size_scatter_tuning_table[0]));
1974     for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1975         mv2_scatter_thresholds_table[i] =
1976             mv2_scatter_thresholds_table[i - 1]
1977                                          + mv2_size_scatter_tuning_table[i - 1];
1978         memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1979             (sizeof(mv2_scatter_tuning_table)
1980                 * mv2_size_scatter_tuning_table[i]));
1981     }
1982     xbt_free(table_ptrs);
1983   
1984 }
1985