Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
e0f2156377caeefdf51273a5e09a8559d264b2a8
[simgrid.git] / src / smpi / colls / smpi_mvapich2_selector_stampede.h
1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
2 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
3
4 /* Copyright (c) 2009-2017. The SimGrid Team. All rights reserved.          */
5
6 /* This program is free software; you can redistribute it and/or modify it
7  * under the terms of the license (GNU LGPL) which comes with this package. */
8
9 /************ Alltoall variables and initializers                        */
10
11 #define MV2_MAX_NB_THRESHOLDS  32
12
13
14 typedef struct {
15   int min;
16   int max;
17   int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
18       void *recvbuf, int recvcount, MPI_Datatype recvtype,
19       MPI_Comm comm_ptr );
20 } mv2_alltoall_tuning_element;
21
22 typedef struct {
23   int numproc;
24   int size_table;
25   mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
26   mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
27 } mv2_alltoall_tuning_table;
28
29 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
30
31 /* Indicates number of processes per node */
32 int *mv2_alltoall_table_ppn_conf = NULL;
33 /* Indicates total number of configurations */
34 int mv2_alltoall_num_ppn_conf = 1;
35 int *mv2_size_alltoall_tuning_table = NULL;
36 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
37
38
39 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
40 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
41 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_mvapich2_scatter_dest
42 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
43 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring 
44
45
46 static void init_mv2_alltoall_tables_stampede(){
47   int i;
48   int agg_table_sum = 0;
49   mv2_alltoall_tuning_table **table_ptrs = NULL;
50   mv2_alltoall_num_ppn_conf = 3;
51   if(smpi_coll_cleanup_callback==NULL)
52     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
53   mv2_alltoall_thresholds_table = static_cast<mv2_alltoall_tuning_table**>(xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
54       * mv2_alltoall_num_ppn_conf));
55   table_ptrs = static_cast<mv2_alltoall_tuning_table**>(xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
56       * mv2_alltoall_num_ppn_conf));
57   mv2_size_alltoall_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
58       mv2_alltoall_num_ppn_conf));
59   mv2_alltoall_table_ppn_conf = static_cast<int*>(xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int)));
60   mv2_alltoall_table_ppn_conf[0] = 1;
61   mv2_size_alltoall_tuning_table[0] = 6;
62   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
63       {2,
64           1,
65           {{0, -1, &MPIR_Alltoall_pairwise_MV2},
66           },
67
68           {{0, -1, &MPIR_Alltoall_inplace_MV2},
69           },
70       },
71
72       {4,
73           2,
74           {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
75               {262144, -1, &MPIR_Alltoall_pairwise_MV2},
76           },
77
78           {{0, -1, &MPIR_Alltoall_inplace_MV2},
79           },
80       },
81
82       {8,
83           2,
84           {{0, 8, &MPIR_Alltoall_RD_MV2},
85               {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
86           },
87
88           {{0, -1, &MPIR_Alltoall_inplace_MV2},
89           },
90       },
91
92       {16,
93           3,
94           {{0, 64, &MPIR_Alltoall_RD_MV2},
95               {64, 512, &MPIR_Alltoall_bruck_MV2},
96               {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
97           },
98
99           {{0,-1, &MPIR_Alltoall_inplace_MV2},
100           },
101       },
102
103       {32,
104           3,
105           {{0, 32, &MPIR_Alltoall_RD_MV2},
106               {32, 2048, &MPIR_Alltoall_bruck_MV2},
107               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
108           },
109
110           {{0, -1, &MPIR_Alltoall_inplace_MV2},
111           },
112       },
113
114       {64,
115           3,
116           {{0, 8, &MPIR_Alltoall_RD_MV2},
117               {8, 1024, &MPIR_Alltoall_bruck_MV2},
118               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
119           },
120
121           {{0, -1, &MPIR_Alltoall_inplace_MV2},
122           },
123       },
124   };
125   table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
126   mv2_alltoall_table_ppn_conf[1] = 2;
127   mv2_size_alltoall_tuning_table[1] = 6;
128   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
129       {4,
130           2,
131           {{0, 32, &MPIR_Alltoall_RD_MV2},
132               {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
133           },
134
135           {{0, -1, &MPIR_Alltoall_inplace_MV2},
136           },
137       },
138
139       {8,
140           2,
141           {{0, 64, &MPIR_Alltoall_RD_MV2},
142               {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
143           },
144
145           {{0, -1, &MPIR_Alltoall_inplace_MV2},
146           },
147       },
148
149       {16,
150           3,
151           {{0, 64, &MPIR_Alltoall_RD_MV2},
152               {64, 2048, &MPIR_Alltoall_bruck_MV2},
153               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
154           },
155
156           {{0,-1, &MPIR_Alltoall_inplace_MV2},
157           },
158       },
159
160       {32,
161           3,
162           {{0, 16, &MPIR_Alltoall_RD_MV2},
163               {16, 2048, &MPIR_Alltoall_bruck_MV2},
164               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
165           },
166
167           {{0, -1, &MPIR_Alltoall_inplace_MV2},
168           },
169       },
170
171       {64,
172           3,
173           {{0, 8, &MPIR_Alltoall_RD_MV2},
174               {8, 1024, &MPIR_Alltoall_bruck_MV2},
175               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
176           },
177
178           {{0, -1, &MPIR_Alltoall_inplace_MV2},
179           },
180       },
181
182       {128,
183           3,
184           {{0, 4, &MPIR_Alltoall_RD_MV2},
185               {4, 2048, &MPIR_Alltoall_bruck_MV2},
186               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
187           },
188
189           {{0, -1, &MPIR_Alltoall_inplace_MV2},
190           },
191       },
192   };
193   table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
194   mv2_alltoall_table_ppn_conf[2] = 16;
195   mv2_size_alltoall_tuning_table[2] = 7;
196   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
197       {16,
198           2,
199           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
200               {2048, -1,  &MPIR_Alltoall_Scatter_dest_MV2},
201           },
202
203           {{32768, -1, &MPIR_Alltoall_inplace_MV2},
204           },
205       },
206
207       {32,
208           2,
209           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
210               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
211           },
212
213           {{16384, -1, &MPIR_Alltoall_inplace_MV2},
214           },
215       },
216
217       {64,
218           3,
219           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
220               {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
221               {16384, -1, &MPIR_Alltoall_pairwise_MV2},
222           },
223
224           {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
225           },
226       },
227
228       {128,
229           2,
230           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
231               {2048, -1, &MPIR_Alltoall_pairwise_MV2},
232           },
233
234           {{16384,65536, &MPIR_Alltoall_inplace_MV2},
235           },
236       },
237
238       {256,
239           2,
240           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
241               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
242           },
243
244           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
245           },
246       },
247
248       {512,
249           2,
250           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
251               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
252           },
253
254           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
255           },
256       },
257       {1024,
258           2,
259           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
260               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
261           },
262
263           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
264           },
265       },
266
267   };
268   table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
269   agg_table_sum = 0;
270   for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
271       agg_table_sum += mv2_size_alltoall_tuning_table[i];
272   }
273   mv2_alltoall_thresholds_table[0] =
274       static_cast<mv2_alltoall_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table)));
275   memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
276       (sizeof(mv2_alltoall_tuning_table)
277           * mv2_size_alltoall_tuning_table[0]));
278   for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
279       mv2_alltoall_thresholds_table[i] =
280           mv2_alltoall_thresholds_table[i - 1]
281                                         + mv2_size_alltoall_tuning_table[i - 1];
282       memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
283           (sizeof(mv2_alltoall_tuning_table)
284               * mv2_size_alltoall_tuning_table[i]));
285   }
286   xbt_free(table_ptrs);
287
288
289 }
290
291
292 /************ Allgather variables and initializers                        */
293
294 typedef struct {
295   int min;
296   int max;
297   int (*MV2_pt_Allgather_function)(void *sendbuf,
298       int sendcount,
299       MPI_Datatype sendtype,
300       void *recvbuf,
301       int recvcount,
302       MPI_Datatype recvtype, MPI_Comm comm_ptr);
303 } mv2_allgather_tuning_element;
304
305 typedef struct {
306   int numproc;
307   int two_level[MV2_MAX_NB_THRESHOLDS];
308   int size_inter_table;
309   mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
310 } mv2_allgather_tuning_table;
311
312 int (*MV2_Allgather_function)(void *sendbuf,
313     int sendcount,
314     MPI_Datatype sendtype,
315     void *recvbuf,
316     int recvcount,
317     MPI_Datatype recvtype, MPI_Comm comm);
318
319 int *mv2_allgather_table_ppn_conf = NULL;
320 int mv2_allgather_num_ppn_conf = 1;
321 int *mv2_size_allgather_tuning_table = NULL;
322 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
323
324 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
325                                  int sendcount,
326                                  MPI_Datatype sendtype,
327                                  void *recvbuf,
328                                  int recvcount,
329                                  MPI_Datatype recvtype, MPI_Comm comm_ptr)
330 {
331     return 0;
332 }
333
334 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
335 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
336 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
337 #define MPIR_2lvl_Allgather_MV2 smpi_coll_tuned_allgather_mvapich2_smp
338
339 static void init_mv2_allgather_tables_stampede(){
340   int i;
341   int agg_table_sum = 0;
342
343   if(smpi_coll_cleanup_callback==NULL)
344     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
345   mv2_allgather_tuning_table **table_ptrs = NULL;
346   mv2_allgather_num_ppn_conf = 3;
347   mv2_allgather_thresholds_table
348   = static_cast<mv2_allgather_tuning_table**>(xbt_malloc(sizeof(mv2_allgather_tuning_table *)
349       * mv2_allgather_num_ppn_conf));
350   table_ptrs = static_cast<mv2_allgather_tuning_table**>(xbt_malloc(sizeof(mv2_allgather_tuning_table *)
351       * mv2_allgather_num_ppn_conf));
352   mv2_size_allgather_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
353       mv2_allgather_num_ppn_conf));
354   mv2_allgather_table_ppn_conf
355   = static_cast<int*>(xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int)));
356   mv2_allgather_table_ppn_conf[0] = 1;
357   mv2_size_allgather_tuning_table[0] = 6;
358   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
359       {
360           2,
361           {0},
362           1,
363           {
364               {0, -1, &MPIR_Allgather_Ring_MV2},
365           },
366       },
367       {
368           4,
369           {0,0},
370           2,
371           {
372               {0, 262144, &MPIR_Allgather_RD_MV2},
373               {262144, -1, &MPIR_Allgather_Ring_MV2},
374           },
375       },
376       {
377           8,
378           {0,0},
379           2,
380           {
381               {0, 131072, &MPIR_Allgather_RD_MV2},
382               {131072, -1, &MPIR_Allgather_Ring_MV2},
383           },
384       },
385       {
386           16,
387           {0,0},
388           2,
389           {
390               {0, 131072, &MPIR_Allgather_RD_MV2},
391               {131072, -1, &MPIR_Allgather_Ring_MV2},
392           },
393       },
394       {
395           32,
396           {0,0},
397           2,
398           {
399               {0, 65536, &MPIR_Allgather_RD_MV2},
400               {65536, -1, &MPIR_Allgather_Ring_MV2},
401           },
402       },
403       {
404           64,
405           {0,0},
406           2,
407           {
408               {0, 32768, &MPIR_Allgather_RD_MV2},
409               {32768, -1, &MPIR_Allgather_Ring_MV2},
410           },
411       },
412   };
413   table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
414   mv2_allgather_table_ppn_conf[1] = 2;
415   mv2_size_allgather_tuning_table[1] = 6;
416   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
417       {
418           4,
419           {0,0},
420           2,
421           {
422               {0, 524288, &MPIR_Allgather_RD_MV2},
423               {524288, -1, &MPIR_Allgather_Ring_MV2},
424           },
425       },
426       {
427           8,
428           {0,1,0},
429           2,
430           {
431               {0, 32768, &MPIR_Allgather_RD_MV2},
432               {32768, 524288, &MPIR_Allgather_Ring_MV2},
433               {524288, -1, &MPIR_Allgather_Ring_MV2},
434           },
435       },
436       {
437           16,
438           {0,1,0},
439           2,
440           {
441               {0, 16384, &MPIR_Allgather_RD_MV2},
442               {16384, 524288, &MPIR_Allgather_Ring_MV2},
443               {524288, -1, &MPIR_Allgather_Ring_MV2},
444           },
445       },
446       {
447           32,
448           {1,1,0},
449           2,
450           {
451               {0, 65536, &MPIR_Allgather_RD_MV2},
452               {65536, 524288, &MPIR_Allgather_Ring_MV2},
453               {524288, -1, &MPIR_Allgather_Ring_MV2},
454           },
455       },
456       {
457           64,
458           {1,1,0},
459           2,
460           {
461               {0, 32768, &MPIR_Allgather_RD_MV2},
462               {32768, 524288, &MPIR_Allgather_Ring_MV2},
463               {524288, -1, &MPIR_Allgather_Ring_MV2},
464           },
465       },
466       {
467           128,
468           {1,1,0},
469           2,
470           {
471               {0, 65536, &MPIR_Allgather_RD_MV2},
472               {65536, 524288, &MPIR_Allgather_Ring_MV2},
473               {524288, -1, &MPIR_Allgather_Ring_MV2},
474           },
475       },
476   };
477   table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
478   mv2_allgather_table_ppn_conf[2] = 16;
479   mv2_size_allgather_tuning_table[2] = 6;
480   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
481       {
482           16,
483           {0,0},
484           2,
485           {
486               {0, 1024, &MPIR_Allgather_RD_MV2},
487               {1024, -1, &MPIR_Allgather_Ring_MV2},
488           },
489       },
490       {
491           32,
492           {0,0},
493           2,
494           {
495               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
496               {1024, -1, &MPIR_Allgather_Ring_MV2},
497           },
498       },
499       {
500           64,
501           {0,0},
502           2,
503           {
504               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
505               {1024, -1, &MPIR_Allgather_Ring_MV2},
506           },
507       },
508       {
509           128,
510           {0,0},
511           2,
512           {
513               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
514               {1024, -1, &MPIR_Allgather_Ring_MV2},
515           },
516       },
517       {
518           256,
519           {0,0},
520           2,
521           {
522               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
523               {1024, -1, &MPIR_Allgather_Ring_MV2},
524           },
525       },
526       {
527           512,
528           {0,0},
529           2,
530           {
531               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
532               {1024, -1, &MPIR_Allgather_Ring_MV2},
533           },
534       },
535
536   };
537   table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
538   agg_table_sum = 0;
539   for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
540       agg_table_sum += mv2_size_allgather_tuning_table[i];
541   }
542   mv2_allgather_thresholds_table[0] =
543       static_cast<mv2_allgather_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table)));
544   memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
545       (sizeof(mv2_allgather_tuning_table)
546           * mv2_size_allgather_tuning_table[0]));
547   for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
548       mv2_allgather_thresholds_table[i] =
549           mv2_allgather_thresholds_table[i - 1]
550                                          + mv2_size_allgather_tuning_table[i - 1];
551       memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
552           (sizeof(mv2_allgather_tuning_table)
553               * mv2_size_allgather_tuning_table[i]));
554   }
555   xbt_free(table_ptrs);
556 }
557
558
559 /************ Gather variables and initializers                        */
560
561 typedef struct {
562   int min;
563   int max;
564   int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
565       MPI_Datatype sendtype, void *recvbuf, int recvcnt,
566       MPI_Datatype recvtype, int root, MPI_Comm  comm_ptr);
567 } mv2_gather_tuning_element;
568
569
570 typedef struct {
571   int numproc;
572   int size_inter_table;
573   mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
574   int size_intra_table;
575   mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
576 } mv2_gather_tuning_table;
577
578 int mv2_size_gather_tuning_table=7;
579 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL; 
580
581 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
582     int sendcnt,
583     MPI_Datatype sendtype,
584     void *recvbuf,
585     int recvcnt,
586     MPI_Datatype recvtype,
587     int root, MPI_Comm comm);
588
589 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
590 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
591
592
593 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
594 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_mvapich2_two_level
595 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
596
597
598 static void init_mv2_gather_tables_stampede(){
599
600   if(smpi_coll_cleanup_callback==NULL)
601     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
602   mv2_size_gather_tuning_table=7;
603   mv2_gather_thresholds_table = static_cast<mv2_gather_tuning_table*>(xbt_malloc(mv2_size_gather_tuning_table*
604       sizeof (mv2_gather_tuning_table)));
605   mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
606       {16,
607           2,{{0, 524288, &MPIR_Gather_MV2_Direct},
608               {524288, -1, &MPIR_Gather_intra}},
609               1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
610               {32,
611                   3,{{0, 16384, &MPIR_Gather_MV2_Direct},
612                       {16384, 131072, &MPIR_Gather_intra},
613                       {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
614                       1,{{0, -1, &MPIR_Gather_intra}}},
615                       {64,
616                           3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
617                               {256, 16384, &MPIR_Gather_MV2_Direct},
618                               {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
619                               1,{{0, -1, &MPIR_Gather_intra}}},
620                               {128,
621                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
622                                       {512, 16384, &MPIR_Gather_MV2_Direct},
623                                       {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
624                                       1,{{0, -1, &MPIR_Gather_intra}}},
625                                       {256,
626                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
627                                               {512, 16384, &MPIR_Gather_MV2_Direct},
628                                               {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
629                                               1,{{0, -1, &MPIR_Gather_intra}}},
630                                               {512,
631                                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
632                                                       {512, 16384, &MPIR_Gather_MV2_Direct},
633                                                       {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
634                                                       1,{{0, -1, &MPIR_Gather_intra}}},
635                                                       {1024,
636                                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
637                                                               {512, 16384, &MPIR_Gather_MV2_Direct},
638                                                               {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
639                                                               1,{{0, -1, &MPIR_Gather_intra}}},
640   };
641
642   memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
643       mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
644
645 }
646
647
648 /************ Allgatherv variables and initializers                        */
649
650 typedef struct {
651   int min;
652   int max;
653   int (*MV2_pt_Allgatherv_function)(void *sendbuf,
654       int sendcount,
655       MPI_Datatype sendtype,
656       void *recvbuf,
657       int *recvcounts,
658       int *displs,
659       MPI_Datatype recvtype,
660       MPI_Comm commg);
661 } mv2_allgatherv_tuning_element;
662
663 typedef struct {
664   int numproc;
665   int size_inter_table;
666   mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
667 } mv2_allgatherv_tuning_table;
668
669 int (*MV2_Allgatherv_function)(void *sendbuf,
670     int sendcount,
671     MPI_Datatype sendtype,
672     void *recvbuf,
673     int *recvcounts,
674     int *displs,
675     MPI_Datatype recvtype,
676     MPI_Comm comm);
677
678 int mv2_size_allgatherv_tuning_table = 0;
679 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
680
681 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
682 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
683 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
684
685
686 static void init_mv2_allgatherv_tables_stampede(){
687   if(smpi_coll_cleanup_callback==NULL)
688     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
689   mv2_size_allgatherv_tuning_table = 6;
690   mv2_allgatherv_thresholds_table = static_cast<mv2_allgatherv_tuning_table*>(xbt_malloc(mv2_size_allgatherv_tuning_table *
691       sizeof (mv2_allgatherv_tuning_table)));
692   mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
693       {
694           16,
695           2,
696           {
697               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
698               {512, -1, &MPIR_Allgatherv_Ring_MV2},
699           },
700       },
701       {
702           32,
703           2,
704           {
705               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
706               {512, -1, &MPIR_Allgatherv_Ring_MV2},
707           },
708       },
709       {
710           64,
711           2,
712           {
713               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
714               {256, -1, &MPIR_Allgatherv_Ring_MV2},
715           },
716       },
717       {
718           128,
719           2,
720           {
721               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
722               {256, -1, &MPIR_Allgatherv_Ring_MV2},
723           },
724       },
725       {
726           256,
727           2,
728           {
729               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
730               {256, -1, &MPIR_Allgatherv_Ring_MV2},
731           },
732       },
733       {
734           512,
735           2,
736           {
737               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
738               {256, -1, &MPIR_Allgatherv_Ring_MV2},
739           },
740       },
741
742   };
743   memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
744       mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
745 }
746
747
748 /************ Allreduce variables and initializers                        */
749
750 typedef struct {
751   int min;
752   int max;
753   int (*MV2_pt_Allreduce_function)(void *sendbuf,
754       void *recvbuf,
755       int count,
756       MPI_Datatype datatype,
757       MPI_Op op, MPI_Comm comm);
758 } mv2_allreduce_tuning_element;
759
760 typedef struct {
761   int numproc;
762   int mcast_enabled;
763   int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
764   int size_inter_table;
765   mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
766   int size_intra_table;
767   mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
768 } mv2_allreduce_tuning_table;
769
770
771 int (*MV2_Allreduce_function)(void *sendbuf,
772     void *recvbuf,
773     int count,
774     MPI_Datatype datatype,
775     MPI_Op op, MPI_Comm comm)=NULL;
776
777
778 int (*MV2_Allreduce_intra_function)( void *sendbuf,
779     void *recvbuf,
780     int count,
781     MPI_Datatype datatype,
782     MPI_Op op, MPI_Comm comm)=NULL;
783
784 int mv2_size_allreduce_tuning_table = 0;
785 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
786
787
788
789
790
791 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
792     void *recvbuf,
793     int count,
794     MPI_Datatype datatype,
795     MPI_Op op, MPI_Comm comm)
796
797   return 0;
798 }
799
800 static  int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
801     void *recvbuf,
802     int count,
803     MPI_Datatype datatype,
804     MPI_Op op, MPI_Comm  comm)
805 {
806   return 0;
807 }
808
809 static  int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
810     void *recvbuf,
811     int count,
812     MPI_Datatype datatype,
813     MPI_Op op, MPI_Comm  comm)
814 {
815   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
816   return MPI_SUCCESS;
817 }
818
819 static  int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
820     void *recvbuf,
821     int count,
822     MPI_Datatype datatype,
823     MPI_Op op, MPI_Comm  comm)
824 {
825   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
826   return MPI_SUCCESS;
827 }
828
829 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
830 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
831 #define MPIR_Allreduce_two_level_MV2 smpi_coll_tuned_allreduce_mvapich2_two_level
832
833
834 static void init_mv2_allreduce_tables_stampede(){
835   if(smpi_coll_cleanup_callback==NULL)
836     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
837   mv2_size_allreduce_tuning_table = 8;
838   mv2_allreduce_thresholds_table = static_cast<mv2_allreduce_tuning_table*>(xbt_malloc(mv2_size_allreduce_tuning_table *
839       sizeof (mv2_allreduce_tuning_table)));
840   mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
841       {
842           16,
843           0,
844           {1, 0},
845           2,
846           {
847               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
848               {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
849           },
850           2,
851           {
852               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
853               {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
854           },
855       },
856       {
857           32,
858           0,
859           {1, 1, 0},
860           3,
861           {
862               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
863               {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
864               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
865           },
866           2,
867           {
868               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
869               {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
870           },
871       },
872       {
873           64,
874           0,
875           {1, 1, 0},
876           3,
877           {
878               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
879               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
880               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
881           },
882           2,
883           {
884               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
885               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
886           },
887       },
888       {
889           128,
890           0,
891           {1, 1, 0},
892           3,
893           {
894               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
895               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
896               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
897           },
898           2,
899           {
900               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
901               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
902           },
903       },
904       {
905           256,
906           0,
907           {1, 1, 0},
908           3,
909           {
910               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
911               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
912               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
913           },
914           2,
915           {
916               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
917               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
918           },
919       },
920       {
921           512,
922           0,
923           {1, 1, 0},
924           3,
925           {
926               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
927               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
928               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
929           },
930           2,
931           {
932               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
933               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
934           },
935       },
936       {
937           1024,
938           0,
939           {1, 1, 1, 0},
940           4,
941           {
942               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
943               {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
944               {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
945               {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
946           },
947           2,
948           {
949               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
950               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
951           },
952       },
953       {
954           2048,
955           0,
956           {1, 1, 1, 0},
957           4,
958           {
959               {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
960               {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
961               {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
962               {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
963               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
964           },
965           2,
966           {
967               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
968               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
969           },
970       },
971
972   };
973   memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
974       mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
975 }
976
977
978
979
980 typedef struct {
981     int min;
982     int max;
983     int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
984                                   int root, MPI_Comm comm_ptr);
985     int zcpy_pipelined_knomial_factor;
986 } mv2_bcast_tuning_element;
987
988 typedef struct {
989     int numproc;
990     int bcast_segment_size;
991     int intra_node_knomial_factor;
992     int inter_node_knomial_factor;
993     int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
994     int size_inter_table;
995     mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
996     int size_intra_table;
997     mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
998 } mv2_bcast_tuning_table;
999
1000 int mv2_size_bcast_tuning_table = 0;
1001 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
1002
1003
1004 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
1005                            int root, MPI_Comm comm_ptr) = NULL;
1006
1007 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
1008                                       int root, MPI_Comm comm_ptr) = NULL;
1009
1010 int zcpy_knomial_factor = 2;
1011 int mv2_pipelined_zcpy_knomial_factor = -1;
1012 int bcast_segment_size = 8192;
1013 int mv2_inter_node_knomial_factor = 4;
1014 int mv2_intra_node_knomial_factor = 4;
1015 #define mv2_bcast_two_level_system_size  64
1016 #define mv2_bcast_short_msg             16384
1017 #define mv2_bcast_large_msg            512*1024
1018
1019 #define INTRA_NODE_ROOT 0
1020
1021 #define MPIR_Pipelined_Bcast_Zcpy_MV2 smpi_coll_tuned_bcast_mpich
1022 #define MPIR_Pipelined_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1023 #define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_binomial_tree
1024 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
1025 #define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_scatter_rdb_allgather
1026 #define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
1027 #define MPIR_Shmem_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1028 #define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
1029 #define MPIR_Bcast_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
1030 #define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mvapich2_knomial_intra_node
1031 #define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mvapich2_intra_node
1032
1033 static void init_mv2_bcast_tables_stampede(){
1034  //Stampede,
1035   if(smpi_coll_cleanup_callback==NULL)
1036     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1037   mv2_size_bcast_tuning_table=8;
1038   mv2_bcast_thresholds_table = static_cast<mv2_bcast_tuning_table*>(xbt_malloc(mv2_size_bcast_tuning_table *
1039   sizeof (mv2_bcast_tuning_table)));
1040
1041   mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1042     {
1043             16,
1044             8192, 4, 4,
1045             {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1046             11,
1047             {
1048               {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1049               {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1050               {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1051               {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1052               {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1053               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1054               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1055               {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1056               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1057               {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1058               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1059             },
1060             11,
1061             {
1062               {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1063               {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1064               {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1065               {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1066               {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1067               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1068               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1069               {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1070               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1071               {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1072               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1073             }
1074     },
1075     {
1076             32,
1077             8192, 4, 4,
1078             {1, 1, 1, 1, 1, 1, 1, 1},
1079             8,
1080             {
1081               {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1082               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1083               {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1084               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1085               {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1086               {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1087               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1088               {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1089             },
1090             8,
1091             {
1092               {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1093               {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1094               {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1095               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1096               {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1097               {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1098               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1099               {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1100             }
1101     },
1102     {
1103             64,
1104             8192, 4, 4,
1105             {1, 1, 1, 1, 1, 1, 1, 1, 1},
1106             9,
1107             {
1108               {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1109               {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1110               {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1111               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1112               {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1113               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1114               {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1115               {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1116               {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1117             },
1118             9,
1119             {
1120               {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1121               {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1122               {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1123               {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1124               {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1125               {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1126               {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1127               {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1128               {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1129             }
1130     },
1131     {
1132             128,
1133             8192, 4, 4,
1134             {1, 1, 1, 0},
1135             4,
1136             {
1137               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1138               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1139               {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1140               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1141             },
1142             4,
1143             {
1144               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1145               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1146               {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1147               {524288, -1, NULL, -1}
1148             }
1149     },
1150     {
1151             256,
1152             8192, 4, 4,
1153             {1, 1, 1, 1, 1},
1154             5,
1155             {
1156               {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1157               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1158               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1159               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1160               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1161             },
1162             5,
1163             {
1164               {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1165               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1166               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1167               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1168               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1169             }
1170     },
1171     {
1172             512,
1173             8192, 4, 4,
1174             {1, 1, 1, 1, 1},
1175             5,
1176             {
1177               {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1178               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1179               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1180               {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1181               {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1182             },
1183             5,
1184             {
1185               {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1186               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1187               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1188               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1189               {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1190             }
1191     },
1192     {
1193             1024,
1194             8192, 4, 4,
1195             {1, 1, 1, 1, 1},
1196             5,
1197             {
1198               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1199               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1200               {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1201               {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1202               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1203             },
1204             5,
1205             {
1206               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1207               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1208               {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1209               {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1210               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1211             }
1212     },
1213     {
1214             2048,
1215             8192, 4, 4,
1216             {1, 1, 1, 1, 1, 1, 1},
1217             7,
1218             {
1219               {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1220               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1221               {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1222               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1223               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1224               {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1225               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1226             },
1227             7,
1228             {
1229               {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1230               {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1231               {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1232               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1233               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1234               {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1235               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1236             }
1237     }
1238   };
1239
1240         memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1241                     mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1242 }
1243
1244
1245 /************ Reduce variables and initializers                        */
1246
1247 typedef struct {
1248   int min;
1249   int max;
1250   int (*MV2_pt_Reduce_function)(void *sendbuf,
1251       void *recvbuf,
1252       int count,
1253       MPI_Datatype datatype,
1254       MPI_Op op,
1255       int root,
1256       MPI_Comm  comm_ptr);
1257 } mv2_reduce_tuning_element;
1258
1259 typedef struct {
1260   int numproc;
1261   int inter_k_degree;
1262   int intra_k_degree;
1263   int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1264   int size_inter_table;
1265   mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1266   int size_intra_table;
1267   mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1268 } mv2_reduce_tuning_table;
1269
1270 int mv2_size_reduce_tuning_table = 0;
1271 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1272
1273
1274 int mv2_reduce_intra_knomial_factor = -1;
1275 int mv2_reduce_inter_knomial_factor = -1;
1276
1277 int (*MV2_Reduce_function)( void *sendbuf,
1278     void *recvbuf,
1279     int count,
1280     MPI_Datatype datatype,
1281     MPI_Op op,
1282     int root,
1283     MPI_Comm  comm_ptr)=NULL;
1284
1285 int (*MV2_Reduce_intra_function)( void *sendbuf,
1286     void *recvbuf,
1287     int count,
1288     MPI_Datatype datatype,
1289     MPI_Op op,
1290     int root,
1291     MPI_Comm  comm_ptr)=NULL;
1292
1293
1294 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1295 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1296 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1297 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1298 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1299 #define MPIR_Reduce_two_level_helper_MV2 smpi_coll_tuned_reduce_mvapich2_two_level
1300
1301
1302 static void init_mv2_reduce_tables_stampede(){
1303   if(smpi_coll_cleanup_callback==NULL)
1304     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1305   /*Stampede*/
1306   mv2_size_reduce_tuning_table = 8;
1307   mv2_reduce_thresholds_table = static_cast<mv2_reduce_tuning_table*>(xbt_malloc(mv2_size_reduce_tuning_table *
1308       sizeof (mv2_reduce_tuning_table)));
1309   mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1310       {
1311           16,
1312           4,
1313           4,
1314           {1, 0, 0},
1315           3,
1316           {
1317               {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1318               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1319               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1320           },
1321           2,
1322           {
1323               {0, 65536, &MPIR_Reduce_shmem_MV2},
1324               {65536,-1,  &MPIR_Reduce_binomial_MV2},
1325           },
1326       },
1327       {
1328           32,
1329           4,
1330           4,
1331           {1, 1, 1, 1, 0, 0, 0},
1332           7,
1333           {
1334               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1335               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1336               {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1337               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1338               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1339               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1340               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1341           },
1342           6,
1343           {
1344               {0, 8192, &MPIR_Reduce_shmem_MV2},
1345               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1346               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1347               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1348               {65536, 262144, &MPIR_Reduce_shmem_MV2},
1349               {262144,-1,  &MPIR_Reduce_binomial_MV2},
1350           },
1351       },
1352       {
1353           64,
1354           4,
1355           4,
1356           {1, 1, 1, 1, 0},
1357           5,
1358           {
1359               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1360               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1361               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1362               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1363               {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1364           },
1365           5,
1366           {
1367               {0, 8192, &MPIR_Reduce_shmem_MV2},
1368               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1369               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1370               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1371               {262144, -1, &MPIR_Reduce_binomial_MV2},
1372           },
1373       },
1374       {
1375           128,
1376           4,
1377           4,
1378           {1, 0, 1, 0, 1, 0},
1379           6,
1380           {
1381               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1382               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1383               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1384               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1385               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1386               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1387           },
1388           5,
1389           {
1390               {0, 8192, &MPIR_Reduce_shmem_MV2},
1391               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1392               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1393               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1394               {262144, -1, &MPIR_Reduce_binomial_MV2},
1395           },
1396       },
1397       {
1398           256,
1399           4,
1400           4,
1401           {1, 1, 1, 0, 1, 1, 0},
1402           7,
1403           {
1404               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1405               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1406               {16384, 32768, &MPIR_Reduce_binomial_MV2},
1407               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1408               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1409               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1410               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1411           },
1412           6,
1413           {
1414               {0, 8192, &MPIR_Reduce_shmem_MV2},
1415               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1416               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1417               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1418               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1419               {262144, -1, &MPIR_Reduce_binomial_MV2},
1420           },
1421       },
1422       {
1423           512,
1424           4,
1425           4,
1426           {1, 0, 1, 1, 1, 0},
1427           6,
1428           {
1429               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1430               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1431               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1432               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1433               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1434               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1435           },
1436           5,
1437           {
1438               {0, 8192, &MPIR_Reduce_shmem_MV2},
1439               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1440               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1441               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1442               {262144, -1, &MPIR_Reduce_binomial_MV2},
1443           },
1444       },
1445       {
1446           1024,
1447           4,
1448           4,
1449           {1, 0, 1, 1, 1},
1450           5,
1451           {
1452               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1453               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1454               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1455               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1456               {262144, -1, &MPIR_Reduce_binomial_MV2},
1457           },
1458           5,
1459           {
1460               {0, 8192, &MPIR_Reduce_shmem_MV2},
1461               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1462               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1463               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1464               {262144, -1, &MPIR_Reduce_binomial_MV2},
1465           },
1466       },
1467       {
1468           2048,
1469           4,
1470           4,
1471           {1, 0, 1, 1, 1,1},
1472           6,
1473           {
1474               {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1475               {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1476               {4096, 16384, &MPIR_Reduce_binomial_MV2},
1477               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1478               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1479               {131072, -1, &MPIR_Reduce_binomial_MV2},
1480           },
1481           6,
1482           {
1483               {0, 2048, &MPIR_Reduce_shmem_MV2},
1484               {2048, 4096, &MPIR_Reduce_shmem_MV2},
1485               {4096, 16384, &MPIR_Reduce_shmem_MV2},
1486               {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1487               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1488               {131072, -1, &MPIR_Reduce_shmem_MV2},
1489           },
1490       },
1491
1492   };
1493   memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1494       mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1495 }
1496
1497 /************ Reduce scatter variables and initializers                        */
1498
1499 typedef struct {
1500   int min;
1501   int max;
1502   int (*MV2_pt_Red_scat_function)(void *sendbuf,
1503       void *recvbuf,
1504       int *recvcnts,
1505       MPI_Datatype datatype,
1506       MPI_Op op,
1507       MPI_Comm comm_ptr);
1508 } mv2_red_scat_tuning_element;
1509
1510 typedef struct {
1511   int numproc;
1512   int size_inter_table;
1513   mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1514 } mv2_red_scat_tuning_table;
1515
1516 int mv2_size_red_scat_tuning_table = 0;
1517 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1518
1519
1520 int (*MV2_Red_scat_function)(void *sendbuf,
1521     void *recvbuf,
1522     int *recvcnts,
1523     MPI_Datatype datatype,
1524     MPI_Op op,
1525     MPI_Comm comm_ptr);
1526
1527
1528
1529 static  int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1530     void *recvbuf,
1531     int *recvcnts,
1532     MPI_Datatype datatype,
1533     MPI_Op op,
1534     MPI_Comm comm)
1535 {
1536   smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1537   return MPI_SUCCESS;
1538 }
1539 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1540 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1541 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1542
1543
1544
1545
1546 static void init_mv2_reduce_scatter_tables_stampede(){
1547   if(smpi_coll_cleanup_callback==NULL)
1548     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1549   mv2_size_red_scat_tuning_table = 6;
1550   mv2_red_scat_thresholds_table = static_cast<mv2_red_scat_tuning_table*>(xbt_malloc(mv2_size_red_scat_tuning_table *
1551       sizeof (mv2_red_scat_tuning_table)));
1552   mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1553       {
1554           16,
1555           3,
1556           {
1557               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1558               {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1559               {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1560           },
1561       },
1562       {
1563           32,
1564           3,
1565           {
1566               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1567               {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1568               {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1569           },
1570       },
1571       {
1572           64,
1573           3,
1574           {
1575               {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1576               {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1577               {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1578           },
1579       },
1580       {
1581           128,
1582           2,
1583           {
1584               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1585               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1586           },
1587       },
1588       {
1589           256,
1590           2,
1591           {
1592               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1593               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1594           },
1595       },
1596       {
1597           512,
1598           2,
1599           {
1600               {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1601               {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1602           },
1603       },
1604
1605   };
1606   memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1607       mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1608 }
1609
1610 /************ Scatter variables and initializers                        */
1611
1612 typedef struct {
1613   int min;
1614   int max;
1615   int (*MV2_pt_Scatter_function)(void *sendbuf,
1616       int sendcnt,
1617       MPI_Datatype sendtype,
1618       void *recvbuf,
1619       int recvcnt,
1620       MPI_Datatype recvtype,
1621       int root, MPI_Comm comm);
1622 } mv2_scatter_tuning_element;
1623
1624 typedef struct {
1625   int numproc;
1626   int size_inter_table;
1627   mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1628   int size_intra_table;
1629   mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1630 } mv2_scatter_tuning_table;
1631
1632
1633 int *mv2_scatter_table_ppn_conf = NULL;
1634 int mv2_scatter_num_ppn_conf = 1;
1635 int *mv2_size_scatter_tuning_table = NULL;
1636 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1637
1638 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1639     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1640     int root, MPI_Comm comm)=NULL;
1641
1642 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1643     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1644     int root, MPI_Comm comm)=NULL;
1645 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1646     int sendcnt,
1647     MPI_Datatype sendtype,
1648     void *recvbuf,
1649     int recvcnt,
1650     MPI_Datatype recvtype,
1651     int root, MPI_Comm comm_ptr);
1652
1653 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1654     int sendcnt,
1655     MPI_Datatype sendtype,
1656     void *recvbuf,
1657     int recvcnt,
1658     MPI_Datatype recvtype,
1659     int root, MPI_Comm comm_ptr)
1660 {
1661   return 0;
1662 }
1663
1664 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1665 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1666 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_mvapich2_two_level_binomial
1667 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_mvapich2_two_level_direct
1668
1669
1670
1671
1672 static void init_mv2_scatter_tables_stampede(){
1673     if(smpi_coll_cleanup_callback==NULL)
1674       smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1675
1676     int agg_table_sum = 0;
1677     int i;
1678     mv2_scatter_tuning_table **table_ptrs = NULL;
1679     mv2_scatter_num_ppn_conf = 3;
1680     mv2_scatter_thresholds_table
1681     = static_cast<mv2_scatter_tuning_table**>(xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1682         * mv2_scatter_num_ppn_conf));
1683     table_ptrs = static_cast<mv2_scatter_tuning_table**>(xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1684         * mv2_scatter_num_ppn_conf));
1685     mv2_size_scatter_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
1686         mv2_scatter_num_ppn_conf));
1687     mv2_scatter_table_ppn_conf
1688     = static_cast<int*>(xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int)));
1689     mv2_scatter_table_ppn_conf[0] = 1;
1690     mv2_size_scatter_tuning_table[0] = 6;
1691     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1692         {2,
1693             1,
1694             {
1695                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1696             },
1697             1,
1698             {
1699                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1700             },
1701         },
1702
1703         {4,
1704             1,
1705             {
1706                 {0, -1, &MPIR_Scatter_MV2_Direct},
1707             },
1708             1,
1709             {
1710                 {0, -1, &MPIR_Scatter_MV2_Direct},
1711             },
1712         },
1713
1714         {8,
1715             1,
1716             {
1717                 {0, -1, &MPIR_Scatter_MV2_Direct},
1718             },
1719             1,
1720             {
1721                 {0, -1, &MPIR_Scatter_MV2_Direct},
1722             },
1723         },
1724
1725         {16,
1726             1,
1727             {
1728                 {0, -1, &MPIR_Scatter_MV2_Direct},
1729             },
1730             1,
1731             {
1732                 {0, -1, &MPIR_Scatter_MV2_Direct},
1733             },
1734         },
1735
1736         {32,
1737             1,
1738             {
1739                 {0, -1, &MPIR_Scatter_MV2_Direct},
1740             },
1741             1,
1742             {
1743                 {0, -1, &MPIR_Scatter_MV2_Direct},
1744             },
1745         },
1746
1747         {64,
1748             2,
1749             {
1750                 {0, 32, &MPIR_Scatter_MV2_Binomial},
1751                 {32, -1, &MPIR_Scatter_MV2_Direct},
1752             },
1753             1,
1754             {
1755                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1756             },
1757         },
1758     };
1759     table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1760     mv2_scatter_table_ppn_conf[1] = 2;
1761     mv2_size_scatter_tuning_table[1] = 6;
1762     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1763         {4,
1764             2,
1765             {
1766                 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1767                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1768             },
1769             1,
1770             {
1771                 {0, -1, &MPIR_Scatter_MV2_Direct},
1772             },
1773         },
1774
1775         {8,
1776             2,
1777             {
1778                 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1779                 {512, -1, &MPIR_Scatter_MV2_Direct},
1780             },
1781             1,
1782             {
1783                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1784             },
1785         },
1786
1787         {16,
1788             2,
1789             {
1790                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1791                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1792             },
1793             1,
1794             {
1795                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1796             },
1797         },
1798
1799         {32,
1800             2,
1801             {
1802                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1803                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1804             },
1805             1,
1806             {
1807                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1808             },
1809         },
1810
1811         {64,
1812             2,
1813             {
1814                 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1815                 {8192, -1, &MPIR_Scatter_MV2_Direct},
1816             },
1817             1,
1818             {
1819                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1820             },
1821         },
1822
1823         {128,
1824             4,
1825             {
1826                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1827                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1828                 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1829                 {16384, -1, &MPIR_Scatter_MV2_Direct},
1830             },
1831             1,
1832             {
1833                 {0, 128, &MPIR_Scatter_MV2_Direct},
1834                 {128, -1, &MPIR_Scatter_MV2_Binomial},
1835             },
1836         },
1837     };
1838     table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1839     mv2_scatter_table_ppn_conf[2] = 16;
1840     mv2_size_scatter_tuning_table[2] = 8;
1841     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1842         {
1843             16,
1844             2,
1845             {
1846                 {0, 256, &MPIR_Scatter_MV2_Binomial},
1847                 {256, -1, &MPIR_Scatter_MV2_Direct},
1848             },
1849             1,
1850             {
1851                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1852             },
1853         },
1854
1855         {
1856             32,
1857             2,
1858             {
1859                 {0, 512, &MPIR_Scatter_MV2_Binomial},
1860                 {512, -1, &MPIR_Scatter_MV2_Direct},
1861             },
1862             1,
1863             {
1864                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1865             },
1866         },
1867
1868         {
1869             64,
1870             2,
1871             {
1872                 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1873                 {1024, -1, &MPIR_Scatter_MV2_Direct},
1874             },
1875             1,
1876             {
1877                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1878             },
1879         },
1880
1881         {
1882             128,
1883             4,
1884             {
1885                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1886                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1887                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1888                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1889             },
1890             1,
1891             {
1892                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1893             },
1894         },
1895
1896         {
1897             256,
1898             4,
1899             {
1900                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1901                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1902                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1903                 {2048, -1,  &MPIR_Scatter_MV2_Direct},
1904             },
1905             1,
1906             {
1907                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1908             },
1909         },
1910
1911         {
1912             512,
1913             4,
1914             {
1915                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1916                 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1917                 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1918                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1919             },
1920             1,
1921             {
1922                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1923             },
1924         },
1925         {
1926             1024,
1927             5,
1928             {
1929                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1930                 {0, 16,  &MPIR_Scatter_MV2_Binomial},
1931                 {16, 32, &MPIR_Scatter_MV2_Binomial},
1932                 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1933                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1934             },
1935             1,
1936             {
1937                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1938             },
1939         },
1940         {
1941             2048,
1942             7,
1943             {
1944                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1945                 {0, 16,  &MPIR_Scatter_MV2_two_level_Binomial},
1946                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1947                 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1948                 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1949                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1950                 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1951             },
1952             6,
1953             {
1954                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1955                 {16, 128, &MPIR_Scatter_MV2_Binomial},
1956                 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1957                 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1958                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1959                 {65536, -1, &MPIR_Scatter_MV2_Direct},
1960             },
1961         },
1962     };
1963     table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1964     agg_table_sum = 0;
1965     for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1966         agg_table_sum += mv2_size_scatter_tuning_table[i];
1967     }
1968     mv2_scatter_thresholds_table[0] =
1969         static_cast<mv2_scatter_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table)));
1970     memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1971         (sizeof(mv2_scatter_tuning_table)
1972             * mv2_size_scatter_tuning_table[0]));
1973     for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1974         mv2_scatter_thresholds_table[i] =
1975             mv2_scatter_thresholds_table[i - 1]
1976                                          + mv2_size_scatter_tuning_table[i - 1];
1977         memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1978             (sizeof(mv2_scatter_tuning_table)
1979                 * mv2_size_scatter_tuning_table[i]));
1980     }
1981     xbt_free(table_ptrs);
1982   
1983 }
1984