Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
protect these calls against MPI_DATATYPE_NULL errors
[simgrid.git] / src / smpi / colls / smpi_mvapich2_selector_stampede.h
1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
2
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4  * All rights reserved.                                                     */
5
6 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
7
8
9
10 /************ Alltoall variables and initializers                        */
11
12 #define MV2_MAX_NB_THRESHOLDS  32
13 typedef struct {
14     int min;
15     int max;
16     int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
17                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
18                                      MPI_Comm comm_ptr );
19 } mv2_alltoall_tuning_element;
20
21 typedef struct {
22     int numproc;
23     int size_table;
24     mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
25     mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
26 } mv2_alltoall_tuning_table;
27
28 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
29
30 /* Indicates number of processes per node */
31 int *mv2_alltoall_table_ppn_conf = NULL;
32 /* Indicates total number of configurations */
33 int mv2_alltoall_num_ppn_conf = 1;
34 int *mv2_size_alltoall_tuning_table = NULL;
35 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
36
37
38 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
39 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
40 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_ring
41 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
42 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring 
43
44
45 static void init_mv2_alltoall_tables_stampede(){
46 int i;
47   int agg_table_sum = 0;
48   mv2_alltoall_tuning_table **table_ptrs = NULL;
49   mv2_alltoall_num_ppn_conf = 3;
50   mv2_alltoall_thresholds_table = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
51                                   * mv2_alltoall_num_ppn_conf);
52   table_ptrs = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
53               * mv2_alltoall_num_ppn_conf);
54   mv2_size_alltoall_tuning_table = xbt_malloc(sizeof(int) *
55                                    mv2_alltoall_num_ppn_conf);
56   mv2_alltoall_table_ppn_conf = xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
57   mv2_alltoall_table_ppn_conf[0] = 1;
58   mv2_size_alltoall_tuning_table[0] = 6;
59   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
60     {2,
61      1, 
62      {{0, -1, &MPIR_Alltoall_pairwise_MV2},
63      },
64   
65      {{0, -1, &MPIR_Alltoall_inplace_MV2},
66      },
67     },
68   
69     {4,
70      2,
71      {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
72       {262144, -1, &MPIR_Alltoall_pairwise_MV2},
73      },
74                 
75      {{0, -1, &MPIR_Alltoall_inplace_MV2},
76      },
77     },
78   
79     {8,
80      2,
81      {{0, 8, &MPIR_Alltoall_RD_MV2},
82       {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
83      },
84   
85      {{0, -1, &MPIR_Alltoall_inplace_MV2},
86      },
87     },
88   
89     {16,
90      3,
91      {{0, 64, &MPIR_Alltoall_RD_MV2},
92       {64, 512, &MPIR_Alltoall_bruck_MV2},
93       {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
94      },
95   
96      {{0,-1, &MPIR_Alltoall_inplace_MV2},
97      },
98     },
99   
100     {32,
101      3,
102      {{0, 32, &MPIR_Alltoall_RD_MV2},
103       {32, 2048, &MPIR_Alltoall_bruck_MV2},
104       {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
105      },
106   
107      {{0, -1, &MPIR_Alltoall_inplace_MV2},
108      },
109     },
110   
111     {64,
112      3,
113      {{0, 8, &MPIR_Alltoall_RD_MV2},
114       {8, 1024, &MPIR_Alltoall_bruck_MV2},
115       {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
116      },
117   
118      {{0, -1, &MPIR_Alltoall_inplace_MV2},
119      },
120     },
121         };
122         table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
123         mv2_alltoall_table_ppn_conf[1] = 2;
124         mv2_size_alltoall_tuning_table[1] = 6;
125         mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
126     {4,
127      2,
128      {{0, 32, &MPIR_Alltoall_RD_MV2},
129       {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
130      },
131                 
132      {{0, -1, &MPIR_Alltoall_inplace_MV2},
133      },
134     },
135   
136     {8,
137      2,
138      {{0, 64, &MPIR_Alltoall_RD_MV2},
139       {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
140      },
141                 
142      {{0, -1, &MPIR_Alltoall_inplace_MV2},
143      },
144     },
145   
146     {16,
147      3,
148      {{0, 64, &MPIR_Alltoall_RD_MV2},
149       {64, 2048, &MPIR_Alltoall_bruck_MV2},
150       {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
151      },
152   
153      {{0,-1, &MPIR_Alltoall_inplace_MV2},
154      },
155     },
156   
157     {32,
158      3,
159      {{0, 16, &MPIR_Alltoall_RD_MV2},
160       {16, 2048, &MPIR_Alltoall_bruck_MV2},
161       {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
162      },
163   
164      {{0, -1, &MPIR_Alltoall_inplace_MV2},
165      },
166     },
167   
168     {64,
169      3,
170      {{0, 8, &MPIR_Alltoall_RD_MV2},
171       {8, 1024, &MPIR_Alltoall_bruck_MV2},
172       {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
173      },
174   
175      {{0, -1, &MPIR_Alltoall_inplace_MV2},
176      },
177     },
178
179     {128,
180      3,
181      {{0, 4, &MPIR_Alltoall_RD_MV2},
182       {4, 2048, &MPIR_Alltoall_bruck_MV2},
183       {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
184      },
185   
186      {{0, -1, &MPIR_Alltoall_inplace_MV2},
187      },
188     },
189         };
190         table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
191         mv2_alltoall_table_ppn_conf[2] = 16;
192         mv2_size_alltoall_tuning_table[2] = 7;
193         mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
194     {16,
195      2, 
196      {{0, 2048, &MPIR_Alltoall_bruck_MV2},
197       {2048, -1,  &MPIR_Alltoall_Scatter_dest_MV2},
198      },
199   
200      {{32768, -1, &MPIR_Alltoall_inplace_MV2},
201      },
202     },
203   
204     {32,
205      2,
206      {{0, 2048, &MPIR_Alltoall_bruck_MV2},
207       {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
208      },
209                 
210      {{16384, -1, &MPIR_Alltoall_inplace_MV2},
211      },
212     },
213   
214     {64,
215      3,
216      {{0, 2048, &MPIR_Alltoall_bruck_MV2},
217       {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
218       {16384, -1, &MPIR_Alltoall_pairwise_MV2},
219      },
220   
221      {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
222      },
223     },
224   
225     {128,
226      2,
227      {{0, 2048, &MPIR_Alltoall_bruck_MV2},
228       {2048, -1, &MPIR_Alltoall_pairwise_MV2},
229      },
230   
231      {{16384,65536, &MPIR_Alltoall_inplace_MV2},
232      },
233     },
234   
235     {256,
236      2,
237      {{0, 1024, &MPIR_Alltoall_bruck_MV2},
238       {1024, -1, &MPIR_Alltoall_pairwise_MV2},
239      },
240   
241      {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
242      },
243     },
244   
245     {512,
246      2,
247      {{0, 1024, &MPIR_Alltoall_bruck_MV2},
248       {1024, -1, &MPIR_Alltoall_pairwise_MV2},
249      },
250   
251      {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
252      },
253     },
254     {1024,
255      2,
256      {{0, 1024, &MPIR_Alltoall_bruck_MV2},
257       {1024, -1, &MPIR_Alltoall_pairwise_MV2},
258      },
259   
260      {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
261      },
262     },
263   
264         };
265   table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
266         agg_table_sum = 0;
267         for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
268     agg_table_sum += mv2_size_alltoall_tuning_table[i];
269         }
270         mv2_alltoall_thresholds_table[0] =
271     xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
272         memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
273                     (sizeof(mv2_alltoall_tuning_table)
274                      * mv2_size_alltoall_tuning_table[0]));
275         for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
276     mv2_alltoall_thresholds_table[i] =
277             mv2_alltoall_thresholds_table[i - 1]
278             + mv2_size_alltoall_tuning_table[i - 1];
279     memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
280                       (sizeof(mv2_alltoall_tuning_table)
281                        * mv2_size_alltoall_tuning_table[i]));
282         }
283         xbt_free(table_ptrs);
284         
285         
286 }
287
288
289 /************ Allgather variables and initializers                        */
290
291 typedef struct {
292     int min;
293     int max;
294     int (*MV2_pt_Allgather_function)(void *sendbuf,
295                                  int sendcount,
296                                  MPI_Datatype sendtype,
297                                  void *recvbuf,
298                                  int recvcount,
299                                  MPI_Datatype recvtype, MPI_Comm comm_ptr);
300 } mv2_allgather_tuning_element;
301
302 typedef struct {
303     int numproc; 
304     int two_level[MV2_MAX_NB_THRESHOLDS];
305     int size_inter_table;
306     mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
307 } mv2_allgather_tuning_table;
308
309 int (*MV2_Allgather_function)(void *sendbuf,
310                              int sendcount,
311                              MPI_Datatype sendtype,
312                              void *recvbuf,
313                              int recvcount,
314                              MPI_Datatype recvtype, MPI_Comm comm);
315
316 int *mv2_allgather_table_ppn_conf = NULL;
317 int mv2_allgather_num_ppn_conf = 1;
318 int *mv2_size_allgather_tuning_table = NULL;
319 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
320
321 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
322 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
323 #define MPIR_Allgather_RD_Allgather_Comm_MV2 smpi_coll_tuned_allgather_rdb
324 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
325
326
327 static void init_mv2_allgather_tables_stampede(){
328 int i;
329   int agg_table_sum = 0;
330 mv2_allgather_tuning_table **table_ptrs = NULL;
331  mv2_allgather_num_ppn_conf = 3;
332         mv2_allgather_thresholds_table
333             = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
334                   * mv2_allgather_num_ppn_conf);
335         table_ptrs = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
336                                  * mv2_allgather_num_ppn_conf);
337         mv2_size_allgather_tuning_table = xbt_malloc(sizeof(int) *
338                                                       mv2_allgather_num_ppn_conf);
339         mv2_allgather_table_ppn_conf 
340             = xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int));
341         mv2_allgather_table_ppn_conf[0] = 1;
342         mv2_size_allgather_tuning_table[0] = 6;
343         mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
344             {
345                 2,
346                 {0},
347                 1,
348                 {
349                     {0, -1, &MPIR_Allgather_Ring_MV2},
350                 },
351             },
352             {
353                 4,
354                 {0,0},
355                 2,
356                 {
357                     {0, 262144, &MPIR_Allgather_RD_MV2},
358                     {262144, -1, &MPIR_Allgather_Ring_MV2},
359                 },
360             },
361             {
362                 8,
363                 {0,0},
364                 2,
365                 {
366                     {0, 131072, &MPIR_Allgather_RD_MV2},
367                     {131072, -1, &MPIR_Allgather_Ring_MV2},
368                 },
369             },
370             {
371                 16,
372                 {0,0},
373                 2,
374                 {
375                     {0, 131072, &MPIR_Allgather_RD_MV2},
376                     {131072, -1, &MPIR_Allgather_Ring_MV2},
377                 },
378             },
379             {
380                 32,
381                 {0,0},
382                 2,
383                 {
384                     {0, 65536, &MPIR_Allgather_RD_MV2},
385                     {65536, -1, &MPIR_Allgather_Ring_MV2},
386                 },
387             },
388             {
389                 64,
390                 {0,0},
391                 2,
392                 {
393                     {0, 32768, &MPIR_Allgather_RD_MV2},
394                     {32768, -1, &MPIR_Allgather_Ring_MV2},
395                 },
396             },
397         };
398         table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
399         mv2_allgather_table_ppn_conf[1] = 2;
400         mv2_size_allgather_tuning_table[1] = 6;
401         mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
402             {
403                 4,
404                 {0,0},
405                 2,
406                 {
407                     {0, 524288, &MPIR_Allgather_RD_MV2},
408                     {524288, -1, &MPIR_Allgather_Ring_MV2},
409                 },
410             },
411             {
412                 8,
413                 {0,1,0},
414                 2,
415                 {
416                     {0, 32768, &MPIR_Allgather_RD_MV2},
417                     {32768, 524288, &MPIR_Allgather_Ring_MV2},
418                     {524288, -1, &MPIR_Allgather_Ring_MV2},
419                 },
420             },
421             {
422                 16,
423                 {0,1,0},
424                 2,
425                 {
426                     {0, 16384, &MPIR_Allgather_RD_MV2},
427                     {16384, 524288, &MPIR_Allgather_Ring_MV2},
428                     {524288, -1, &MPIR_Allgather_Ring_MV2},
429                 },
430             },
431             {
432                 32,
433                 {1,1,0},
434                 2,
435                 {
436                     {0, 65536, &MPIR_Allgather_RD_MV2},
437                     {65536, 524288, &MPIR_Allgather_Ring_MV2},
438                     {524288, -1, &MPIR_Allgather_Ring_MV2},
439                 },
440             },
441             {
442                 64,
443                 {1,1,0},
444                 2,
445                 {
446                     {0, 32768, &MPIR_Allgather_RD_MV2},
447                     {32768, 524288, &MPIR_Allgather_Ring_MV2},
448                     {524288, -1, &MPIR_Allgather_Ring_MV2},
449                 },
450             },
451             {
452                 128,
453                 {1,1,0},
454                 2,
455                 {
456                     {0, 65536, &MPIR_Allgather_RD_MV2},
457                     {65536, 524288, &MPIR_Allgather_Ring_MV2},
458                     {524288, -1, &MPIR_Allgather_Ring_MV2},
459                 },
460             },
461         };
462         table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
463         mv2_allgather_table_ppn_conf[2] = 16;
464         mv2_size_allgather_tuning_table[2] = 6;
465         mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
466             {
467                 16,
468                 {0,0},
469                 2,
470                 {
471                     {0, 1024, &MPIR_Allgather_RD_MV2},
472                     {1024, -1, &MPIR_Allgather_Ring_MV2},
473                 },
474             },
475             {
476                 32,
477                 {0,0},
478                 2,
479                 {
480                     {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
481                     {1024, -1, &MPIR_Allgather_Ring_MV2},
482                 },
483             },
484             {
485                 64,
486                 {0,0},
487                 2,
488                 {
489                     {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
490                     {1024, -1, &MPIR_Allgather_Ring_MV2},
491                 },
492             },
493             {
494                 128,
495                 {0,0},
496                 2,
497                 {
498                     {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
499                     {1024, -1, &MPIR_Allgather_Ring_MV2},
500                 },
501             },
502             {
503                 256,
504                 {0,0},
505                 2,
506                 {
507                     {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
508                     {1024, -1, &MPIR_Allgather_Ring_MV2},
509                 },
510             },
511             {
512                 512,
513                 {0,0},
514                 2,
515                 {
516                     {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
517                     {1024, -1, &MPIR_Allgather_Ring_MV2},
518                 },
519             },
520
521         };
522         table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
523         agg_table_sum = 0;
524         for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
525             agg_table_sum += mv2_size_allgather_tuning_table[i];
526         }
527         mv2_allgather_thresholds_table[0] =
528             xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
529         memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
530             (sizeof(mv2_allgather_tuning_table)
531                      * mv2_size_allgather_tuning_table[0]));
532         for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
533             mv2_allgather_thresholds_table[i] =
534             mv2_allgather_thresholds_table[i - 1]
535             + mv2_size_allgather_tuning_table[i - 1];
536             memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
537                       (sizeof(mv2_allgather_tuning_table)
538                        * mv2_size_allgather_tuning_table[i]));
539         }
540         xbt_free(table_ptrs);
541 }
542
543
544 /************ Gather variables and initializers                        */
545
546 typedef struct {
547     int min;
548     int max;
549     int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
550                                   MPI_Datatype sendtype, void *recvbuf, int recvcnt,
551                                   MPI_Datatype recvtype, int root, MPI_Comm  comm_ptr);
552 } mv2_gather_tuning_element;
553
554
555 typedef struct {
556     int numproc;
557     int size_inter_table;
558     mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
559     int size_intra_table;
560     mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
561 } mv2_gather_tuning_table;
562
563 int mv2_size_gather_tuning_table=7;
564 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL; 
565
566 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
567                                         int sendcnt,
568                                         MPI_Datatype sendtype,
569                                         void *recvbuf,
570                                         int recvcnt,
571                                         MPI_Datatype recvtype,
572                                         int root, MPI_Comm comm);
573
574 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
575 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
576
577
578 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
579 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_ompi_basic_linear
580 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
581
582
583 static void init_mv2_gather_tables_stampede(){
584
585  mv2_size_gather_tuning_table=7;
586       mv2_gather_thresholds_table = xbt_malloc(mv2_size_gather_tuning_table*
587             sizeof (mv2_gather_tuning_table)); 
588       mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
589   {16,
590    2,{{0, 524288, &MPIR_Gather_MV2_Direct},
591       {524288, -1, &MPIR_Gather_intra}},
592    1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
593   {32,
594    3,{{0, 16384, &MPIR_Gather_MV2_Direct}, 
595       {16384, 131072, &MPIR_Gather_intra},
596       {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
597    1,{{0, -1, &MPIR_Gather_intra}}},
598   {64,
599    3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct}, 
600       {256, 16384, &MPIR_Gather_MV2_Direct},
601       {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
602    1,{{0, -1, &MPIR_Gather_intra}}},
603   {128,
604    3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, 
605       {512, 16384, &MPIR_Gather_MV2_Direct},
606       {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
607    1,{{0, -1, &MPIR_Gather_intra}}},
608   {256,
609    3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, 
610       {512, 16384, &MPIR_Gather_MV2_Direct},
611       {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
612    1,{{0, -1, &MPIR_Gather_intra}}},
613   {512,
614    3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, 
615       {512, 16384, &MPIR_Gather_MV2_Direct},
616       {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
617    1,{{0, -1, &MPIR_Gather_intra}}},
618   {1024,
619    3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, 
620       {512, 16384, &MPIR_Gather_MV2_Direct},
621       {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
622    1,{{0, -1, &MPIR_Gather_intra}}},
623       };
624
625       memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
626       mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
627
628 }
629
630
631 /************ Allgatherv variables and initializers                        */
632
633 typedef struct {
634     int min;
635     int max;
636     int (*MV2_pt_Allgatherv_function)(void *sendbuf,
637                                       int sendcount,
638                                       MPI_Datatype sendtype,
639                                       void *recvbuf,
640                                       int *recvcounts,
641                                       int *displs,
642                                       MPI_Datatype recvtype,
643                                       MPI_Comm commg);
644 } mv2_allgatherv_tuning_element;
645
646 typedef struct {
647     int numproc; 
648     int size_inter_table;
649     mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
650 } mv2_allgatherv_tuning_table;
651
652 int (*MV2_Allgatherv_function)(void *sendbuf,
653                                int sendcount,
654                                MPI_Datatype sendtype,
655                                void *recvbuf,
656                                int *recvcounts,
657                                int *displs,
658                                MPI_Datatype recvtype,
659                                MPI_Comm comm);
660                                
661 int mv2_size_allgatherv_tuning_table = 0;
662 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
663
664 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
665 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
666 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
667
668
669 static void init_mv2_allgatherv_tables_stampede(){
670  mv2_size_allgatherv_tuning_table = 6;
671  mv2_allgatherv_thresholds_table = xbt_malloc(mv2_size_allgatherv_tuning_table *
672                                                   sizeof (mv2_allgatherv_tuning_table));
673         mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
674             {
675                 16,
676                 2,
677                 {
678                     {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
679                     {512, -1, &MPIR_Allgatherv_Ring_MV2},
680                 },
681             },
682             {
683                 32,
684                 2,
685                 {
686                     {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
687                     {512, -1, &MPIR_Allgatherv_Ring_MV2},
688                 },
689             },
690             {
691                 64,
692                 2,
693                 {
694                     {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
695                     {256, -1, &MPIR_Allgatherv_Ring_MV2},
696                 },
697             },
698             {
699                 128,
700                 2,
701                 {
702                     {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
703                     {256, -1, &MPIR_Allgatherv_Ring_MV2},
704                 },
705             },
706             {
707                 256,
708                 2,
709                 {
710                     {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
711                     {256, -1, &MPIR_Allgatherv_Ring_MV2},
712                 },
713             },
714             {
715                 512,
716                 2,
717                 {
718                     {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
719                     {256, -1, &MPIR_Allgatherv_Ring_MV2},
720                 },
721             },
722
723         }; 
724         memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
725                   mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
726 }
727
728
729 /************ Allreduce variables and initializers                        */
730
731 typedef struct {
732     int min;
733     int max;
734     int (*MV2_pt_Allreduce_function)(void *sendbuf,
735                                    void *recvbuf,
736                                    int count,
737                                    MPI_Datatype datatype,
738                                    MPI_Op op, MPI_Comm comm);
739 } mv2_allreduce_tuning_element;
740
741 typedef struct {
742     int numproc; 
743     int mcast_enabled;  
744     int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];   
745     int size_inter_table;
746     mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
747     int size_intra_table;
748     mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
749 } mv2_allreduce_tuning_table;
750
751
752 int (*MV2_Allreduce_function)(void *sendbuf,
753                              void *recvbuf,
754                              int count,
755                              MPI_Datatype datatype,
756                              MPI_Op op, MPI_Comm comm)=NULL;
757
758
759 int (*MV2_Allreduce_intra_function)( void *sendbuf,
760                              void *recvbuf,
761                              int count,
762                              MPI_Datatype datatype,
763                              MPI_Op op, MPI_Comm comm)=NULL;
764
765 int mv2_size_allreduce_tuning_table = 0;
766 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
767
768
769
770
771
772 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
773                              void *recvbuf,
774                              int count,
775                              MPI_Datatype datatype,
776                              MPI_Op op, MPI_Comm comm)
777
778     return 0;
779 }
780
781 static  int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
782                              void *recvbuf,
783                              int count,
784                              MPI_Datatype datatype,
785                              MPI_Op op, MPI_Comm  comm)
786 {
787     return 0;
788 }
789
790 static  int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
791                              void *recvbuf,
792                              int count,
793                              MPI_Datatype datatype,
794                              MPI_Op op, MPI_Comm  comm)
795 {
796     mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
797     return MPI_SUCCESS;
798 }
799
800 static  int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
801                              void *recvbuf,
802                              int count,
803                              MPI_Datatype datatype,
804                              MPI_Op op, MPI_Comm  comm)
805 {
806     mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
807     return MPI_SUCCESS;
808 }
809
810 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
811 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
812
813
814
815 static void init_mv2_allreduce_tables_stampede(){
816 mv2_size_allreduce_tuning_table = 8;
817       mv2_allreduce_thresholds_table = xbt_malloc(mv2_size_allreduce_tuning_table *
818                sizeof (mv2_allreduce_tuning_table));
819       mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
820   {
821     16,
822     0,
823     {1, 0},
824     2,
825     {
826       {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
827       {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
828     },
829     2,
830     {
831       {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
832       {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
833     },
834   },
835   {
836     32,
837     0,
838     {1, 1, 0},
839     3,
840     {
841       {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
842       {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
843       {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
844     },
845     2,
846     {
847       {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
848       {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
849     },
850   },
851   {
852     64,
853     0,
854     {1, 1, 0},
855     3,
856     {
857       {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
858       {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
859       {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
860     },
861     2,
862     {
863       {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
864       {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
865     },
866   },
867   {
868     128,
869     0,
870     {1, 1, 0},
871     3,
872     {
873       {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
874       {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
875       {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
876     },
877     2,
878     {
879       {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
880       {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
881     },
882   },
883   {
884     256,
885     0,
886     {1, 1, 0},
887     3,
888     {
889       {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
890       {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
891       {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
892     },
893     2,
894     {
895       {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
896       {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
897     },
898   },
899   {
900     512,
901     0,
902     {1, 1, 0},
903     3,
904     {
905       {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
906       {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
907       {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
908     },
909     2,
910     {
911       {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
912       {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
913     },
914   },
915   {
916     1024,
917     0,
918     {1, 1, 1, 0},
919     4,
920     {
921       {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
922       {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
923       {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
924       {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
925     },
926     2,
927     {
928       {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
929       {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
930     },
931   },
932   {
933     2048,
934     0,
935     {1, 1, 1, 0},
936     4,
937     {
938       {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
939       {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
940       {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
941       {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
942       {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
943     },
944     2,
945     {
946       {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
947       {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
948     },
949   },
950  
951       }; 
952       memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
953       mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
954 }
955
956
957 /*
958 Bcast deactivated for now, defaults to mpich one
959 typedef struct {
960     int min;
961     int max;
962     int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
963                                   int root, MPI_Comm comm_ptr);
964     int zcpy_pipelined_knomial_factor;
965 } mv2_bcast_tuning_element;
966
967 typedef struct {
968     int numproc;
969     int bcast_segment_size;
970     int intra_node_knomial_factor;
971     int inter_node_knomial_factor;
972     int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
973     int size_inter_table;
974     mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
975     int size_intra_table;
976     mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
977 } mv2_bcast_tuning_table;
978
979 int mv2_size_bcast_tuning_table = 0;
980 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
981
982
983 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
984                            int root, MPI_Comm comm_ptr) = NULL;
985
986 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
987                                       int root, MPI_Comm comm_ptr) = NULL;
988                                       
989                                       
990 */
991
992
993 /*
994 static void init_mv2_bcast_tables_stampede(){
995  //Stampede,
996         mv2_size_bcast_tuning_table=8;
997         mv2_bcast_thresholds_table = xbt_malloc(mv2_size_bcast_tuning_table *
998                                                  sizeof (mv2_bcast_tuning_table));
999
1000   mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1001     {
1002             16,
1003             8192, 4, 4,
1004             {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1005             11,
1006             {
1007               {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1008               {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1009               {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1010               {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1011               {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1012               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1013               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1014               {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1015               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1016               {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1017               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1018             },
1019             11,
1020             {
1021               {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1022               {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1023               {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1024               {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1025               {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1026               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1027               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1028               {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1029               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1030               {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1031               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1032             }
1033     },
1034     {
1035             32,
1036             8192, 4, 4,
1037             {1, 1, 1, 1, 1, 1, 1, 1},
1038             8,
1039             {
1040               {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1041               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1042               {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1043               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1044               {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1045               {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1046               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1047               {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1048             },
1049             8,
1050             {
1051               {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1052               {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1053               {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1054               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1055               {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1056               {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1057               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1058               {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1059             }
1060     },
1061     {
1062             64,
1063             8192, 4, 4,
1064             {1, 1, 1, 1, 1, 1, 1, 1, 1},
1065             9,
1066             {
1067               {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1068               {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1069               {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1070               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1071               {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1072               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1073               {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1074               {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1075               {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1076             },
1077             9,
1078             {
1079               {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1080               {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1081               {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1082               {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1083               {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1084               {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1085               {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1086               {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1087               {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1088             }
1089     },
1090     {
1091             128,
1092             8192, 4, 4,
1093             {1, 1, 1, 0},
1094             4,
1095             {
1096               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1097               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1098               {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1099               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1100             },
1101             4,
1102             {
1103               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1104               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1105               {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1106               {524288, -1, NULL, -1}
1107             }
1108     },
1109     {
1110             256,
1111             8192, 4, 4,
1112             {1, 1, 1, 1, 1},
1113             5,
1114             {
1115               {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1116               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1117               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1118               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1119               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1120             },
1121             5,
1122             {
1123               {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1124               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1125               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1126               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1127               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1128             }
1129     },
1130     {
1131             512,
1132             8192, 4, 4,
1133             {1, 1, 1, 1, 1},
1134             5,
1135             {
1136               {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1137               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1138               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1139               {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1140               {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1141             },
1142             5,
1143             {
1144               {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1145               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1146               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1147               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1148               {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1149             }
1150     },
1151     {
1152             1024,
1153             8192, 4, 4,
1154             {1, 1, 1, 1, 1},
1155             5,
1156             {
1157               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1158               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1159               {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1160               {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1161               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1162             },
1163             5,
1164             {
1165               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1166               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1167               {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1168               {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1169               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1170             }
1171     },
1172     {
1173             2048,
1174             8192, 4, 4,
1175             {1, 1, 1, 1, 1, 1, 1},
1176             7,
1177             {
1178               {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1179               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1180               {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1181               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1182               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1183               {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1184               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1185             },
1186             7,
1187             {
1188               {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1189               {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1190               {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1191               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1192               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1193               {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1194               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1195             }
1196     }
1197   };
1198
1199         memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1200                     mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1201 }*/
1202
1203
1204 /************ Reduce variables and initializers                        */
1205
1206 typedef struct {
1207     int min;
1208     int max;
1209     int (*MV2_pt_Reduce_function)(void *sendbuf,
1210                                  void *recvbuf,
1211                                  int count,
1212                                  MPI_Datatype datatype,
1213                                  MPI_Op op,
1214                                  int root,
1215                                  MPI_Comm  comm_ptr);
1216 } mv2_reduce_tuning_element;
1217
1218 typedef struct {
1219     int numproc; 
1220     int inter_k_degree;
1221     int intra_k_degree;
1222     int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1223     int size_inter_table;
1224     mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1225     int size_intra_table;
1226     mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1227 } mv2_reduce_tuning_table;
1228
1229 int mv2_size_reduce_tuning_table = 0;
1230 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1231
1232
1233 int mv2_reduce_intra_knomial_factor = -1;
1234 int mv2_reduce_inter_knomial_factor = -1;
1235
1236 int (*MV2_Reduce_function)( void *sendbuf,
1237                            void *recvbuf,
1238                            int count,
1239                            MPI_Datatype datatype,
1240                            MPI_Op op,
1241                            int root,
1242                            MPI_Comm  comm_ptr)=NULL;
1243
1244 int (*MV2_Reduce_intra_function)( void *sendbuf,
1245                                  void *recvbuf,
1246                                  int count,
1247                                  MPI_Datatype datatype,
1248                                  MPI_Op op,
1249                                  int root,
1250                                  MPI_Comm  comm_ptr)=NULL;
1251                                  
1252                                  
1253 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_binomial
1254 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_binomial
1255 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1256 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1257 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1258
1259
1260
1261 static void init_mv2_reduce_tables_stampede(){
1262  /*Stampede*/
1263         mv2_size_reduce_tuning_table = 8;
1264         mv2_reduce_thresholds_table = xbt_malloc(mv2_size_reduce_tuning_table *
1265                                                   sizeof (mv2_reduce_tuning_table));
1266         mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1267     {
1268       16,
1269       4,
1270       4,
1271       {1, 0, 0},
1272       3,
1273       {
1274         {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1275         {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1276         {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1277       },
1278       2,
1279       {
1280         {0, 65536, &MPIR_Reduce_shmem_MV2},
1281         {65536,-1,  &MPIR_Reduce_binomial_MV2},
1282       },
1283     },
1284     {
1285       32,
1286       4,
1287       4,
1288       {1, 1, 1, 1, 0, 0, 0},
1289       7,
1290       {
1291         {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1292         {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1293         {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1294         {32768, 65536, &MPIR_Reduce_binomial_MV2},
1295         {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1296         {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1297         {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1298       },
1299       6,
1300       {
1301         {0, 8192, &MPIR_Reduce_shmem_MV2},
1302         {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1303         {16384, 32768, &MPIR_Reduce_shmem_MV2},
1304         {32768, 65536, &MPIR_Reduce_shmem_MV2},
1305         {65536, 262144, &MPIR_Reduce_shmem_MV2},
1306         {262144,-1,  &MPIR_Reduce_binomial_MV2},
1307       },
1308     },
1309     {
1310       64,
1311       4,
1312       4,
1313       {1, 1, 1, 1, 0},
1314       5,
1315       {
1316         {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1317         {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1318         {16384, 65536, &MPIR_Reduce_binomial_MV2},
1319         {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1320         {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1321       },
1322       5,
1323       {
1324         {0, 8192, &MPIR_Reduce_shmem_MV2},
1325         {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1326         {16384, 65536, &MPIR_Reduce_shmem_MV2},
1327         {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1328         {262144, -1, &MPIR_Reduce_binomial_MV2},
1329       },
1330     },
1331     {
1332       128,
1333       4,
1334       4,
1335       {1, 0, 1, 0, 1, 0},
1336       6,
1337       {
1338         {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1339         {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1340         {16384, 65536, &MPIR_Reduce_binomial_MV2},
1341         {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1342         {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1343         {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1344       },
1345       5,
1346       {
1347         {0, 8192, &MPIR_Reduce_shmem_MV2},
1348         {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1349         {16384, 65536, &MPIR_Reduce_shmem_MV2},
1350         {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1351         {262144, -1, &MPIR_Reduce_binomial_MV2},
1352       },
1353     },
1354     {
1355       256,
1356       4,
1357       4,
1358       {1, 1, 1, 0, 1, 1, 0},
1359       7,
1360       {
1361         {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1362         {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1363         {16384, 32768, &MPIR_Reduce_binomial_MV2},
1364         {32768, 65536, &MPIR_Reduce_binomial_MV2},
1365         {65536, 262144, &MPIR_Reduce_binomial_MV2},
1366         {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1367         {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1368       },
1369       6,
1370       {
1371         {0, 8192, &MPIR_Reduce_shmem_MV2},
1372         {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1373         {16384, 32768, &MPIR_Reduce_shmem_MV2},
1374         {32768, 65536, &MPIR_Reduce_shmem_MV2},
1375         {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1376         {262144, -1, &MPIR_Reduce_binomial_MV2},
1377       },
1378     },
1379     {
1380       512,
1381       4,
1382       4,
1383       {1, 0, 1, 1, 1, 0},
1384       6,
1385       {
1386         {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1387         {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1388         {16384, 65536, &MPIR_Reduce_binomial_MV2},
1389         {65536, 262144, &MPIR_Reduce_binomial_MV2},
1390         {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1391         {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1392       },
1393       5,
1394       {
1395         {0, 8192, &MPIR_Reduce_shmem_MV2},
1396         {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1397         {16384, 65536, &MPIR_Reduce_shmem_MV2},
1398         {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1399         {262144, -1, &MPIR_Reduce_binomial_MV2},
1400       },
1401     },
1402     {
1403       1024,
1404       4,
1405       4,
1406       {1, 0, 1, 1, 1},
1407       5,
1408       {
1409         {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1410         {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1411         {16384, 65536, &MPIR_Reduce_binomial_MV2},
1412         {65536, 262144, &MPIR_Reduce_binomial_MV2},
1413         {262144, -1, &MPIR_Reduce_binomial_MV2},
1414       },
1415       5,
1416       {
1417         {0, 8192, &MPIR_Reduce_shmem_MV2},
1418         {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1419         {16384, 65536, &MPIR_Reduce_shmem_MV2},
1420         {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1421         {262144, -1, &MPIR_Reduce_binomial_MV2},
1422       },
1423     },
1424     {
1425       2048,
1426       4,
1427       4,
1428       {1, 0, 1, 1, 1,1},
1429       6,
1430       {
1431         {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1432         {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1433         {4096, 16384, &MPIR_Reduce_binomial_MV2},
1434         {16384, 65536, &MPIR_Reduce_binomial_MV2},
1435         {65536, 131072, &MPIR_Reduce_binomial_MV2},
1436         {131072, -1, &MPIR_Reduce_binomial_MV2},
1437       },
1438       6,
1439       {
1440         {0, 2048, &MPIR_Reduce_shmem_MV2},
1441         {2048, 4096, &MPIR_Reduce_shmem_MV2},
1442         {4096, 16384, &MPIR_Reduce_shmem_MV2},
1443         {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1444         {65536, 131072, &MPIR_Reduce_binomial_MV2},
1445         {131072, -1, &MPIR_Reduce_shmem_MV2},
1446       },
1447     },
1448
1449         }; 
1450         memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1451         mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1452 }
1453
1454 /************ Reduce scatter variables and initializers                        */
1455
1456 typedef struct {
1457     int min;
1458     int max;
1459     int (*MV2_pt_Red_scat_function)(void *sendbuf,
1460                                     void *recvbuf,
1461                                     int *recvcnts,
1462                                     MPI_Datatype datatype,
1463                                     MPI_Op op,
1464                                     MPI_Comm comm_ptr);
1465 } mv2_red_scat_tuning_element;
1466
1467 typedef struct {
1468     int numproc; 
1469     int size_inter_table;
1470     mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1471 } mv2_red_scat_tuning_table;
1472
1473 int mv2_size_red_scat_tuning_table = 0;
1474 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1475
1476
1477 int (*MV2_Red_scat_function)(void *sendbuf,
1478                              void *recvbuf,
1479                              int *recvcnts,
1480                              MPI_Datatype datatype,
1481                              MPI_Op op,
1482                              MPI_Comm comm_ptr);
1483                              
1484                              
1485
1486 static  int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1487                              void *recvbuf,
1488                              int *recvcnts,
1489                              MPI_Datatype datatype,
1490                              MPI_Op op,
1491                              MPI_Comm comm)
1492 {
1493     smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1494     return MPI_SUCCESS;
1495 }
1496 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1497 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1498 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1499
1500
1501
1502
1503 static void init_mv2_reduce_scatter_tables_stampede(){
1504         mv2_size_red_scat_tuning_table = 6;
1505         mv2_red_scat_thresholds_table = xbt_malloc(mv2_size_red_scat_tuning_table *
1506                                                   sizeof (mv2_red_scat_tuning_table));
1507         mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1508             {
1509                 16,
1510                 3,
1511                 {
1512                     {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1513                     {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1514                     {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1515                 },
1516             },
1517             {
1518                 32,
1519                 3,
1520                 {
1521                     {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1522                     {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1523                     {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1524                 },
1525             },
1526             {
1527                 64,
1528                 3,
1529                 {
1530                     {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1531                     {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1532                     {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1533                 },
1534             },
1535             {
1536                 128,
1537                 2,
1538                 {
1539                     {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1540                     {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1541                 },
1542             },
1543             {
1544                 256,
1545                 2,
1546                 {
1547                     {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1548                     {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1549                 },
1550             },
1551             {
1552                 512,
1553                 2,
1554                 {
1555                     {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1556                     {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1557                 },
1558             },
1559
1560         }; 
1561         memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1562                   mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1563 }
1564
1565 /************ Scatter variables and initializers                        */
1566
1567 typedef struct {
1568     int min;
1569     int max;
1570     int (*MV2_pt_Scatter_function)(void *sendbuf,
1571                                    int sendcnt,
1572                                    MPI_Datatype sendtype,
1573                                    void *recvbuf,
1574                                    int recvcnt,
1575                                    MPI_Datatype recvtype,
1576                                    int root, MPI_Comm comm);
1577 } mv2_scatter_tuning_element;
1578
1579 typedef struct {
1580     int numproc;
1581     int size_inter_table;
1582     mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1583     int size_intra_table;
1584     mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1585 } mv2_scatter_tuning_table;
1586
1587
1588 int *mv2_scatter_table_ppn_conf = NULL;
1589 int mv2_scatter_num_ppn_conf = 1;
1590 int *mv2_size_scatter_tuning_table = NULL;
1591 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1592
1593 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1594                              void *recvbuf, int recvcount, MPI_Datatype recvtype,
1595                              int root, MPI_Comm comm)=NULL;
1596
1597 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1598                              void *recvbuf, int recvcount, MPI_Datatype recvtype,
1599                              int root, MPI_Comm comm)=NULL;
1600 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1601                               int sendcnt,
1602                               MPI_Datatype sendtype,
1603                               void *recvbuf,
1604                               int recvcnt,
1605                               MPI_Datatype recvtype,
1606                               int root, MPI_Comm comm_ptr);
1607                               
1608 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1609                               int sendcnt,
1610                               MPI_Datatype sendtype,
1611                               void *recvbuf,
1612                               int recvcnt,
1613                               MPI_Datatype recvtype,
1614                               int root, MPI_Comm comm_ptr)
1615 {
1616     return 0;
1617 }
1618
1619 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1620 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1621 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_ompi_binomial
1622 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1623
1624
1625
1626
1627 static void init_mv2_scatter_tables_stampede(){
1628 {
1629     int agg_table_sum = 0;
1630     int i;
1631     mv2_scatter_tuning_table **table_ptrs = NULL;
1632      mv2_scatter_num_ppn_conf = 3;
1633         mv2_scatter_thresholds_table
1634     = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1635       * mv2_scatter_num_ppn_conf);
1636         table_ptrs = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1637                                  * mv2_scatter_num_ppn_conf);
1638         mv2_size_scatter_tuning_table = xbt_malloc(sizeof(int) *
1639                 mv2_scatter_num_ppn_conf);
1640         mv2_scatter_table_ppn_conf 
1641     = xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int));
1642         mv2_scatter_table_ppn_conf[0] = 1;
1643         mv2_size_scatter_tuning_table[0] = 6;
1644         mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1645     {2,
1646      1, 
1647      {
1648        {0, -1, &MPIR_Scatter_MV2_Binomial},
1649      },
1650      1,
1651      {
1652        {0, -1, &MPIR_Scatter_MV2_Binomial},
1653      },
1654     },
1655
1656     {4,
1657      1, 
1658      {
1659        {0, -1, &MPIR_Scatter_MV2_Direct},
1660      },
1661      1,
1662      {
1663        {0, -1, &MPIR_Scatter_MV2_Direct},
1664      },
1665     },
1666   
1667     {8,
1668      1, 
1669      {
1670        {0, -1, &MPIR_Scatter_MV2_Direct},
1671      },
1672      1,
1673      {
1674        {0, -1, &MPIR_Scatter_MV2_Direct},
1675      },
1676     },
1677   
1678     {16,
1679      1, 
1680      {
1681        {0, -1, &MPIR_Scatter_MV2_Direct},
1682      },
1683      1,
1684      {
1685        {0, -1, &MPIR_Scatter_MV2_Direct},
1686      },
1687     },
1688   
1689     {32,
1690      1, 
1691      {
1692        {0, -1, &MPIR_Scatter_MV2_Direct},
1693      },
1694      1,
1695      {
1696        {0, -1, &MPIR_Scatter_MV2_Direct},
1697      },
1698     },
1699   
1700     {64,
1701      2, 
1702      {
1703        {0, 32, &MPIR_Scatter_MV2_Binomial},
1704        {32, -1, &MPIR_Scatter_MV2_Direct},
1705      },
1706      1,
1707      {
1708        {0, -1, &MPIR_Scatter_MV2_Binomial},
1709      },
1710     },
1711         };
1712         table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1713         mv2_scatter_table_ppn_conf[1] = 2;
1714         mv2_size_scatter_tuning_table[1] = 6;
1715         mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1716     {4,
1717      2, 
1718      {
1719        {0, 4096, &MPIR_Scatter_MV2_Binomial},
1720        {4096, -1, &MPIR_Scatter_MV2_Direct},
1721      },
1722      1,
1723      {
1724        {0, -1, &MPIR_Scatter_MV2_Direct},
1725      },
1726     },
1727   
1728     {8,
1729      2, 
1730      {
1731        {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1732        {512, -1, &MPIR_Scatter_MV2_Direct},
1733      },
1734      1,
1735      {
1736        {0, -1, &MPIR_Scatter_MV2_Binomial},
1737      },
1738     },
1739   
1740     {16,
1741      2, 
1742      {
1743        {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1744        {2048, -1, &MPIR_Scatter_MV2_Direct},
1745      },
1746      1,
1747      {
1748        {0, -1, &MPIR_Scatter_MV2_Binomial},
1749      },
1750     },
1751   
1752     {32,
1753      2, 
1754      {
1755        {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1756        {2048, -1, &MPIR_Scatter_MV2_Direct},
1757      },
1758      1,
1759      {
1760        {0, -1, &MPIR_Scatter_MV2_Binomial},
1761      },
1762     },
1763   
1764     {64,
1765      2, 
1766      {
1767        {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1768        {8192, -1, &MPIR_Scatter_MV2_Direct},
1769      },
1770      1,
1771      {
1772        {0, -1, &MPIR_Scatter_MV2_Binomial},
1773      },
1774     },
1775   
1776     {128,
1777      4, 
1778      {
1779        {0, 16, &MPIR_Scatter_MV2_Binomial},
1780        {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1781        {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1782        {16384, -1, &MPIR_Scatter_MV2_Direct},
1783      },
1784      1,
1785      {
1786        {0, 128, &MPIR_Scatter_MV2_Direct},
1787        {128, -1, &MPIR_Scatter_MV2_Binomial},
1788      },
1789     },
1790         };
1791         table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1792         mv2_scatter_table_ppn_conf[2] = 16;
1793         mv2_size_scatter_tuning_table[2] = 8;
1794         mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1795     {
1796       16,
1797       2,
1798       { 
1799         {0, 256, &MPIR_Scatter_MV2_Binomial}, 
1800         {256, -1, &MPIR_Scatter_MV2_Direct},
1801       },
1802       1, 
1803       { 
1804         { 0, -1, &MPIR_Scatter_MV2_Direct},
1805       },
1806     },
1807
1808     {
1809       32,
1810       2,
1811       {
1812         {0, 512, &MPIR_Scatter_MV2_Binomial}, 
1813         {512, -1, &MPIR_Scatter_MV2_Direct},
1814       },
1815       1, 
1816       { 
1817         { 0, -1, &MPIR_Scatter_MV2_Direct},
1818       },
1819     },
1820
1821     {
1822       64,
1823       2,
1824       {
1825         {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1826         {1024, -1, &MPIR_Scatter_MV2_Direct},
1827       },
1828       1,
1829       {
1830         { 0, -1, &MPIR_Scatter_MV2_Direct},
1831       },
1832     },
1833
1834     {
1835       128,
1836       4,
1837       {
1838         {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1839         {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1840         {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1841         {2048, -1, &MPIR_Scatter_MV2_Direct},
1842       },
1843       1,
1844       {
1845         { 0, -1, &MPIR_Scatter_MV2_Direct},
1846       },
1847     },
1848
1849     {
1850       256,
1851       4,
1852       {
1853         {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1854         {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1855         {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1856         {2048, -1,  &MPIR_Scatter_MV2_Direct},
1857       },
1858       1,
1859       {
1860         { 0, -1, &MPIR_Scatter_MV2_Direct},
1861       },
1862     },
1863
1864     {
1865       512,
1866       4,
1867       {
1868         {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1869         {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1870         {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1871         {4096, -1, &MPIR_Scatter_MV2_Direct},
1872       },
1873       1,
1874       {
1875         { 0, -1, &MPIR_Scatter_MV2_Binomial},
1876       }, 
1877     },  
1878     {
1879       1024,
1880       5,
1881       {
1882         {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1883         {0, 16,  &MPIR_Scatter_MV2_Binomial},
1884         {16, 32, &MPIR_Scatter_MV2_Binomial},
1885         {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1886         {4096, -1, &MPIR_Scatter_MV2_Direct},
1887       },
1888       1,
1889       {
1890         { 0, -1, &MPIR_Scatter_MV2_Binomial},
1891       },  
1892     },  
1893     {
1894       2048,
1895       7,
1896       {
1897         {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1898         {0, 16,  &MPIR_Scatter_MV2_two_level_Binomial},
1899         {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1900         {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1901         {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1902         {16384, 65536, &MPIR_Scatter_MV2_Direct},
1903         {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1904       },
1905       6,
1906       {
1907         {0, 16, &MPIR_Scatter_MV2_Binomial},
1908         {16, 128, &MPIR_Scatter_MV2_Binomial},
1909         {128, 1024, &MPIR_Scatter_MV2_Binomial},
1910         {1024, 16384, &MPIR_Scatter_MV2_Direct},
1911         {16384, 65536, &MPIR_Scatter_MV2_Direct},
1912         {65536, -1, &MPIR_Scatter_MV2_Direct},
1913       },
1914     }, 
1915         };
1916         table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1917         agg_table_sum = 0;
1918         for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1919     agg_table_sum += mv2_size_scatter_tuning_table[i];
1920         }
1921         mv2_scatter_thresholds_table[0] =
1922     xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
1923         memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1924         (sizeof(mv2_scatter_tuning_table)
1925                      * mv2_size_scatter_tuning_table[0]));
1926         for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1927     mv2_scatter_thresholds_table[i] =
1928             mv2_scatter_thresholds_table[i - 1]
1929             + mv2_size_scatter_tuning_table[i - 1];
1930     memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1931                       (sizeof(mv2_scatter_tuning_table)
1932                        * mv2_size_scatter_tuning_table[i]));
1933         }
1934         xbt_free(table_ptrs);
1935    }
1936 }
1937