Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Simplify, cleanup, remove useless bits, and some files.
[simgrid.git] / src / smpi / colls / smpi_mvapich2_selector_stampede.h
1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
2 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
3
4 /* Copyright (c) 2009-2017. The SimGrid Team. All rights reserved.          */
5
6 /* This program is free software; you can redistribute it and/or modify it
7  * under the terms of the license (GNU LGPL) which comes with this package. */
8
9 /************ Alltoall variables and initializers                        */
10
11 #define MV2_MAX_NB_THRESHOLDS  32
12
13 using namespace simgrid::smpi;
14
15 XBT_PUBLIC(void) smpi_coll_cleanup_mvapich2(void);
16
17 typedef struct {
18   int min;
19   int max;
20   int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
21       void *recvbuf, int recvcount, MPI_Datatype recvtype,
22       MPI_Comm comm_ptr );
23 } mv2_alltoall_tuning_element;
24
25 typedef struct {
26   int numproc;
27   int size_table;
28   mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
29   mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
30 } mv2_alltoall_tuning_table;
31
32 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
33
34 /* Indicates number of processes per node */
35 int *mv2_alltoall_table_ppn_conf = NULL;
36 /* Indicates total number of configurations */
37 int mv2_alltoall_num_ppn_conf = 1;
38 int *mv2_size_alltoall_tuning_table = NULL;
39 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
40
41
42 #define MPIR_Alltoall_bruck_MV2 Coll_alltoall_bruck::alltoall
43 #define MPIR_Alltoall_RD_MV2 Coll_alltoall_rdb::alltoall
44 #define MPIR_Alltoall_Scatter_dest_MV2 Coll_alltoall_mvapich2_scatter_dest::alltoall
45 #define MPIR_Alltoall_pairwise_MV2 Coll_alltoall_pair::alltoall
46 #define MPIR_Alltoall_inplace_MV2 Coll_alltoall_ring::alltoall 
47
48
49 static void init_mv2_alltoall_tables_stampede(){
50   int i;
51   int agg_table_sum = 0;
52   mv2_alltoall_tuning_table **table_ptrs = NULL;
53   mv2_alltoall_num_ppn_conf = 3;
54   if(Colls::smpi_coll_cleanup_callback==NULL)
55     Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
56   mv2_alltoall_thresholds_table = static_cast<mv2_alltoall_tuning_table**>(xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
57       * mv2_alltoall_num_ppn_conf));
58   table_ptrs = static_cast<mv2_alltoall_tuning_table**>(xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
59       * mv2_alltoall_num_ppn_conf));
60   mv2_size_alltoall_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
61       mv2_alltoall_num_ppn_conf));
62   mv2_alltoall_table_ppn_conf = static_cast<int*>(xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int)));
63   mv2_alltoall_table_ppn_conf[0] = 1;
64   mv2_size_alltoall_tuning_table[0] = 6;
65   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
66       {2,
67           1,
68           {{0, -1, &MPIR_Alltoall_pairwise_MV2},
69           },
70
71           {{0, -1, &MPIR_Alltoall_inplace_MV2},
72           },
73       },
74
75       {4,
76           2,
77           {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
78               {262144, -1, &MPIR_Alltoall_pairwise_MV2},
79           },
80
81           {{0, -1, &MPIR_Alltoall_inplace_MV2},
82           },
83       },
84
85       {8,
86           2,
87           {{0, 8, &MPIR_Alltoall_RD_MV2},
88               {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
89           },
90
91           {{0, -1, &MPIR_Alltoall_inplace_MV2},
92           },
93       },
94
95       {16,
96           3,
97           {{0, 64, &MPIR_Alltoall_RD_MV2},
98               {64, 512, &MPIR_Alltoall_bruck_MV2},
99               {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
100           },
101
102           {{0,-1, &MPIR_Alltoall_inplace_MV2},
103           },
104       },
105
106       {32,
107           3,
108           {{0, 32, &MPIR_Alltoall_RD_MV2},
109               {32, 2048, &MPIR_Alltoall_bruck_MV2},
110               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
111           },
112
113           {{0, -1, &MPIR_Alltoall_inplace_MV2},
114           },
115       },
116
117       {64,
118           3,
119           {{0, 8, &MPIR_Alltoall_RD_MV2},
120               {8, 1024, &MPIR_Alltoall_bruck_MV2},
121               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
122           },
123
124           {{0, -1, &MPIR_Alltoall_inplace_MV2},
125           },
126       },
127   };
128   table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
129   mv2_alltoall_table_ppn_conf[1] = 2;
130   mv2_size_alltoall_tuning_table[1] = 6;
131   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
132       {4,
133           2,
134           {{0, 32, &MPIR_Alltoall_RD_MV2},
135               {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
136           },
137
138           {{0, -1, &MPIR_Alltoall_inplace_MV2},
139           },
140       },
141
142       {8,
143           2,
144           {{0, 64, &MPIR_Alltoall_RD_MV2},
145               {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
146           },
147
148           {{0, -1, &MPIR_Alltoall_inplace_MV2},
149           },
150       },
151
152       {16,
153           3,
154           {{0, 64, &MPIR_Alltoall_RD_MV2},
155               {64, 2048, &MPIR_Alltoall_bruck_MV2},
156               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
157           },
158
159           {{0,-1, &MPIR_Alltoall_inplace_MV2},
160           },
161       },
162
163       {32,
164           3,
165           {{0, 16, &MPIR_Alltoall_RD_MV2},
166               {16, 2048, &MPIR_Alltoall_bruck_MV2},
167               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
168           },
169
170           {{0, -1, &MPIR_Alltoall_inplace_MV2},
171           },
172       },
173
174       {64,
175           3,
176           {{0, 8, &MPIR_Alltoall_RD_MV2},
177               {8, 1024, &MPIR_Alltoall_bruck_MV2},
178               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
179           },
180
181           {{0, -1, &MPIR_Alltoall_inplace_MV2},
182           },
183       },
184
185       {128,
186           3,
187           {{0, 4, &MPIR_Alltoall_RD_MV2},
188               {4, 2048, &MPIR_Alltoall_bruck_MV2},
189               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
190           },
191
192           {{0, -1, &MPIR_Alltoall_inplace_MV2},
193           },
194       },
195   };
196   table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
197   mv2_alltoall_table_ppn_conf[2] = 16;
198   mv2_size_alltoall_tuning_table[2] = 7;
199   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
200       {16,
201           2,
202           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
203               {2048, -1,  &MPIR_Alltoall_Scatter_dest_MV2},
204           },
205
206           {{32768, -1, &MPIR_Alltoall_inplace_MV2},
207           },
208       },
209
210       {32,
211           2,
212           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
213               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
214           },
215
216           {{16384, -1, &MPIR_Alltoall_inplace_MV2},
217           },
218       },
219
220       {64,
221           3,
222           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
223               {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
224               {16384, -1, &MPIR_Alltoall_pairwise_MV2},
225           },
226
227           {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
228           },
229       },
230
231       {128,
232           2,
233           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
234               {2048, -1, &MPIR_Alltoall_pairwise_MV2},
235           },
236
237           {{16384,65536, &MPIR_Alltoall_inplace_MV2},
238           },
239       },
240
241       {256,
242           2,
243           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
244               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
245           },
246
247           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
248           },
249       },
250
251       {512,
252           2,
253           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
254               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
255           },
256
257           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
258           },
259       },
260       {1024,
261           2,
262           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
263               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
264           },
265
266           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
267           },
268       },
269
270   };
271   table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
272   agg_table_sum = 0;
273   for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
274       agg_table_sum += mv2_size_alltoall_tuning_table[i];
275   }
276   mv2_alltoall_thresholds_table[0] =
277       static_cast<mv2_alltoall_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table)));
278   memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
279       (sizeof(mv2_alltoall_tuning_table)
280           * mv2_size_alltoall_tuning_table[0]));
281   for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
282       mv2_alltoall_thresholds_table[i] =
283           mv2_alltoall_thresholds_table[i - 1]
284                                         + mv2_size_alltoall_tuning_table[i - 1];
285       memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
286           (sizeof(mv2_alltoall_tuning_table)
287               * mv2_size_alltoall_tuning_table[i]));
288   }
289   xbt_free(table_ptrs);
290
291
292 }
293
294
295 /************ Allgather variables and initializers                        */
296
297 typedef struct {
298   int min;
299   int max;
300   int (*MV2_pt_Allgatherction)(void *sendbuf,
301       int sendcount,
302       MPI_Datatype sendtype,
303       void *recvbuf,
304       int recvcount,
305       MPI_Datatype recvtype, MPI_Comm comm_ptr);
306 } mv2_allgather_tuning_element;
307
308 typedef struct {
309   int numproc;
310   int two_level[MV2_MAX_NB_THRESHOLDS];
311   int size_inter_table;
312   mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
313 } mv2_allgather_tuning_table;
314
315 int (*MV2_Allgatherction)(void *sendbuf,
316     int sendcount,
317     MPI_Datatype sendtype,
318     void *recvbuf,
319     int recvcount,
320     MPI_Datatype recvtype, MPI_Comm comm);
321
322 int *mv2_allgather_table_ppn_conf = NULL;
323 int mv2_allgather_num_ppn_conf = 1;
324 int *mv2_size_allgather_tuning_table = NULL;
325 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
326
327 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
328                                  int sendcount,
329                                  MPI_Datatype sendtype,
330                                  void *recvbuf,
331                                  int recvcount,
332                                  MPI_Datatype recvtype, MPI_Comm comm_ptr)
333 {
334     return 0;
335 }
336
337 #define MPIR_Allgather_Bruck_MV2 Coll_allgather_bruck::allgather
338 #define MPIR_Allgather_RD_MV2 Coll_allgather_rdb::allgather
339 #define MPIR_Allgather_Ring_MV2 Coll_allgather_ring::allgather
340 #define MPIR_2lvl_Allgather_MV2 Coll_allgather_mvapich2_smp::allgather
341
342 static void init_mv2_allgather_tables_stampede(){
343   int i;
344   int agg_table_sum = 0;
345
346   if(Colls::smpi_coll_cleanup_callback==NULL)
347     Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
348   mv2_allgather_tuning_table **table_ptrs = NULL;
349   mv2_allgather_num_ppn_conf = 3;
350   mv2_allgather_thresholds_table
351   = static_cast<mv2_allgather_tuning_table**>(xbt_malloc(sizeof(mv2_allgather_tuning_table *)
352       * mv2_allgather_num_ppn_conf));
353   table_ptrs = static_cast<mv2_allgather_tuning_table**>(xbt_malloc(sizeof(mv2_allgather_tuning_table *)
354       * mv2_allgather_num_ppn_conf));
355   mv2_size_allgather_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
356       mv2_allgather_num_ppn_conf));
357   mv2_allgather_table_ppn_conf
358   = static_cast<int*>(xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int)));
359   mv2_allgather_table_ppn_conf[0] = 1;
360   mv2_size_allgather_tuning_table[0] = 6;
361   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
362       {
363           2,
364           {0},
365           1,
366           {
367               {0, -1, &MPIR_Allgather_Ring_MV2},
368           },
369       },
370       {
371           4,
372           {0,0},
373           2,
374           {
375               {0, 262144, &MPIR_Allgather_RD_MV2},
376               {262144, -1, &MPIR_Allgather_Ring_MV2},
377           },
378       },
379       {
380           8,
381           {0,0},
382           2,
383           {
384               {0, 131072, &MPIR_Allgather_RD_MV2},
385               {131072, -1, &MPIR_Allgather_Ring_MV2},
386           },
387       },
388       {
389           16,
390           {0,0},
391           2,
392           {
393               {0, 131072, &MPIR_Allgather_RD_MV2},
394               {131072, -1, &MPIR_Allgather_Ring_MV2},
395           },
396       },
397       {
398           32,
399           {0,0},
400           2,
401           {
402               {0, 65536, &MPIR_Allgather_RD_MV2},
403               {65536, -1, &MPIR_Allgather_Ring_MV2},
404           },
405       },
406       {
407           64,
408           {0,0},
409           2,
410           {
411               {0, 32768, &MPIR_Allgather_RD_MV2},
412               {32768, -1, &MPIR_Allgather_Ring_MV2},
413           },
414       },
415   };
416   table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
417   mv2_allgather_table_ppn_conf[1] = 2;
418   mv2_size_allgather_tuning_table[1] = 6;
419   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
420       {
421           4,
422           {0,0},
423           2,
424           {
425               {0, 524288, &MPIR_Allgather_RD_MV2},
426               {524288, -1, &MPIR_Allgather_Ring_MV2},
427           },
428       },
429       {
430           8,
431           {0,1,0},
432           2,
433           {
434               {0, 32768, &MPIR_Allgather_RD_MV2},
435               {32768, 524288, &MPIR_Allgather_Ring_MV2},
436               {524288, -1, &MPIR_Allgather_Ring_MV2},
437           },
438       },
439       {
440           16,
441           {0,1,0},
442           2,
443           {
444               {0, 16384, &MPIR_Allgather_RD_MV2},
445               {16384, 524288, &MPIR_Allgather_Ring_MV2},
446               {524288, -1, &MPIR_Allgather_Ring_MV2},
447           },
448       },
449       {
450           32,
451           {1,1,0},
452           2,
453           {
454               {0, 65536, &MPIR_Allgather_RD_MV2},
455               {65536, 524288, &MPIR_Allgather_Ring_MV2},
456               {524288, -1, &MPIR_Allgather_Ring_MV2},
457           },
458       },
459       {
460           64,
461           {1,1,0},
462           2,
463           {
464               {0, 32768, &MPIR_Allgather_RD_MV2},
465               {32768, 524288, &MPIR_Allgather_Ring_MV2},
466               {524288, -1, &MPIR_Allgather_Ring_MV2},
467           },
468       },
469       {
470           128,
471           {1,1,0},
472           2,
473           {
474               {0, 65536, &MPIR_Allgather_RD_MV2},
475               {65536, 524288, &MPIR_Allgather_Ring_MV2},
476               {524288, -1, &MPIR_Allgather_Ring_MV2},
477           },
478       },
479   };
480   table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
481   mv2_allgather_table_ppn_conf[2] = 16;
482   mv2_size_allgather_tuning_table[2] = 6;
483   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
484       {
485           16,
486           {0,0},
487           2,
488           {
489               {0, 1024, &MPIR_Allgather_RD_MV2},
490               {1024, -1, &MPIR_Allgather_Ring_MV2},
491           },
492       },
493       {
494           32,
495           {0,0},
496           2,
497           {
498               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
499               {1024, -1, &MPIR_Allgather_Ring_MV2},
500           },
501       },
502       {
503           64,
504           {0,0},
505           2,
506           {
507               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
508               {1024, -1, &MPIR_Allgather_Ring_MV2},
509           },
510       },
511       {
512           128,
513           {0,0},
514           2,
515           {
516               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
517               {1024, -1, &MPIR_Allgather_Ring_MV2},
518           },
519       },
520       {
521           256,
522           {0,0},
523           2,
524           {
525               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
526               {1024, -1, &MPIR_Allgather_Ring_MV2},
527           },
528       },
529       {
530           512,
531           {0,0},
532           2,
533           {
534               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
535               {1024, -1, &MPIR_Allgather_Ring_MV2},
536           },
537       },
538
539   };
540   table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
541   agg_table_sum = 0;
542   for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
543       agg_table_sum += mv2_size_allgather_tuning_table[i];
544   }
545   mv2_allgather_thresholds_table[0] =
546       static_cast<mv2_allgather_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table)));
547   memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
548       (sizeof(mv2_allgather_tuning_table)
549           * mv2_size_allgather_tuning_table[0]));
550   for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
551       mv2_allgather_thresholds_table[i] =
552           mv2_allgather_thresholds_table[i - 1]
553                                          + mv2_size_allgather_tuning_table[i - 1];
554       memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
555           (sizeof(mv2_allgather_tuning_table)
556               * mv2_size_allgather_tuning_table[i]));
557   }
558   xbt_free(table_ptrs);
559 }
560
561
562 /************ Gather variables and initializers                        */
563
564 typedef struct {
565   int min;
566   int max;
567   int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
568       MPI_Datatype sendtype, void *recvbuf, int recvcnt,
569       MPI_Datatype recvtype, int root, MPI_Comm  comm_ptr);
570 } mv2_gather_tuning_element;
571
572
573 typedef struct {
574   int numproc;
575   int size_inter_table;
576   mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
577   int size_intra_table;
578   mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
579 } mv2_gather_tuning_table;
580
581 int mv2_size_gather_tuning_table=7;
582 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL; 
583
584 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
585     int sendcnt,
586     MPI_Datatype sendtype,
587     void *recvbuf,
588     int recvcnt,
589     MPI_Datatype recvtype,
590     int root, MPI_Comm comm);
591
592 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
593 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
594
595
596
597 #define MPIR_Gather_MV2_Direct Coll_gather_ompi_basic_linear::gather
598 #define MPIR_Gather_MV2_two_level_Direct Coll_gather_mvapich2_two_level::gather
599 #define MPIR_Gather_intra Coll_gather_mpich::gather
600
601
602 static void init_mv2_gather_tables_stampede(){
603
604   if(Colls::smpi_coll_cleanup_callback==NULL)
605     Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
606   mv2_size_gather_tuning_table=7;
607   mv2_gather_thresholds_table = static_cast<mv2_gather_tuning_table*>(xbt_malloc(mv2_size_gather_tuning_table*
608       sizeof (mv2_gather_tuning_table)));
609   mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
610       {16,
611           2,{{0, 524288, &MPIR_Gather_MV2_Direct},
612               {524288, -1, &MPIR_Gather_intra}},
613               1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
614               {32,
615                   3,{{0, 16384, &MPIR_Gather_MV2_Direct},
616                       {16384, 131072, &MPIR_Gather_intra},
617                       {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
618                       1,{{0, -1, &MPIR_Gather_intra}}},
619                       {64,
620                           3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
621                               {256, 16384, &MPIR_Gather_MV2_Direct},
622                               {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
623                               1,{{0, -1, &MPIR_Gather_intra}}},
624                               {128,
625                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
626                                       {512, 16384, &MPIR_Gather_MV2_Direct},
627                                       {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
628                                       1,{{0, -1, &MPIR_Gather_intra}}},
629                                       {256,
630                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
631                                               {512, 16384, &MPIR_Gather_MV2_Direct},
632                                               {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
633                                               1,{{0, -1, &MPIR_Gather_intra}}},
634                                               {512,
635                                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
636                                                       {512, 16384, &MPIR_Gather_MV2_Direct},
637                                                       {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
638                                                       1,{{0, -1, &MPIR_Gather_intra}}},
639                                                       {1024,
640                                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
641                                                               {512, 16384, &MPIR_Gather_MV2_Direct},
642                                                               {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
643                                                               1,{{0, -1, &MPIR_Gather_intra}}},
644   };
645
646   memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
647       mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
648
649 }
650
651
652 /************ Allgatherv variables and initializers                        */
653
654 typedef struct {
655   int min;
656   int max;
657   int (*MV2_pt_Allgatherv_function)(void *sendbuf,
658       int sendcount,
659       MPI_Datatype sendtype,
660       void *recvbuf,
661       int *recvcounts,
662       int *displs,
663       MPI_Datatype recvtype,
664       MPI_Comm commg);
665 } mv2_allgatherv_tuning_element;
666
667 typedef struct {
668   int numproc;
669   int size_inter_table;
670   mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
671 } mv2_allgatherv_tuning_table;
672
673 int (*MV2_Allgatherv_function)(void *sendbuf,
674     int sendcount,
675     MPI_Datatype sendtype,
676     void *recvbuf,
677     int *recvcounts,
678     int *displs,
679     MPI_Datatype recvtype,
680     MPI_Comm comm);
681
682 int mv2_size_allgatherv_tuning_table = 0;
683 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
684
685 #define MPIR_Allgatherv_Rec_Doubling_MV2 Coll_allgatherv_mpich_rdb::allgatherv
686 #define MPIR_Allgatherv_Bruck_MV2 Coll_allgatherv_ompi_bruck::allgatherv
687 #define MPIR_Allgatherv_Ring_MV2 Coll_allgatherv_mpich_ring::allgatherv
688
689
690 static void init_mv2_allgatherv_tables_stampede(){
691   if(Colls::smpi_coll_cleanup_callback==NULL)
692     Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
693   mv2_size_allgatherv_tuning_table = 6;
694   mv2_allgatherv_thresholds_table = static_cast<mv2_allgatherv_tuning_table*>(xbt_malloc(mv2_size_allgatherv_tuning_table *
695       sizeof (mv2_allgatherv_tuning_table)));
696   mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
697       {
698           16,
699           2,
700           {
701               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
702               {512, -1, &MPIR_Allgatherv_Ring_MV2},
703           },
704       },
705       {
706           32,
707           2,
708           {
709               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
710               {512, -1, &MPIR_Allgatherv_Ring_MV2},
711           },
712       },
713       {
714           64,
715           2,
716           {
717               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
718               {256, -1, &MPIR_Allgatherv_Ring_MV2},
719           },
720       },
721       {
722           128,
723           2,
724           {
725               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
726               {256, -1, &MPIR_Allgatherv_Ring_MV2},
727           },
728       },
729       {
730           256,
731           2,
732           {
733               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
734               {256, -1, &MPIR_Allgatherv_Ring_MV2},
735           },
736       },
737       {
738           512,
739           2,
740           {
741               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
742               {256, -1, &MPIR_Allgatherv_Ring_MV2},
743           },
744       },
745
746   };
747   memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
748       mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
749 }
750
751
752 /************ Allreduce variables and initializers                        */
753
754 typedef struct {
755   int min;
756   int max;
757   int (*MV2_pt_Allreducection)(void *sendbuf,
758       void *recvbuf,
759       int count,
760       MPI_Datatype datatype,
761       MPI_Op op, MPI_Comm comm);
762 } mv2_allreduce_tuning_element;
763
764 typedef struct {
765   int numproc;
766   int mcast_enabled;
767   int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
768   int size_inter_table;
769   mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
770   int size_intra_table;
771   mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
772 } mv2_allreduce_tuning_table;
773
774
775 int (*MV2_Allreducection)(void *sendbuf,
776     void *recvbuf,
777     int count,
778     MPI_Datatype datatype,
779     MPI_Op op, MPI_Comm comm)=NULL;
780
781
782 int (*MV2_Allreduce_intra_function)( void *sendbuf,
783     void *recvbuf,
784     int count,
785     MPI_Datatype datatype,
786     MPI_Op op, MPI_Comm comm)=NULL;
787
788 int mv2_size_allreduce_tuning_table = 0;
789 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
790
791
792
793
794
795 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
796     void *recvbuf,
797     int count,
798     MPI_Datatype datatype,
799     MPI_Op op, MPI_Comm comm)
800
801   return 0;
802 }
803
804 static  int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
805     void *recvbuf,
806     int count,
807     MPI_Datatype datatype,
808     MPI_Op op, MPI_Comm  comm)
809 {
810   return 0;
811 }
812
813 static  int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
814     void *recvbuf,
815     int count,
816     MPI_Datatype datatype,
817     MPI_Op op, MPI_Comm  comm)
818 {
819   Colls::reduce(sendbuf,recvbuf,count,datatype,op,0,comm);
820   return MPI_SUCCESS;
821 }
822
823 static  int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
824     void *recvbuf,
825     int count,
826     MPI_Datatype datatype,
827     MPI_Op op, MPI_Comm  comm)
828 {
829   Colls::reduce(sendbuf,recvbuf,count,datatype,op,0,comm);
830   return MPI_SUCCESS;
831 }
832
833 #define MPIR_Allreduce_pt2pt_rd_MV2 Coll_allreduce_rdb::allreduce
834 #define MPIR_Allreduce_pt2pt_rs_MV2 Coll_allreduce_mvapich2_rs::allreduce
835 #define MPIR_Allreduce_two_level_MV2 Coll_allreduce_mvapich2_two_level::allreduce
836
837
838 static void init_mv2_allreduce_tables_stampede(){
839   if(Colls::smpi_coll_cleanup_callback==NULL)
840     Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
841   mv2_size_allreduce_tuning_table = 8;
842   mv2_allreduce_thresholds_table = static_cast<mv2_allreduce_tuning_table*>(xbt_malloc(mv2_size_allreduce_tuning_table *
843       sizeof (mv2_allreduce_tuning_table)));
844   mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
845       {
846           16,
847           0,
848           {1, 0},
849           2,
850           {
851               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
852               {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
853           },
854           2,
855           {
856               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
857               {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
858           },
859       },
860       {
861           32,
862           0,
863           {1, 1, 0},
864           3,
865           {
866               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
867               {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
868               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
869           },
870           2,
871           {
872               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
873               {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
874           },
875       },
876       {
877           64,
878           0,
879           {1, 1, 0},
880           3,
881           {
882               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
883               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
884               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
885           },
886           2,
887           {
888               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
889               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
890           },
891       },
892       {
893           128,
894           0,
895           {1, 1, 0},
896           3,
897           {
898               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
899               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
900               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
901           },
902           2,
903           {
904               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
905               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
906           },
907       },
908       {
909           256,
910           0,
911           {1, 1, 0},
912           3,
913           {
914               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
915               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
916               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
917           },
918           2,
919           {
920               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
921               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
922           },
923       },
924       {
925           512,
926           0,
927           {1, 1, 0},
928           3,
929           {
930               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
931               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
932               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
933           },
934           2,
935           {
936               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
937               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
938           },
939       },
940       {
941           1024,
942           0,
943           {1, 1, 1, 0},
944           4,
945           {
946               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
947               {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
948               {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
949               {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
950           },
951           2,
952           {
953               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
954               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
955           },
956       },
957       {
958           2048,
959           0,
960           {1, 1, 1, 0},
961           4,
962           {
963               {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
964               {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
965               {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
966               {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
967               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
968           },
969           2,
970           {
971               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
972               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
973           },
974       },
975
976   };
977   memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
978       mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
979 }
980
981
982
983
984 typedef struct {
985     int min;
986     int max;
987     int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
988                                   int root, MPI_Comm comm_ptr);
989     int zcpy_pipelined_knomial_factor;
990 } mv2_bcast_tuning_element;
991
992 typedef struct {
993     int numproc;
994     int bcast_segment_size;
995     int intra_node_knomial_factor;
996     int inter_node_knomial_factor;
997     int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
998     int size_inter_table;
999     mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1000     int size_intra_table;
1001     mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1002 } mv2_bcast_tuning_table;
1003
1004 int mv2_size_bcast_tuning_table = 0;
1005 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
1006
1007
1008 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
1009                            int root, MPI_Comm comm_ptr) = NULL;
1010
1011 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
1012                                       int root, MPI_Comm comm_ptr) = NULL;
1013
1014 int zcpy_knomial_factor = 2;
1015 int mv2_pipelined_zcpy_knomial_factor = -1;
1016 int bcast_segment_size = 8192;
1017 int mv2_inter_node_knomial_factor = 4;
1018 int mv2_intra_node_knomial_factor = 4;
1019 #define mv2_bcast_two_level_system_size  64
1020 #define mv2_bcast_short_msg             16384
1021 #define mv2_bcast_large_msg            512*1024
1022
1023 #define INTRA_NODE_ROOT 0
1024
1025 #define MPIR_Pipelined_Bcast_Zcpy_MV2 Coll_bcast_mpich::bcast
1026 #define MPIR_Pipelined_Bcast_MV2 Coll_bcast_mpich::bcast
1027 #define MPIR_Bcast_binomial_MV2 Coll_bcast_binomial_tree::bcast
1028 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 Coll_bcast_scatter_LR_allgather::bcast
1029 #define MPIR_Bcast_scatter_doubling_allgather_MV2 Coll_bcast_scatter_rdb_allgather::bcast
1030 #define MPIR_Bcast_scatter_ring_allgather_MV2 Coll_bcast_scatter_LR_allgather::bcast
1031 #define MPIR_Shmem_Bcast_MV2 Coll_bcast_mpich::bcast
1032 #define MPIR_Bcast_tune_inter_node_helper_MV2 Coll_bcast_mvapich2_inter_node::bcast
1033 #define MPIR_Bcast_inter_node_helper_MV2 Coll_bcast_mvapich2_inter_node::bcast
1034 #define MPIR_Knomial_Bcast_intra_node_MV2 Coll_bcast_mvapich2_knomial_intra_node::bcast
1035 #define MPIR_Bcast_intra_MV2 Coll_bcast_mvapich2_intra_node::bcast
1036
1037 static void init_mv2_bcast_tables_stampede(){
1038  //Stampede,
1039   if(Colls::smpi_coll_cleanup_callback==NULL)
1040     Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1041   mv2_size_bcast_tuning_table=8;
1042   mv2_bcast_thresholds_table = static_cast<mv2_bcast_tuning_table*>(xbt_malloc(mv2_size_bcast_tuning_table *
1043   sizeof (mv2_bcast_tuning_table)));
1044
1045   mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1046     {
1047             16,
1048             8192, 4, 4,
1049             {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1050             11,
1051             {
1052               {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1053               {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1054               {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1055               {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1056               {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1057               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1058               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1059               {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1060               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1061               {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1062               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1063             },
1064             11,
1065             {
1066               {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1067               {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1068               {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1069               {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1070               {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1071               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1072               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1073               {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1074               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1075               {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1076               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1077             }
1078     },
1079     {
1080             32,
1081             8192, 4, 4,
1082             {1, 1, 1, 1, 1, 1, 1, 1},
1083             8,
1084             {
1085               {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1086               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1087               {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1088               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1089               {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1090               {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1091               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1092               {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1093             },
1094             8,
1095             {
1096               {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1097               {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1098               {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1099               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1100               {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1101               {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1102               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1103               {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1104             }
1105     },
1106     {
1107             64,
1108             8192, 4, 4,
1109             {1, 1, 1, 1, 1, 1, 1, 1, 1},
1110             9,
1111             {
1112               {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1113               {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1114               {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1115               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1116               {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1117               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1118               {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1119               {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1120               {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1121             },
1122             9,
1123             {
1124               {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1125               {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1126               {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1127               {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1128               {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1129               {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1130               {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1131               {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1132               {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1133             }
1134     },
1135     {
1136             128,
1137             8192, 4, 4,
1138             {1, 1, 1, 0},
1139             4,
1140             {
1141               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1142               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1143               {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1144               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1145             },
1146             4,
1147             {
1148               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1149               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1150               {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1151               {524288, -1, NULL, -1}
1152             }
1153     },
1154     {
1155             256,
1156             8192, 4, 4,
1157             {1, 1, 1, 1, 1},
1158             5,
1159             {
1160               {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1161               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1162               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1163               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1164               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1165             },
1166             5,
1167             {
1168               {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1169               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1170               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1171               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1172               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1173             }
1174     },
1175     {
1176             512,
1177             8192, 4, 4,
1178             {1, 1, 1, 1, 1},
1179             5,
1180             {
1181               {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1182               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1183               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1184               {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1185               {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1186             },
1187             5,
1188             {
1189               {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1190               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1191               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1192               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1193               {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1194             }
1195     },
1196     {
1197             1024,
1198             8192, 4, 4,
1199             {1, 1, 1, 1, 1},
1200             5,
1201             {
1202               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1203               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1204               {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1205               {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1206               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1207             },
1208             5,
1209             {
1210               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1211               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1212               {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1213               {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1214               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1215             }
1216     },
1217     {
1218             2048,
1219             8192, 4, 4,
1220             {1, 1, 1, 1, 1, 1, 1},
1221             7,
1222             {
1223               {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1224               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1225               {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1226               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1227               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1228               {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1229               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1230             },
1231             7,
1232             {
1233               {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1234               {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1235               {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1236               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1237               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1238               {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1239               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1240             }
1241     }
1242   };
1243
1244         memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1245                     mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1246 }
1247
1248
1249 /************ Reduce variables and initializers                        */
1250
1251 typedef struct {
1252   int min;
1253   int max;
1254   int (*MV2_pt_Reduce_function)(void *sendbuf,
1255       void *recvbuf,
1256       int count,
1257       MPI_Datatype datatype,
1258       MPI_Op op,
1259       int root,
1260       MPI_Comm  comm_ptr);
1261 } mv2_reduce_tuning_element;
1262
1263 typedef struct {
1264   int numproc;
1265   int inter_k_degree;
1266   int intra_k_degree;
1267   int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1268   int size_inter_table;
1269   mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1270   int size_intra_table;
1271   mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1272 } mv2_reduce_tuning_table;
1273
1274 int mv2_size_reduce_tuning_table = 0;
1275 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1276
1277
1278 int mv2_reduce_intra_knomial_factor = -1;
1279 int mv2_reduce_inter_knomial_factor = -1;
1280
1281 int (*MV2_Reduce_function)( void *sendbuf,
1282     void *recvbuf,
1283     int count,
1284     MPI_Datatype datatype,
1285     MPI_Op op,
1286     int root,
1287     MPI_Comm  comm_ptr)=NULL;
1288
1289 int (*MV2_Reduce_intra_function)( void *sendbuf,
1290     void *recvbuf,
1291     int count,
1292     MPI_Datatype datatype,
1293     MPI_Op op,
1294     int root,
1295     MPI_Comm  comm_ptr)=NULL;
1296
1297
1298 #define MPIR_Reduce_inter_knomial_wrapper_MV2 Coll_reduce_mvapich2_knomial::reduce
1299 #define MPIR_Reduce_intra_knomial_wrapper_MV2 Coll_reduce_mvapich2_knomial::reduce
1300 #define MPIR_Reduce_binomial_MV2 Coll_reduce_binomial::reduce
1301 #define MPIR_Reduce_redscat_gather_MV2 Coll_reduce_scatter_gather::reduce
1302 #define MPIR_Reduce_shmem_MV2 Coll_reduce_ompi_basic_linear::reduce
1303 #define MPIR_Reduce_two_level_helper_MV2 Coll_reduce_mvapich2_two_level::reduce
1304
1305
1306 static void init_mv2_reduce_tables_stampede(){
1307   if(Colls::smpi_coll_cleanup_callback==NULL)
1308     Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1309   /*Stampede*/
1310   mv2_size_reduce_tuning_table = 8;
1311   mv2_reduce_thresholds_table = static_cast<mv2_reduce_tuning_table*>(xbt_malloc(mv2_size_reduce_tuning_table *
1312       sizeof (mv2_reduce_tuning_table)));
1313   mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1314       {
1315           16,
1316           4,
1317           4,
1318           {1, 0, 0},
1319           3,
1320           {
1321               {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1322               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1323               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1324           },
1325           2,
1326           {
1327               {0, 65536, &MPIR_Reduce_shmem_MV2},
1328               {65536,-1,  &MPIR_Reduce_binomial_MV2},
1329           },
1330       },
1331       {
1332           32,
1333           4,
1334           4,
1335           {1, 1, 1, 1, 0, 0, 0},
1336           7,
1337           {
1338               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1339               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1340               {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1341               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1342               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1343               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1344               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1345           },
1346           6,
1347           {
1348               {0, 8192, &MPIR_Reduce_shmem_MV2},
1349               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1350               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1351               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1352               {65536, 262144, &MPIR_Reduce_shmem_MV2},
1353               {262144,-1,  &MPIR_Reduce_binomial_MV2},
1354           },
1355       },
1356       {
1357           64,
1358           4,
1359           4,
1360           {1, 1, 1, 1, 0},
1361           5,
1362           {
1363               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1364               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1365               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1366               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1367               {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1368           },
1369           5,
1370           {
1371               {0, 8192, &MPIR_Reduce_shmem_MV2},
1372               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1373               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1374               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1375               {262144, -1, &MPIR_Reduce_binomial_MV2},
1376           },
1377       },
1378       {
1379           128,
1380           4,
1381           4,
1382           {1, 0, 1, 0, 1, 0},
1383           6,
1384           {
1385               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1386               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1387               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1388               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1389               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1390               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1391           },
1392           5,
1393           {
1394               {0, 8192, &MPIR_Reduce_shmem_MV2},
1395               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1396               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1397               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1398               {262144, -1, &MPIR_Reduce_binomial_MV2},
1399           },
1400       },
1401       {
1402           256,
1403           4,
1404           4,
1405           {1, 1, 1, 0, 1, 1, 0},
1406           7,
1407           {
1408               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1409               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1410               {16384, 32768, &MPIR_Reduce_binomial_MV2},
1411               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1412               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1413               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1414               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1415           },
1416           6,
1417           {
1418               {0, 8192, &MPIR_Reduce_shmem_MV2},
1419               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1420               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1421               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1422               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1423               {262144, -1, &MPIR_Reduce_binomial_MV2},
1424           },
1425       },
1426       {
1427           512,
1428           4,
1429           4,
1430           {1, 0, 1, 1, 1, 0},
1431           6,
1432           {
1433               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1434               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1435               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1436               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1437               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1438               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1439           },
1440           5,
1441           {
1442               {0, 8192, &MPIR_Reduce_shmem_MV2},
1443               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1444               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1445               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1446               {262144, -1, &MPIR_Reduce_binomial_MV2},
1447           },
1448       },
1449       {
1450           1024,
1451           4,
1452           4,
1453           {1, 0, 1, 1, 1},
1454           5,
1455           {
1456               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1457               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1458               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1459               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1460               {262144, -1, &MPIR_Reduce_binomial_MV2},
1461           },
1462           5,
1463           {
1464               {0, 8192, &MPIR_Reduce_shmem_MV2},
1465               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1466               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1467               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1468               {262144, -1, &MPIR_Reduce_binomial_MV2},
1469           },
1470       },
1471       {
1472           2048,
1473           4,
1474           4,
1475           {1, 0, 1, 1, 1,1},
1476           6,
1477           {
1478               {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1479               {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1480               {4096, 16384, &MPIR_Reduce_binomial_MV2},
1481               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1482               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1483               {131072, -1, &MPIR_Reduce_binomial_MV2},
1484           },
1485           6,
1486           {
1487               {0, 2048, &MPIR_Reduce_shmem_MV2},
1488               {2048, 4096, &MPIR_Reduce_shmem_MV2},
1489               {4096, 16384, &MPIR_Reduce_shmem_MV2},
1490               {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1491               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1492               {131072, -1, &MPIR_Reduce_shmem_MV2},
1493           },
1494       },
1495
1496   };
1497   memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1498       mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1499 }
1500
1501 /************ Reduce scatter variables and initializers                        */
1502
1503 typedef struct {
1504   int min;
1505   int max;
1506   int (*MV2_pt_Red_scat_function)(void *sendbuf,
1507       void *recvbuf,
1508       int *recvcnts,
1509       MPI_Datatype datatype,
1510       MPI_Op op,
1511       MPI_Comm comm_ptr);
1512 } mv2_red_scat_tuning_element;
1513
1514 typedef struct {
1515   int numproc;
1516   int size_inter_table;
1517   mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1518 } mv2_red_scat_tuning_table;
1519
1520 int mv2_size_red_scat_tuning_table = 0;
1521 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1522
1523
1524 int (*MV2_Red_scat_function)(void *sendbuf,
1525     void *recvbuf,
1526     int *recvcnts,
1527     MPI_Datatype datatype,
1528     MPI_Op op,
1529     MPI_Comm comm_ptr);
1530
1531
1532
1533 static  int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1534     void *recvbuf,
1535     int *recvcnts,
1536     MPI_Datatype datatype,
1537     MPI_Op op,
1538     MPI_Comm comm)
1539 {
1540   Coll_reduce_scatter_default::reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1541   return MPI_SUCCESS;
1542 }
1543 #define MPIR_Reduce_scatter_non_comm_MV2 Coll_reduce_scatter_mpich_noncomm::reduce_scatter
1544 #define MPIR_Reduce_scatter_Rec_Halving_MV2 Coll_reduce_scatter_ompi_basic_recursivehalving::reduce_scatter
1545 #define MPIR_Reduce_scatter_Pair_Wise_MV2 Coll_reduce_scatter_mpich_pair::reduce_scatter
1546
1547
1548
1549
1550 static void init_mv2_reduce_scatter_tables_stampede(){
1551   if(Colls::smpi_coll_cleanup_callback==NULL)
1552     Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1553   mv2_size_red_scat_tuning_table = 6;
1554   mv2_red_scat_thresholds_table = static_cast<mv2_red_scat_tuning_table*>(xbt_malloc(mv2_size_red_scat_tuning_table *
1555       sizeof (mv2_red_scat_tuning_table)));
1556   mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1557       {
1558           16,
1559           3,
1560           {
1561               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1562               {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1563               {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1564           },
1565       },
1566       {
1567           32,
1568           3,
1569           {
1570               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1571               {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1572               {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1573           },
1574       },
1575       {
1576           64,
1577           3,
1578           {
1579               {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1580               {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1581               {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1582           },
1583       },
1584       {
1585           128,
1586           2,
1587           {
1588               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1589               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1590           },
1591       },
1592       {
1593           256,
1594           2,
1595           {
1596               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1597               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1598           },
1599       },
1600       {
1601           512,
1602           2,
1603           {
1604               {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1605               {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1606           },
1607       },
1608
1609   };
1610   memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1611       mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1612 }
1613
1614 /************ Scatter variables and initializers                        */
1615
1616 typedef struct {
1617   int min;
1618   int max;
1619   int (*MV2_pt_Scatter_function)(void *sendbuf,
1620       int sendcnt,
1621       MPI_Datatype sendtype,
1622       void *recvbuf,
1623       int recvcnt,
1624       MPI_Datatype recvtype,
1625       int root, MPI_Comm comm);
1626 } mv2_scatter_tuning_element;
1627
1628 typedef struct {
1629   int numproc;
1630   int size_inter_table;
1631   mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1632   int size_intra_table;
1633   mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1634 } mv2_scatter_tuning_table;
1635
1636
1637 int *mv2_scatter_table_ppn_conf = NULL;
1638 int mv2_scatter_num_ppn_conf = 1;
1639 int *mv2_size_scatter_tuning_table = NULL;
1640 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1641
1642 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1643     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1644     int root, MPI_Comm comm)=NULL;
1645
1646 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1647     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1648     int root, MPI_Comm comm)=NULL;
1649 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1650     int sendcnt,
1651     MPI_Datatype sendtype,
1652     void *recvbuf,
1653     int recvcnt,
1654     MPI_Datatype recvtype,
1655     int root, MPI_Comm comm_ptr);
1656
1657 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1658     int sendcnt,
1659     MPI_Datatype sendtype,
1660     void *recvbuf,
1661     int recvcnt,
1662     MPI_Datatype recvtype,
1663     int root, MPI_Comm comm_ptr)
1664 {
1665   return 0;
1666 }
1667
1668 #define MPIR_Scatter_MV2_Binomial Coll_scatter_ompi_binomial::scatter
1669 #define MPIR_Scatter_MV2_Direct Coll_scatter_ompi_basic_linear::scatter
1670 #define MPIR_Scatter_MV2_two_level_Binomial Coll_scatter_mvapich2_two_level_binomial::scatter
1671 #define MPIR_Scatter_MV2_two_level_Direct Coll_scatter_mvapich2_two_level_direct::scatter
1672
1673
1674
1675
1676 static void init_mv2_scatter_tables_stampede(){
1677     if(Colls::smpi_coll_cleanup_callback==NULL)
1678       Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1679
1680     int agg_table_sum = 0;
1681     int i;
1682     mv2_scatter_tuning_table **table_ptrs = NULL;
1683     mv2_scatter_num_ppn_conf = 3;
1684     mv2_scatter_thresholds_table
1685     = static_cast<mv2_scatter_tuning_table**>(xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1686         * mv2_scatter_num_ppn_conf));
1687     table_ptrs = static_cast<mv2_scatter_tuning_table**>(xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1688         * mv2_scatter_num_ppn_conf));
1689     mv2_size_scatter_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
1690         mv2_scatter_num_ppn_conf));
1691     mv2_scatter_table_ppn_conf
1692     = static_cast<int*>(xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int)));
1693     mv2_scatter_table_ppn_conf[0] = 1;
1694     mv2_size_scatter_tuning_table[0] = 6;
1695     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1696         {2,
1697             1,
1698             {
1699                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1700             },
1701             1,
1702             {
1703                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1704             },
1705         },
1706
1707         {4,
1708             1,
1709             {
1710                 {0, -1, &MPIR_Scatter_MV2_Direct},
1711             },
1712             1,
1713             {
1714                 {0, -1, &MPIR_Scatter_MV2_Direct},
1715             },
1716         },
1717
1718         {8,
1719             1,
1720             {
1721                 {0, -1, &MPIR_Scatter_MV2_Direct},
1722             },
1723             1,
1724             {
1725                 {0, -1, &MPIR_Scatter_MV2_Direct},
1726             },
1727         },
1728
1729         {16,
1730             1,
1731             {
1732                 {0, -1, &MPIR_Scatter_MV2_Direct},
1733             },
1734             1,
1735             {
1736                 {0, -1, &MPIR_Scatter_MV2_Direct},
1737             },
1738         },
1739
1740         {32,
1741             1,
1742             {
1743                 {0, -1, &MPIR_Scatter_MV2_Direct},
1744             },
1745             1,
1746             {
1747                 {0, -1, &MPIR_Scatter_MV2_Direct},
1748             },
1749         },
1750
1751         {64,
1752             2,
1753             {
1754                 {0, 32, &MPIR_Scatter_MV2_Binomial},
1755                 {32, -1, &MPIR_Scatter_MV2_Direct},
1756             },
1757             1,
1758             {
1759                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1760             },
1761         },
1762     };
1763     table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1764     mv2_scatter_table_ppn_conf[1] = 2;
1765     mv2_size_scatter_tuning_table[1] = 6;
1766     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1767         {4,
1768             2,
1769             {
1770                 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1771                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1772             },
1773             1,
1774             {
1775                 {0, -1, &MPIR_Scatter_MV2_Direct},
1776             },
1777         },
1778
1779         {8,
1780             2,
1781             {
1782                 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1783                 {512, -1, &MPIR_Scatter_MV2_Direct},
1784             },
1785             1,
1786             {
1787                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1788             },
1789         },
1790
1791         {16,
1792             2,
1793             {
1794                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1795                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1796             },
1797             1,
1798             {
1799                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1800             },
1801         },
1802
1803         {32,
1804             2,
1805             {
1806                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1807                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1808             },
1809             1,
1810             {
1811                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1812             },
1813         },
1814
1815         {64,
1816             2,
1817             {
1818                 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1819                 {8192, -1, &MPIR_Scatter_MV2_Direct},
1820             },
1821             1,
1822             {
1823                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1824             },
1825         },
1826
1827         {128,
1828             4,
1829             {
1830                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1831                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1832                 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1833                 {16384, -1, &MPIR_Scatter_MV2_Direct},
1834             },
1835             1,
1836             {
1837                 {0, 128, &MPIR_Scatter_MV2_Direct},
1838                 {128, -1, &MPIR_Scatter_MV2_Binomial},
1839             },
1840         },
1841     };
1842     table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1843     mv2_scatter_table_ppn_conf[2] = 16;
1844     mv2_size_scatter_tuning_table[2] = 8;
1845     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1846         {
1847             16,
1848             2,
1849             {
1850                 {0, 256, &MPIR_Scatter_MV2_Binomial},
1851                 {256, -1, &MPIR_Scatter_MV2_Direct},
1852             },
1853             1,
1854             {
1855                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1856             },
1857         },
1858
1859         {
1860             32,
1861             2,
1862             {
1863                 {0, 512, &MPIR_Scatter_MV2_Binomial},
1864                 {512, -1, &MPIR_Scatter_MV2_Direct},
1865             },
1866             1,
1867             {
1868                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1869             },
1870         },
1871
1872         {
1873             64,
1874             2,
1875             {
1876                 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1877                 {1024, -1, &MPIR_Scatter_MV2_Direct},
1878             },
1879             1,
1880             {
1881                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1882             },
1883         },
1884
1885         {
1886             128,
1887             4,
1888             {
1889                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1890                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1891                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1892                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1893             },
1894             1,
1895             {
1896                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1897             },
1898         },
1899
1900         {
1901             256,
1902             4,
1903             {
1904                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1905                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1906                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1907                 {2048, -1,  &MPIR_Scatter_MV2_Direct},
1908             },
1909             1,
1910             {
1911                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1912             },
1913         },
1914
1915         {
1916             512,
1917             4,
1918             {
1919                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1920                 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1921                 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1922                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1923             },
1924             1,
1925             {
1926                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1927             },
1928         },
1929         {
1930             1024,
1931             5,
1932             {
1933                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1934                 {0, 16,  &MPIR_Scatter_MV2_Binomial},
1935                 {16, 32, &MPIR_Scatter_MV2_Binomial},
1936                 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1937                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1938             },
1939             1,
1940             {
1941                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1942             },
1943         },
1944         {
1945             2048,
1946             7,
1947             {
1948                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1949                 {0, 16,  &MPIR_Scatter_MV2_two_level_Binomial},
1950                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1951                 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1952                 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1953                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1954                 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1955             },
1956             6,
1957             {
1958                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1959                 {16, 128, &MPIR_Scatter_MV2_Binomial},
1960                 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1961                 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1962                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1963                 {65536, -1, &MPIR_Scatter_MV2_Direct},
1964             },
1965         },
1966     };
1967     table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1968     agg_table_sum = 0;
1969     for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1970         agg_table_sum += mv2_size_scatter_tuning_table[i];
1971     }
1972     mv2_scatter_thresholds_table[0] =
1973         static_cast<mv2_scatter_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table)));
1974     memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1975         (sizeof(mv2_scatter_tuning_table)
1976             * mv2_size_scatter_tuning_table[0]));
1977     for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1978         mv2_scatter_thresholds_table[i] =
1979             mv2_scatter_thresholds_table[i - 1]
1980                                          + mv2_size_scatter_tuning_table[i - 1];
1981         memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1982             (sizeof(mv2_scatter_tuning_table)
1983                 * mv2_size_scatter_tuning_table[i]));
1984     }
1985     xbt_free(table_ptrs);
1986   
1987 }
1988