Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Merge branch 'master' of git+ssh://scm.gforge.inria.fr//gitroot/simgrid/simgrid
[simgrid.git] / src / smpi / colls / smpi_mvapich2_selector_stampede.h
1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
2
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4  * All rights reserved.                                                     */
5
6 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
7
8
9
10 /************ Alltoall variables and initializers                        */
11
12 #define MV2_MAX_NB_THRESHOLDS  32
13 typedef struct {
14   int min;
15   int max;
16   int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
17       void *recvbuf, int recvcount, MPI_Datatype recvtype,
18       MPI_Comm comm_ptr );
19 } mv2_alltoall_tuning_element;
20
21 typedef struct {
22   int numproc;
23   int size_table;
24   mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
25   mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
26 } mv2_alltoall_tuning_table;
27
28 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
29
30 /* Indicates number of processes per node */
31 int *mv2_alltoall_table_ppn_conf = NULL;
32 /* Indicates total number of configurations */
33 int mv2_alltoall_num_ppn_conf = 1;
34 int *mv2_size_alltoall_tuning_table = NULL;
35 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
36
37
38 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
39 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
40 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_ring
41 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
42 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring 
43
44
45 static void init_mv2_alltoall_tables_stampede(){
46   int i;
47   int agg_table_sum = 0;
48   mv2_alltoall_tuning_table **table_ptrs = NULL;
49   mv2_alltoall_num_ppn_conf = 3;
50   mv2_alltoall_thresholds_table = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
51       * mv2_alltoall_num_ppn_conf);
52   table_ptrs = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
53       * mv2_alltoall_num_ppn_conf);
54   mv2_size_alltoall_tuning_table = xbt_malloc(sizeof(int) *
55       mv2_alltoall_num_ppn_conf);
56   mv2_alltoall_table_ppn_conf = xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
57   mv2_alltoall_table_ppn_conf[0] = 1;
58   mv2_size_alltoall_tuning_table[0] = 6;
59   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
60       {2,
61           1,
62           {{0, -1, &MPIR_Alltoall_pairwise_MV2},
63           },
64
65           {{0, -1, &MPIR_Alltoall_inplace_MV2},
66           },
67       },
68
69       {4,
70           2,
71           {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
72               {262144, -1, &MPIR_Alltoall_pairwise_MV2},
73           },
74
75           {{0, -1, &MPIR_Alltoall_inplace_MV2},
76           },
77       },
78
79       {8,
80           2,
81           {{0, 8, &MPIR_Alltoall_RD_MV2},
82               {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
83           },
84
85           {{0, -1, &MPIR_Alltoall_inplace_MV2},
86           },
87       },
88
89       {16,
90           3,
91           {{0, 64, &MPIR_Alltoall_RD_MV2},
92               {64, 512, &MPIR_Alltoall_bruck_MV2},
93               {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
94           },
95
96           {{0,-1, &MPIR_Alltoall_inplace_MV2},
97           },
98       },
99
100       {32,
101           3,
102           {{0, 32, &MPIR_Alltoall_RD_MV2},
103               {32, 2048, &MPIR_Alltoall_bruck_MV2},
104               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
105           },
106
107           {{0, -1, &MPIR_Alltoall_inplace_MV2},
108           },
109       },
110
111       {64,
112           3,
113           {{0, 8, &MPIR_Alltoall_RD_MV2},
114               {8, 1024, &MPIR_Alltoall_bruck_MV2},
115               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
116           },
117
118           {{0, -1, &MPIR_Alltoall_inplace_MV2},
119           },
120       },
121   };
122   table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
123   mv2_alltoall_table_ppn_conf[1] = 2;
124   mv2_size_alltoall_tuning_table[1] = 6;
125   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
126       {4,
127           2,
128           {{0, 32, &MPIR_Alltoall_RD_MV2},
129               {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
130           },
131
132           {{0, -1, &MPIR_Alltoall_inplace_MV2},
133           },
134       },
135
136       {8,
137           2,
138           {{0, 64, &MPIR_Alltoall_RD_MV2},
139               {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
140           },
141
142           {{0, -1, &MPIR_Alltoall_inplace_MV2},
143           },
144       },
145
146       {16,
147           3,
148           {{0, 64, &MPIR_Alltoall_RD_MV2},
149               {64, 2048, &MPIR_Alltoall_bruck_MV2},
150               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
151           },
152
153           {{0,-1, &MPIR_Alltoall_inplace_MV2},
154           },
155       },
156
157       {32,
158           3,
159           {{0, 16, &MPIR_Alltoall_RD_MV2},
160               {16, 2048, &MPIR_Alltoall_bruck_MV2},
161               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
162           },
163
164           {{0, -1, &MPIR_Alltoall_inplace_MV2},
165           },
166       },
167
168       {64,
169           3,
170           {{0, 8, &MPIR_Alltoall_RD_MV2},
171               {8, 1024, &MPIR_Alltoall_bruck_MV2},
172               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
173           },
174
175           {{0, -1, &MPIR_Alltoall_inplace_MV2},
176           },
177       },
178
179       {128,
180           3,
181           {{0, 4, &MPIR_Alltoall_RD_MV2},
182               {4, 2048, &MPIR_Alltoall_bruck_MV2},
183               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
184           },
185
186           {{0, -1, &MPIR_Alltoall_inplace_MV2},
187           },
188       },
189   };
190   table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
191   mv2_alltoall_table_ppn_conf[2] = 16;
192   mv2_size_alltoall_tuning_table[2] = 7;
193   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
194       {16,
195           2,
196           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
197               {2048, -1,  &MPIR_Alltoall_Scatter_dest_MV2},
198           },
199
200           {{32768, -1, &MPIR_Alltoall_inplace_MV2},
201           },
202       },
203
204       {32,
205           2,
206           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
207               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
208           },
209
210           {{16384, -1, &MPIR_Alltoall_inplace_MV2},
211           },
212       },
213
214       {64,
215           3,
216           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
217               {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
218               {16384, -1, &MPIR_Alltoall_pairwise_MV2},
219           },
220
221           {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
222           },
223       },
224
225       {128,
226           2,
227           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
228               {2048, -1, &MPIR_Alltoall_pairwise_MV2},
229           },
230
231           {{16384,65536, &MPIR_Alltoall_inplace_MV2},
232           },
233       },
234
235       {256,
236           2,
237           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
238               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
239           },
240
241           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
242           },
243       },
244
245       {512,
246           2,
247           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
248               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
249           },
250
251           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
252           },
253       },
254       {1024,
255           2,
256           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
257               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
258           },
259
260           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
261           },
262       },
263
264   };
265   table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
266   agg_table_sum = 0;
267   for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
268       agg_table_sum += mv2_size_alltoall_tuning_table[i];
269   }
270   mv2_alltoall_thresholds_table[0] =
271       xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
272   memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
273       (sizeof(mv2_alltoall_tuning_table)
274           * mv2_size_alltoall_tuning_table[0]));
275   for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
276       mv2_alltoall_thresholds_table[i] =
277           mv2_alltoall_thresholds_table[i - 1]
278                                         + mv2_size_alltoall_tuning_table[i - 1];
279       memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
280           (sizeof(mv2_alltoall_tuning_table)
281               * mv2_size_alltoall_tuning_table[i]));
282   }
283   xbt_free(table_ptrs);
284
285
286 }
287
288
289 /************ Allgather variables and initializers                        */
290
291 typedef struct {
292   int min;
293   int max;
294   int (*MV2_pt_Allgather_function)(void *sendbuf,
295       int sendcount,
296       MPI_Datatype sendtype,
297       void *recvbuf,
298       int recvcount,
299       MPI_Datatype recvtype, MPI_Comm comm_ptr);
300 } mv2_allgather_tuning_element;
301
302 typedef struct {
303   int numproc;
304   int two_level[MV2_MAX_NB_THRESHOLDS];
305   int size_inter_table;
306   mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
307 } mv2_allgather_tuning_table;
308
309 int (*MV2_Allgather_function)(void *sendbuf,
310     int sendcount,
311     MPI_Datatype sendtype,
312     void *recvbuf,
313     int recvcount,
314     MPI_Datatype recvtype, MPI_Comm comm);
315
316 int *mv2_allgather_table_ppn_conf = NULL;
317 int mv2_allgather_num_ppn_conf = 1;
318 int *mv2_size_allgather_tuning_table = NULL;
319 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
320
321 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
322 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
323 #define MPIR_Allgather_RD_Allgather_Comm_MV2 smpi_coll_tuned_allgather_rdb
324 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
325
326
327 static void init_mv2_allgather_tables_stampede(){
328   int i;
329   int agg_table_sum = 0;
330   mv2_allgather_tuning_table **table_ptrs = NULL;
331   mv2_allgather_num_ppn_conf = 3;
332   mv2_allgather_thresholds_table
333   = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
334       * mv2_allgather_num_ppn_conf);
335   table_ptrs = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
336       * mv2_allgather_num_ppn_conf);
337   mv2_size_allgather_tuning_table = xbt_malloc(sizeof(int) *
338       mv2_allgather_num_ppn_conf);
339   mv2_allgather_table_ppn_conf
340   = xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int));
341   mv2_allgather_table_ppn_conf[0] = 1;
342   mv2_size_allgather_tuning_table[0] = 6;
343   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
344       {
345           2,
346           {0},
347           1,
348           {
349               {0, -1, &MPIR_Allgather_Ring_MV2},
350           },
351       },
352       {
353           4,
354           {0,0},
355           2,
356           {
357               {0, 262144, &MPIR_Allgather_RD_MV2},
358               {262144, -1, &MPIR_Allgather_Ring_MV2},
359           },
360       },
361       {
362           8,
363           {0,0},
364           2,
365           {
366               {0, 131072, &MPIR_Allgather_RD_MV2},
367               {131072, -1, &MPIR_Allgather_Ring_MV2},
368           },
369       },
370       {
371           16,
372           {0,0},
373           2,
374           {
375               {0, 131072, &MPIR_Allgather_RD_MV2},
376               {131072, -1, &MPIR_Allgather_Ring_MV2},
377           },
378       },
379       {
380           32,
381           {0,0},
382           2,
383           {
384               {0, 65536, &MPIR_Allgather_RD_MV2},
385               {65536, -1, &MPIR_Allgather_Ring_MV2},
386           },
387       },
388       {
389           64,
390           {0,0},
391           2,
392           {
393               {0, 32768, &MPIR_Allgather_RD_MV2},
394               {32768, -1, &MPIR_Allgather_Ring_MV2},
395           },
396       },
397   };
398   table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
399   mv2_allgather_table_ppn_conf[1] = 2;
400   mv2_size_allgather_tuning_table[1] = 6;
401   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
402       {
403           4,
404           {0,0},
405           2,
406           {
407               {0, 524288, &MPIR_Allgather_RD_MV2},
408               {524288, -1, &MPIR_Allgather_Ring_MV2},
409           },
410       },
411       {
412           8,
413           {0,1,0},
414           2,
415           {
416               {0, 32768, &MPIR_Allgather_RD_MV2},
417               {32768, 524288, &MPIR_Allgather_Ring_MV2},
418               {524288, -1, &MPIR_Allgather_Ring_MV2},
419           },
420       },
421       {
422           16,
423           {0,1,0},
424           2,
425           {
426               {0, 16384, &MPIR_Allgather_RD_MV2},
427               {16384, 524288, &MPIR_Allgather_Ring_MV2},
428               {524288, -1, &MPIR_Allgather_Ring_MV2},
429           },
430       },
431       {
432           32,
433           {1,1,0},
434           2,
435           {
436               {0, 65536, &MPIR_Allgather_RD_MV2},
437               {65536, 524288, &MPIR_Allgather_Ring_MV2},
438               {524288, -1, &MPIR_Allgather_Ring_MV2},
439           },
440       },
441       {
442           64,
443           {1,1,0},
444           2,
445           {
446               {0, 32768, &MPIR_Allgather_RD_MV2},
447               {32768, 524288, &MPIR_Allgather_Ring_MV2},
448               {524288, -1, &MPIR_Allgather_Ring_MV2},
449           },
450       },
451       {
452           128,
453           {1,1,0},
454           2,
455           {
456               {0, 65536, &MPIR_Allgather_RD_MV2},
457               {65536, 524288, &MPIR_Allgather_Ring_MV2},
458               {524288, -1, &MPIR_Allgather_Ring_MV2},
459           },
460       },
461   };
462   table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
463   mv2_allgather_table_ppn_conf[2] = 16;
464   mv2_size_allgather_tuning_table[2] = 6;
465   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
466       {
467           16,
468           {0,0},
469           2,
470           {
471               {0, 1024, &MPIR_Allgather_RD_MV2},
472               {1024, -1, &MPIR_Allgather_Ring_MV2},
473           },
474       },
475       {
476           32,
477           {0,0},
478           2,
479           {
480               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
481               {1024, -1, &MPIR_Allgather_Ring_MV2},
482           },
483       },
484       {
485           64,
486           {0,0},
487           2,
488           {
489               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
490               {1024, -1, &MPIR_Allgather_Ring_MV2},
491           },
492       },
493       {
494           128,
495           {0,0},
496           2,
497           {
498               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
499               {1024, -1, &MPIR_Allgather_Ring_MV2},
500           },
501       },
502       {
503           256,
504           {0,0},
505           2,
506           {
507               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
508               {1024, -1, &MPIR_Allgather_Ring_MV2},
509           },
510       },
511       {
512           512,
513           {0,0},
514           2,
515           {
516               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
517               {1024, -1, &MPIR_Allgather_Ring_MV2},
518           },
519       },
520
521   };
522   table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
523   agg_table_sum = 0;
524   for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
525       agg_table_sum += mv2_size_allgather_tuning_table[i];
526   }
527   mv2_allgather_thresholds_table[0] =
528       xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
529   memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
530       (sizeof(mv2_allgather_tuning_table)
531           * mv2_size_allgather_tuning_table[0]));
532   for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
533       mv2_allgather_thresholds_table[i] =
534           mv2_allgather_thresholds_table[i - 1]
535                                          + mv2_size_allgather_tuning_table[i - 1];
536       memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
537           (sizeof(mv2_allgather_tuning_table)
538               * mv2_size_allgather_tuning_table[i]));
539   }
540   xbt_free(table_ptrs);
541 }
542
543
544 /************ Gather variables and initializers                        */
545
546 typedef struct {
547   int min;
548   int max;
549   int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
550       MPI_Datatype sendtype, void *recvbuf, int recvcnt,
551       MPI_Datatype recvtype, int root, MPI_Comm  comm_ptr);
552 } mv2_gather_tuning_element;
553
554
555 typedef struct {
556   int numproc;
557   int size_inter_table;
558   mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
559   int size_intra_table;
560   mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
561 } mv2_gather_tuning_table;
562
563 int mv2_size_gather_tuning_table=7;
564 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL; 
565
566 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
567     int sendcnt,
568     MPI_Datatype sendtype,
569     void *recvbuf,
570     int recvcnt,
571     MPI_Datatype recvtype,
572     int root, MPI_Comm comm);
573
574 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
575 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
576
577
578 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
579 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_ompi_basic_linear
580 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
581
582
583 static void init_mv2_gather_tables_stampede(){
584
585   mv2_size_gather_tuning_table=7;
586   mv2_gather_thresholds_table = xbt_malloc(mv2_size_gather_tuning_table*
587       sizeof (mv2_gather_tuning_table));
588   mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
589       {16,
590           2,{{0, 524288, &MPIR_Gather_MV2_Direct},
591               {524288, -1, &MPIR_Gather_intra}},
592               1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
593               {32,
594                   3,{{0, 16384, &MPIR_Gather_MV2_Direct},
595                       {16384, 131072, &MPIR_Gather_intra},
596                       {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
597                       1,{{0, -1, &MPIR_Gather_intra}}},
598                       {64,
599                           3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
600                               {256, 16384, &MPIR_Gather_MV2_Direct},
601                               {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
602                               1,{{0, -1, &MPIR_Gather_intra}}},
603                               {128,
604                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
605                                       {512, 16384, &MPIR_Gather_MV2_Direct},
606                                       {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
607                                       1,{{0, -1, &MPIR_Gather_intra}}},
608                                       {256,
609                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
610                                               {512, 16384, &MPIR_Gather_MV2_Direct},
611                                               {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
612                                               1,{{0, -1, &MPIR_Gather_intra}}},
613                                               {512,
614                                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
615                                                       {512, 16384, &MPIR_Gather_MV2_Direct},
616                                                       {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
617                                                       1,{{0, -1, &MPIR_Gather_intra}}},
618                                                       {1024,
619                                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
620                                                               {512, 16384, &MPIR_Gather_MV2_Direct},
621                                                               {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
622                                                               1,{{0, -1, &MPIR_Gather_intra}}},
623   };
624
625   memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
626       mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
627
628 }
629
630
631 /************ Allgatherv variables and initializers                        */
632
633 typedef struct {
634   int min;
635   int max;
636   int (*MV2_pt_Allgatherv_function)(void *sendbuf,
637       int sendcount,
638       MPI_Datatype sendtype,
639       void *recvbuf,
640       int *recvcounts,
641       int *displs,
642       MPI_Datatype recvtype,
643       MPI_Comm commg);
644 } mv2_allgatherv_tuning_element;
645
646 typedef struct {
647   int numproc;
648   int size_inter_table;
649   mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
650 } mv2_allgatherv_tuning_table;
651
652 int (*MV2_Allgatherv_function)(void *sendbuf,
653     int sendcount,
654     MPI_Datatype sendtype,
655     void *recvbuf,
656     int *recvcounts,
657     int *displs,
658     MPI_Datatype recvtype,
659     MPI_Comm comm);
660
661 int mv2_size_allgatherv_tuning_table = 0;
662 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
663
664 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
665 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
666 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
667
668
669 static void init_mv2_allgatherv_tables_stampede(){
670   mv2_size_allgatherv_tuning_table = 6;
671   mv2_allgatherv_thresholds_table = xbt_malloc(mv2_size_allgatherv_tuning_table *
672       sizeof (mv2_allgatherv_tuning_table));
673   mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
674       {
675           16,
676           2,
677           {
678               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
679               {512, -1, &MPIR_Allgatherv_Ring_MV2},
680           },
681       },
682       {
683           32,
684           2,
685           {
686               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
687               {512, -1, &MPIR_Allgatherv_Ring_MV2},
688           },
689       },
690       {
691           64,
692           2,
693           {
694               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
695               {256, -1, &MPIR_Allgatherv_Ring_MV2},
696           },
697       },
698       {
699           128,
700           2,
701           {
702               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
703               {256, -1, &MPIR_Allgatherv_Ring_MV2},
704           },
705       },
706       {
707           256,
708           2,
709           {
710               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
711               {256, -1, &MPIR_Allgatherv_Ring_MV2},
712           },
713       },
714       {
715           512,
716           2,
717           {
718               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
719               {256, -1, &MPIR_Allgatherv_Ring_MV2},
720           },
721       },
722
723   };
724   memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
725       mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
726 }
727
728
729 /************ Allreduce variables and initializers                        */
730
731 typedef struct {
732   int min;
733   int max;
734   int (*MV2_pt_Allreduce_function)(void *sendbuf,
735       void *recvbuf,
736       int count,
737       MPI_Datatype datatype,
738       MPI_Op op, MPI_Comm comm);
739 } mv2_allreduce_tuning_element;
740
741 typedef struct {
742   int numproc;
743   int mcast_enabled;
744   int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
745   int size_inter_table;
746   mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
747   int size_intra_table;
748   mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
749 } mv2_allreduce_tuning_table;
750
751
752 int (*MV2_Allreduce_function)(void *sendbuf,
753     void *recvbuf,
754     int count,
755     MPI_Datatype datatype,
756     MPI_Op op, MPI_Comm comm)=NULL;
757
758
759 int (*MV2_Allreduce_intra_function)( void *sendbuf,
760     void *recvbuf,
761     int count,
762     MPI_Datatype datatype,
763     MPI_Op op, MPI_Comm comm)=NULL;
764
765 int mv2_size_allreduce_tuning_table = 0;
766 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
767
768
769
770
771
772 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
773     void *recvbuf,
774     int count,
775     MPI_Datatype datatype,
776     MPI_Op op, MPI_Comm comm)
777
778   return 0;
779 }
780
781 static  int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
782     void *recvbuf,
783     int count,
784     MPI_Datatype datatype,
785     MPI_Op op, MPI_Comm  comm)
786 {
787   return 0;
788 }
789
790 static  int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
791     void *recvbuf,
792     int count,
793     MPI_Datatype datatype,
794     MPI_Op op, MPI_Comm  comm)
795 {
796   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
797   return MPI_SUCCESS;
798 }
799
800 static  int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
801     void *recvbuf,
802     int count,
803     MPI_Datatype datatype,
804     MPI_Op op, MPI_Comm  comm)
805 {
806   mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
807   return MPI_SUCCESS;
808 }
809
810 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
811 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
812
813
814
815 static void init_mv2_allreduce_tables_stampede(){
816   mv2_size_allreduce_tuning_table = 8;
817   mv2_allreduce_thresholds_table = xbt_malloc(mv2_size_allreduce_tuning_table *
818       sizeof (mv2_allreduce_tuning_table));
819   mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
820       {
821           16,
822           0,
823           {1, 0},
824           2,
825           {
826               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
827               {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
828           },
829           2,
830           {
831               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
832               {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
833           },
834       },
835       {
836           32,
837           0,
838           {1, 1, 0},
839           3,
840           {
841               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
842               {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
843               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
844           },
845           2,
846           {
847               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
848               {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
849           },
850       },
851       {
852           64,
853           0,
854           {1, 1, 0},
855           3,
856           {
857               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
858               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
859               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
860           },
861           2,
862           {
863               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
864               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
865           },
866       },
867       {
868           128,
869           0,
870           {1, 1, 0},
871           3,
872           {
873               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
874               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
875               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
876           },
877           2,
878           {
879               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
880               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
881           },
882       },
883       {
884           256,
885           0,
886           {1, 1, 0},
887           3,
888           {
889               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
890               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
891               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
892           },
893           2,
894           {
895               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
896               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
897           },
898       },
899       {
900           512,
901           0,
902           {1, 1, 0},
903           3,
904           {
905               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
906               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
907               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
908           },
909           2,
910           {
911               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
912               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
913           },
914       },
915       {
916           1024,
917           0,
918           {1, 1, 1, 0},
919           4,
920           {
921               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
922               {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
923               {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
924               {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
925           },
926           2,
927           {
928               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
929               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
930           },
931       },
932       {
933           2048,
934           0,
935           {1, 1, 1, 0},
936           4,
937           {
938               {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
939               {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
940               {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
941               {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
942               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
943           },
944           2,
945           {
946               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
947               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
948           },
949       },
950
951   };
952   memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
953       mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
954 }
955
956
957 /*
958 Bcast deactivated for now, defaults to mpich one
959 typedef struct {
960     int min;
961     int max;
962     int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
963                                   int root, MPI_Comm comm_ptr);
964     int zcpy_pipelined_knomial_factor;
965 } mv2_bcast_tuning_element;
966
967 typedef struct {
968     int numproc;
969     int bcast_segment_size;
970     int intra_node_knomial_factor;
971     int inter_node_knomial_factor;
972     int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
973     int size_inter_table;
974     mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
975     int size_intra_table;
976     mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
977 } mv2_bcast_tuning_table;
978
979 int mv2_size_bcast_tuning_table = 0;
980 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
981
982
983 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
984                            int root, MPI_Comm comm_ptr) = NULL;
985
986 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
987                                       int root, MPI_Comm comm_ptr) = NULL;
988
989
990  */
991
992
993 /*
994 static void init_mv2_bcast_tables_stampede(){
995  //Stampede,
996         mv2_size_bcast_tuning_table=8;
997         mv2_bcast_thresholds_table = xbt_malloc(mv2_size_bcast_tuning_table *
998                                                  sizeof (mv2_bcast_tuning_table));
999
1000   mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1001     {
1002             16,
1003             8192, 4, 4,
1004             {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1005             11,
1006             {
1007               {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1008               {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1009               {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1010               {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1011               {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1012               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1013               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1014               {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1015               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1016               {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1017               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1018             },
1019             11,
1020             {
1021               {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1022               {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1023               {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1024               {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1025               {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1026               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1027               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1028               {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1029               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1030               {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1031               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1032             }
1033     },
1034     {
1035             32,
1036             8192, 4, 4,
1037             {1, 1, 1, 1, 1, 1, 1, 1},
1038             8,
1039             {
1040               {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1041               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1042               {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1043               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1044               {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1045               {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1046               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1047               {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1048             },
1049             8,
1050             {
1051               {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1052               {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1053               {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1054               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1055               {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1056               {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1057               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1058               {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1059             }
1060     },
1061     {
1062             64,
1063             8192, 4, 4,
1064             {1, 1, 1, 1, 1, 1, 1, 1, 1},
1065             9,
1066             {
1067               {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1068               {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1069               {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1070               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1071               {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1072               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1073               {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1074               {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1075               {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1076             },
1077             9,
1078             {
1079               {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1080               {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1081               {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1082               {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1083               {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1084               {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1085               {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1086               {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1087               {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1088             }
1089     },
1090     {
1091             128,
1092             8192, 4, 4,
1093             {1, 1, 1, 0},
1094             4,
1095             {
1096               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1097               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1098               {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1099               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1100             },
1101             4,
1102             {
1103               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1104               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1105               {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1106               {524288, -1, NULL, -1}
1107             }
1108     },
1109     {
1110             256,
1111             8192, 4, 4,
1112             {1, 1, 1, 1, 1},
1113             5,
1114             {
1115               {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1116               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1117               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1118               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1119               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1120             },
1121             5,
1122             {
1123               {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1124               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1125               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1126               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1127               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1128             }
1129     },
1130     {
1131             512,
1132             8192, 4, 4,
1133             {1, 1, 1, 1, 1},
1134             5,
1135             {
1136               {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1137               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1138               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1139               {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1140               {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1141             },
1142             5,
1143             {
1144               {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1145               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1146               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1147               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1148               {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1149             }
1150     },
1151     {
1152             1024,
1153             8192, 4, 4,
1154             {1, 1, 1, 1, 1},
1155             5,
1156             {
1157               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1158               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1159               {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1160               {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1161               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1162             },
1163             5,
1164             {
1165               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1166               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1167               {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1168               {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1169               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1170             }
1171     },
1172     {
1173             2048,
1174             8192, 4, 4,
1175             {1, 1, 1, 1, 1, 1, 1},
1176             7,
1177             {
1178               {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1179               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1180               {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1181               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1182               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1183               {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1184               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1185             },
1186             7,
1187             {
1188               {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1189               {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1190               {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1191               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1192               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1193               {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1194               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1195             }
1196     }
1197   };
1198
1199         memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1200                     mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1201 }*/
1202
1203
1204 /************ Reduce variables and initializers                        */
1205
1206 typedef struct {
1207   int min;
1208   int max;
1209   int (*MV2_pt_Reduce_function)(void *sendbuf,
1210       void *recvbuf,
1211       int count,
1212       MPI_Datatype datatype,
1213       MPI_Op op,
1214       int root,
1215       MPI_Comm  comm_ptr);
1216 } mv2_reduce_tuning_element;
1217
1218 typedef struct {
1219   int numproc;
1220   int inter_k_degree;
1221   int intra_k_degree;
1222   int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1223   int size_inter_table;
1224   mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1225   int size_intra_table;
1226   mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1227 } mv2_reduce_tuning_table;
1228
1229 int mv2_size_reduce_tuning_table = 0;
1230 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1231
1232
1233 int mv2_reduce_intra_knomial_factor = -1;
1234 int mv2_reduce_inter_knomial_factor = -1;
1235
1236 int (*MV2_Reduce_function)( void *sendbuf,
1237     void *recvbuf,
1238     int count,
1239     MPI_Datatype datatype,
1240     MPI_Op op,
1241     int root,
1242     MPI_Comm  comm_ptr)=NULL;
1243
1244 int (*MV2_Reduce_intra_function)( void *sendbuf,
1245     void *recvbuf,
1246     int count,
1247     MPI_Datatype datatype,
1248     MPI_Op op,
1249     int root,
1250     MPI_Comm  comm_ptr)=NULL;
1251
1252
1253 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_binomial
1254 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_binomial
1255 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1256 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1257 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1258
1259
1260
1261 static void init_mv2_reduce_tables_stampede(){
1262   /*Stampede*/
1263   mv2_size_reduce_tuning_table = 8;
1264   mv2_reduce_thresholds_table = xbt_malloc(mv2_size_reduce_tuning_table *
1265       sizeof (mv2_reduce_tuning_table));
1266   mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1267       {
1268           16,
1269           4,
1270           4,
1271           {1, 0, 0},
1272           3,
1273           {
1274               {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1275               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1276               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1277           },
1278           2,
1279           {
1280               {0, 65536, &MPIR_Reduce_shmem_MV2},
1281               {65536,-1,  &MPIR_Reduce_binomial_MV2},
1282           },
1283       },
1284       {
1285           32,
1286           4,
1287           4,
1288           {1, 1, 1, 1, 0, 0, 0},
1289           7,
1290           {
1291               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1292               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1293               {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1294               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1295               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1296               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1297               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1298           },
1299           6,
1300           {
1301               {0, 8192, &MPIR_Reduce_shmem_MV2},
1302               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1303               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1304               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1305               {65536, 262144, &MPIR_Reduce_shmem_MV2},
1306               {262144,-1,  &MPIR_Reduce_binomial_MV2},
1307           },
1308       },
1309       {
1310           64,
1311           4,
1312           4,
1313           {1, 1, 1, 1, 0},
1314           5,
1315           {
1316               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1317               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1318               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1319               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1320               {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1321           },
1322           5,
1323           {
1324               {0, 8192, &MPIR_Reduce_shmem_MV2},
1325               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1326               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1327               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1328               {262144, -1, &MPIR_Reduce_binomial_MV2},
1329           },
1330       },
1331       {
1332           128,
1333           4,
1334           4,
1335           {1, 0, 1, 0, 1, 0},
1336           6,
1337           {
1338               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1339               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1340               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1341               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1342               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1343               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1344           },
1345           5,
1346           {
1347               {0, 8192, &MPIR_Reduce_shmem_MV2},
1348               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1349               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1350               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1351               {262144, -1, &MPIR_Reduce_binomial_MV2},
1352           },
1353       },
1354       {
1355           256,
1356           4,
1357           4,
1358           {1, 1, 1, 0, 1, 1, 0},
1359           7,
1360           {
1361               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1362               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1363               {16384, 32768, &MPIR_Reduce_binomial_MV2},
1364               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1365               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1366               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1367               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1368           },
1369           6,
1370           {
1371               {0, 8192, &MPIR_Reduce_shmem_MV2},
1372               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1373               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1374               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1375               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1376               {262144, -1, &MPIR_Reduce_binomial_MV2},
1377           },
1378       },
1379       {
1380           512,
1381           4,
1382           4,
1383           {1, 0, 1, 1, 1, 0},
1384           6,
1385           {
1386               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1387               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1388               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1389               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1390               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1391               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1392           },
1393           5,
1394           {
1395               {0, 8192, &MPIR_Reduce_shmem_MV2},
1396               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1397               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1398               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1399               {262144, -1, &MPIR_Reduce_binomial_MV2},
1400           },
1401       },
1402       {
1403           1024,
1404           4,
1405           4,
1406           {1, 0, 1, 1, 1},
1407           5,
1408           {
1409               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1410               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1411               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1412               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1413               {262144, -1, &MPIR_Reduce_binomial_MV2},
1414           },
1415           5,
1416           {
1417               {0, 8192, &MPIR_Reduce_shmem_MV2},
1418               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1419               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1420               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1421               {262144, -1, &MPIR_Reduce_binomial_MV2},
1422           },
1423       },
1424       {
1425           2048,
1426           4,
1427           4,
1428           {1, 0, 1, 1, 1,1},
1429           6,
1430           {
1431               {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1432               {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1433               {4096, 16384, &MPIR_Reduce_binomial_MV2},
1434               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1435               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1436               {131072, -1, &MPIR_Reduce_binomial_MV2},
1437           },
1438           6,
1439           {
1440               {0, 2048, &MPIR_Reduce_shmem_MV2},
1441               {2048, 4096, &MPIR_Reduce_shmem_MV2},
1442               {4096, 16384, &MPIR_Reduce_shmem_MV2},
1443               {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1444               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1445               {131072, -1, &MPIR_Reduce_shmem_MV2},
1446           },
1447       },
1448
1449   };
1450   memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1451       mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1452 }
1453
1454 /************ Reduce scatter variables and initializers                        */
1455
1456 typedef struct {
1457   int min;
1458   int max;
1459   int (*MV2_pt_Red_scat_function)(void *sendbuf,
1460       void *recvbuf,
1461       int *recvcnts,
1462       MPI_Datatype datatype,
1463       MPI_Op op,
1464       MPI_Comm comm_ptr);
1465 } mv2_red_scat_tuning_element;
1466
1467 typedef struct {
1468   int numproc;
1469   int size_inter_table;
1470   mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1471 } mv2_red_scat_tuning_table;
1472
1473 int mv2_size_red_scat_tuning_table = 0;
1474 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1475
1476
1477 int (*MV2_Red_scat_function)(void *sendbuf,
1478     void *recvbuf,
1479     int *recvcnts,
1480     MPI_Datatype datatype,
1481     MPI_Op op,
1482     MPI_Comm comm_ptr);
1483
1484
1485
1486 static  int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1487     void *recvbuf,
1488     int *recvcnts,
1489     MPI_Datatype datatype,
1490     MPI_Op op,
1491     MPI_Comm comm)
1492 {
1493   smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1494   return MPI_SUCCESS;
1495 }
1496 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1497 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1498 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1499
1500
1501
1502
1503 static void init_mv2_reduce_scatter_tables_stampede(){
1504   mv2_size_red_scat_tuning_table = 6;
1505   mv2_red_scat_thresholds_table = xbt_malloc(mv2_size_red_scat_tuning_table *
1506       sizeof (mv2_red_scat_tuning_table));
1507   mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1508       {
1509           16,
1510           3,
1511           {
1512               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1513               {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1514               {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1515           },
1516       },
1517       {
1518           32,
1519           3,
1520           {
1521               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1522               {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1523               {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1524           },
1525       },
1526       {
1527           64,
1528           3,
1529           {
1530               {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1531               {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1532               {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1533           },
1534       },
1535       {
1536           128,
1537           2,
1538           {
1539               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1540               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1541           },
1542       },
1543       {
1544           256,
1545           2,
1546           {
1547               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1548               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1549           },
1550       },
1551       {
1552           512,
1553           2,
1554           {
1555               {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1556               {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1557           },
1558       },
1559
1560   };
1561   memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1562       mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1563 }
1564
1565 /************ Scatter variables and initializers                        */
1566
1567 typedef struct {
1568   int min;
1569   int max;
1570   int (*MV2_pt_Scatter_function)(void *sendbuf,
1571       int sendcnt,
1572       MPI_Datatype sendtype,
1573       void *recvbuf,
1574       int recvcnt,
1575       MPI_Datatype recvtype,
1576       int root, MPI_Comm comm);
1577 } mv2_scatter_tuning_element;
1578
1579 typedef struct {
1580   int numproc;
1581   int size_inter_table;
1582   mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1583   int size_intra_table;
1584   mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1585 } mv2_scatter_tuning_table;
1586
1587
1588 int *mv2_scatter_table_ppn_conf = NULL;
1589 int mv2_scatter_num_ppn_conf = 1;
1590 int *mv2_size_scatter_tuning_table = NULL;
1591 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1592
1593 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1594     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1595     int root, MPI_Comm comm)=NULL;
1596
1597 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1598     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1599     int root, MPI_Comm comm)=NULL;
1600 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1601     int sendcnt,
1602     MPI_Datatype sendtype,
1603     void *recvbuf,
1604     int recvcnt,
1605     MPI_Datatype recvtype,
1606     int root, MPI_Comm comm_ptr);
1607
1608 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1609     int sendcnt,
1610     MPI_Datatype sendtype,
1611     void *recvbuf,
1612     int recvcnt,
1613     MPI_Datatype recvtype,
1614     int root, MPI_Comm comm_ptr)
1615 {
1616   return 0;
1617 }
1618
1619 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1620 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1621 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_ompi_binomial
1622 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1623
1624
1625
1626
1627 static void init_mv2_scatter_tables_stampede(){
1628   {
1629     int agg_table_sum = 0;
1630     int i;
1631     mv2_scatter_tuning_table **table_ptrs = NULL;
1632     mv2_scatter_num_ppn_conf = 3;
1633     mv2_scatter_thresholds_table
1634     = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1635         * mv2_scatter_num_ppn_conf);
1636     table_ptrs = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1637         * mv2_scatter_num_ppn_conf);
1638     mv2_size_scatter_tuning_table = xbt_malloc(sizeof(int) *
1639         mv2_scatter_num_ppn_conf);
1640     mv2_scatter_table_ppn_conf
1641     = xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int));
1642     mv2_scatter_table_ppn_conf[0] = 1;
1643     mv2_size_scatter_tuning_table[0] = 6;
1644     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1645         {2,
1646             1,
1647             {
1648                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1649             },
1650             1,
1651             {
1652                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1653             },
1654         },
1655
1656         {4,
1657             1,
1658             {
1659                 {0, -1, &MPIR_Scatter_MV2_Direct},
1660             },
1661             1,
1662             {
1663                 {0, -1, &MPIR_Scatter_MV2_Direct},
1664             },
1665         },
1666
1667         {8,
1668             1,
1669             {
1670                 {0, -1, &MPIR_Scatter_MV2_Direct},
1671             },
1672             1,
1673             {
1674                 {0, -1, &MPIR_Scatter_MV2_Direct},
1675             },
1676         },
1677
1678         {16,
1679             1,
1680             {
1681                 {0, -1, &MPIR_Scatter_MV2_Direct},
1682             },
1683             1,
1684             {
1685                 {0, -1, &MPIR_Scatter_MV2_Direct},
1686             },
1687         },
1688
1689         {32,
1690             1,
1691             {
1692                 {0, -1, &MPIR_Scatter_MV2_Direct},
1693             },
1694             1,
1695             {
1696                 {0, -1, &MPIR_Scatter_MV2_Direct},
1697             },
1698         },
1699
1700         {64,
1701             2,
1702             {
1703                 {0, 32, &MPIR_Scatter_MV2_Binomial},
1704                 {32, -1, &MPIR_Scatter_MV2_Direct},
1705             },
1706             1,
1707             {
1708                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1709             },
1710         },
1711     };
1712     table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1713     mv2_scatter_table_ppn_conf[1] = 2;
1714     mv2_size_scatter_tuning_table[1] = 6;
1715     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1716         {4,
1717             2,
1718             {
1719                 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1720                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1721             },
1722             1,
1723             {
1724                 {0, -1, &MPIR_Scatter_MV2_Direct},
1725             },
1726         },
1727
1728         {8,
1729             2,
1730             {
1731                 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1732                 {512, -1, &MPIR_Scatter_MV2_Direct},
1733             },
1734             1,
1735             {
1736                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1737             },
1738         },
1739
1740         {16,
1741             2,
1742             {
1743                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1744                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1745             },
1746             1,
1747             {
1748                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1749             },
1750         },
1751
1752         {32,
1753             2,
1754             {
1755                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1756                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1757             },
1758             1,
1759             {
1760                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1761             },
1762         },
1763
1764         {64,
1765             2,
1766             {
1767                 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1768                 {8192, -1, &MPIR_Scatter_MV2_Direct},
1769             },
1770             1,
1771             {
1772                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1773             },
1774         },
1775
1776         {128,
1777             4,
1778             {
1779                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1780                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1781                 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1782                 {16384, -1, &MPIR_Scatter_MV2_Direct},
1783             },
1784             1,
1785             {
1786                 {0, 128, &MPIR_Scatter_MV2_Direct},
1787                 {128, -1, &MPIR_Scatter_MV2_Binomial},
1788             },
1789         },
1790     };
1791     table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1792     mv2_scatter_table_ppn_conf[2] = 16;
1793     mv2_size_scatter_tuning_table[2] = 8;
1794     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1795         {
1796             16,
1797             2,
1798             {
1799                 {0, 256, &MPIR_Scatter_MV2_Binomial},
1800                 {256, -1, &MPIR_Scatter_MV2_Direct},
1801             },
1802             1,
1803             {
1804                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1805             },
1806         },
1807
1808         {
1809             32,
1810             2,
1811             {
1812                 {0, 512, &MPIR_Scatter_MV2_Binomial},
1813                 {512, -1, &MPIR_Scatter_MV2_Direct},
1814             },
1815             1,
1816             {
1817                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1818             },
1819         },
1820
1821         {
1822             64,
1823             2,
1824             {
1825                 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1826                 {1024, -1, &MPIR_Scatter_MV2_Direct},
1827             },
1828             1,
1829             {
1830                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1831             },
1832         },
1833
1834         {
1835             128,
1836             4,
1837             {
1838                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1839                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1840                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1841                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1842             },
1843             1,
1844             {
1845                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1846             },
1847         },
1848
1849         {
1850             256,
1851             4,
1852             {
1853                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1854                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1855                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1856                 {2048, -1,  &MPIR_Scatter_MV2_Direct},
1857             },
1858             1,
1859             {
1860                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1861             },
1862         },
1863
1864         {
1865             512,
1866             4,
1867             {
1868                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1869                 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1870                 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1871                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1872             },
1873             1,
1874             {
1875                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1876             },
1877         },
1878         {
1879             1024,
1880             5,
1881             {
1882                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1883                 {0, 16,  &MPIR_Scatter_MV2_Binomial},
1884                 {16, 32, &MPIR_Scatter_MV2_Binomial},
1885                 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1886                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1887             },
1888             1,
1889             {
1890                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1891             },
1892         },
1893         {
1894             2048,
1895             7,
1896             {
1897                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1898                 {0, 16,  &MPIR_Scatter_MV2_two_level_Binomial},
1899                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1900                 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1901                 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1902                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1903                 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1904             },
1905             6,
1906             {
1907                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1908                 {16, 128, &MPIR_Scatter_MV2_Binomial},
1909                 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1910                 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1911                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1912                 {65536, -1, &MPIR_Scatter_MV2_Direct},
1913             },
1914         },
1915     };
1916     table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1917     agg_table_sum = 0;
1918     for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1919         agg_table_sum += mv2_size_scatter_tuning_table[i];
1920     }
1921     mv2_scatter_thresholds_table[0] =
1922         xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
1923     memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1924         (sizeof(mv2_scatter_tuning_table)
1925             * mv2_size_scatter_tuning_table[0]));
1926     for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1927         mv2_scatter_thresholds_table[i] =
1928             mv2_scatter_thresholds_table[i - 1]
1929                                          + mv2_size_scatter_tuning_table[i - 1];
1930         memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1931             (sizeof(mv2_scatter_tuning_table)
1932                 * mv2_size_scatter_tuning_table[i]));
1933     }
1934     xbt_free(table_ptrs);
1935   }
1936 }
1937