Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Add last collectives from mvapich selector : bcast reduce reduce_scatter scatter
[simgrid.git] / src / smpi / colls / smpi_mvapich2_selector.c
1 /* selector for collective algorithms based on mvapich decision logic */
2
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4  * All rights reserved.                                                     */
5
6 /* This program is free software; you can redistribute it and/or modify it
7  * under the terms of the license (GNU LGPL) which comes with this package. */
8
9 #include "colls_private.h"
10
11 #include "smpi_mvapich2_selector_stampede.h"
12
13
14 static void init_mv2_alltoall_tables_stampede(){
15 int i;
16   int agg_table_sum = 0;
17 mv2_alltoall_tuning_table **table_ptrs = NULL;
18    mv2_alltoall_num_ppn_conf = 3;
19         mv2_alltoall_thresholds_table
20           = malloc(sizeof(mv2_alltoall_tuning_table *)
21                         * mv2_alltoall_num_ppn_conf);
22         table_ptrs = malloc(sizeof(mv2_alltoall_tuning_table *)
23                                  * mv2_alltoall_num_ppn_conf);
24         mv2_size_alltoall_tuning_table = malloc(sizeof(int) *
25                                                      mv2_alltoall_num_ppn_conf);
26         mv2_alltoall_table_ppn_conf =malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
27         mv2_alltoall_table_ppn_conf[0] = 1;
28         mv2_size_alltoall_tuning_table[0] = 6;
29         mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
30           {2,
31            1, 
32            {{0, -1, &MPIR_Alltoall_pairwise_MV2},
33            },
34   
35            {{0, -1, &MPIR_Alltoall_inplace_MV2},
36            },
37           },
38   
39           {4,
40            2,
41            {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
42             {262144, -1, &MPIR_Alltoall_pairwise_MV2},
43            },
44                 
45            {{0, -1, &MPIR_Alltoall_inplace_MV2},
46            },
47           },
48   
49           {8,
50            2,
51            {{0, 8, &MPIR_Alltoall_RD_MV2},
52             {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
53            },
54   
55            {{0, -1, &MPIR_Alltoall_inplace_MV2},
56            },
57           },
58   
59           {16,
60            3,
61            {{0, 64, &MPIR_Alltoall_RD_MV2},
62             {64, 512, &MPIR_Alltoall_bruck_MV2},
63             {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
64            },
65   
66            {{0,-1, &MPIR_Alltoall_inplace_MV2},
67            },
68           },
69   
70           {32,
71            3,
72            {{0, 32, &MPIR_Alltoall_RD_MV2},
73             {32, 2048, &MPIR_Alltoall_bruck_MV2},
74             {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
75            },
76   
77            {{0, -1, &MPIR_Alltoall_inplace_MV2},
78            },
79           },
80   
81           {64,
82            3,
83            {{0, 8, &MPIR_Alltoall_RD_MV2},
84             {8, 1024, &MPIR_Alltoall_bruck_MV2},
85             {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
86            },
87   
88            {{0, -1, &MPIR_Alltoall_inplace_MV2},
89            },
90           },
91         };
92         table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
93         mv2_alltoall_table_ppn_conf[1] = 2;
94         mv2_size_alltoall_tuning_table[1] = 6;
95         mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
96           {4,
97            2,
98            {{0, 32, &MPIR_Alltoall_RD_MV2},
99             {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
100            },
101                 
102            {{0, -1, &MPIR_Alltoall_inplace_MV2},
103            },
104           },
105   
106           {8,
107            2,
108            {{0, 64, &MPIR_Alltoall_RD_MV2},
109             {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
110            },
111                 
112            {{0, -1, &MPIR_Alltoall_inplace_MV2},
113            },
114           },
115   
116           {16,
117            3,
118            {{0, 64, &MPIR_Alltoall_RD_MV2},
119             {64, 2048, &MPIR_Alltoall_bruck_MV2},
120             {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
121            },
122   
123            {{0,-1, &MPIR_Alltoall_inplace_MV2},
124            },
125           },
126   
127           {32,
128            3,
129            {{0, 16, &MPIR_Alltoall_RD_MV2},
130             {16, 2048, &MPIR_Alltoall_bruck_MV2},
131             {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
132            },
133   
134            {{0, -1, &MPIR_Alltoall_inplace_MV2},
135            },
136           },
137   
138           {64,
139            3,
140            {{0, 8, &MPIR_Alltoall_RD_MV2},
141             {8, 1024, &MPIR_Alltoall_bruck_MV2},
142             {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
143            },
144   
145            {{0, -1, &MPIR_Alltoall_inplace_MV2},
146            },
147           },
148
149           {128,
150            3,
151            {{0, 4, &MPIR_Alltoall_RD_MV2},
152             {4, 2048, &MPIR_Alltoall_bruck_MV2},
153             {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
154            },
155   
156            {{0, -1, &MPIR_Alltoall_inplace_MV2},
157            },
158           },
159         };
160         table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
161         mv2_alltoall_table_ppn_conf[2] = 16;
162         mv2_size_alltoall_tuning_table[2] = 7;
163         mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
164           {16,
165            2, 
166            {{0, 2048, &MPIR_Alltoall_bruck_MV2},
167             {2048, -1,  &MPIR_Alltoall_Scatter_dest_MV2},
168            },
169   
170            {{32768, -1, &MPIR_Alltoall_inplace_MV2},
171            },
172           },
173   
174           {32,
175            2,
176            {{0, 2048, &MPIR_Alltoall_bruck_MV2},
177             {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
178            },
179                 
180            {{16384, -1, &MPIR_Alltoall_inplace_MV2},
181            },
182           },
183   
184           {64,
185            3,
186            {{0, 2048, &MPIR_Alltoall_bruck_MV2},
187             {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
188             {16384, -1, &MPIR_Alltoall_pairwise_MV2},
189            },
190   
191            {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
192            },
193           },
194   
195           {128,
196            2,
197            {{0, 2048, &MPIR_Alltoall_bruck_MV2},
198             {2048, -1, &MPIR_Alltoall_pairwise_MV2},
199            },
200   
201            {{16384,65536, &MPIR_Alltoall_inplace_MV2},
202            },
203           },
204   
205           {256,
206            2,
207            {{0, 1024, &MPIR_Alltoall_bruck_MV2},
208             {1024, -1, &MPIR_Alltoall_pairwise_MV2},
209            },
210   
211            {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
212            },
213           },
214   
215           {512,
216            2,
217            {{0, 1024, &MPIR_Alltoall_bruck_MV2},
218             {1024, -1, &MPIR_Alltoall_pairwise_MV2},
219            },
220   
221            {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
222            },
223           },
224           {1024,
225            2,
226            {{0, 1024, &MPIR_Alltoall_bruck_MV2},
227             {1024, -1, &MPIR_Alltoall_pairwise_MV2},
228            },
229   
230            {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
231            },
232           },
233   
234         };
235         table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
236         agg_table_sum = 0;
237         for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
238           agg_table_sum += mv2_size_alltoall_tuning_table[i];
239         }
240         mv2_alltoall_thresholds_table[0] =
241           malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
242         memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
243                     (sizeof(mv2_alltoall_tuning_table)
244                      * mv2_size_alltoall_tuning_table[0]));
245         for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
246           mv2_alltoall_thresholds_table[i] =
247             mv2_alltoall_thresholds_table[i - 1]
248             + mv2_size_alltoall_tuning_table[i - 1];
249           memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
250                       (sizeof(mv2_alltoall_tuning_table)
251                        * mv2_size_alltoall_tuning_table[i]));
252         }
253         free(table_ptrs);
254         
255         
256 }
257                             
258 int smpi_coll_tuned_alltoall_mvapich2( void *sendbuf, int sendcount, 
259                                              MPI_Datatype sendtype,
260                                              void* recvbuf, int recvcount, 
261                                              MPI_Datatype recvtype, 
262                                              MPI_Comm comm)
263 {
264
265     if(mv2_alltoall_table_ppn_conf==NULL)
266         init_mv2_alltoall_tables_stampede();
267         
268     int sendtype_size, recvtype_size, nbytes, comm_size;
269     char * tmp_buf = NULL;
270     int mpi_errno=MPI_SUCCESS;
271     int range = 0;
272     int range_threshold = 0;
273     int conf_index = 0;
274     comm_size =  smpi_comm_size(comm);
275
276     sendtype_size=smpi_datatype_size(sendtype);
277     recvtype_size=smpi_datatype_size(recvtype);
278     nbytes = sendtype_size * sendcount;
279
280     /* check if safe to use partial subscription mode */
281
282     /* Search for the corresponding system size inside the tuning table */
283     while ((range < (mv2_size_alltoall_tuning_table[conf_index] - 1)) &&
284            (comm_size > mv2_alltoall_thresholds_table[conf_index][range].numproc)) {
285         range++;
286     }    
287     /* Search for corresponding inter-leader function */
288     while ((range_threshold < (mv2_alltoall_thresholds_table[conf_index][range].size_table - 1))
289            && (nbytes >
290                mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold].max)
291            && (mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold].max != -1)) {
292         range_threshold++;
293     }     
294     MV2_Alltoall_function = mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold]
295                                 .MV2_pt_Alltoall_function;
296
297     if(sendbuf != MPI_IN_PLACE) {  
298         mpi_errno = MV2_Alltoall_function(sendbuf, sendcount, sendtype,
299                                               recvbuf, recvcount, recvtype,
300                                                comm);
301     } else {
302         range_threshold = 0; 
303         if(nbytes < 
304           mv2_alltoall_thresholds_table[conf_index][range].in_place_algo_table[range_threshold].min
305           ||nbytes > mv2_alltoall_thresholds_table[conf_index][range].in_place_algo_table[range_threshold].max
306           ) {
307             tmp_buf = (char *)malloc( comm_size * recvcount * recvtype_size );
308             mpi_errno = smpi_datatype_copy((char *)recvbuf,
309                                        comm_size*recvcount, recvtype,
310                                        (char *)tmp_buf,
311                                        comm_size*recvcount, recvtype);
312
313             mpi_errno = MV2_Alltoall_function(tmp_buf, recvcount, recvtype,
314                                                recvbuf, recvcount, recvtype,
315                                                 comm );        
316             free(tmp_buf);
317         } else { 
318             mpi_errno = MPIR_Alltoall_inplace_MV2(sendbuf, sendcount, sendtype,
319                                               recvbuf, recvcount, recvtype,
320                                                comm );
321         } 
322     }
323
324     
325     return (mpi_errno);
326 }
327
328
329 static void init_mv2_allgather_tables_stampede(){
330 int i;
331   int agg_table_sum = 0;
332 mv2_allgather_tuning_table **table_ptrs = NULL;
333  mv2_allgather_num_ppn_conf = 3;
334         mv2_allgather_thresholds_table
335             = malloc(sizeof(mv2_allgather_tuning_table *)
336                   * mv2_allgather_num_ppn_conf);
337         table_ptrs = malloc(sizeof(mv2_allgather_tuning_table *)
338                                  * mv2_allgather_num_ppn_conf);
339         mv2_size_allgather_tuning_table = malloc(sizeof(int) *
340                                                       mv2_allgather_num_ppn_conf);
341         mv2_allgather_table_ppn_conf 
342             = malloc(mv2_allgather_num_ppn_conf * sizeof(int));
343         mv2_allgather_table_ppn_conf[0] = 1;
344         mv2_size_allgather_tuning_table[0] = 6;
345         mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
346             {
347                 2,
348                 {0},
349                 1,
350                 {
351                     {0, -1, &MPIR_Allgather_Ring_MV2},
352                 },
353             },
354             {
355                 4,
356                 {0,0},
357                 2,
358                 {
359                     {0, 262144, &MPIR_Allgather_RD_MV2},
360                     {262144, -1, &MPIR_Allgather_Ring_MV2},
361                 },
362             },
363             {
364                 8,
365                 {0,0},
366                 2,
367                 {
368                     {0, 131072, &MPIR_Allgather_RD_MV2},
369                     {131072, -1, &MPIR_Allgather_Ring_MV2},
370                 },
371             },
372             {
373                 16,
374                 {0,0},
375                 2,
376                 {
377                     {0, 131072, &MPIR_Allgather_RD_MV2},
378                     {131072, -1, &MPIR_Allgather_Ring_MV2},
379                 },
380             },
381             {
382                 32,
383                 {0,0},
384                 2,
385                 {
386                     {0, 65536, &MPIR_Allgather_RD_MV2},
387                     {65536, -1, &MPIR_Allgather_Ring_MV2},
388                 },
389             },
390             {
391                 64,
392                 {0,0},
393                 2,
394                 {
395                     {0, 32768, &MPIR_Allgather_RD_MV2},
396                     {32768, -1, &MPIR_Allgather_Ring_MV2},
397                 },
398             },
399         };
400         table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
401         mv2_allgather_table_ppn_conf[1] = 2;
402         mv2_size_allgather_tuning_table[1] = 6;
403         mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
404             {
405                 4,
406                 {0,0},
407                 2,
408                 {
409                     {0, 524288, &MPIR_Allgather_RD_MV2},
410                     {524288, -1, &MPIR_Allgather_Ring_MV2},
411                 },
412             },
413             {
414                 8,
415                 {0,1,0},
416                 2,
417                 {
418                     {0, 32768, &MPIR_Allgather_RD_MV2},
419                     {32768, 524288, &MPIR_Allgather_Ring_MV2},
420                     {524288, -1, &MPIR_Allgather_Ring_MV2},
421                 },
422             },
423             {
424                 16,
425                 {0,1,0},
426                 2,
427                 {
428                     {0, 16384, &MPIR_Allgather_RD_MV2},
429                     {16384, 524288, &MPIR_Allgather_Ring_MV2},
430                     {524288, -1, &MPIR_Allgather_Ring_MV2},
431                 },
432             },
433             {
434                 32,
435                 {1,1,0},
436                 2,
437                 {
438                     {0, 65536, &MPIR_Allgather_RD_MV2},
439                     {65536, 524288, &MPIR_Allgather_Ring_MV2},
440                     {524288, -1, &MPIR_Allgather_Ring_MV2},
441                 },
442             },
443             {
444                 64,
445                 {1,1,0},
446                 2,
447                 {
448                     {0, 32768, &MPIR_Allgather_RD_MV2},
449                     {32768, 524288, &MPIR_Allgather_Ring_MV2},
450                     {524288, -1, &MPIR_Allgather_Ring_MV2},
451                 },
452             },
453             {
454                 128,
455                 {1,1,0},
456                 2,
457                 {
458                     {0, 65536, &MPIR_Allgather_RD_MV2},
459                     {65536, 524288, &MPIR_Allgather_Ring_MV2},
460                     {524288, -1, &MPIR_Allgather_Ring_MV2},
461                 },
462             },
463         };
464         table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
465         mv2_allgather_table_ppn_conf[2] = 16;
466         mv2_size_allgather_tuning_table[2] = 6;
467         mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
468             {
469                 16,
470                 {0,0},
471                 2,
472                 {
473                     {0, 1024, &MPIR_Allgather_RD_MV2},
474                     {1024, -1, &MPIR_Allgather_Ring_MV2},
475                 },
476             },
477             {
478                 32,
479                 {0,0},
480                 2,
481                 {
482                     {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
483                     {1024, -1, &MPIR_Allgather_Ring_MV2},
484                 },
485             },
486             {
487                 64,
488                 {0,0},
489                 2,
490                 {
491                     {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
492                     {1024, -1, &MPIR_Allgather_Ring_MV2},
493                 },
494             },
495             {
496                 128,
497                 {0,0},
498                 2,
499                 {
500                     {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
501                     {1024, -1, &MPIR_Allgather_Ring_MV2},
502                 },
503             },
504             {
505                 256,
506                 {0,0},
507                 2,
508                 {
509                     {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
510                     {1024, -1, &MPIR_Allgather_Ring_MV2},
511                 },
512             },
513             {
514                 512,
515                 {0,0},
516                 2,
517                 {
518                     {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
519                     {1024, -1, &MPIR_Allgather_Ring_MV2},
520                 },
521             },
522
523         };
524         table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
525         agg_table_sum = 0;
526         for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
527             agg_table_sum += mv2_size_allgather_tuning_table[i];
528         }
529         mv2_allgather_thresholds_table[0] =
530             malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
531         memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
532             (sizeof(mv2_allgather_tuning_table)
533                      * mv2_size_allgather_tuning_table[0]));
534         for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
535             mv2_allgather_thresholds_table[i] =
536             mv2_allgather_thresholds_table[i - 1]
537             + mv2_size_allgather_tuning_table[i - 1];
538             memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
539                       (sizeof(mv2_allgather_tuning_table)
540                        * mv2_size_allgather_tuning_table[i]));
541         }
542         free(table_ptrs);
543 }
544
545 int smpi_coll_tuned_allgather_mvapich2(void *sendbuf, int sendcount, MPI_Datatype sendtype,
546                        void *recvbuf, int recvcount, MPI_Datatype recvtype,
547                        MPI_Comm comm)
548 {
549
550     int mpi_errno = MPI_SUCCESS;
551     int nbytes = 0, comm_size, recvtype_size;
552     int range = 0;
553     //int partial_sub_ok = 0;
554     int conf_index = 0;
555     int range_threshold = 0;
556     int is_two_level = 0;
557     //int local_size = -1;
558     //MPI_Comm shmem_comm;
559     //MPI_Comm *shmem_commptr=NULL;
560     /* Get the size of the communicator */
561     comm_size = smpi_comm_size(comm);
562     recvtype_size=smpi_datatype_size(recvtype);
563     nbytes = recvtype_size * recvcount;
564
565     if(mv2_allgather_table_ppn_conf==NULL)
566         init_mv2_allgather_tables_stampede();
567         
568     //int i;
569     /* check if safe to use partial subscription mode */
570   /*  if (comm->ch.shmem_coll_ok == 1 && comm->ch.is_uniform) {
571     
572         shmem_comm = comm->ch.shmem_comm;
573         MPID_Comm_get_ptr(shmem_comm, shmem_commptr);
574         local_size = shmem_commptr->local_size;
575         i = 0;
576         if (mv2_allgather_table_ppn_conf[0] == -1) {
577             // Indicating user defined tuning
578             conf_index = 0;
579             goto conf_check_end;
580         }
581         do {
582             if (local_size == mv2_allgather_table_ppn_conf[i]) {
583                 conf_index = i;
584                 partial_sub_ok = 1;
585                 break;
586             }
587             i++;
588         } while(i < mv2_allgather_num_ppn_conf);
589     }
590
591   conf_check_end:
592     if (partial_sub_ok != 1) {
593         conf_index = 0;
594     }*/
595     /* Search for the corresponding system size inside the tuning table */
596     while ((range < (mv2_size_allgather_tuning_table[conf_index] - 1)) &&
597            (comm_size >
598             mv2_allgather_thresholds_table[conf_index][range].numproc)) {
599         range++;
600     }
601     /* Search for corresponding inter-leader function */
602     while ((range_threshold <
603          (mv2_allgather_thresholds_table[conf_index][range].size_inter_table - 1))
604            && (nbytes > mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
605            && (mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].max !=
606                -1)) {
607         range_threshold++;
608     }
609
610     /* Set inter-leader pt */
611     MV2_Allgather_function =
612                           mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].
613                           MV2_pt_Allgather_function;
614
615     is_two_level =  mv2_allgather_thresholds_table[conf_index][range].two_level[range_threshold];
616
617     /* intracommunicator */
618     if(is_two_level ==1){
619         
620  /*       if(comm->ch.shmem_coll_ok == 1){
621             MPIR_T_PVAR_COUNTER_INC(MV2, mv2_num_shmem_coll_calls, 1);
622            if (1 == comm->ch.is_blocked) {
623                 mpi_errno = MPIR_2lvl_Allgather_MV2(sendbuf, sendcount, sendtype,
624                                                     recvbuf, recvcount, recvtype,
625                                                     comm, errflag);
626            }
627            else {
628                mpi_errno = MPIR_Allgather_intra(sendbuf, sendcount, sendtype,
629                                                 recvbuf, recvcount, recvtype,
630                                                 comm, errflag);
631            }
632         } else {*/
633             mpi_errno = MPIR_Allgather_RD_MV2(sendbuf, sendcount, sendtype,
634                                                 recvbuf, recvcount, recvtype,
635                                                 comm);
636    //     }
637     } else if(MV2_Allgather_function == &MPIR_Allgather_Bruck_MV2 
638             || MV2_Allgather_function == &MPIR_Allgather_RD_MV2
639             || MV2_Allgather_function == &MPIR_Allgather_Ring_MV2) {
640             mpi_errno = MV2_Allgather_function(sendbuf, sendcount, sendtype,
641                                           recvbuf, recvcount, recvtype,
642                                           comm);
643     }else{
644       return MPI_ERR_OTHER;
645     }
646
647     return mpi_errno;
648 }
649
650 static void init_mv2_gather_tables_stampede(){
651
652  mv2_size_gather_tuning_table=7;
653       mv2_gather_thresholds_table = malloc(mv2_size_gather_tuning_table*
654                                                 sizeof (mv2_gather_tuning_table)); 
655       mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
656         {16,
657          2,{{0, 524288, &MPIR_Gather_MV2_Direct},
658             {524288, -1, &MPIR_Gather_intra}},
659          1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
660         {32,
661          3,{{0, 16384, &MPIR_Gather_MV2_Direct}, 
662             {16384, 131072, &MPIR_Gather_intra},
663             {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
664          1,{{0, -1, &MPIR_Gather_intra}}},
665         {64,
666          3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct}, 
667             {256, 16384, &MPIR_Gather_MV2_Direct},
668             {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
669          1,{{0, -1, &MPIR_Gather_intra}}},
670         {128,
671          3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, 
672             {512, 16384, &MPIR_Gather_MV2_Direct},
673             {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
674          1,{{0, -1, &MPIR_Gather_intra}}},
675         {256,
676          3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, 
677             {512, 16384, &MPIR_Gather_MV2_Direct},
678             {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
679          1,{{0, -1, &MPIR_Gather_intra}}},
680         {512,
681          3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, 
682             {512, 16384, &MPIR_Gather_MV2_Direct},
683             {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
684          1,{{0, -1, &MPIR_Gather_intra}}},
685         {1024,
686          3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, 
687             {512, 16384, &MPIR_Gather_MV2_Direct},
688             {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
689          1,{{0, -1, &MPIR_Gather_intra}}},
690       };
691
692       memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
693                   mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
694
695 }
696
697
698 int smpi_coll_tuned_gather_mvapich2(void *sendbuf,
699                     int sendcnt,
700                     MPI_Datatype sendtype,
701                     void *recvbuf,
702                     int recvcnt,
703                     MPI_Datatype recvtype,
704                     int root, MPI_Comm  comm)
705 {
706     if(mv2_gather_thresholds_table==NULL)
707         init_mv2_gather_tables_stampede();
708         
709     int mpi_errno = MPI_SUCCESS;
710     int range = 0;
711     int range_threshold = 0;
712     int range_intra_threshold = 0;
713     int nbytes = 0;
714     int comm_size = 0;
715     int recvtype_size, sendtype_size;
716     int rank = -1;
717     comm_size = smpi_comm_size(comm);
718     rank = smpi_comm_rank(comm);
719
720     if (rank == root) {
721         recvtype_size=smpi_datatype_size(recvtype);
722         nbytes = recvcnt * recvtype_size;
723     } else {
724         sendtype_size=smpi_datatype_size(sendtype);
725         nbytes = sendcnt * sendtype_size;
726     }
727     
728     /* Search for the corresponding system size inside the tuning table */
729     while ((range < (mv2_size_gather_tuning_table - 1)) &&
730            (comm_size > mv2_gather_thresholds_table[range].numproc)) {
731         range++;
732     }
733     /* Search for corresponding inter-leader function */
734     while ((range_threshold < (mv2_gather_thresholds_table[range].size_inter_table - 1))
735            && (nbytes >
736                mv2_gather_thresholds_table[range].inter_leader[range_threshold].max)
737            && (mv2_gather_thresholds_table[range].inter_leader[range_threshold].max !=
738                -1)) {
739         range_threshold++;
740     }
741
742     /* Search for corresponding intra node function */
743     while ((range_intra_threshold < (mv2_gather_thresholds_table[range].size_intra_table - 1))
744            && (nbytes >
745                mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].max)
746            && (mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].max !=
747                -1)) {
748         range_intra_threshold++;
749     }
750 /*
751     if (comm->ch.is_global_block == 1 && mv2_use_direct_gather == 1 &&
752             mv2_use_two_level_gather == 1 && comm->ch.shmem_coll_ok == 1) {
753         // Set intra-node function pt for gather_two_level 
754         MV2_Gather_intra_node_function = 
755                               mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].
756                               MV2_pt_Gather_function;
757         //Set inter-leader pt 
758         MV2_Gather_inter_leader_function =
759                               mv2_gather_thresholds_table[range].inter_leader[range_threshold].
760                               MV2_pt_Gather_function;
761         // We call Gather function 
762         mpi_errno =
763             MV2_Gather_inter_leader_function(sendbuf, sendcnt, sendtype, recvbuf, recvcnt,
764                                              recvtype, root, comm);
765
766     } else {*/
767     // Indded, direct (non SMP-aware)gather is MPICH one 
768         mpi_errno = smpi_coll_tuned_gather_mpich(sendbuf, sendcnt, sendtype,
769                                       recvbuf, recvcnt, recvtype,
770                                       root, comm);
771     //}
772
773     return mpi_errno;
774 }
775
776
777
778 static void init_mv2_allgatherv_tables_stampede(){
779  mv2_size_allgatherv_tuning_table = 6;
780  mv2_allgatherv_thresholds_table = malloc(mv2_size_allgatherv_tuning_table *
781                                                   sizeof (mv2_allgatherv_tuning_table));
782         mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
783             {
784                 16,
785                 2,
786                 {
787                     {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
788                     {512, -1, &MPIR_Allgatherv_Ring_MV2},
789                 },
790             },
791             {
792                 32,
793                 2,
794                 {
795                     {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
796                     {512, -1, &MPIR_Allgatherv_Ring_MV2},
797                 },
798             },
799             {
800                 64,
801                 2,
802                 {
803                     {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
804                     {256, -1, &MPIR_Allgatherv_Ring_MV2},
805                 },
806             },
807             {
808                 128,
809                 2,
810                 {
811                     {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
812                     {256, -1, &MPIR_Allgatherv_Ring_MV2},
813                 },
814             },
815             {
816                 256,
817                 2,
818                 {
819                     {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
820                     {256, -1, &MPIR_Allgatherv_Ring_MV2},
821                 },
822             },
823             {
824                 512,
825                 2,
826                 {
827                     {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
828                     {256, -1, &MPIR_Allgatherv_Ring_MV2},
829                 },
830             },
831
832         }; 
833         memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
834                   mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
835 }
836
837
838
839
840
841
842
843 int smpi_coll_tuned_allgatherv_mvapich2(void *sendbuf, int sendcount, MPI_Datatype sendtype,
844                         void *recvbuf, int *recvcounts, int *displs,
845                         MPI_Datatype recvtype, MPI_Comm  comm )
846 {
847     int mpi_errno = MPI_SUCCESS;
848     int range = 0, comm_size, total_count, recvtype_size, i;
849     int range_threshold = 0;
850     int nbytes = 0;
851
852     if(mv2_allgatherv_thresholds_table==NULL)
853         init_mv2_allgatherv_tables_stampede();
854         
855     comm_size = smpi_comm_size(comm);
856     total_count = 0;
857     for (i = 0; i < comm_size; i++)
858         total_count += recvcounts[i];
859
860     recvtype_size=smpi_datatype_size(recvtype);
861     nbytes = total_count * recvtype_size;
862
863     /* Search for the corresponding system size inside the tuning table */
864     while ((range < (mv2_size_allgatherv_tuning_table - 1)) &&
865            (comm_size > mv2_allgatherv_thresholds_table[range].numproc)) {
866         range++;
867     }
868     /* Search for corresponding inter-leader function */
869     while ((range_threshold < (mv2_allgatherv_thresholds_table[range].size_inter_table - 1))
870            && (nbytes >
871                comm_size * mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].max)
872            && (mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].max !=
873                -1)) {
874         range_threshold++;
875     }
876     /* Set inter-leader pt */
877     MV2_Allgatherv_function =
878                           mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].
879                           MV2_pt_Allgatherv_function;
880
881     if (MV2_Allgatherv_function == &MPIR_Allgatherv_Rec_Doubling_MV2)
882     {
883         if(!(comm_size & (comm_size - 1)))
884         {
885             mpi_errno =
886                 MPIR_Allgatherv_Rec_Doubling_MV2(sendbuf, sendcount,
887                                                  sendtype, recvbuf,
888                                                  recvcounts, displs,
889                                                  recvtype, comm);
890         } else {
891             mpi_errno =
892                 MPIR_Allgatherv_Bruck_MV2(sendbuf, sendcount,
893                                           sendtype, recvbuf,
894                                           recvcounts, displs,
895                                           recvtype, comm);
896         }
897     } else {
898         mpi_errno =
899             MV2_Allgatherv_function(sendbuf, sendcount, sendtype,
900                                     recvbuf, recvcounts, displs,
901                                     recvtype, comm);
902     }
903
904     return mpi_errno;
905 }
906
907
908 static void init_mv2_allreduce_tables_stampede(){
909 mv2_size_allreduce_tuning_table = 8;
910       mv2_allreduce_thresholds_table = malloc(mv2_size_allreduce_tuning_table *
911                                                    sizeof (mv2_allreduce_tuning_table));
912       mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
913         {
914           16,
915           0,
916           {1, 0},
917           2,
918           {
919             {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
920             {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
921           },
922           2,
923           {
924             {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
925             {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
926           },
927         },
928         {
929           32,
930           0,
931           {1, 1, 0},
932           3,
933           {
934             {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
935             {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
936             {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
937           },
938           2,
939           {
940             {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
941             {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
942           },
943         },
944         {
945           64,
946           0,
947           {1, 1, 0},
948           3,
949           {
950             {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
951             {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
952             {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
953           },
954           2,
955           {
956             {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
957             {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
958           },
959         },
960         {
961           128,
962           0,
963           {1, 1, 0},
964           3,
965           {
966             {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
967             {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
968             {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
969           },
970           2,
971           {
972             {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
973             {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
974           },
975         },
976         {
977           256,
978           0,
979           {1, 1, 0},
980           3,
981           {
982             {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
983             {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
984             {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
985           },
986           2,
987           {
988             {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
989             {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
990           },
991         },
992         {
993           512,
994           0,
995           {1, 1, 0},
996           3,
997           {
998             {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
999             {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
1000             {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
1001           },
1002           2,
1003           {
1004             {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
1005             {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
1006           },
1007         },
1008         {
1009           1024,
1010           0,
1011           {1, 1, 1, 0},
1012           4,
1013           {
1014             {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
1015             {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
1016             {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
1017             {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
1018           },
1019           2,
1020           {
1021             {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
1022             {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
1023           },
1024         },
1025         {
1026           2048,
1027           0,
1028           {1, 1, 1, 0},
1029           4,
1030           {
1031             {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
1032             {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
1033             {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
1034             {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
1035             {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
1036           },
1037           2,
1038           {
1039             {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
1040             {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
1041           },
1042         },
1043  
1044       }; 
1045       memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
1046                   mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
1047 }
1048
1049
1050 int smpi_coll_tuned_allreduce_mvapich2(void *sendbuf,
1051                        void *recvbuf,
1052                        int count,
1053                        MPI_Datatype datatype,
1054                        MPI_Op op, MPI_Comm comm)
1055 {
1056
1057     int mpi_errno = MPI_SUCCESS;
1058     //int rank = 0, 
1059     int comm_size = 0;
1060    
1061     comm_size = smpi_comm_size(comm);
1062     //rank = smpi_comm_rank(comm);
1063
1064     if (count == 0) {
1065         return MPI_SUCCESS;
1066     }
1067
1068   if (mv2_allreduce_thresholds_table == NULL)
1069     init_mv2_allreduce_tables_stampede();
1070
1071     /* check if multiple threads are calling this collective function */
1072
1073     MPI_Aint sendtype_size = 0;
1074     int nbytes = 0;
1075     int range = 0, range_threshold = 0, range_threshold_intra = 0;
1076     int is_two_level = 0;
1077     //int is_commutative = 0;
1078     MPI_Aint true_lb, true_extent;
1079
1080     sendtype_size=smpi_datatype_size(datatype);
1081     nbytes = count * sendtype_size;
1082
1083     smpi_datatype_extent(datatype, &true_lb, &true_extent);
1084     //MPI_Op *op_ptr;
1085     //is_commutative = smpi_op_is_commute(op);
1086
1087     {
1088         /* Search for the corresponding system size inside the tuning table */
1089         while ((range < (mv2_size_allreduce_tuning_table - 1)) &&
1090                (comm_size > mv2_allreduce_thresholds_table[range].numproc)) {
1091             range++;
1092         }
1093         /* Search for corresponding inter-leader function */
1094         /* skip mcast poiters if mcast is not available */
1095         if(mv2_allreduce_thresholds_table[range].mcast_enabled != 1){
1096             while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1)) 
1097                     && ((mv2_allreduce_thresholds_table[range].
1098                     inter_leader[range_threshold].MV2_pt_Allreduce_function 
1099                     == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2) ||
1100                     (mv2_allreduce_thresholds_table[range].
1101                     inter_leader[range_threshold].MV2_pt_Allreduce_function
1102                     == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)
1103                     )) {
1104                     range_threshold++;
1105             }
1106         }
1107         while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
1108                && (nbytes >
1109                mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max)
1110                && (mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
1111                range_threshold++;
1112         }
1113         if(mv2_allreduce_thresholds_table[range].is_two_level_allreduce[range_threshold] == 1){
1114                is_two_level = 1;    
1115         }
1116         /* Search for corresponding intra-node function */
1117         while ((range_threshold_intra <
1118                (mv2_allreduce_thresholds_table[range].size_intra_table - 1))
1119                 && (nbytes >
1120                 mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max)
1121                 && (mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max !=
1122                 -1)) {
1123                 range_threshold_intra++;
1124         }
1125
1126         MV2_Allreduce_function = mv2_allreduce_thresholds_table[range].inter_leader[range_threshold]
1127                                 .MV2_pt_Allreduce_function;
1128
1129         MV2_Allreduce_intra_function = mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra]
1130                                 .MV2_pt_Allreduce_function;
1131
1132         /* check if mcast is ready, otherwise replace mcast with other algorithm */
1133         if((MV2_Allreduce_function == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2)||
1134           (MV2_Allreduce_function == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)){
1135             {
1136                 MV2_Allreduce_function = &MPIR_Allreduce_pt2pt_rd_MV2;
1137             }
1138             if(is_two_level != 1) {
1139                 MV2_Allreduce_function = &MPIR_Allreduce_pt2pt_rd_MV2;
1140             }
1141         } 
1142
1143         if(is_two_level == 1){
1144                 // check if shm is ready, if not use other algorithm first 
1145                 /*if ((comm->ch.shmem_coll_ok == 1)
1146                     && (mv2_enable_shmem_allreduce)
1147                     && (is_commutative)
1148                     && (mv2_enable_shmem_collectives)) {
1149                     mpi_errno = MPIR_Allreduce_two_level_MV2(sendbuf, recvbuf, count,
1150                                                      datatype, op, comm);
1151                 } else {*/
1152                     mpi_errno = MPIR_Allreduce_pt2pt_rd_MV2(sendbuf, recvbuf, count,
1153                                                      datatype, op, comm);
1154                // }
1155         } else { 
1156             mpi_errno = MV2_Allreduce_function(sendbuf, recvbuf, count,
1157                                            datatype, op, comm);
1158         }
1159     } 
1160
1161         //comm->ch.intra_node_done=0;
1162         
1163     return (mpi_errno);
1164
1165
1166 }
1167
1168
1169 int smpi_coll_tuned_alltoallv_mvapich2(void *sbuf, int *scounts, int *sdisps,
1170                                               MPI_Datatype sdtype,
1171                                               void *rbuf, int *rcounts, int *rdisps,
1172                                               MPI_Datatype rdtype,
1173                                               MPI_Comm  comm
1174                                               )
1175 {
1176
1177 if (sbuf == MPI_IN_PLACE) {
1178     return smpi_coll_tuned_alltoallv_ompi_basic_linear(sbuf, scounts, sdisps, sdtype, 
1179                                                         rbuf, rcounts, rdisps,rdtype,
1180                                                         comm);
1181  } else     /* For starters, just keep the original algorithm. */
1182     return smpi_coll_tuned_alltoallv_pair(sbuf, scounts, sdisps, sdtype, 
1183                                                         rbuf, rcounts, rdisps,rdtype,
1184                                                         comm);
1185 }
1186
1187
1188 int smpi_coll_tuned_barrier_mvapich2(MPI_Comm  comm)
1189 {   
1190     return smpi_coll_tuned_barrier_mvapich2_pair(comm);
1191 }
1192
1193
1194 /*
1195 static void init_mv2_bcast_tables_stampede(){
1196  //Stampede,
1197         mv2_size_bcast_tuning_table=8;
1198         mv2_bcast_thresholds_table = malloc(mv2_size_bcast_tuning_table *
1199                                                  sizeof (mv2_bcast_tuning_table));
1200
1201         mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1202           {
1203             16,
1204             8192, 4, 4,
1205             {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1206             11,
1207             {
1208               {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1209               {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1210               {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1211               {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1212               {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1213               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1214               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1215               {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1216               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1217               {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1218               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1219             },
1220             11,
1221             {
1222               {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1223               {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1224               {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1225               {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1226               {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1227               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1228               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1229               {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1230               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1231               {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1232               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1233             }
1234           },
1235           {
1236             32,
1237             8192, 4, 4,
1238             {1, 1, 1, 1, 1, 1, 1, 1},
1239             8,
1240             {
1241               {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1242               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1243               {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1244               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1245               {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1246               {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1247               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1248               {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1249             },
1250             8,
1251             {
1252               {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1253               {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1254               {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1255               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1256               {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1257               {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1258               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1259               {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1260             }
1261           },
1262           {
1263             64,
1264             8192, 4, 4,
1265             {1, 1, 1, 1, 1, 1, 1, 1, 1},
1266             9,
1267             {
1268               {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1269               {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1270               {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1271               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1272               {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1273               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1274               {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1275               {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1276               {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1277             },
1278             9,
1279             {
1280               {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1281               {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1282               {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1283               {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1284               {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1285               {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1286               {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1287               {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1288               {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1289             }
1290           },
1291           {
1292             128,
1293             8192, 4, 4,
1294             {1, 1, 1, 0},
1295             4,
1296             {
1297               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1298               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1299               {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1300               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1301             },
1302             4,
1303             {
1304               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1305               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1306               {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1307               {524288, -1, NULL, -1}
1308             }
1309           },
1310           {
1311             256,
1312             8192, 4, 4,
1313             {1, 1, 1, 1, 1},
1314             5,
1315             {
1316               {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1317               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1318               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1319               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1320               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1321             },
1322             5,
1323             {
1324               {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1325               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1326               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1327               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1328               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1329             }
1330           },
1331           {
1332             512,
1333             8192, 4, 4,
1334             {1, 1, 1, 1, 1},
1335             5,
1336             {
1337               {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1338               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1339               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1340               {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1341               {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1342             },
1343             5,
1344             {
1345               {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1346               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1347               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1348               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1349               {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1350             }
1351           },
1352           {
1353             1024,
1354             8192, 4, 4,
1355             {1, 1, 1, 1, 1},
1356             5,
1357             {
1358               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1359               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1360               {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1361               {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1362               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1363             },
1364             5,
1365             {
1366               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1367               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1368               {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1369               {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1370               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1371             }
1372           },
1373           {
1374             2048,
1375             8192, 4, 4,
1376             {1, 1, 1, 1, 1, 1, 1},
1377             7,
1378             {
1379               {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1380               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1381               {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1382               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1383               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1384               {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1385               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1386             },
1387             7,
1388             {
1389               {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1390               {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1391               {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1392               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1393               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1394               {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1395               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1396             }
1397           }
1398         };
1399
1400         memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1401                     mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1402 }*/
1403
1404
1405 int smpi_coll_tuned_bcast_mvapich2(void *buffer,
1406                               int count,
1407                               MPI_Datatype datatype,
1408                               int root, MPI_Comm comm)
1409 {
1410
1411 //TODO : Bcast really needs intra/inter phases in mvapich. Default to mpich if not available
1412   return smpi_coll_tuned_bcast_mpich(buffer, count, datatype, root, comm);
1413
1414 }
1415
1416 static void init_mv2_reduce_tables_stampede(){
1417  /*Stampede*/
1418         mv2_size_reduce_tuning_table = 8;
1419         mv2_reduce_thresholds_table = malloc(mv2_size_reduce_tuning_table *
1420                                                   sizeof (mv2_reduce_tuning_table));
1421         mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1422           {
1423             16,
1424             4,
1425             4,
1426             {1, 0, 0},
1427             3,
1428             {
1429               {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1430               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1431               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1432             },
1433             2,
1434             {
1435               {0, 65536, &MPIR_Reduce_shmem_MV2},
1436               {65536,-1,  &MPIR_Reduce_binomial_MV2},
1437             },
1438           },
1439           {
1440             32,
1441             4,
1442             4,
1443             {1, 1, 1, 1, 0, 0, 0},
1444             7,
1445             {
1446               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1447               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1448               {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1449               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1450               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1451               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1452               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1453             },
1454             6,
1455             {
1456               {0, 8192, &MPIR_Reduce_shmem_MV2},
1457               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1458               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1459               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1460               {65536, 262144, &MPIR_Reduce_shmem_MV2},
1461               {262144,-1,  &MPIR_Reduce_binomial_MV2},
1462             },
1463           },
1464           {
1465             64,
1466             4,
1467             4,
1468             {1, 1, 1, 1, 0},
1469             5,
1470             {
1471               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1472               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1473               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1474               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1475               {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1476             },
1477             5,
1478             {
1479               {0, 8192, &MPIR_Reduce_shmem_MV2},
1480               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1481               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1482               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1483               {262144, -1, &MPIR_Reduce_binomial_MV2},
1484             },
1485           },
1486           {
1487             128,
1488             4,
1489             4,
1490             {1, 0, 1, 0, 1, 0},
1491             6,
1492             {
1493               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1494               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1495               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1496               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1497               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1498               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1499             },
1500             5,
1501             {
1502               {0, 8192, &MPIR_Reduce_shmem_MV2},
1503               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1504               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1505               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1506               {262144, -1, &MPIR_Reduce_binomial_MV2},
1507             },
1508           },
1509           {
1510             256,
1511             4,
1512             4,
1513             {1, 1, 1, 0, 1, 1, 0},
1514             7,
1515             {
1516               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1517               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1518               {16384, 32768, &MPIR_Reduce_binomial_MV2},
1519               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1520               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1521               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1522               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1523             },
1524             6,
1525             {
1526               {0, 8192, &MPIR_Reduce_shmem_MV2},
1527               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1528               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1529               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1530               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1531               {262144, -1, &MPIR_Reduce_binomial_MV2},
1532             },
1533           },
1534           {
1535             512,
1536             4,
1537             4,
1538             {1, 0, 1, 1, 1, 0},
1539             6,
1540             {
1541               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1542               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1543               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1544               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1545               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1546               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1547             },
1548             5,
1549             {
1550               {0, 8192, &MPIR_Reduce_shmem_MV2},
1551               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1552               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1553               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1554               {262144, -1, &MPIR_Reduce_binomial_MV2},
1555             },
1556           },
1557           {
1558             1024,
1559             4,
1560             4,
1561             {1, 0, 1, 1, 1},
1562             5,
1563             {
1564               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1565               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1566               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1567               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1568               {262144, -1, &MPIR_Reduce_binomial_MV2},
1569             },
1570             5,
1571             {
1572               {0, 8192, &MPIR_Reduce_shmem_MV2},
1573               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1574               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1575               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1576               {262144, -1, &MPIR_Reduce_binomial_MV2},
1577             },
1578           },
1579           {
1580             2048,
1581             4,
1582             4,
1583             {1, 0, 1, 1, 1,1},
1584             6,
1585             {
1586               {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1587               {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1588               {4096, 16384, &MPIR_Reduce_binomial_MV2},
1589               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1590               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1591               {131072, -1, &MPIR_Reduce_binomial_MV2},
1592             },
1593             6,
1594             {
1595               {0, 2048, &MPIR_Reduce_shmem_MV2},
1596               {2048, 4096, &MPIR_Reduce_shmem_MV2},
1597               {4096, 16384, &MPIR_Reduce_shmem_MV2},
1598               {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1599               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1600               {131072, -1, &MPIR_Reduce_shmem_MV2},
1601             },
1602           },
1603
1604         }; 
1605         memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1606                     mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1607 }
1608
1609
1610
1611 int smpi_coll_tuned_reduce_mvapich2( void *sendbuf,
1612                     void *recvbuf,
1613                     int count,
1614                     MPI_Datatype datatype,
1615                     MPI_Op op, int root, MPI_Comm comm)
1616 {
1617    if(mv2_reduce_thresholds_table == NULL)
1618      init_mv2_reduce_tables_stampede();
1619
1620     int mpi_errno = MPI_SUCCESS;
1621     int range = 0;
1622     int range_threshold = 0;
1623     int range_intra_threshold = 0;
1624     int is_commutative, pof2;
1625     int comm_size = 0;
1626     int nbytes = 0;
1627     int sendtype_size;
1628     int is_two_level = 0;
1629
1630     comm_size = smpi_comm_size(comm);
1631     sendtype_size=smpi_datatype_size(datatype);
1632     nbytes = count * sendtype_size;
1633
1634     if (count == 0)
1635         return MPI_SUCCESS;
1636
1637     is_commutative = smpi_op_is_commute(op);
1638
1639     /* find nearest power-of-two less than or equal to comm_size */
1640     for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 );
1641     pof2 >>=1;
1642     
1643
1644     /* Search for the corresponding system size inside the tuning table */
1645     while ((range < (mv2_size_reduce_tuning_table - 1)) &&
1646            (comm_size > mv2_reduce_thresholds_table[range].numproc)) {
1647         range++;
1648     }
1649     /* Search for corresponding inter-leader function */
1650     while ((range_threshold < (mv2_reduce_thresholds_table[range].size_inter_table - 1))
1651            && (nbytes >
1652                mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max)
1653            && (mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max !=
1654                -1)) {
1655         range_threshold++;
1656     }
1657
1658     /* Search for corresponding intra node function */
1659     while ((range_intra_threshold < (mv2_reduce_thresholds_table[range].size_intra_table - 1))
1660            && (nbytes >
1661                mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max)
1662            && (mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max !=
1663                -1)) {
1664         range_intra_threshold++;
1665     }
1666
1667     /* Set intra-node function pt for reduce_two_level */
1668     MV2_Reduce_intra_function = 
1669                           mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].
1670                           MV2_pt_Reduce_function;
1671     /* Set inter-leader pt */
1672     MV2_Reduce_function =
1673                           mv2_reduce_thresholds_table[range].inter_leader[range_threshold].
1674                           MV2_pt_Reduce_function;
1675
1676     if(mv2_reduce_intra_knomial_factor<0)
1677     {
1678         mv2_reduce_intra_knomial_factor = mv2_reduce_thresholds_table[range].intra_k_degree;
1679     }
1680     if(mv2_reduce_inter_knomial_factor<0)
1681     {
1682         mv2_reduce_inter_knomial_factor = mv2_reduce_thresholds_table[range].inter_k_degree;
1683     }
1684     if(mv2_reduce_thresholds_table[range].is_two_level_reduce[range_threshold] == 1){
1685                is_two_level = 1;
1686     }
1687     /* We call Reduce function */
1688     if(is_two_level == 1)
1689     {
1690        /* if (comm->ch.shmem_coll_ok == 1
1691             && is_commutative == 1) {
1692             mpi_errno = MPIR_Reduce_two_level_helper_MV2(sendbuf, recvbuf, count, 
1693                                            datatype, op, root, comm, errflag);
1694         } else {*/
1695             mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count, 
1696                                            datatype, op, root, comm);
1697        //}
1698     } else if(MV2_Reduce_function == &MPIR_Reduce_inter_knomial_wrapper_MV2 ){
1699         if(is_commutative ==1)
1700         {
1701             mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count, 
1702                                            datatype, op, root, comm);
1703         } else {
1704             mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count, 
1705                                            datatype, op, root, comm);
1706         }
1707     } else if(MV2_Reduce_function == &MPIR_Reduce_redscat_gather_MV2){
1708         if (/*(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) &&*/ (count >= pof2))
1709         {
1710             mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count, 
1711                                             datatype, op, root, comm);
1712         } else {
1713             mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count, 
1714                                             datatype, op, root, comm);
1715         }
1716     } else {
1717         mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count, 
1718                                         datatype, op, root, comm);
1719     }
1720
1721
1722       return mpi_errno;
1723
1724 }
1725
1726
1727
1728 static void init_mv2_reduce_scatter_tables_stampede(){
1729         mv2_size_red_scat_tuning_table = 6;
1730         mv2_red_scat_thresholds_table = malloc(mv2_size_red_scat_tuning_table *
1731                                                   sizeof (mv2_red_scat_tuning_table));
1732         mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1733             {
1734                 16,
1735                 3,
1736                 {
1737                     {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1738                     {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1739                     {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1740                 },
1741             },
1742             {
1743                 32,
1744                 3,
1745                 {
1746                     {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1747                     {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1748                     {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1749                 },
1750             },
1751             {
1752                 64,
1753                 3,
1754                 {
1755                     {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1756                     {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1757                     {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1758                 },
1759             },
1760             {
1761                 128,
1762                 2,
1763                 {
1764                     {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1765                     {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1766                 },
1767             },
1768             {
1769                 256,
1770                 2,
1771                 {
1772                     {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1773                     {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1774                 },
1775             },
1776             {
1777                 512,
1778                 2,
1779                 {
1780                     {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1781                     {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1782                 },
1783             },
1784
1785         }; 
1786         memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1787                   mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1788 }
1789
1790 int smpi_coll_tuned_reduce_scatter_mvapich2(void *sendbuf, void *recvbuf, int *recvcnts,
1791                                                         MPI_Datatype datatype, MPI_Op op,
1792                                                         MPI_Comm comm)
1793 {
1794         int mpi_errno = MPI_SUCCESS;
1795         int i = 0, comm_size = smpi_comm_size(comm), total_count = 0, type_size =
1796                 0, nbytes = 0;
1797     int range = 0;
1798     int range_threshold = 0;
1799         int is_commutative = 0;
1800         int *disps = malloc(comm_size * sizeof (int));
1801
1802     if(mv2_red_scat_thresholds_table==NULL)
1803       init_mv2_reduce_scatter_tables_stampede();
1804       
1805     is_commutative=smpi_op_is_commute(op);
1806         for (i = 0; i < comm_size; i++) {
1807                 disps[i] = total_count;
1808                 total_count += recvcnts[i];
1809         }
1810
1811         type_size=smpi_datatype_size(datatype);
1812         nbytes = total_count * type_size;
1813
1814         if (is_commutative) {
1815
1816         /* Search for the corresponding system size inside the tuning table */
1817         while ((range < (mv2_size_red_scat_tuning_table - 1)) &&
1818                (comm_size > mv2_red_scat_thresholds_table[range].numproc)) {
1819             range++;
1820         }
1821         /* Search for corresponding inter-leader function */
1822         while ((range_threshold < (mv2_red_scat_thresholds_table[range].size_inter_table - 1))
1823                && (nbytes >
1824                    mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max)
1825                && (mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max !=
1826                    -1)) {
1827             range_threshold++;
1828         }
1829     
1830         /* Set inter-leader pt */
1831         MV2_Red_scat_function =
1832                               mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].
1833                               MV2_pt_Red_scat_function;
1834
1835                 mpi_errno = MV2_Red_scat_function(sendbuf, recvbuf,
1836                                           recvcnts, datatype,
1837                                           op, comm);
1838         } else {
1839         mpi_errno = MPIR_Reduce_scatter_non_comm_MV2(sendbuf, recvbuf,
1840                                                      recvcnts, datatype,
1841                                                      op, comm);
1842         }
1843
1844     return mpi_errno;
1845
1846 }
1847
1848
1849
1850 static void init_mv2_scatter_tables_stampede(){
1851 {
1852     int agg_table_sum = 0;
1853     int i;
1854     mv2_scatter_tuning_table **table_ptrs = NULL;
1855      mv2_scatter_num_ppn_conf = 3;
1856         mv2_scatter_thresholds_table
1857           = malloc(sizeof(mv2_scatter_tuning_table *)
1858                         * mv2_scatter_num_ppn_conf);
1859         table_ptrs = malloc(sizeof(mv2_scatter_tuning_table *)
1860                                  * mv2_scatter_num_ppn_conf);
1861         mv2_size_scatter_tuning_table = malloc(sizeof(int) *
1862                                                     mv2_scatter_num_ppn_conf);
1863         mv2_scatter_table_ppn_conf 
1864           = malloc(mv2_scatter_num_ppn_conf * sizeof(int));
1865         mv2_scatter_table_ppn_conf[0] = 1;
1866         mv2_size_scatter_tuning_table[0] = 6;
1867         mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1868           {2,
1869            1, 
1870            {
1871              {0, -1, &MPIR_Scatter_MV2_Binomial},
1872            },
1873            1,
1874            {
1875              {0, -1, &MPIR_Scatter_MV2_Binomial},
1876            },
1877           },
1878
1879           {4,
1880            1, 
1881            {
1882              {0, -1, &MPIR_Scatter_MV2_Direct},
1883            },
1884            1,
1885            {
1886              {0, -1, &MPIR_Scatter_MV2_Direct},
1887            },
1888           },
1889   
1890           {8,
1891            1, 
1892            {
1893              {0, -1, &MPIR_Scatter_MV2_Direct},
1894            },
1895            1,
1896            {
1897              {0, -1, &MPIR_Scatter_MV2_Direct},
1898            },
1899           },
1900   
1901           {16,
1902            1, 
1903            {
1904              {0, -1, &MPIR_Scatter_MV2_Direct},
1905            },
1906            1,
1907            {
1908              {0, -1, &MPIR_Scatter_MV2_Direct},
1909            },
1910           },
1911   
1912           {32,
1913            1, 
1914            {
1915              {0, -1, &MPIR_Scatter_MV2_Direct},
1916            },
1917            1,
1918            {
1919              {0, -1, &MPIR_Scatter_MV2_Direct},
1920            },
1921           },
1922   
1923           {64,
1924            2, 
1925            {
1926              {0, 32, &MPIR_Scatter_MV2_Binomial},
1927              {32, -1, &MPIR_Scatter_MV2_Direct},
1928            },
1929            1,
1930            {
1931              {0, -1, &MPIR_Scatter_MV2_Binomial},
1932            },
1933           },
1934         };
1935         table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1936         mv2_scatter_table_ppn_conf[1] = 2;
1937         mv2_size_scatter_tuning_table[1] = 6;
1938         mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1939           {4,
1940            2, 
1941            {
1942              {0, 4096, &MPIR_Scatter_MV2_Binomial},
1943              {4096, -1, &MPIR_Scatter_MV2_Direct},
1944            },
1945            1,
1946            {
1947              {0, -1, &MPIR_Scatter_MV2_Direct},
1948            },
1949           },
1950   
1951           {8,
1952            2, 
1953            {
1954              {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1955              {512, -1, &MPIR_Scatter_MV2_Direct},
1956            },
1957            1,
1958            {
1959              {0, -1, &MPIR_Scatter_MV2_Binomial},
1960            },
1961           },
1962   
1963           {16,
1964            2, 
1965            {
1966              {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1967              {2048, -1, &MPIR_Scatter_MV2_Direct},
1968            },
1969            1,
1970            {
1971              {0, -1, &MPIR_Scatter_MV2_Binomial},
1972            },
1973           },
1974   
1975           {32,
1976            2, 
1977            {
1978              {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1979              {2048, -1, &MPIR_Scatter_MV2_Direct},
1980            },
1981            1,
1982            {
1983              {0, -1, &MPIR_Scatter_MV2_Binomial},
1984            },
1985           },
1986   
1987           {64,
1988            2, 
1989            {
1990              {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1991              {8192, -1, &MPIR_Scatter_MV2_Direct},
1992            },
1993            1,
1994            {
1995              {0, -1, &MPIR_Scatter_MV2_Binomial},
1996            },
1997           },
1998   
1999           {128,
2000            4, 
2001            {
2002              {0, 16, &MPIR_Scatter_MV2_Binomial},
2003              {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
2004              {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
2005              {16384, -1, &MPIR_Scatter_MV2_Direct},
2006            },
2007            1,
2008            {
2009              {0, 128, &MPIR_Scatter_MV2_Direct},
2010              {128, -1, &MPIR_Scatter_MV2_Binomial},
2011            },
2012           },
2013         };
2014         table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
2015         mv2_scatter_table_ppn_conf[2] = 16;
2016         mv2_size_scatter_tuning_table[2] = 8;
2017         mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
2018           {
2019             16,
2020             2,
2021             { 
2022               {0, 256, &MPIR_Scatter_MV2_Binomial}, 
2023               {256, -1, &MPIR_Scatter_MV2_Direct},
2024             },
2025             1, 
2026             { 
2027               { 0, -1, &MPIR_Scatter_MV2_Direct},
2028             },
2029           },
2030
2031           {
2032             32,
2033             2,
2034             {
2035               {0, 512, &MPIR_Scatter_MV2_Binomial}, 
2036               {512, -1, &MPIR_Scatter_MV2_Direct},
2037             },
2038             1, 
2039             { 
2040               { 0, -1, &MPIR_Scatter_MV2_Direct},
2041             },
2042           },
2043
2044           {
2045             64,
2046             2,
2047             {
2048               {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
2049               {1024, -1, &MPIR_Scatter_MV2_Direct},
2050             },
2051             1,
2052             {
2053               { 0, -1, &MPIR_Scatter_MV2_Direct},
2054             },
2055           },
2056
2057           {
2058             128,
2059             4,
2060             {
2061               {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
2062               {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
2063               {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
2064               {2048, -1, &MPIR_Scatter_MV2_Direct},
2065             },
2066             1,
2067             {
2068               { 0, -1, &MPIR_Scatter_MV2_Direct},
2069             },
2070           },
2071
2072           {
2073             256,
2074             4,
2075             {
2076               {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
2077               {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
2078               {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
2079               {2048, -1,  &MPIR_Scatter_MV2_Direct},
2080             },
2081             1,
2082             {
2083               { 0, -1, &MPIR_Scatter_MV2_Direct},
2084             },
2085           },
2086
2087           {
2088             512,
2089             4,
2090             {
2091               {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
2092               {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
2093               {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
2094               {4096, -1, &MPIR_Scatter_MV2_Direct},
2095             },
2096             1,
2097             {
2098               { 0, -1, &MPIR_Scatter_MV2_Binomial},
2099             }, 
2100           },  
2101           {
2102             1024,
2103             5,
2104             {
2105               {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
2106               {0, 16,  &MPIR_Scatter_MV2_Binomial},
2107               {16, 32, &MPIR_Scatter_MV2_Binomial},
2108               {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
2109               {4096, -1, &MPIR_Scatter_MV2_Direct},
2110             },
2111             1,
2112             {
2113               { 0, -1, &MPIR_Scatter_MV2_Binomial},
2114             },  
2115           },  
2116           {
2117             2048,
2118             7,
2119             {
2120               {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
2121               {0, 16,  &MPIR_Scatter_MV2_two_level_Binomial},
2122               {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
2123               {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
2124               {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
2125               {16384, 65536, &MPIR_Scatter_MV2_Direct},
2126               {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
2127             },
2128             6,
2129             {
2130               {0, 16, &MPIR_Scatter_MV2_Binomial},
2131               {16, 128, &MPIR_Scatter_MV2_Binomial},
2132               {128, 1024, &MPIR_Scatter_MV2_Binomial},
2133               {1024, 16384, &MPIR_Scatter_MV2_Direct},
2134               {16384, 65536, &MPIR_Scatter_MV2_Direct},
2135               {65536, -1, &MPIR_Scatter_MV2_Direct},
2136             },
2137           }, 
2138         };
2139         table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
2140         agg_table_sum = 0;
2141         for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
2142           agg_table_sum += mv2_size_scatter_tuning_table[i];
2143         }
2144         mv2_scatter_thresholds_table[0] =
2145           malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
2146         memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
2147                     (sizeof(mv2_scatter_tuning_table)
2148                      * mv2_size_scatter_tuning_table[0]));
2149         for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
2150           mv2_scatter_thresholds_table[i] =
2151             mv2_scatter_thresholds_table[i - 1]
2152             + mv2_size_scatter_tuning_table[i - 1];
2153           memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
2154                       (sizeof(mv2_scatter_tuning_table)
2155                        * mv2_size_scatter_tuning_table[i]));
2156         }
2157         free(table_ptrs);
2158    }
2159 }
2160
2161 int smpi_coll_tuned_scatter_mvapich2(void *sendbuf,
2162                            int sendcnt,
2163                            MPI_Datatype sendtype,
2164                            void *recvbuf,
2165                            int recvcnt,
2166                            MPI_Datatype recvtype,
2167                            int root, MPI_Comm comm_ptr)
2168 {
2169     int range = 0, range_threshold = 0, range_threshold_intra = 0;
2170     int mpi_errno = MPI_SUCCESS;
2171  //   int mpi_errno_ret = MPI_SUCCESS;
2172     int rank, nbytes, comm_size;
2173     int recvtype_size, sendtype_size;
2174     int partial_sub_ok = 0;
2175     int conf_index = 0;
2176   //  int local_size = -1;
2177   //  int i;
2178  //   MPI_Comm shmem_comm;
2179 //    MPID_Comm *shmem_commptr=NULL;
2180     if(mv2_scatter_thresholds_table==NULL)
2181       init_mv2_scatter_tables_stampede();
2182
2183     comm_size = smpi_comm_size(comm_ptr);
2184
2185     rank = smpi_comm_rank(comm_ptr);
2186
2187     if (rank == root) {
2188         sendtype_size=smpi_datatype_size(sendtype);
2189         nbytes = sendcnt * sendtype_size;
2190     } else {
2191         recvtype_size=smpi_datatype_size(recvtype);
2192         nbytes = recvcnt * recvtype_size;
2193     }
2194 /*
2195     // check if safe to use partial subscription mode 
2196     if (comm_ptr->ch.shmem_coll_ok == 1 && comm_ptr->ch.is_uniform) {
2197     
2198         shmem_comm = comm_ptr->ch.shmem_comm;
2199         MPID_Comm_get_ptr(shmem_comm, shmem_commptr);
2200         local_size = shmem_commptr->local_size;
2201         i = 0;
2202         if (mv2_scatter_table_ppn_conf[0] == -1) {
2203             // Indicating user defined tuning 
2204             conf_index = 0;
2205             goto conf_check_end;
2206         }
2207         do {
2208             if (local_size == mv2_scatter_table_ppn_conf[i]) {
2209                 conf_index = i;
2210                 partial_sub_ok = 1;
2211                 break;
2212             }
2213             i++;
2214         } while(i < mv2_scatter_num_ppn_conf);
2215     }
2216     */
2217     if (partial_sub_ok != 1) {
2218         conf_index = 0;
2219     }
2220
2221     /* Search for the corresponding system size inside the tuning table */
2222     while ((range < (mv2_size_scatter_tuning_table[conf_index] - 1)) &&
2223            (comm_size > mv2_scatter_thresholds_table[conf_index][range].numproc)) {
2224         range++;
2225     }
2226     /* Search for corresponding inter-leader function */
2227     while ((range_threshold < (mv2_scatter_thresholds_table[conf_index][range].size_inter_table - 1))
2228            && (nbytes >
2229            mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
2230            && (mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max != -1)) {
2231            range_threshold++;
2232     }
2233
2234     /* Search for corresponding intra-node function */
2235     while ((range_threshold_intra <
2236            (mv2_scatter_thresholds_table[conf_index][range].size_intra_table - 1))
2237             && (nbytes >
2238                 mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max)
2239             && (mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max !=
2240             -1)) {
2241             range_threshold_intra++;
2242     }
2243
2244     MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold]
2245                             .MV2_pt_Scatter_function;
2246
2247     if(MV2_Scatter_function == &MPIR_Scatter_mcst_wrap_MV2) { 
2248 #if defined(_MCST_SUPPORT_)
2249         if(comm_ptr->ch.is_mcast_ok == 1 
2250            && mv2_use_mcast_scatter == 1 
2251            && comm_ptr->ch.shmem_coll_ok == 1) {
2252             MV2_Scatter_function = &MPIR_Scatter_mcst_MV2; 
2253         } else
2254 #endif /*#if defined(_MCST_SUPPORT_) */
2255         {
2256             if(mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1].
2257                MV2_pt_Scatter_function != NULL) { 
2258                   MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1]
2259                                                                           .MV2_pt_Scatter_function;
2260             } else { 
2261                   /* Fallback! */ 
2262                   MV2_Scatter_function = &MPIR_Scatter_MV2_Binomial; 
2263             }  
2264         } 
2265     } 
2266  
2267     if( (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Direct) || 
2268         (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Binomial)) { 
2269         /* if( comm_ptr->ch.shmem_coll_ok == 1 && 
2270              comm_ptr->ch.is_global_block == 1 ) {
2271              MV2_Scatter_intra_function = mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra]
2272                                 .MV2_pt_Scatter_function;
2273
2274              mpi_errno =
2275                    MV2_Scatter_function(sendbuf, sendcnt, sendtype,
2276                                         recvbuf, recvcnt, recvtype, root,
2277                                         comm_ptr);
2278          } else {*/
2279              mpi_errno = MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt, sendtype,
2280                                         recvbuf, recvcnt, recvtype, root,
2281                                         comm_ptr);
2282
2283          //}
2284     } else { 
2285          mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype,
2286                                     recvbuf, recvcnt, recvtype, root,
2287                                     comm_ptr);
2288     } 
2289     return (mpi_errno);
2290 }
2291