src/smpi/colls/smpi_intel_mpi_selector.c

   1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
   2
   3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
   4  * All rights reserved.                                                     */
   5
   6 /* This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the license (GNU LGPL) which comes with this package. */
   8
   9 #include "colls_private.h"
  10
  11 // This selector is based on information gathered on the Stampede cluster, with Intel MPI 4.1.3.049, and from the intel reference manual. The data was gathered launching runs with 1,2,4,8,16 processes per node.
  12
  13 #define INTEL_MAX_NB_THRESHOLDS  32
  14 #define INTEL_MAX_NB_NUMPROCS  12
  15 #define INTEL_MAX_NB_PPN  5  /* 1 2 4 8 16 ppn */
  16
  17 typedef struct {
  18   int max_size;
  19   int algo;
  20 } intel_tuning_table_size_element;
  21
  22 typedef struct {
  23   int max_num_proc;
  24   int num_elems;
  25   intel_tuning_table_size_element elems[INTEL_MAX_NB_THRESHOLDS];
  26 } intel_tuning_table_numproc_element;
  27
  28 typedef struct {
  29   int ppn;
  30   intel_tuning_table_numproc_element elems[INTEL_MAX_NB_NUMPROCS];
  31 } intel_tuning_table_element;
  32
  33 /*
  34 I_MPI_ADJUST_ALLREDUCE
  35
  36 MPI_Allreduce
  37
  38 1 - Recursive doubling algorithm
  39 2 - Rabenseifner's algorithm
  40 3 - Reduce + Bcast algorithm
  41 4 - Topology aware Reduce + Bcast algorithm
  42 5 - Binomial gather + scatter algorithm
  43 6 - Topology aware binominal gather + scatter algorithm
  44 7 - Shumilin's ring algorithm
  45 8 - Ring algorithm
  46
  47   as Shumilin's ring algorithm is unknown, default to ring'
  48 */
  49
  50
  51 int (*intel_allreduce_functions_table[])(void *sendbuf,
  52       void *recvbuf,
  53       int count,
  54       MPI_Datatype datatype,
  55       MPI_Op op, MPI_Comm comm) ={
  56       smpi_coll_tuned_allreduce_rdb,
  57       smpi_coll_tuned_allreduce_rab1,
  58       smpi_coll_tuned_allreduce_redbcast,
  59       smpi_coll_tuned_allreduce_mvapich2_two_level,
  60       smpi_coll_tuned_allreduce_smp_binomial,
  61       smpi_coll_tuned_allreduce_mvapich2_two_level,
  62       smpi_coll_tuned_allreduce_ompi_ring_segmented,
  63       smpi_coll_tuned_allreduce_ompi_ring_segmented
  64 };
  65
  66 intel_tuning_table_element intel_allreduce_table[] =
  67 {
  68   {1,{
  69     { 2,9,{
  70       {6,7},
  71       {85,1},
  72       {192,7},
  73       {853,1},
  74       {1279,7},
  75       {16684,1},
  76       {34279,8},
  77       {1681224,3},
  78       {2147483647,7}
  79     }
  80     },
  81     { 4, 8,{
  82       {16,7},
  83       {47,1},
  84       {2062,7},
  85       {16699,1},
  86       {33627,7},
  87       {70732,8},
  88       {1300705,3},
  89       {2147483647,8}
  90     }
  91     },
  92     {8,8,{
  93       {118,1},
  94       {146,4},
  95       {16760,1},
  96       {36364,6},
  97       {136239,8},
  98       {315710,7},
  99       {3220366,3},
 100       {2147483647,8}
 101       }
 102     },
 103     {16,7,{
 104       {934,1},
 105       {1160,6},
 106       {15505,1},
 107       {52730,2},
 108       {300705,8},
 109       {563680,7},
 110       {2147483647,3}
 111       }
 112     },
 113     {2147483647,11,{
 114       {5,6},
 115       {11,4},
 116       {182,1},
 117       {700,6},
 118       {1450,4},
 119       {11146,1},
 120       {25539,6},
 121       {37634,4},
 122       {93784,6},
 123       {817658,2},
 124       {2147483647,3}
 125     }
 126     }
 127   }
 128   },
 129   {2,{
 130     { 4,6,{
 131       {2084,7},
 132       {15216,1},
 133       {99715,7},
 134       {168666,3},
 135       {363889,2},
 136       {2147483647,7}
 137     }
 138     },
 139     { 8,6,{
 140       {14978,1},
 141       {66879,2},
 142       {179296,8},
 143       {304801,3},
 144       {704509,7},
 145       {2147483647,2}
 146     }
 147     },
 148     { 16,6,{
 149       {16405,1},
 150       {81784,2},
 151       {346385,8},
 152       {807546,7},
 153       {1259854,2},
 154       {2147483647,3}
 155     }
 156     },
 157     { 32,4,{
 158       {8913,1},
 159       {103578,2},
 160       {615876,8},
 161       {2147483647,2}
 162     }
 163     },
 164     { 64,7,{
 165       {1000,1},
 166       {2249,2},
 167       {6029,1},
 168       {325357,2},
 169       {1470976,8},
 170       {2556670,7},
 171       {2147483647,3}
 172     }
 173     },
 174     { 128,5,{
 175       {664,1},
 176       {754706,2},
 177       {1663862,4},
 178       {3269097,2},
 179       {2147483647,7}
 180     }
 181     },
 182     { 2147483647,3,{
 183       {789,1},
 184       {2247589,2},
 185       {2147483647,8}
 186     }
 187     }
 188   }
 189   },
 190   {4,{
 191     { 4,4,{
 192       {5738,1},
 193       {197433,2},
 194       {593742,7},
 195       {2147483647,2}
 196     }
 197     },
 198     { 8,7,{
 199       {5655,1},
 200       {75166,2},
 201       {177639,8},
 202       {988014,3},
 203       {1643869,2},
 204       {2494859,8},
 205       {2147483647,2}
 206     }
 207     },
 208     { 16,7,{
 209       {587,1},
 210       {3941,2},
 211       {9003,1},
 212       {101469,2},
 213       {355768,8},
 214       {3341814,3},
 215       {2147483647,8}
 216     }
 217     },
 218     { 32,4,{
 219       {795,1},
 220       {146567,2},
 221       {732118,8},
 222       {2147483647,3}
 223     }
 224     },
 225     { 64,4,{
 226       {528,1},
 227       {221277,2},
 228       {1440737,8},
 229       {2147483647,3}
 230     }
 231     },
 232     { 128,4,{
 233       {481,1},
 234       {593833,2},
 235       {2962021,8},
 236       {2147483647,7}
 237     }
 238     },
 239     { 256,2,{
 240       {584,1},
 241       {2147483647,2}
 242     }
 243     },
 244     { 2147483647,3,{
 245       {604,1},
 246       {2997006,2},
 247       {2147483647,8}
 248     }
 249     }
 250   }
 251   },
 252   {8,{
 253     { 8,6,{
 254       {2560,1},
 255       {114230,6},
 256       {288510,8},
 257       {664038,2},
 258       {1339913,6},
 259       {2147483647,4}
 260     }
 261     },
 262     { 16,5,{
 263       {497,1},
 264       {54201,2},
 265       {356217,8},
 266       {3413609,3},
 267       {2147483647,8}
 268     }
 269     },
 270     { 32,5,{
 271       {377,1},
 272       {109745,2},
 273       {716514,8},
 274       {3976768,3},
 275       {2147483647,8}
 276     }
 277     },
 278     { 64,6,{
 279       {109,1},
 280       {649,5},
 281       {266080,2},
 282       {1493331,8},
 283       {2541403,7},
 284       {2147483647,3}
 285     }
 286     },
 287     { 128,4,{
 288       {7,1},
 289       {751,5},
 290       {408808,2},
 291       {2147483647,8}
 292     }
 293     },
 294     { 256,3,{
 295       {828,5},
 296       {909676,2},
 297       {2147483647,8}
 298     }
 299     },
 300     { 512,5,{
 301       {847,5},
 302       {1007066,2},
 303       {1068775,4},
 304       {2803389,2},
 305       {2147483647,8}
 306     }
 307     },
 308     { 2147483647,3,{
 309       {1974,5},
 310       {4007876,2},
 311       {2147483647,8}
 312     }
 313     }
 314   }
 315   },
 316   {16,{
 317     { 16,12,{
 318       {409,1},
 319       {768,6},
 320       {1365,4},
 321       {3071,6},
 322       {11299,2},
 323       {21746,6},
 324       {55629,2},
 325       {86065,4},
 326       {153867,2},
 327       {590560,6},
 328       {1448760,2},
 329       {2147483647,8},
 330     }
 331     },
 332     { 32,8,{
 333       {6,1},
 334       {24,5},
 335       {86,1},
 336       {875,5},
 337       {74528,2},
 338       {813050,8},
 339       {1725981,7},
 340       {2147483647,8},
 341     }
 342     },
 343     { 64,6,{
 344       {1018,5},
 345       {1217,6},
 346       {2370,5},
 347       {160654,2},
 348       {1885487,8},
 349       {2147483647,3},
 350     }
 351     },
 352     { 128,4,{
 353       {2291,5},
 354       {434465,2},
 355       {3525103,8},
 356       {2147483647,7},
 357     }
 358     },
 359     { 256,3,{
 360       {2189,5},
 361       {713154,2},
 362       {2147483647,8},
 363     }
 364     },
 365     { 512,3,{
 366       {2140,5},
 367       {1235056,2},
 368       {2147483647,8},
 369     }
 370     },
 371     { 2147483647,3,{
 372       {2153,5},
 373       {2629855,2},
 374       {2147483647,8},
 375     }
 376     }
 377   }
 378   }
 379 };
 380
 381
 382
 383 /*I_MPI_ADJUST_ALLTOALL
 384
 385 MPI_Alltoall
 386
 387 1. Bruck's algorithm
 388 2. Isend/Irecv + waitall algorithm
 389 3. Pair wise exchange algorithm
 390 4. Plum's algorithm
 391
 392 */
 393
 394
 395 intel_tuning_table_element intel_alltoall_table[] =
 396 {
 397   {1,{
 398     { 2,1,
 399         {
 400         {2147483647,3}
 401         }
 402     },
 403     { 4,2,
 404         {
 405         {0,4},
 406         {2147483647,2}
 407         }
 408     },
 409     {8,1,
 410         {
 411         {2147483647,2}
 412         }
 413     },
 414     {16,5,
 415         {
 416         {0,3},
 417         {84645,2},
 418         {167570,3},
 419         {413152,4},
 420         {2147483647,2}
 421         }
 422     },
 423     {32,6,
 424         {
 425         {61,1},
 426         {164,2},
 427         {696,1},
 428         {143254,2},
 429         {387024,3},
 430         {2147483647,2}
 431         },
 432     },
 433     {64,4,
 434         {
 435         {523,1},
 436         {146088,2},
 437         {488989,4},
 438         {2147483647,2}
 439         }
 440     },
 441     {2147483647,3,
 442         {
 443         {270,1},
 444         {628,4},
 445         {2147483647,2}
 446         }
 447     }
 448   }
 449   },
 450   {2, {
 451     { 4,4,{
 452       {1,2},
 453       {75,3},
 454       {131072,2},
 455       {2147483647,2}
 456     }
 457     },
 458     { 8,3,{
 459       {709,1},
 460       {131072,2},
 461       {2147483647,2}
 462     }
 463     },
 464     { 16,4,{
 465       {40048,2},
 466       {131072,3},
 467       {155927,3},
 468       {2147483647,4}
 469     }
 470     },
 471     { 32,7,{
 472       {105,1},
 473       {130,2},
 474       {1030,1},
 475       {58893,2},
 476       {131072,2},
 477       {271838,3},
 478       {2147483647,2}
 479     }
 480     },
 481     { 2147483647,8,{
 482       {521,1},
 483       {2032,4},
 484       {2412,2},
 485       {4112,4},
 486       {61620,2},
 487       {131072,3},
 488       {427408,3},
 489       {2147483647,4}
 490     }
 491     }
 492   }
 493   },
 494   {4,{
 495     { 8,3,{
 496       {512,1},
 497       {32768,2},
 498       {2147483647,2}
 499     }
 500     },
 501     { 64,8,{
 502       {7,1},
 503       {199,4},
 504       {764,1},
 505       {6409,4},
 506       {20026,2},
 507       {32768,3},
 508       {221643,4},
 509       {2147483647,3}
 510     }
 511     },
 512     { 2147483647,7,{
 513       {262,1},
 514       {7592,4},
 515       {22871,2},
 516       {32768,3},
 517       {47538,3},
 518       {101559,4},
 519       {2147483647,3}
 520     }
 521     }
 522   }
 523   },
 524   {8,{
 525     { 16,6,{
 526       {973,1},
 527       {5126,4},
 528       {16898,2},
 529       {32768,4},
 530       {65456,4},
 531       {2147483647,2}
 532     }
 533     },
 534     { 32,7,{
 535       {874,1},
 536       {6727,4},
 537       {17912,2},
 538       {32768,3},
 539       {41513,3},
 540       {199604,4},
 541       {2147483647,3}
 542     }
 543     },
 544     { 64,8,{
 545       {5,1},
 546       {114,4},
 547       {552,1},
 548       {8130,4},
 549       {32768,3},
 550       {34486,3},
 551       {160113,4},
 552       {2147483647,3}
 553     }
 554     },
 555     { 128,6,{
 556       {270,1},
 557       {3679,4},
 558       {32768,3},
 559       {64367,3},
 560       {146595,4},
 561       {2147483647,3}
 562     }
 563     },
 564     { 2147483647,4,{
 565       {133,1},
 566       {4017,4},
 567       {32768,3},
 568       {76351,4},
 569       {2147483647,3}
 570     }
 571     }
 572   }
 573   },
 574   {16,{
 575     { 32,7,{
 576       {963,1},
 577       {1818,4},
 578       {20007,2},
 579       {32768,4},
 580       {54296,4},
 581       {169735,3},
 582       {2147483647,2}
 583     }
 584     },
 585     { 64,11,{
 586       {17,1},
 587       {42,4},
 588       {592,1},
 589       {2015,4},
 590       {2753,2},
 591       {6496,3},
 592       {20402,4},
 593       {32768,3},
 594       {36246,3},
 595       {93229,4},
 596       {2147483647,3}
 597     }
 598     },
 599     { 128,9,{
 600       {18,1},
 601       {40,4},
 602       {287,1},
 603       {1308,4},
 604       {6842,1},
 605       {32768,3},
 606       {36986,3},
 607       {129081,4},
 608       {2147483647,3}
 609     }
 610     },
 611     { 256,7,{
 612       {135,1},
 613       {1538,4},
 614       {3267,1},
 615       {4132,3},
 616       {31469,4},
 617       {32768,3},
 618       {2147483647,3}
 619     }
 620     },
 621     { 2147483647,8,{
 622       {66,1},
 623       {1637,4},
 624       {2626,1},
 625       {4842,4},
 626       {32768,3},
 627       {33963,3},
 628       {72978,4},
 629       {2147483647,3}
 630     }
 631     }
 632   }
 633   }
 634 };
 635 int (*intel_alltoall_functions_table[])(void *sbuf, int scount,
 636                                              MPI_Datatype sdtype,
 637                                              void* rbuf, int rcount,
 638                                              MPI_Datatype rdtype,
 639                                              MPI_Comm comm) ={
 640       smpi_coll_tuned_alltoall_bruck,
 641       smpi_coll_tuned_alltoall_mvapich2_scatter_dest,
 642       smpi_coll_tuned_alltoall_pair,
 643       smpi_coll_tuned_alltoall_mvapich2//Plum is proprietary ? (and super efficient)
 644 };
 645
 646 /*I_MPI_ADJUST_BARRIER
 647
 648 MPI_Barrier
 649
 650 1. Dissemination algorithm
 651 2. Recursive doubling algorithm
 652 3. Topology aware dissemination algorithm
 653 4. Topology aware recursive doubling algorithm
 654 5. Binominal gather + scatter algorithm
 655 6. Topology aware binominal gather + scatter algorithm
 656
 657 */
 658 static int intel_barrier_gather_scatter(MPI_Comm comm){
 659     //our default barrier performs a antibcast/bcast
 660     smpi_mpi_barrier(comm);
 661     return MPI_SUCCESS;
 662 }
 663
 664 int (*intel_barrier_functions_table[])(MPI_Comm comm) ={
 665       smpi_coll_tuned_barrier_ompi_basic_linear,
 666       smpi_coll_tuned_barrier_ompi_recursivedoubling,
 667       smpi_coll_tuned_barrier_ompi_basic_linear,
 668       smpi_coll_tuned_barrier_ompi_recursivedoubling,
 669       intel_barrier_gather_scatter,
 670       intel_barrier_gather_scatter
 671 };
 672
 673 intel_tuning_table_element intel_barrier_table[] =
 674 {
 675   {1,{
 676     {2,1,
 677         {
 678         {2147483647,2}
 679         }
 680     },
 681     {4,1,
 682         {
 683         {2147483647,6}
 684         }
 685     },
 686     {8,1,
 687         {
 688         {2147483647,1}
 689         }
 690     },
 691     {64,1,
 692         {
 693         {2147483647,2}
 694         }
 695     },
 696     {2147483647,1,
 697         {
 698         {2147483647,6}
 699         }
 700     }
 701   }
 702   },
 703   {2,{
 704     { 2,1,{
 705       {2147483647,1}
 706     }
 707     },
 708     { 4,1,{
 709       {2147483647,3}
 710     }
 711     },
 712     { 8,1,{
 713       {2147483647,5}
 714     }
 715     },
 716     { 32,1,{
 717       {2147483647,2}
 718     }
 719     },
 720     { 128,1,{
 721       {2147483647,3}
 722     }
 723     },
 724     { 2147483647,1,{
 725       {2147483647,4}
 726     }
 727     }
 728   }
 729   },
 730   {4,{
 731     { 4,1,{
 732       {2147483647,2}
 733     }
 734     },
 735     { 8,1,{
 736       {2147483647,5}
 737     }
 738     },
 739     { 32,1,{
 740       {2147483647,2}
 741     }
 742     },
 743     { 2147483647,1,{
 744       {2147483647,4}
 745     }
 746     }
 747   }
 748   },
 749   {8,{
 750     { 8,1,{
 751       {2147483647,1}
 752     }
 753     },
 754     { 32,1,{
 755       {2147483647,2}
 756     }
 757     },
 758     { 2147483647,1,{
 759       {2147483647,4}
 760     }
 761     }
 762   }
 763   },
 764   {16,{
 765     { 4,1,{
 766       {2147483647,2}
 767     }
 768     },
 769     { 8,1,{
 770       {2147483647,5}
 771     }
 772     },
 773     { 32,1,{
 774       {2147483647,2}
 775     }
 776     },
 777     { 2147483647,1,{
 778       {2147483647,4}
 779     }
 780     }
 781   }
 782   }
 783 };
 784
 785
 786 /*I_MPI_ADJUST_BCAST
 787
 788 MPI_Bcast
 789
 790 1. Binomial algorithm
 791 2. Recursive doubling algorithm
 792 3. Ring algorithm
 793 4. Topology aware binomial algorithm
 794 5. Topology aware recursive doubling algorithm
 795 6. Topology aware ring algorithm
 796 7. Shumilin's bcast algorithm
 797 */
 798
 799 int (*intel_bcast_functions_table[])(void *buff, int count,
 800                                           MPI_Datatype datatype, int root,
 801                                           MPI_Comm  comm) ={
 802       smpi_coll_tuned_bcast_binomial_tree,
 803       //smpi_coll_tuned_bcast_scatter_rdb_allgather,
 804       smpi_coll_tuned_bcast_NTSL,
 805       smpi_coll_tuned_bcast_NTSL,
 806       smpi_coll_tuned_bcast_SMP_binomial,
 807       //smpi_coll_tuned_bcast_scatter_rdb_allgather,
 808       smpi_coll_tuned_bcast_NTSL,
 809       smpi_coll_tuned_bcast_SMP_linear,
 810       smpi_coll_tuned_bcast_mvapich2,//we don't know shumilin's algo'
 811 };
 812
 813 intel_tuning_table_element intel_bcast_table[] =
 814 {
 815   {1,{
 816     {2,9,
 817         {
 818         {1,2},
 819         {402,7},
 820         {682,5},
 821         {1433,4},
 822         {5734,7},
 823         {21845,1},
 824         {95963,6},
 825         {409897,5},
 826         {2147483647,1}
 827         }
 828     },
 829     {4,1,
 830         {
 831         {2147483647,7}
 832         }
 833     },
 834     {8,11,
 835         {
 836         {3,6},
 837         {4,7},
 838         {25,6},
 839         {256,1},
 840         {682,6},
 841         {1264,1},
 842         {2234,6},
 843         {6655,5},
 844         {16336,1},
 845         {3998434,7},
 846         {2147483647,6}
 847         }
 848     },
 849     {2147483647,1,
 850         {
 851         {2147483647,7}
 852         }
 853     }
 854   }
 855   },
 856   {2,{
 857     { 4,6,{
 858       {806,4},
 859       {18093,7},
 860       {51366,6},
 861       {182526,4},
 862       {618390,1},
 863       {2147483647,7}
 864     }
 865     },
 866     { 8,6,{
 867       {24,1},
 868       {74,4},
 869       {18137,1},
 870       {614661,7},
 871       {1284626,1},
 872       {2147483647,2}
 873     }
 874     },
 875     { 16,4,{
 876       {1,1},
 877       {158,7},
 878       {16955,1},
 879       {2147483647,7}
 880     }
 881     },
 882     { 32,3,{
 883       {242,7},
 884       {10345,1},
 885       {2147483647,7}
 886     }
 887     },
 888     { 2147483647,4,{
 889       {1,1},
 890       {737,7},
 891       {5340,1},
 892       {2147483647,7}
 893     }
 894     }
 895   }
 896   },
 897   {4,{
 898     { 8,4,{
 899       {256,4},
 900       {17181,1},
 901       {1048576,7},
 902       {2147483647,7}
 903     }
 904     },
 905     { 2147483647,1,{
 906       {2147483647,7}
 907     }
 908     }
 909   }
 910   },
 911   {8,{
 912     { 16,5,{
 913       {3,1},
 914       {318,7},
 915       {1505,1},
 916       {1048576,7},
 917       {2147483647,7}
 918     }
 919     },
 920     { 32,3,{
 921       {422,7},
 922       {851,1},
 923       {2147483647,7}
 924     }
 925     },
 926     { 64,3,{
 927       {468,7},
 928       {699,1},
 929       {2147483647,7}
 930     }
 931     },
 932     { 2147483647,1,{
 933       {2147483647,7}
 934     }
 935     }
 936   }
 937   },
 938   {16,{
 939     { 8,4,{
 940       {256,4},
 941       {17181,1},
 942       {1048576,7},
 943       {2147483647,7}
 944     }
 945     },
 946     { 2147483647,1,{
 947       {2147483647,7}
 948     }
 949     }
 950   }
 951   }
 952 };
 953
 954
 955 /*I_MPI_ADJUST_REDUCE
 956
 957 MPI_Reduce
 958
 959 1. Shumilin's algorithm
 960 2. Binomial algorithm
 961 3. Topology aware Shumilin's algorithm
 962 4. Topology aware binomial algorithm
 963 5. Rabenseifner's algorithm
 964 6. Topology aware Rabenseifner's algorithm
 965
 966 */
 967
 968 int (*intel_reduce_functions_table[])(void *sendbuf, void *recvbuf,
 969                                             int count, MPI_Datatype  datatype,
 970                                             MPI_Op   op, int root,
 971                                             MPI_Comm   comm) ={
 972       smpi_coll_tuned_reduce_mvapich2,
 973       smpi_coll_tuned_reduce_binomial,
 974       smpi_coll_tuned_reduce_mvapich2,
 975       smpi_coll_tuned_reduce_mvapich2_two_level,
 976       smpi_coll_tuned_reduce_rab,
 977       smpi_coll_tuned_reduce_rab
 978 };
 979
 980 intel_tuning_table_element intel_reduce_table[] =
 981 {
 982   {1,{
 983     {2147483647,1,
 984       {
 985       {2147483647,1}
 986       }
 987     }
 988   }
 989   },
 990   {2,{
 991     { 2,1,{
 992       {2147483647,1}
 993     }
 994     },
 995     { 4,2,{
 996       {10541,3},
 997       {2147483647,1}
 998     }
 999     },
1000     { 2147483647,1,{
1001       {2147483647,1}
1002     }
1003     }
1004   }
1005   },
1006   {4,{
1007     { 256,1,{
1008       {2147483647,1}
1009     }
1010     },
1011     { 2147483647,2,{
1012       {45,3},
1013       {2147483647,1}
1014     }
1015     }
1016   }
1017   },
1018   {8,{
1019     { 512,1,{
1020       {2147483647,1}
1021     }
1022     },
1023     { 2147483647,3,{
1024       {5,1},
1025       {11882,3},
1026       {2147483647,1}
1027     }
1028     }
1029   }
1030   },
1031   {16,{
1032     { 256,1,{
1033       {2147483647,1}
1034     }
1035     },
1036     { 2147483647,2,{
1037       {45,3},
1038       {2147483647,1}
1039     }
1040     }
1041   }
1042   }
1043 };
1044
1045 /* I_MPI_ADJUST_REDUCE_SCATTER
1046
1047 MPI_Reduce_scatter
1048
1049 1. Recursive having algorithm
1050 2. Pair wise exchange algorithm
1051 3. Recursive doubling algorithm
1052 4. Reduce + Scatterv algorithm
1053 5. Topology aware Reduce + Scatterv algorithm
1054
1055 */
1056 static  int intel_reduce_scatter_reduce_scatterv(void *sbuf, void *rbuf,
1057                                                     int *rcounts,
1058                                                     MPI_Datatype dtype,
1059                                                     MPI_Op  op,
1060                                                     MPI_Comm  comm)
1061 {
1062   smpi_mpi_reduce_scatter(sbuf, rbuf, rcounts,dtype, op,comm);
1063   return MPI_SUCCESS;
1064 }
1065
1066 static  int  intel_reduce_scatter_recursivehalving(void *sbuf, void *rbuf,
1067                                                     int *rcounts,
1068                                                     MPI_Datatype dtype,
1069                                                     MPI_Op  op,
1070                                                     MPI_Comm  comm)
1071 {
1072   if(smpi_op_is_commute(op))
1073     return smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(sbuf, rbuf, rcounts,dtype, op,comm);
1074   else
1075     return smpi_coll_tuned_reduce_scatter_mvapich2(sbuf, rbuf, rcounts,dtype, op,comm);
1076 }
1077
1078 int (*intel_reduce_scatter_functions_table[])( void *sbuf, void *rbuf,
1079                                                     int *rcounts,
1080                                                     MPI_Datatype dtype,
1081                                                     MPI_Op  op,
1082                                                     MPI_Comm  comm
1083                                                     ) ={
1084       intel_reduce_scatter_recursivehalving,
1085       smpi_coll_tuned_reduce_scatter_mpich_pair,
1086       smpi_coll_tuned_reduce_scatter_mpich_rdb,
1087       intel_reduce_scatter_reduce_scatterv,
1088       intel_reduce_scatter_reduce_scatterv
1089 };
1090
1091 intel_tuning_table_element intel_reduce_scatter_table[] =
1092 {
1093   {1,{
1094     {2,5,
1095     {
1096         {5,4},
1097         {522429,2},
1098         {1375877,5},
1099         {2932736,2},
1100         {2147483647,5}
1101         }
1102     },
1103     {4,9,
1104         {
1105         {4,4},
1106         {15,1},
1107         {120,3},
1108         {651,1},
1109         {12188,3},
1110         {33890,1},
1111         {572117,2},
1112         {1410202,5},
1113         {2147483647,2}
1114         }
1115     },
1116     {8,7,
1117         {
1118         {4,4},
1119         {2263,1},
1120         {25007,3},
1121         {34861,1},
1122         {169625,2},
1123         {2734000,4},
1124         {2147483647,2}
1125         }
1126     },
1127     {16,5,
1128         {
1129         {4,4},
1130         {14228,1},
1131         {46084,3},
1132         {522139,2},
1133         {2147483647,5}
1134         }
1135     },
1136     {32,5,
1137         {
1138         {4,4},
1139         {27516,1},
1140         {61693,3},
1141         {2483469,2},
1142         {2147483647,5}
1143         }
1144     },
1145     {64,4,
1146         {
1147         {0,3},
1148         {4,4},
1149         {100396,1},
1150         {2147483647,2}
1151         }
1152     },
1153     {2147483647,6,
1154         {
1155         {0,3},
1156         {4,4},
1157         {186926,1},
1158         {278259,3},
1159         {1500100,2},
1160         {2147483647,5}
1161         }
1162     }
1163   }
1164   },
1165   {2,{
1166     { 2,2,{
1167       {6,1},
1168       {2147483647,2}
1169     }
1170     },
1171     { 4,7,{
1172       {5,4},
1173       {13,5},
1174       {59,3},
1175       {76,1},
1176       {91488,3},
1177       {680063,4},
1178       {2147483647,2}
1179     }
1180     },
1181     { 8,8,{
1182       {4,4},
1183       {11,5},
1184       {31,1},
1185       {69615,3},
1186       {202632,2},
1187       {396082,5},
1188       {1495696,4},
1189       {2147483647,2}
1190     }
1191     },
1192     { 16,1,{
1193       {4,4},
1194       {345,1},
1195       {79523,3},
1196       {2147483647,2}
1197     }
1198     },
1199     { 32,5,{
1200       {0,3},
1201       {4,4},
1202       {992,1},
1203       {71417,3},
1204       {2147483647,2}
1205     }
1206     },
1207     { 64,4,{
1208       {4,4},
1209       {1472,1},
1210       {196592,3},
1211       {2147483647,2}
1212     }
1213     },
1214     { 128,5,{
1215       {0,3},
1216       {4,4},
1217       {32892,1},
1218       {381072,3},
1219       {2147483647,2}
1220     }
1221     },
1222     { 2147483647,6,{
1223       {0,2},
1224       {4,4},
1225       {33262,1},
1226       {1571397,3},
1227       {2211398,5},
1228       {2147483647,4}
1229     }
1230     }
1231   }
1232   },
1233   {4,{
1234     { 4,7,{
1235       {12,4},
1236       {27,5},
1237       {49,3},
1238       {187,1},
1239       {405673,3},
1240       {594687,4},
1241       {2147483647,2}
1242     }
1243     },
1244     { 8,5,{
1245       {24,5},
1246       {155,1},
1247       {204501,3},
1248       {274267,5},
1249       {2147483647,4}
1250     }
1251     },
1252     { 16,6,{
1253       {63,1},
1254       {72,3},
1255       {264,1},
1256       {168421,3},
1257       {168421,4},
1258       {2147483647,2}
1259     }
1260     },
1261     { 32,10,{
1262       {0,3},
1263       {4,4},
1264       {12,1},
1265       {18,5},
1266       {419,1},
1267       {188739,3},
1268       {716329,4},
1269       {1365841,5},
1270       {2430194,2},
1271       {2147483647,4}
1272     }
1273     },
1274     { 64,8,{
1275       {0,3},
1276       {4,4},
1277       {17,5},
1278       {635,1},
1279       {202937,3},
1280       {308253,5},
1281       {1389874,4},
1282       {2147483647,2}
1283     }
1284     },
1285     { 128,8,{
1286       {0,3},
1287       {4,4},
1288       {16,5},
1289       {1238,1},
1290       {280097,3},
1291       {631434,5},
1292       {2605072,4},
1293       {2147483647,2}
1294     }
1295     },
1296     { 256,5,{
1297       {0,2},
1298       {4,4},
1299       {16,5},
1300       {2418,1},
1301       {2147483647,3}
1302     }
1303     },
1304     { 2147483647,6,{
1305       {0,2},
1306       {4,4},
1307       {16,5},
1308       {33182,1},
1309       {3763779,3},
1310       {2147483647,4}
1311     }
1312     }
1313   }
1314   },
1315   {8,{
1316     { 8,6,{
1317       {5,4},
1318       {494,1},
1319       {97739,3},
1320       {522836,2},
1321       {554174,5},
1322       {2147483647,2}
1323     }
1324     },
1325     { 16,8,{
1326       {5,4},
1327       {62,1},
1328       {94,3},
1329       {215,1},
1330       {185095,3},
1331       {454784,4},
1332       {607911,5},
1333       {2147483647,4}
1334     }
1335     },
1336     { 32,7,{
1337       {0,3},
1338       {4,4},
1339       {302,1},
1340       {250841,3},
1341       {665822,4},
1342       {1760980,5},
1343       {2147483647,4}
1344     }
1345     },
1346     { 64,8,{
1347       {0,3},
1348       {4,4},
1349       {41,5},
1350       {306,1},
1351       {332405,3},
1352       {1269189,4},
1353       {3712421,5},
1354       {2147483647,4}
1355     }
1356     },
1357     { 128,6,{
1358       {0,3},
1359       {4,4},
1360       {39,5},
1361       {526,1},
1362       {487878,3},
1363       {2147483647,4}
1364     }
1365     },
1366     { 256,8,{
1367       {0,2},
1368       {4,4},
1369       {36,5},
1370       {1382,1},
1371       {424162,3},
1372       {632881,5},
1373       {1127566,3},
1374       {2147483647,4}
1375     }
1376     },
1377     { 512,4,{
1378       {4,4},
1379       {34,5},
1380       {5884,1},
1381       {2147483647,3}
1382     }
1383     },
1384     { 2147483647,4,{
1385       {5,4},
1386       {32,5},
1387       {25105,1},
1388       {2147483647,3}
1389     }
1390     }
1391   }
1392   },
1393   {16,{
1394     { 4,7,{
1395       {12,4},
1396       {27,5},
1397       {49,3},
1398       {187,1},
1399       {405673,3},
1400       {594687,4},
1401       {2147483647,2}
1402     }
1403     },
1404     { 8,5,{
1405       {24,5},
1406       {155,1},
1407       {204501,3},
1408       {274267,5},
1409       {2147483647,4}
1410     }
1411     },
1412     { 16,6,{
1413       {63,1},
1414       {72,3},
1415       {264,1},
1416       {168421,3},
1417       {168421,4},
1418       {2147483647,2}
1419     }
1420     },
1421     { 32,10,{
1422       {0,3},
1423       {4,4},
1424       {12,1},
1425       {18,5},
1426       {419,1},
1427       {188739,3},
1428       {716329,4},
1429       {1365841,5},
1430       {2430194,2},
1431       {2147483647,4}
1432     }
1433     },
1434     { 64,8,{
1435       {0,3},
1436       {4,4},
1437       {17,5},
1438       {635,1},
1439       {202937,3},
1440       {308253,5},
1441       {1389874,4},
1442       {2147483647,2}
1443     }
1444     },
1445     { 128,8,{
1446       {0,3},
1447       {4,4},
1448       {16,5},
1449       {1238,1},
1450       {280097,3},
1451       {631434,5},
1452       {2605072,4},
1453       {2147483647,2}
1454     }
1455     },
1456     { 256,5,{
1457       {0,2},
1458       {4,4},
1459       {16,5},
1460       {2418,1},
1461       {2147483647,3}
1462     }
1463     },
1464     { 2147483647,6,{
1465       {0,2},
1466       {4,4},
1467       {16,5},
1468       {33182,1},
1469       {3763779,3},
1470       {2147483647,4}
1471     }
1472     }
1473   }
1474   }
1475 };
1476
1477 /* I_MPI_ADJUST_ALLGATHER
1478
1479 MPI_Allgather
1480
1481 1. Recursive doubling algorithm
1482 2. Bruck's algorithm
1483 3. Ring algorithm
1484 4. Topology aware Gatherv + Bcast algorithm
1485
1486 */
1487
1488 int (*intel_allgather_functions_table[])(void *sbuf, int scount,
1489                                               MPI_Datatype sdtype,
1490                                               void* rbuf, int rcount,
1491                                               MPI_Datatype rdtype,
1492                                               MPI_Comm  comm
1493                                                     ) ={
1494       smpi_coll_tuned_allgather_rdb,
1495       smpi_coll_tuned_allgather_bruck,
1496       smpi_coll_tuned_allgather_ring,
1497       smpi_coll_tuned_allgather_GB
1498 };
1499
1500 intel_tuning_table_element intel_allgather_table[] =
1501 {
1502   {1,{
1503     {4,11,
1504         {
1505         {1,4},
1506         {384,1},
1507         {1533,4},
1508         {3296,1},
1509         {10763,4},
1510         {31816,3},
1511         {193343,4},
1512         {405857,3},
1513         {597626,4},
1514         {1844323,3},
1515         {2147483647,4}
1516         }
1517     },
1518     {8,10,
1519         {
1520         {12,4},
1521         {46,1},
1522         {205,4},
1523         {3422,2},
1524         {4200,4},
1525         {8748,1},
1526         {24080,3},
1527         {33244,4},
1528         {371159,1},
1529         {2147483647,3}
1530         }
1531     },
1532     {16, 8,
1533         {
1534         {3,4},
1535         {53,1},
1536         {100,4},
1537         {170,1},
1538         {6077,4},
1539         {127644,1},
1540         {143741,4},
1541         {2147483647,3}
1542         }
1543     },
1544     {2147483647,10,
1545         {
1546         {184,1},
1547         {320,4},
1548         {759,1},
1549         {1219,4},
1550         {2633,1},
1551         {8259,4},
1552         {123678,1},
1553         {160801,4},
1554         {284341,1},
1555         {2147483647,4}
1556         }
1557     }
1558   }
1559   },
1560   {2,{
1561     { 8,6,{
1562       {490,1},
1563       {558,2},
1564       {2319,1},
1565       {46227,3},
1566       {2215101,1},
1567       {2147483647,3}
1568     }
1569     },
1570     { 16,4,{
1571       {1005,1},
1572       {1042,2},
1573       {2059,1},
1574       {2147483647,3}
1575     }
1576     },
1577     { 2147483647,2,{
1578       {2454,1},
1579       {2147483647,3}
1580    }
1581    }
1582   }
1583   },
1584   {4,{
1585     { 8,2,{
1586       {2861,1},
1587       {2147483647,3}
1588     }
1589     },
1590     { 2147483647,2,{
1591       {605,1},
1592       {2147483647,3}
1593     }
1594     }
1595   }
1596   },
1597   {8,{
1598     { 16,4,{
1599       {66,1},
1600       {213,4},
1601       {514,1},
1602       {2147483647,3}
1603     }
1604     },
1605     { 32,4,{
1606       {91,1},
1607       {213,4},
1608       {514,1},
1609       {2147483647,3}
1610     }
1611     },
1612     { 64,4,{
1613       {71,1},
1614       {213,4},
1615       {514,1},
1616       {2147483647,3}
1617     }
1618     },
1619     { 128,2,{
1620       {305,1},
1621       {2147483647,3}
1622     }
1623     },
1624     { 2147483647,2,{
1625       {213,1},
1626       {2147483647,3}
1627     }
1628     }
1629   }
1630   },
1631   {16,{
1632     { 8,2,{
1633       {2861,1},
1634       {2147483647,3}
1635     }
1636     },
1637     { 2147483647,2,{
1638       {605,1},
1639       {2147483647,3}
1640     }
1641     }
1642   }
1643   }
1644 };
1645
1646 /* I_MPI_ADJUST_ALLGATHERV
1647
1648 MPI_Allgatherv
1649
1650 1. Recursive doubling algorithm
1651 2. Bruck's algorithm
1652 3. Ring algorithm
1653 4. Topology aware Gatherv + Bcast algorithm
1654
1655 */
1656
1657 int (*intel_allgatherv_functions_table[])(void *sbuf, int scount,
1658                                                MPI_Datatype sdtype,
1659                                                void* rbuf, int *rcounts,
1660                                                int *rdispls,
1661                                                MPI_Datatype rdtype,
1662                                                MPI_Comm  comm
1663                                                     ) ={
1664       smpi_coll_tuned_allgatherv_mpich_rdb,
1665       smpi_coll_tuned_allgatherv_ompi_bruck,
1666       smpi_coll_tuned_allgatherv_ring,
1667       smpi_coll_tuned_allgatherv_GB
1668 };
1669
1670 intel_tuning_table_element intel_allgatherv_table[] =
1671 {
1672   {1,{
1673     {2,3,
1674         {
1675         {259668,3},
1676         {635750,4},
1677         {2147483647,3}
1678         }
1679     },
1680     {4,7,
1681         {
1682         {1,1},
1683         {5,4},
1684         {46,1},
1685         {2590,2},
1686         {1177259,3},
1687         {2767234,4},
1688         {2147483647,3}
1689         }
1690     },
1691     {8, 6,
1692         {
1693         {99,2},
1694         {143,1},
1695         {4646,2},
1696         {63522,3},
1697         {2187806,4},
1698         {2147483647,3}
1699         }
1700     },
1701     {2147483647,7,
1702         {
1703         {1,1},
1704         {5,4},
1705         {46,1},
1706         {2590,2},
1707         {1177259,3},
1708         {2767234,4},
1709         {2147483647,3}
1710         }
1711     }
1712   }
1713   },
1714   {2,{
1715     { 4,3,{
1716       {3147,1},
1717       {5622,2},
1718       {2147483647,3}
1719     }
1720     },
1721     { 8,3,{
1722       {975,1},
1723       {4158,2},
1724       {2147483647,3}
1725     }
1726     },
1727     { 16,2,{
1728       {2146,1},
1729       {2147483647,3}
1730     }
1731     },
1732     { 32,4,{
1733       {81,1},
1734       {414,2},
1735       {1190,1},
1736       {2147483647,3}
1737     }
1738     },
1739     { 2147483647,5,{
1740       {1,2},
1741       {3,1},
1742       {783,2},
1743       {1782,4},
1744       {2147483647,3}
1745     }
1746     }
1747   }
1748   },
1749   {4,{
1750     { 8,2,{
1751       {2554,1},
1752       {2147483647,3}
1753     }
1754     },
1755     { 16,4,{
1756       {272,1},
1757       {657,2},
1758       {2078,1},
1759       {2147483647,3}
1760     }
1761     },
1762     { 32,2,{
1763       {1081,1},
1764       {2147483647,3}
1765     }
1766     },
1767     { 64,2,{
1768       {547,1},
1769       {2147483647,3}
1770     }
1771     },
1772     { 2147483647,5,{
1773       {19,1},
1774       {239,2},
1775       {327,1},
1776       {821,4},
1777       {2147483647,3}
1778     }
1779     }
1780   }
1781   },
1782   {8,{
1783     { 16,3,{
1784       {55,1},
1785       {514,2},
1786       {2147483647,3}
1787     }
1788     },
1789     { 32,4,{
1790       {53,1},
1791       {167,4},
1792       {514,2},
1793       {2147483647,3}
1794     }
1795     },
1796     { 64,3,{
1797       {13,1},
1798       {319,4},
1799       {2147483647,3}
1800     }
1801     },
1802     { 128,7,{
1803       {2,1},
1804       {11,2},
1805       {48,1},
1806       {201,2},
1807       {304,1},
1808       {1048,4},
1809       {2147483647,3}
1810     }
1811     },
1812     { 2147483647,5,{
1813       {5,1},
1814       {115,4},
1815       {129,1},
1816       {451,4},
1817       {2147483647,3}
1818     }
1819     }
1820   }
1821   },
1822   {16,{
1823     { 8,2,{
1824       {2554,1},
1825       {2147483647,3}
1826     }
1827     },
1828     { 16,4,{
1829       {272,1},
1830       {657,2},
1831       {2078,1},
1832       {2147483647,3}
1833     }
1834     },
1835     { 32,2,{
1836       {1081,1},
1837       {2147483647,3}
1838     }
1839     },
1840     { 64,2,{
1841       {547,1},
1842       {2147483647,3}
1843     }
1844     },
1845     { 2147483647,5,{
1846       {19,1},
1847       {239,2},
1848       {327,1},
1849       {821,4},
1850       {2147483647,3}
1851     }
1852     }
1853   }
1854   }
1855 };
1856
1857
1858 /* I_MPI_ADJUST_GATHER
1859
1860 MPI_Gather
1861
1862 1. Binomial algorithm
1863 2. Topology aware binomial algorithm
1864 3. Shumilin's algorithm
1865
1866 */
1867
1868 int (*intel_gather_functions_table[])(void *sbuf, int scount,
1869                                            MPI_Datatype sdtype,
1870                                            void* rbuf, int rcount,
1871                                            MPI_Datatype rdtype,
1872                                            int root,
1873                                            MPI_Comm  comm
1874                                                     ) ={
1875       smpi_coll_tuned_gather_ompi_binomial,
1876       smpi_coll_tuned_gather_ompi_binomial,
1877       smpi_coll_tuned_gather_mvapich2
1878 };
1879
1880 intel_tuning_table_element intel_gather_table[] =
1881 {
1882   {1,{
1883     {8,3,
1884         {
1885         {17561,3},
1886         {44791,2},
1887         {2147483647,3}
1888         }
1889     },
1890     {16,7,
1891         {
1892         {16932,3},
1893         {84425,2},
1894         {158363,3},
1895         {702801,2},
1896         {1341444,3},
1897         {2413569,2},
1898         {2147483647,3}
1899         }
1900     },
1901     {2147483647,4,
1902         {
1903         {47187,3},
1904         {349696,2},
1905         {2147483647,3},
1906         {2147483647,1}
1907         }
1908     }
1909   }
1910   },
1911   {2,{
1912     {2147483647,1,{
1913       {2147483647,3}
1914     }
1915     }
1916   }
1917   },
1918   {4,{
1919     {2147483647,1,{
1920       {2147483647,3}
1921     }
1922     }
1923   }
1924   },
1925   {8,{
1926     { 16,1,{
1927       {2147483647,3}
1928     }
1929     },
1930     { 32,2,{
1931       {9,2},
1932       {2147483647,3}
1933     }
1934     },
1935     { 64,2,{
1936       {784,2},
1937       {2147483647,3}
1938     }
1939     },
1940     { 128,3,{
1941       {160,3},
1942       {655,2},
1943       {2147483647,3}
1944     }
1945     },
1946     { 2147483647,1,{
1947       {2147483647,3}
1948     }
1949     }
1950   }
1951   },
1952   {16,{
1953     {2147483647,1,{
1954       {2147483647,3}
1955     }
1956     }
1957   }
1958   }
1959 };
1960
1961
1962 /* I_MPI_ADJUST_SCATTER
1963
1964 MPI_Scatter
1965
1966 1. Binomial algorithm
1967 2. Topology aware binomial algorithm
1968 3. Shumilin's algorithm
1969
1970 */
1971
1972 int (*intel_scatter_functions_table[])(void *sbuf, int scount,
1973                                             MPI_Datatype sdtype,
1974                                             void* rbuf, int rcount,
1975                                             MPI_Datatype rdtype,
1976                                             int root, MPI_Comm  comm
1977                                                     ) ={
1978       smpi_coll_tuned_scatter_ompi_binomial,
1979       smpi_coll_tuned_scatter_ompi_binomial,
1980       smpi_coll_tuned_scatter_mvapich2
1981 };
1982
1983 intel_tuning_table_element intel_scatter_table[] =
1984 {
1985   {1,{
1986     {2,2,
1987         {
1988         {16391,1},
1989         {2147483647,3}
1990         }
1991     },
1992     {4,6,
1993         {
1994         {16723,3},
1995         {153541,2},
1996         {425631,3},
1997         {794142,2},
1998         {1257027,3},
1999         {2147483647,2}
2000         }
2001     },
2002     {8,7,
2003         {
2004         {2633,3},
2005         {6144,2},
2006         {14043,3},
2007         {24576,2},
2008         {107995,3},
2009         {1752729,2},
2010         {2147483647,3}
2011         }
2012     },
2013     {16,7,
2014         {
2015         {2043,3},
2016         {2252,2},
2017         {17749,3},
2018         {106020,2},
2019         {628654,3},
2020         {3751354,2},
2021         {2147483647,3}
2022         }
2023     },
2024     {2147483647,4,
2025         {
2026         {65907,3},
2027         {245132,2},
2028         {1042439,3},
2029         {2147483647,2},
2030         {2147483647,1}
2031         }
2032     }
2033   }
2034   },
2035   {2,{
2036      {2147483647,1,{
2037        {2147483647,3}
2038      }
2039      }
2040   }
2041   },
2042   {4,{
2043     { 8,1,{
2044       {2147483647,3}
2045     }
2046     },
2047     { 16,2,{
2048       {140,3},
2049       {1302,1},
2050       {2147483647,3}
2051     }
2052     },
2053     { 32,2,{
2054       {159,3},
2055       {486,1},
2056       {2147483647,3}
2057     }
2058     },
2059     { 64,2,{
2060       {149,1},
2061       {2147483647,3}
2062     }
2063     },
2064     { 2147483647,2,{
2065       {139,1},
2066       {2147483647,3}
2067     }
2068     }
2069   }
2070   },
2071   {8,{
2072     { 16,4,{
2073       {587,1},
2074       {1370,2},
2075       {2102,1},
2076       {2147483647,3}
2077     }
2078     },
2079     { 32,3,{
2080       {1038,1},
2081       {2065,2},
2082       {2147483647,3}
2083     }
2084     },
2085     { 64,3,{
2086       {515,1},
2087       {2069,2},
2088       {2147483647,3}
2089     }
2090     },
2091     { 128,3,{
2092       {284,1},
2093       {796,2},
2094       {2147483647,3}
2095     }
2096     },
2097     { 2147483647,2,{
2098       {139,1},
2099       {2147483647,3}
2100     }
2101     }
2102   }
2103   },
2104   {16,{
2105     { 8,1,{
2106       {2147483647,3}
2107     }
2108     },
2109     { 16,3,{
2110       {140,3},
2111       {1302,1},
2112       {2147483647,3}
2113     }
2114     },
2115     { 32,3,{
2116       {159,3},
2117       {486,1},
2118       {2147483647,3}
2119     }
2120     },
2121     { 64,2,{
2122       {149,1},
2123       {2147483647,3}
2124     }
2125     },
2126     { 2147483647,2,{
2127       {139,1},
2128       {2147483647,3}
2129     }
2130     }
2131   }
2132   }
2133 };
2134
2135
2136
2137 /* I_MPI_ADJUST_ALLTOALLV
2138
2139 MPI_Alltoallv
2140
2141 1. Isend/Irecv + waitall algorithm
2142 2. Plum's algorithm
2143
2144 */
2145
2146 int (*intel_alltoallv_functions_table[])(void *sbuf, int *scounts, int *sdisps,
2147                                               MPI_Datatype sdtype,
2148                                               void *rbuf, int *rcounts, int *rdisps,
2149                                               MPI_Datatype rdtype,
2150                                               MPI_Comm  comm
2151                                                     ) ={
2152       smpi_coll_tuned_alltoallv_ompi_basic_linear,
2153       smpi_coll_tuned_alltoallv_bruck
2154 };
2155
2156 intel_tuning_table_element intel_alltoallv_table[] =
2157 {
2158   {1,{
2159     {2147483647,1,
2160         {
2161         {2147483647,1}
2162         }
2163     }
2164   }
2165   },
2166   {2,{
2167     {2147483647,1,
2168         {
2169         {2147483647,1}
2170         }
2171     }
2172   }
2173   },
2174   {4,{
2175     { 8,1,{
2176       {2147483647,1}//weirdly, intel reports the use of algo 0 here
2177     }
2178     },
2179     { 2147483647,2,{
2180       {4,1},//0 again
2181       {2147483647,2}
2182     }
2183     }
2184   }
2185   },
2186   {8,{
2187     { 16,1,{
2188       {2147483647,1}
2189     }
2190     },
2191     { 2147483647,2,{
2192       {0,1},//weird again, only for 0-sized messages
2193       {2147483647,2}
2194     }
2195     }
2196   }
2197   },
2198   {16,{
2199     { 8,1,{
2200       {2147483647,1}//0
2201     }
2202     },
2203     { 2147483647,2,{
2204       {4,1},//0
2205       {2147483647,2}
2206     }
2207     }
2208   }
2209   }
2210 };
2211
2212
2213 //These are collected from table 3.5-2 of the Intel MPI Reference Manual
2214
2215
2216 #define SIZECOMP_reduce_scatter\
2217     int total_message_size = 0;\
2218     for (i = 0; i < comm_size; i++) { \
2219         total_message_size += rcounts[i];\
2220     }\
2221     size_t block_dsize = total_message_size*smpi_datatype_size(dtype);\
2222
2223 #define SIZECOMP_allreduce\
2224   size_t block_dsize =rcount * smpi_datatype_size(dtype);
2225
2226 #define SIZECOMP_alltoall\
2227   size_t block_dsize =send_count * smpi_datatype_size(send_type);
2228
2229 #define SIZECOMP_bcast\
2230   size_t block_dsize =count * smpi_datatype_size(datatype);
2231
2232 #define SIZECOMP_reduce\
2233   size_t block_dsize =count * smpi_datatype_size(datatype);
2234
2235 #define SIZECOMP_barrier\
2236   size_t block_dsize = 1;
2237
2238 #define SIZECOMP_allgather\
2239   size_t block_dsize =recv_count * smpi_datatype_size(recv_type);
2240
2241 #define SIZECOMP_allgatherv\
2242     int total_message_size = 0;\
2243     for (i = 0; i < comm_size; i++) { \
2244         total_message_size += recv_count[i];\
2245     }\
2246     size_t block_dsize = total_message_size*smpi_datatype_size(recv_type);
2247
2248 #define SIZECOMP_gather\
2249   int rank = smpi_comm_rank(comm);\
2250   size_t block_dsize = (send_buff == MPI_IN_PLACE || rank ==root) ?\
2251                 recv_count * smpi_datatype_size(recv_type) :\
2252                 send_count * smpi_datatype_size(send_type);
2253
2254 #define SIZECOMP_scatter\
2255   int rank = smpi_comm_rank(comm);\
2256   size_t block_dsize = (sendbuf == MPI_IN_PLACE || rank !=root ) ?\
2257                 recvcount * smpi_datatype_size(recvtype) :\
2258                 sendcount * smpi_datatype_size(sendtype);
2259
2260 #define SIZECOMP_alltoallv\
2261   size_t block_dsize = 1;
2262
2263 #define IMPI_COLL_SELECT(cat, ret, args, args2)\
2264 ret smpi_coll_tuned_ ## cat ## _impi (COLL_UNPAREN args)\
2265 {\
2266     int comm_size = smpi_comm_size(comm);\
2267     int i =0;\
2268     SIZECOMP_ ## cat\
2269     i=0;\
2270     int j =0, k=0;\
2271     if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){\
2272       smpi_comm_init_smp(comm);\
2273     }\
2274     int local_size=1;\
2275     if (smpi_comm_is_uniform(comm)) {\
2276         local_size = smpi_comm_size(smpi_comm_get_intra_comm(comm));\
2277     }\
2278     while(i < INTEL_MAX_NB_PPN &&\
2279     local_size!=intel_ ## cat ## _table[i].ppn)\
2280       i++;\
2281     if(i==INTEL_MAX_NB_PPN) i=0;\
2282     while(comm_size>intel_ ## cat ## _table[i].elems[j].max_num_proc\
2283         && j < INTEL_MAX_NB_THRESHOLDS)\
2284       j++;\
2285     while(block_dsize >=intel_ ## cat ## _table[i].elems[j].elems[k].max_size\
2286          && k< intel_ ## cat ## _table[i].elems[j].num_elems)\
2287       k++;\
2288     return (intel_ ## cat ## _functions_table[intel_ ## cat ## _table[i].elems[j].elems[k].algo-1]\
2289     args2);\
2290 }
2291
2292 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHERV_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_disps, recv_type, comm));
2293 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLREDUCE_SIG, (sbuf, rbuf, rcount, dtype, op, comm));
2294 COLL_APPLY(IMPI_COLL_SELECT, COLL_GATHER_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_type, root, comm));
2295 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHER_SIG, (send_buff,send_count,send_type,recv_buff,recv_count,recv_type,comm));
2296 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALL_SIG,(send_buff, send_count, send_type, recv_buff, recv_count, recv_type,comm));
2297 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALLV_SIG, (send_buff, send_counts, send_disps, send_type, recv_buff, recv_counts, recv_disps, recv_type, comm));
2298 COLL_APPLY(IMPI_COLL_SELECT, COLL_BCAST_SIG , (buf, count, datatype, root, comm));
2299 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SIG,(buf,rbuf, count, datatype, op, root, comm));
2300 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SCATTER_SIG ,(sbuf,rbuf, rcounts,dtype,op,comm));
2301 COLL_APPLY(IMPI_COLL_SELECT, COLL_SCATTER_SIG ,(sendbuf, sendcount, sendtype,recvbuf, recvcount, recvtype,root, comm));
2302 COLL_APPLY(IMPI_COLL_SELECT, COLL_BARRIER_SIG,(comm));
2303