src/smpi/colls/smpi_intel_mpi_selector.c

   1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
   2
   3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
   4  * All rights reserved.                                                     */
   5
   6 /* This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the license (GNU LGPL) which comes with this package. */
   8
   9 #include "colls_private.h"
  10
  11
  12 // This selector is based on information gathered on the Stampede cluster, with Intel MPI 4.1.3.049, and from the intel reference manual. The data was gathered launching runs with 1,2,4,8,16 processes per node.
  13
  14
  15 #define INTEL_MAX_NB_THRESHOLDS  32
  16 #define INTEL_MAX_NB_NUMPROCS  12
  17 #define INTEL_MAX_NB_PPN  5  /* 1 2 4 8 16 ppn */
  18
  19 typedef struct {
  20   int max_size;
  21   int algo;
  22 } intel_tuning_table_size_element;
  23
  24 typedef struct {
  25   int max_num_proc;
  26   int num_elems;
  27   intel_tuning_table_size_element elems[INTEL_MAX_NB_THRESHOLDS];
  28 } intel_tuning_table_numproc_element;
  29
  30 typedef struct {
  31   int ppn;
  32   intel_tuning_table_numproc_element elems[INTEL_MAX_NB_NUMPROCS];
  33 } intel_tuning_table_element;
  34
  35 /*
  36 I_MPI_ADJUST_ALLREDUCE
  37
  38 MPI_Allreduce
  39
  40 1 - Recursive doubling algorithm
  41 2 - Rabenseifner's algorithm
  42 3 - Reduce + Bcast algorithm
  43 4 - Topology aware Reduce + Bcast algorithm
  44 5 - Binomial gather + scatter algorithm
  45 6 - Topology aware binominal gather + scatter algorithm
  46 7 - Shumilin's ring algorithm
  47 8 - Ring algorithm
  48
  49
  50 //as Shumilin's ring algorithm is unknown, default to ring'
  51 */
  52
  53
  54 int (*intel_allreduce_functions_table[])(void *sendbuf,
  55       void *recvbuf,
  56       int count,
  57       MPI_Datatype datatype,
  58       MPI_Op op, MPI_Comm comm) ={
  59       smpi_coll_tuned_allreduce_rdb,
  60       smpi_coll_tuned_allreduce_rab1,
  61       smpi_coll_tuned_allreduce_redbcast,
  62       smpi_coll_tuned_allreduce_mvapich2_two_level,
  63       smpi_coll_tuned_allreduce_smp_binomial,
  64       smpi_coll_tuned_allreduce_mvapich2_two_level,
  65       smpi_coll_tuned_allreduce_ompi_ring_segmented,
  66       smpi_coll_tuned_allreduce_ompi_ring_segmented
  67 };
  68
  69 intel_tuning_table_element intel_allreduce_table[] =
  70 {
  71   {1,{
  72     { 2,9,{
  73       {6,7},
  74       {85,1},
  75       {192,7},
  76       {853,1},
  77       {1279,7},
  78       {16684,1},
  79       {34279,8},
  80       {1681224,3},
  81       {2147483647,7}
  82     }
  83     },
  84     { 4, 8,{
  85       {16,7},
  86       {47,1},
  87       {2062,7},
  88       {16699,1},
  89       {33627,7},
  90       {70732,8},
  91       {1300705,3},
  92       {2147483647,8}
  93     }
  94     },
  95     {8,8,{
  96       {118,1},
  97       {146,4},
  98       {16760,1},
  99       {36364,6},
 100       {136239,8},
 101       {315710,7},
 102       {3220366,3},
 103       {2147483647,8}
 104       }
 105     },
 106     {16,7,{
 107       {934,1},
 108       {1160,6},
 109       {15505,1},
 110       {52730,2},
 111       {300705,8},
 112       {563680,7},
 113       {2147483647,3}
 114       }
 115     },
 116     {2147483647,11,{
 117       {5,6},
 118       {11,4},
 119       {182,1},
 120       {700,6},
 121       {1450,4},
 122       {11146,1},
 123       {25539,6},
 124       {37634,4},
 125       {93784,6},
 126       {817658,2},
 127       {2147483647,3}
 128     }
 129     }
 130   }
 131   },
 132   {2,{
 133     { 4,6,{
 134       {2084,7},
 135       {15216,1},
 136       {99715,7},
 137       {168666,3},
 138       {363889,2},
 139       {2147483647,7}
 140     }
 141     },
 142     { 8,6,{
 143       {14978,1},
 144       {66879,2},
 145       {179296,8},
 146       {304801,3},
 147       {704509,7},
 148       {2147483647,2}
 149     }
 150     },
 151     { 16,6,{
 152       {16405,1},
 153       {81784,2},
 154       {346385,8},
 155       {807546,7},
 156       {1259854,2},
 157       {2147483647,3}
 158     }
 159     },
 160     { 32,4,{
 161       {8913,1},
 162       {103578,2},
 163       {615876,8},
 164       {2147483647,2}
 165     }
 166     },
 167     { 64,7,{
 168       {1000,1},
 169       {2249,2},
 170       {6029,1},
 171       {325357,2},
 172       {1470976,8},
 173       {2556670,7},
 174       {2147483647,3}
 175     }
 176     },
 177     { 128,5,{
 178       {664,1},
 179       {754706,2},
 180       {1663862,4},
 181       {3269097,2},
 182       {2147483647,7}
 183     }
 184     },
 185     { 2147483647,3,{
 186       {789,1},
 187       {2247589,2},
 188       {2147483647,8}
 189     }
 190     }
 191   }
 192   },
 193   {4,{
 194     { 4,4,{
 195       {5738,1},
 196       {197433,2},
 197       {593742,7},
 198       {2147483647,2}
 199     }
 200     },
 201     { 8,7,{
 202       {5655,1},
 203       {75166,2},
 204       {177639,8},
 205       {988014,3},
 206       {1643869,2},
 207       {2494859,8},
 208       {2147483647,2}
 209     }
 210     },
 211     { 16,7,{
 212       {587,1},
 213       {3941,2},
 214       {9003,1},
 215       {101469,2},
 216       {355768,8},
 217       {3341814,3},
 218       {2147483647,8}
 219     }
 220     },
 221     { 32,4,{
 222       {795,1},
 223       {146567,2},
 224       {732118,8},
 225       {2147483647,3}
 226     }
 227     },
 228     { 64,4,{
 229       {528,1},
 230       {221277,2},
 231       {1440737,8},
 232       {2147483647,3}
 233     }
 234     },
 235     { 128,4,{
 236       {481,1},
 237       {593833,2},
 238       {2962021,8},
 239       {2147483647,7}
 240     }
 241     },
 242     { 256,2,{
 243       {584,1},
 244       {2147483647,2}
 245     }
 246     },
 247     { 2147483647,3,{
 248       {604,1},
 249       {2997006,2},
 250       {2147483647,8}
 251     }
 252     }
 253   }
 254   },
 255   {8,{
 256     { 8,6,{
 257       {2560,1},
 258       {114230,6},
 259       {288510,8},
 260       {664038,2},
 261       {1339913,6},
 262       {2147483647,4}
 263     }
 264     },
 265     { 16,5,{
 266       {497,1},
 267       {54201,2},
 268       {356217,8},
 269       {3413609,3},
 270       {2147483647,8}
 271     }
 272     },
 273     { 32,5,{
 274       {377,1},
 275       {109745,2},
 276       {716514,8},
 277       {3976768,3},
 278       {2147483647,8}
 279     }
 280     },
 281     { 64,6,{
 282       {109,1},
 283       {649,5},
 284       {266080,2},
 285       {1493331,8},
 286       {2541403,7},
 287       {2147483647,3}
 288     }
 289     },
 290     { 128,4,{
 291       {7,1},
 292       {751,5},
 293       {408808,2},
 294       {2147483647,8}
 295     }
 296     },
 297     { 256,3,{
 298       {828,5},
 299       {909676,2},
 300       {2147483647,8}
 301     }
 302     },
 303     { 512,5,{
 304       {847,5},
 305       {1007066,2},
 306       {1068775,4},
 307       {2803389,2},
 308       {2147483647,8}
 309     }
 310     },
 311     { 2147483647,3,{
 312       {1974,5},
 313       {4007876,2},
 314       {2147483647,8}
 315     }
 316     }
 317   }
 318   },
 319   {16,{
 320     { 16,12,{
 321       {409,1},
 322       {768,6},
 323       {1365,4},
 324       {3071,6},
 325       {11299,2},
 326       {21746,6},
 327       {55629,2},
 328       {86065,4},
 329       {153867,2},
 330       {590560,6},
 331       {1448760,2},
 332       {2147483647,8},
 333     }
 334     },
 335     { 32,8,{
 336       {6,1},
 337       {24,5},
 338       {86,1},
 339       {875,5},
 340       {74528,2},
 341       {813050,8},
 342       {1725981,7},
 343       {2147483647,8},
 344     }
 345     },
 346     { 64,6,{
 347       {1018,5},
 348       {1217,6},
 349       {2370,5},
 350       {160654,2},
 351       {1885487,8},
 352       {2147483647,3},
 353     }
 354     },
 355     { 128,4,{
 356       {2291,5},
 357       {434465,2},
 358       {3525103,8},
 359       {2147483647,7},
 360     }
 361     },
 362     { 256,3,{
 363       {2189,5},
 364       {713154,2},
 365       {2147483647,8},
 366     }
 367     },
 368     { 512,3,{
 369       {2140,5},
 370       {1235056,2},
 371       {2147483647,8},
 372     }
 373     },
 374     { 2147483647,3,{
 375       {2153,5},
 376       {2629855,2},
 377       {2147483647,8},
 378     }
 379     }
 380   }
 381   }
 382 };
 383
 384
 385
 386 /*I_MPI_ADJUST_ALLTOALL
 387
 388 MPI_Alltoall
 389
 390 1. Bruck's algorithm
 391 2. Isend/Irecv + waitall algorithm
 392 3. Pair wise exchange algorithm
 393 4. Plum's algorithm
 394
 395 */
 396
 397
 398 intel_tuning_table_element intel_alltoall_table[] =
 399 {
 400   {1,{
 401     { 2,1,
 402         {
 403         {2147483647,3}
 404         }
 405     },
 406     { 4,2,
 407         {
 408         {0,4},
 409         {2147483647,2}
 410         }
 411     },
 412     {8,1,
 413         {
 414         {2147483647,2}
 415         }
 416     },
 417     {16,5,
 418         {
 419         {0,3},
 420         {84645,2},
 421         {167570,3},
 422         {413152,4},
 423         {2147483647,2}
 424         }
 425     },
 426     {32,6,
 427         {
 428         {61,1},
 429         {164,2},
 430         {696,1},
 431         {143254,2},
 432         {387024,3},
 433         {2147483647,2}
 434         },
 435     },
 436     {64,4,
 437         {
 438         {523,1},
 439         {146088,2},
 440         {488989,4},
 441         {2147483647,2}
 442         }
 443     },
 444     {2147483647,3,
 445         {
 446         {270,1},
 447         {628,4},
 448         {2147483647,2}
 449         }
 450     }
 451   }
 452   },
 453   {2, {
 454     { 4,4,{
 455       {1,2},
 456       {75,3},
 457       {131072,2},
 458       {2147483647,2}
 459     }
 460     },
 461     { 8,3,{
 462       {709,1},
 463       {131072,2},
 464       {2147483647,2}
 465     }
 466     },
 467     { 16,4,{
 468       {40048,2},
 469       {131072,3},
 470       {155927,3},
 471       {2147483647,4}
 472     }
 473     },
 474     { 32,7,{
 475       {105,1},
 476       {130,2},
 477       {1030,1},
 478       {58893,2},
 479       {131072,2},
 480       {271838,3},
 481       {2147483647,2}
 482     }
 483     },
 484     { 2147483647,8,{
 485       {521,1},
 486       {2032,4},
 487       {2412,2},
 488       {4112,4},
 489       {61620,2},
 490       {131072,3},
 491       {427408,3},
 492       {2147483647,4}
 493     }
 494     }
 495   }
 496   },
 497   {4,{
 498     { 8,3,{
 499       {512,1},
 500       {32768,2},
 501       {2147483647,2}
 502     }
 503     },
 504     { 64,8,{
 505       {7,1},
 506       {199,4},
 507       {764,1},
 508       {6409,4},
 509       {20026,2},
 510       {32768,3},
 511       {221643,4},
 512       {2147483647,3}
 513     }
 514     },
 515     { 2147483647,7,{
 516       {262,1},
 517       {7592,4},
 518       {22871,2},
 519       {32768,3},
 520       {47538,3},
 521       {101559,4},
 522       {2147483647,3}
 523     }
 524     }
 525   }
 526   },
 527   {8,{
 528     { 16,6,{
 529       {973,1},
 530       {5126,4},
 531       {16898,2},
 532       {32768,4},
 533       {65456,4},
 534       {2147483647,2}
 535     }
 536     },
 537     { 32,7,{
 538       {874,1},
 539       {6727,4},
 540       {17912,2},
 541       {32768,3},
 542       {41513,3},
 543       {199604,4},
 544       {2147483647,3}
 545     }
 546     },
 547     { 64,8,{
 548       {5,1},
 549       {114,4},
 550       {552,1},
 551       {8130,4},
 552       {32768,3},
 553       {34486,3},
 554       {160113,4},
 555       {2147483647,3}
 556     }
 557     },
 558     { 128,6,{
 559       {270,1},
 560       {3679,4},
 561       {32768,3},
 562       {64367,3},
 563       {146595,4},
 564       {2147483647,3}
 565     }
 566     },
 567     { 2147483647,4,{
 568       {133,1},
 569       {4017,4},
 570       {32768,3},
 571       {76351,4},
 572       {2147483647,3}
 573     }
 574     }
 575   }
 576   },
 577   {16,{
 578     { 32,7,{
 579       {963,1},
 580       {1818,4},
 581       {20007,2},
 582       {32768,4},
 583       {54296,4},
 584       {169735,3},
 585       {2147483647,2}
 586     }
 587     },
 588     { 64,11,{
 589       {17,1},
 590       {42,4},
 591       {592,1},
 592       {2015,4},
 593       {2753,2},
 594       {6496,3},
 595       {20402,4},
 596       {32768,3},
 597       {36246,3},
 598       {93229,4},
 599       {2147483647,3}
 600     }
 601     },
 602     { 128,9,{
 603       {18,1},
 604       {40,4},
 605       {287,1},
 606       {1308,4},
 607       {6842,1},
 608       {32768,3},
 609       {36986,3},
 610       {129081,4},
 611       {2147483647,3}
 612     }
 613     },
 614     { 256,7,{
 615       {135,1},
 616       {1538,4},
 617       {3267,1},
 618       {4132,3},
 619       {31469,4},
 620       {32768,3},
 621       {2147483647,3}
 622     }
 623     },
 624     { 2147483647,8,{
 625       {66,1},
 626       {1637,4},
 627       {2626,1},
 628       {4842,4},
 629       {32768,3},
 630       {33963,3},
 631       {72978,4},
 632       {2147483647,3}
 633     }
 634     }
 635   }
 636   }
 637 };
 638 int (*intel_alltoall_functions_table[])(void *sbuf, int scount,
 639                                              MPI_Datatype sdtype,
 640                                              void* rbuf, int rcount,
 641                                              MPI_Datatype rdtype,
 642                                              MPI_Comm comm) ={
 643       smpi_coll_tuned_alltoall_bruck,
 644       smpi_coll_tuned_alltoall_mvapich2_scatter_dest,
 645       smpi_coll_tuned_alltoall_pair,
 646       smpi_coll_tuned_alltoall_mvapich2//Plum is proprietary ? (and super efficient)
 647 };
 648
 649 /*I_MPI_ADJUST_BARRIER
 650
 651 MPI_Barrier
 652
 653 1. Dissemination algorithm
 654 2. Recursive doubling algorithm
 655 3. Topology aware dissemination algorithm
 656 4. Topology aware recursive doubling algorithm
 657 5. Binominal gather + scatter algorithm
 658 6. Topology aware binominal gather + scatter algorithm
 659
 660 */
 661 static int intel_barrier_gather_scatter(MPI_Comm comm){
 662     //our default barrier performs a antibcast/bcast
 663     smpi_mpi_barrier(comm);
 664     return MPI_SUCCESS;
 665 }
 666
 667 int (*intel_barrier_functions_table[])(MPI_Comm comm) ={
 668       smpi_coll_tuned_barrier_ompi_basic_linear,
 669       smpi_coll_tuned_barrier_ompi_recursivedoubling,
 670       smpi_coll_tuned_barrier_ompi_basic_linear,
 671       smpi_coll_tuned_barrier_ompi_recursivedoubling,
 672       intel_barrier_gather_scatter,
 673       intel_barrier_gather_scatter
 674 };
 675
 676 intel_tuning_table_element intel_barrier_table[] =
 677 {
 678   {1,{
 679     {2,1,
 680         {
 681         {2147483647,2}
 682         }
 683     },
 684     {4,1,
 685         {
 686         {2147483647,6}
 687         }
 688     },
 689     {8,1,
 690         {
 691         {2147483647,1}
 692         }
 693     },
 694     {64,1,
 695         {
 696         {2147483647,2}
 697         }
 698     },
 699     {2147483647,1,
 700         {
 701         {2147483647,6}
 702         }
 703     }
 704   }
 705   },
 706   {2,{
 707     { 2,1,{
 708       {2147483647,1}
 709     }
 710     },
 711     { 4,1,{
 712       {2147483647,3}
 713     }
 714     },
 715     { 8,1,{
 716       {2147483647,5}
 717     }
 718     },
 719     { 32,1,{
 720       {2147483647,2}
 721     }
 722     },
 723     { 128,1,{
 724       {2147483647,3}
 725     }
 726     },
 727     { 2147483647,1,{
 728       {2147483647,4}
 729     }
 730     }
 731   }
 732   },
 733   {4,{
 734     { 4,1,{
 735       {2147483647,2}
 736     }
 737     },
 738     { 8,1,{
 739       {2147483647,5}
 740     }
 741     },
 742     { 32,1,{
 743       {2147483647,2}
 744     }
 745     },
 746     { 2147483647,1,{
 747       {2147483647,4}
 748     }
 749     }
 750   }
 751   },
 752   {8,{
 753     { 8,1,{
 754       {2147483647,1}
 755     }
 756     },
 757     { 32,1,{
 758       {2147483647,2}
 759     }
 760     },
 761     { 2147483647,1,{
 762       {2147483647,4}
 763     }
 764     }
 765   }
 766   },
 767   {16,{
 768     { 4,1,{
 769       {2147483647,2}
 770     }
 771     },
 772     { 8,1,{
 773       {2147483647,5}
 774     }
 775     },
 776     { 32,1,{
 777       {2147483647,2}
 778     }
 779     },
 780     { 2147483647,1,{
 781       {2147483647,4}
 782     }
 783     }
 784   }
 785   }
 786 };
 787
 788
 789 /*I_MPI_ADJUST_BCAST
 790
 791 MPI_Bcast
 792
 793 1. Binomial algorithm
 794 2. Recursive doubling algorithm
 795 3. Ring algorithm
 796 4. Topology aware binomial algorithm
 797 5. Topology aware recursive doubling algorithm
 798 6. Topology aware ring algorithm
 799 7. Shumilin's bcast algorithm
 800 */
 801
 802 int (*intel_bcast_functions_table[])(void *buff, int count,
 803                                           MPI_Datatype datatype, int root,
 804                                           MPI_Comm  comm) ={
 805       smpi_coll_tuned_bcast_binomial_tree,
 806       //smpi_coll_tuned_bcast_scatter_rdb_allgather,
 807       smpi_coll_tuned_bcast_NTSL,
 808       smpi_coll_tuned_bcast_NTSL,
 809       smpi_coll_tuned_bcast_SMP_binomial,
 810       //smpi_coll_tuned_bcast_scatter_rdb_allgather,
 811       smpi_coll_tuned_bcast_NTSL,
 812       smpi_coll_tuned_bcast_SMP_linear,
 813       smpi_coll_tuned_bcast_mvapich2,//we don't know shumilin's algo'
 814 };
 815
 816 intel_tuning_table_element intel_bcast_table[] =
 817 {
 818   {1,{
 819     {2,9,
 820         {
 821         {1,2},
 822         {402,7},
 823         {682,5},
 824         {1433,4},
 825         {5734,7},
 826         {21845,1},
 827         {95963,6},
 828         {409897,5},
 829         {2147483647,1}
 830         }
 831     },
 832     {4,1,
 833         {
 834         {2147483647,7}
 835         }
 836     },
 837     {8,11,
 838         {
 839         {3,6},
 840         {4,7},
 841         {25,6},
 842         {256,1},
 843         {682,6},
 844         {1264,1},
 845         {2234,6},
 846         {6655,5},
 847         {16336,1},
 848         {3998434,7},
 849         {2147483647,6}
 850         }
 851     },
 852     {2147483647,1,
 853         {
 854         {2147483647,7}
 855         }
 856     }
 857   }
 858   },
 859   {2,{
 860     { 4,6,{
 861       {806,4},
 862       {18093,7},
 863       {51366,6},
 864       {182526,4},
 865       {618390,1},
 866       {2147483647,7}
 867     }
 868     },
 869     { 8,6,{
 870       {24,1},
 871       {74,4},
 872       {18137,1},
 873       {614661,7},
 874       {1284626,1},
 875       {2147483647,2}
 876     }
 877     },
 878     { 16,4,{
 879       {1,1},
 880       {158,7},
 881       {16955,1},
 882       {2147483647,7}
 883     }
 884     },
 885     { 32,3,{
 886       {242,7},
 887       {10345,1},
 888       {2147483647,7}
 889     }
 890     },
 891     { 2147483647,4,{
 892       {1,1},
 893       {737,7},
 894       {5340,1},
 895       {2147483647,7}
 896     }
 897     }
 898   }
 899   },
 900   {4,{
 901     { 8,4,{
 902       {256,4},
 903       {17181,1},
 904       {1048576,7},
 905       {2147483647,7}
 906     }
 907     },
 908     { 2147483647,1,{
 909       {2147483647,7}
 910     }
 911     }
 912   }
 913   },
 914   {8,{
 915     { 16,5,{
 916       {3,1},
 917       {318,7},
 918       {1505,1},
 919       {1048576,7},
 920       {2147483647,7}
 921     }
 922     },
 923     { 32,3,{
 924       {422,7},
 925       {851,1},
 926       {2147483647,7}
 927     }
 928     },
 929     { 64,3,{
 930       {468,7},
 931       {699,1},
 932       {2147483647,7}
 933     }
 934     },
 935     { 2147483647,1,{
 936       {2147483647,7}
 937     }
 938     }
 939   }
 940   },
 941   {16,{
 942     { 8,4,{
 943       {256,4},
 944       {17181,1},
 945       {1048576,7},
 946       {2147483647,7}
 947     }
 948     },
 949     { 2147483647,1,{
 950       {2147483647,7}
 951     }
 952     }
 953   }
 954   }
 955 };
 956
 957
 958 /*I_MPI_ADJUST_REDUCE
 959
 960 MPI_Reduce
 961
 962 1. Shumilin's algorithm
 963 2. Binomial algorithm
 964 3. Topology aware Shumilin's algorithm
 965 4. Topology aware binomial algorithm
 966 5. Rabenseifner's algorithm
 967 6. Topology aware Rabenseifner's algorithm
 968
 969 */
 970
 971 int (*intel_reduce_functions_table[])(void *sendbuf, void *recvbuf,
 972                                             int count, MPI_Datatype  datatype,
 973                                             MPI_Op   op, int root,
 974                                             MPI_Comm   comm) ={
 975       smpi_coll_tuned_reduce_mvapich2,
 976       smpi_coll_tuned_reduce_binomial,
 977       smpi_coll_tuned_reduce_mvapich2,
 978       smpi_coll_tuned_reduce_mvapich2_two_level,
 979       smpi_coll_tuned_reduce_rab,
 980       smpi_coll_tuned_reduce_rab
 981 };
 982
 983 intel_tuning_table_element intel_reduce_table[] =
 984 {
 985   {1,{
 986     {2147483647,1,
 987       {
 988       {2147483647,1}
 989       }
 990     }
 991   }
 992   },
 993   {2,{
 994     { 2,1,{
 995       {2147483647,1}
 996     }
 997     },
 998     { 4,2,{
 999       {10541,3},
1000       {2147483647,1}
1001     }
1002     },
1003     { 2147483647,1,{
1004       {2147483647,1}
1005     }
1006     }
1007   }
1008   },
1009   {4,{
1010     { 256,1,{
1011       {2147483647,1}
1012     }
1013     },
1014     { 2147483647,2,{
1015       {45,3},
1016       {2147483647,1}
1017     }
1018     }
1019   }
1020   },
1021   {8,{
1022     { 512,1,{
1023       {2147483647,1}
1024     }
1025     },
1026     { 2147483647,3,{
1027       {5,1},
1028       {11882,3},
1029       {2147483647,1}
1030     }
1031     }
1032   }
1033   },
1034   {16,{
1035     { 256,1,{
1036       {2147483647,1}
1037     }
1038     },
1039     { 2147483647,2,{
1040       {45,3},
1041       {2147483647,1}
1042     }
1043     }
1044   }
1045   }
1046 };
1047
1048 /* I_MPI_ADJUST_REDUCE_SCATTER
1049
1050 MPI_Reduce_scatter
1051
1052 1. Recursive having algorithm
1053 2. Pair wise exchange algorithm
1054 3. Recursive doubling algorithm
1055 4. Reduce + Scatterv algorithm
1056 5. Topology aware Reduce + Scatterv algorithm
1057
1058 */
1059 static  int intel_reduce_scatter_reduce_scatterv(void *sbuf, void *rbuf,
1060                                                     int *rcounts,
1061                                                     MPI_Datatype dtype,
1062                                                     MPI_Op  op,
1063                                                     MPI_Comm  comm)
1064 {
1065   smpi_mpi_reduce_scatter(sbuf, rbuf, rcounts,dtype, op,comm);
1066   return MPI_SUCCESS;
1067 }
1068
1069 static  int  intel_reduce_scatter_recursivehalving(void *sbuf, void *rbuf,
1070                                                     int *rcounts,
1071                                                     MPI_Datatype dtype,
1072                                                     MPI_Op  op,
1073                                                     MPI_Comm  comm)
1074 {
1075   if(smpi_op_is_commute(op))
1076     return smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(sbuf, rbuf, rcounts,dtype, op,comm);
1077   else
1078     return smpi_coll_tuned_reduce_scatter_mvapich2(sbuf, rbuf, rcounts,dtype, op,comm);
1079 }
1080
1081 int (*intel_reduce_scatter_functions_table[])( void *sbuf, void *rbuf,
1082                                                     int *rcounts,
1083                                                     MPI_Datatype dtype,
1084                                                     MPI_Op  op,
1085                                                     MPI_Comm  comm
1086                                                     ) ={
1087       intel_reduce_scatter_recursivehalving,
1088       smpi_coll_tuned_reduce_scatter_mpich_pair,
1089       smpi_coll_tuned_reduce_scatter_mpich_rdb,
1090       intel_reduce_scatter_reduce_scatterv,
1091       intel_reduce_scatter_reduce_scatterv
1092 };
1093
1094 intel_tuning_table_element intel_reduce_scatter_table[] =
1095 {
1096   {1,{
1097     {2,5,
1098     {
1099         {5,4},
1100         {522429,2},
1101         {1375877,5},
1102         {2932736,2},
1103         {2147483647,5}
1104         }
1105     },
1106     {4,9,
1107         {
1108         {4,4},
1109         {15,1},
1110         {120,3},
1111         {651,1},
1112         {12188,3},
1113         {33890,1},
1114         {572117,2},
1115         {1410202,5},
1116         {2147483647,2}
1117         }
1118     },
1119     {8,7,
1120         {
1121         {4,4},
1122         {2263,1},
1123         {25007,3},
1124         {34861,1},
1125         {169625,2},
1126         {2734000,4},
1127         {2147483647,2}
1128         }
1129     },
1130     {16,5,
1131         {
1132         {4,4},
1133         {14228,1},
1134         {46084,3},
1135         {522139,2},
1136         {2147483647,5}
1137         }
1138     },
1139     {32,5,
1140         {
1141         {4,4},
1142         {27516,1},
1143         {61693,3},
1144         {2483469,2},
1145         {2147483647,5}
1146         }
1147     },
1148     {64,4,
1149         {
1150         {0,3},
1151         {4,4},
1152         {100396,1},
1153         {2147483647,2}
1154         }
1155     },
1156     {2147483647,6,
1157         {
1158         {0,3},
1159         {4,4},
1160         {186926,1},
1161         {278259,3},
1162         {1500100,2},
1163         {2147483647,5}
1164         }
1165     }
1166   }
1167   },
1168   {2,{
1169     { 2,2,{
1170       {6,1},
1171       {2147483647,2}
1172     }
1173     },
1174     { 4,7,{
1175       {5,4},
1176       {13,5},
1177       {59,3},
1178       {76,1},
1179       {91488,3},
1180       {680063,4},
1181       {2147483647,2}
1182     }
1183     },
1184     { 8,8,{
1185       {4,4},
1186       {11,5},
1187       {31,1},
1188       {69615,3},
1189       {202632,2},
1190       {396082,5},
1191       {1495696,4},
1192       {2147483647,2}
1193     }
1194     },
1195     { 16,1,{
1196       {4,4},
1197       {345,1},
1198       {79523,3},
1199       {2147483647,2}
1200     }
1201     },
1202     { 32,5,{
1203       {0,3},
1204       {4,4},
1205       {992,1},
1206       {71417,3},
1207       {2147483647,2}
1208     }
1209     },
1210     { 64,4,{
1211       {4,4},
1212       {1472,1},
1213       {196592,3},
1214       {2147483647,2}
1215     }
1216     },
1217     { 128,5,{
1218       {0,3},
1219       {4,4},
1220       {32892,1},
1221       {381072,3},
1222       {2147483647,2}
1223     }
1224     },
1225     { 2147483647,6,{
1226       {0,2},
1227       {4,4},
1228       {33262,1},
1229       {1571397,3},
1230       {2211398,5},
1231       {2147483647,4}
1232     }
1233     }
1234   }
1235   },
1236   {4,{
1237     { 4,7,{
1238       {12,4},
1239       {27,5},
1240       {49,3},
1241       {187,1},
1242       {405673,3},
1243       {594687,4},
1244       {2147483647,2}
1245     }
1246     },
1247     { 8,5,{
1248       {24,5},
1249       {155,1},
1250       {204501,3},
1251       {274267,5},
1252       {2147483647,4}
1253     }
1254     },
1255     { 16,6,{
1256       {63,1},
1257       {72,3},
1258       {264,1},
1259       {168421,3},
1260       {168421,4},
1261       {2147483647,2}
1262     }
1263     },
1264     { 32,10,{
1265       {0,3},
1266       {4,4},
1267       {12,1},
1268       {18,5},
1269       {419,1},
1270       {188739,3},
1271       {716329,4},
1272       {1365841,5},
1273       {2430194,2},
1274       {2147483647,4}
1275     }
1276     },
1277     { 64,8,{
1278       {0,3},
1279       {4,4},
1280       {17,5},
1281       {635,1},
1282       {202937,3},
1283       {308253,5},
1284       {1389874,4},
1285       {2147483647,2}
1286     }
1287     },
1288     { 128,8,{
1289       {0,3},
1290       {4,4},
1291       {16,5},
1292       {1238,1},
1293       {280097,3},
1294       {631434,5},
1295       {2605072,4},
1296       {2147483647,2}
1297     }
1298     },
1299     { 256,5,{
1300       {0,2},
1301       {4,4},
1302       {16,5},
1303       {2418,1},
1304       {2147483647,3}
1305     }
1306     },
1307     { 2147483647,6,{
1308       {0,2},
1309       {4,4},
1310       {16,5},
1311       {33182,1},
1312       {3763779,3},
1313       {2147483647,4}
1314     }
1315     }
1316   }
1317   },
1318   {8,{
1319     { 8,6,{
1320       {5,4},
1321       {494,1},
1322       {97739,3},
1323       {522836,2},
1324       {554174,5},
1325       {2147483647,2}
1326     }
1327     },
1328     { 16,8,{
1329       {5,4},
1330       {62,1},
1331       {94,3},
1332       {215,1},
1333       {185095,3},
1334       {454784,4},
1335       {607911,5},
1336       {2147483647,4}
1337     }
1338     },
1339     { 32,7,{
1340       {0,3},
1341       {4,4},
1342       {302,1},
1343       {250841,3},
1344       {665822,4},
1345       {1760980,5},
1346       {2147483647,4}
1347     }
1348     },
1349     { 64,8,{
1350       {0,3},
1351       {4,4},
1352       {41,5},
1353       {306,1},
1354       {332405,3},
1355       {1269189,4},
1356       {3712421,5},
1357       {2147483647,4}
1358     }
1359     },
1360     { 128,6,{
1361       {0,3},
1362       {4,4},
1363       {39,5},
1364       {526,1},
1365       {487878,3},
1366       {2147483647,4}
1367     }
1368     },
1369     { 256,8,{
1370       {0,2},
1371       {4,4},
1372       {36,5},
1373       {1382,1},
1374       {424162,3},
1375       {632881,5},
1376       {1127566,3},
1377       {2147483647,4}
1378     }
1379     },
1380     { 512,4,{
1381       {4,4},
1382       {34,5},
1383       {5884,1},
1384       {2147483647,3}
1385     }
1386     },
1387     { 2147483647,4,{
1388       {5,4},
1389       {32,5},
1390       {25105,1},
1391       {2147483647,3}
1392     }
1393     }
1394   }
1395   },
1396   {16,{
1397     { 4,7,{
1398       {12,4},
1399       {27,5},
1400       {49,3},
1401       {187,1},
1402       {405673,3},
1403       {594687,4},
1404       {2147483647,2}
1405     }
1406     },
1407     { 8,5,{
1408       {24,5},
1409       {155,1},
1410       {204501,3},
1411       {274267,5},
1412       {2147483647,4}
1413     }
1414     },
1415     { 16,6,{
1416       {63,1},
1417       {72,3},
1418       {264,1},
1419       {168421,3},
1420       {168421,4},
1421       {2147483647,2}
1422     }
1423     },
1424     { 32,10,{
1425       {0,3},
1426       {4,4},
1427       {12,1},
1428       {18,5},
1429       {419,1},
1430       {188739,3},
1431       {716329,4},
1432       {1365841,5},
1433       {2430194,2},
1434       {2147483647,4}
1435     }
1436     },
1437     { 64,8,{
1438       {0,3},
1439       {4,4},
1440       {17,5},
1441       {635,1},
1442       {202937,3},
1443       {308253,5},
1444       {1389874,4},
1445       {2147483647,2}
1446     }
1447     },
1448     { 128,8,{
1449       {0,3},
1450       {4,4},
1451       {16,5},
1452       {1238,1},
1453       {280097,3},
1454       {631434,5},
1455       {2605072,4},
1456       {2147483647,2}
1457     }
1458     },
1459     { 256,5,{
1460       {0,2},
1461       {4,4},
1462       {16,5},
1463       {2418,1},
1464       {2147483647,3}
1465     }
1466     },
1467     { 2147483647,6,{
1468       {0,2},
1469       {4,4},
1470       {16,5},
1471       {33182,1},
1472       {3763779,3},
1473       {2147483647,4}
1474     }
1475     }
1476   }
1477   }
1478 };
1479
1480 /* I_MPI_ADJUST_ALLGATHER
1481
1482 MPI_Allgather
1483
1484 1. Recursive doubling algorithm
1485 2. Bruck's algorithm
1486 3. Ring algorithm
1487 4. Topology aware Gatherv + Bcast algorithm
1488
1489 */
1490
1491 int (*intel_allgather_functions_table[])(void *sbuf, int scount,
1492                                               MPI_Datatype sdtype,
1493                                               void* rbuf, int rcount,
1494                                               MPI_Datatype rdtype,
1495                                               MPI_Comm  comm
1496                                                     ) ={
1497       smpi_coll_tuned_allgather_rdb,
1498       smpi_coll_tuned_allgather_bruck,
1499       smpi_coll_tuned_allgather_ring,
1500       smpi_coll_tuned_allgather_GB
1501 };
1502
1503 intel_tuning_table_element intel_allgather_table[] =
1504 {
1505   {1,{
1506     {4,11,
1507         {
1508         {1,4},
1509         {384,1},
1510         {1533,4},
1511         {3296,1},
1512         {10763,4},
1513         {31816,3},
1514         {193343,4},
1515         {405857,3},
1516         {597626,4},
1517         {1844323,3},
1518         {2147483647,4}
1519         }
1520     },
1521     {8,10,
1522         {
1523         {12,4},
1524         {46,1},
1525         {205,4},
1526         {3422,2},
1527         {4200,4},
1528         {8748,1},
1529         {24080,3},
1530         {33244,4},
1531         {371159,1},
1532         {2147483647,3}
1533         }
1534     },
1535     {16, 8,
1536         {
1537         {3,4},
1538         {53,1},
1539         {100,4},
1540         {170,1},
1541         {6077,4},
1542         {127644,1},
1543         {143741,4},
1544         {2147483647,3}
1545         }
1546     },
1547     {2147483647,10,
1548         {
1549         {184,1},
1550         {320,4},
1551         {759,1},
1552         {1219,4},
1553         {2633,1},
1554         {8259,4},
1555         {123678,1},
1556         {160801,4},
1557         {284341,1},
1558         {2147483647,4}
1559         }
1560     }
1561   }
1562   },
1563   {2,{
1564     { 8,6,{
1565       {490,1},
1566       {558,2},
1567       {2319,1},
1568       {46227,3},
1569       {2215101,1},
1570       {2147483647,3}
1571     }
1572     },
1573     { 16,4,{
1574       {1005,1},
1575       {1042,2},
1576       {2059,1},
1577       {2147483647,3}
1578     }
1579     },
1580     { 2147483647,2,{
1581       {2454,1},
1582       {2147483647,3}
1583    }
1584    }
1585   }
1586   },
1587   {4,{
1588     { 8,2,{
1589       {2861,1},
1590       {2147483647,3}
1591     }
1592     },
1593     { 2147483647,2,{
1594       {605,1},
1595       {2147483647,3}
1596     }
1597     }
1598   }
1599   },
1600   {8,{
1601     { 16,4,{
1602       {66,1},
1603       {213,4},
1604       {514,1},
1605       {2147483647,3}
1606     }
1607     },
1608     { 32,4,{
1609       {91,1},
1610       {213,4},
1611       {514,1},
1612       {2147483647,3}
1613     }
1614     },
1615     { 64,4,{
1616       {71,1},
1617       {213,4},
1618       {514,1},
1619       {2147483647,3}
1620     }
1621     },
1622     { 128,2,{
1623       {305,1},
1624       {2147483647,3}
1625     }
1626     },
1627     { 2147483647,2,{
1628       {213,1},
1629       {2147483647,3}
1630     }
1631     }
1632   }
1633   },
1634   {16,{
1635     { 8,2,{
1636       {2861,1},
1637       {2147483647,3}
1638     }
1639     },
1640     { 2147483647,2,{
1641       {605,1},
1642       {2147483647,3}
1643     }
1644     }
1645   }
1646   }
1647 };
1648
1649 /* I_MPI_ADJUST_ALLGATHERV
1650
1651 MPI_Allgatherv
1652
1653 1. Recursive doubling algorithm
1654 2. Bruck's algorithm
1655 3. Ring algorithm
1656 4. Topology aware Gatherv + Bcast algorithm
1657
1658 */
1659
1660 int (*intel_allgatherv_functions_table[])(void *sbuf, int scount,
1661                                                MPI_Datatype sdtype,
1662                                                void* rbuf, int *rcounts,
1663                                                int *rdispls,
1664                                                MPI_Datatype rdtype,
1665                                                MPI_Comm  comm
1666                                                     ) ={
1667       smpi_coll_tuned_allgatherv_mpich_rdb,
1668       smpi_coll_tuned_allgatherv_ompi_bruck,
1669       smpi_coll_tuned_allgatherv_ring,
1670       smpi_coll_tuned_allgatherv_GB
1671 };
1672
1673 intel_tuning_table_element intel_allgatherv_table[] =
1674 {
1675   {1,{
1676     {2,3,
1677         {
1678         {259668,3},
1679         {635750,4},
1680         {2147483647,3}
1681         }
1682     },
1683     {4,7,
1684         {
1685         {1,1},
1686         {5,4},
1687         {46,1},
1688         {2590,2},
1689         {1177259,3},
1690         {2767234,4},
1691         {2147483647,3}
1692         }
1693     },
1694     {8, 6,
1695         {
1696         {99,2},
1697         {143,1},
1698         {4646,2},
1699         {63522,3},
1700         {2187806,4},
1701         {2147483647,3}
1702         }
1703     },
1704     {2147483647,7,
1705         {
1706         {1,1},
1707         {5,4},
1708         {46,1},
1709         {2590,2},
1710         {1177259,3},
1711         {2767234,4},
1712         {2147483647,3}
1713         }
1714     }
1715   }
1716   },
1717   {2,{
1718     { 4,3,{
1719       {3147,1},
1720       {5622,2},
1721       {2147483647,3}
1722     }
1723     },
1724     { 8,3,{
1725       {975,1},
1726       {4158,2},
1727       {2147483647,3}
1728     }
1729     },
1730     { 16,2,{
1731       {2146,1},
1732       {2147483647,3}
1733     }
1734     },
1735     { 32,4,{
1736       {81,1},
1737       {414,2},
1738       {1190,1},
1739       {2147483647,3}
1740     }
1741     },
1742     { 2147483647,5,{
1743       {1,2},
1744       {3,1},
1745       {783,2},
1746       {1782,4},
1747       {2147483647,3}
1748     }
1749     }
1750   }
1751   },
1752   {4,{
1753     { 8,2,{
1754       {2554,1},
1755       {2147483647,3}
1756     }
1757     },
1758     { 16,4,{
1759       {272,1},
1760       {657,2},
1761       {2078,1},
1762       {2147483647,3}
1763     }
1764     },
1765     { 32,2,{
1766       {1081,1},
1767       {2147483647,3}
1768     }
1769     },
1770     { 64,2,{
1771       {547,1},
1772       {2147483647,3}
1773     }
1774     },
1775     { 2147483647,5,{
1776       {19,1},
1777       {239,2},
1778       {327,1},
1779       {821,4},
1780       {2147483647,3}
1781     }
1782     }
1783   }
1784   },
1785   {8,{
1786     { 16,3,{
1787       {55,1},
1788       {514,2},
1789       {2147483647,3}
1790     }
1791     },
1792     { 32,4,{
1793       {53,1},
1794       {167,4},
1795       {514,2},
1796       {2147483647,3}
1797     }
1798     },
1799     { 64,3,{
1800       {13,1},
1801       {319,4},
1802       {2147483647,3}
1803     }
1804     },
1805     { 128,7,{
1806       {2,1},
1807       {11,2},
1808       {48,1},
1809       {201,2},
1810       {304,1},
1811       {1048,4},
1812       {2147483647,3}
1813     }
1814     },
1815     { 2147483647,5,{
1816       {5,1},
1817       {115,4},
1818       {129,1},
1819       {451,4},
1820       {2147483647,3}
1821     }
1822     }
1823   }
1824   },
1825   {16,{
1826     { 8,2,{
1827       {2554,1},
1828       {2147483647,3}
1829     }
1830     },
1831     { 16,4,{
1832       {272,1},
1833       {657,2},
1834       {2078,1},
1835       {2147483647,3}
1836     }
1837     },
1838     { 32,2,{
1839       {1081,1},
1840       {2147483647,3}
1841     }
1842     },
1843     { 64,2,{
1844       {547,1},
1845       {2147483647,3}
1846     }
1847     },
1848     { 2147483647,5,{
1849       {19,1},
1850       {239,2},
1851       {327,1},
1852       {821,4},
1853       {2147483647,3}
1854     }
1855     }
1856   }
1857   }
1858 };
1859
1860
1861 /* I_MPI_ADJUST_GATHER
1862
1863 MPI_Gather
1864
1865 1. Binomial algorithm
1866 2. Topology aware binomial algorithm
1867 3. Shumilin's algorithm
1868
1869 */
1870
1871 int (*intel_gather_functions_table[])(void *sbuf, int scount,
1872                                            MPI_Datatype sdtype,
1873                                            void* rbuf, int rcount,
1874                                            MPI_Datatype rdtype,
1875                                            int root,
1876                                            MPI_Comm  comm
1877                                                     ) ={
1878       smpi_coll_tuned_gather_ompi_binomial,
1879       smpi_coll_tuned_gather_ompi_binomial,
1880       smpi_coll_tuned_gather_mvapich2
1881 };
1882
1883 intel_tuning_table_element intel_gather_table[] =
1884 {
1885   {1,{
1886     {8,3,
1887         {
1888         {17561,3},
1889         {44791,2},
1890         {2147483647,3}
1891         }
1892     },
1893     {16,7,
1894         {
1895         {16932,3},
1896         {84425,2},
1897         {158363,3},
1898         {702801,2},
1899         {1341444,3},
1900         {2413569,2},
1901         {2147483647,3}
1902         }
1903     },
1904     {2147483647,4,
1905         {
1906         {47187,3},
1907         {349696,2},
1908         {2147483647,3},
1909         {2147483647,1}
1910         }
1911     }
1912   }
1913   },
1914   {2,{
1915     {2147483647,1,{
1916       {2147483647,3}
1917     }
1918     }
1919   }
1920   },
1921   {4,{
1922     {2147483647,1,{
1923       {2147483647,3}
1924     }
1925     }
1926   }
1927   },
1928   {8,{
1929     { 16,1,{
1930       {2147483647,3}
1931     }
1932     },
1933     { 32,2,{
1934       {9,2},
1935       {2147483647,3}
1936     }
1937     },
1938     { 64,2,{
1939       {784,2},
1940       {2147483647,3}
1941     }
1942     },
1943     { 128,3,{
1944       {160,3},
1945       {655,2},
1946       {2147483647,3}
1947     }
1948     },
1949     { 2147483647,1,{
1950       {2147483647,3}
1951     }
1952     }
1953   }
1954   },
1955   {16,{
1956     {2147483647,1,{
1957       {2147483647,3}
1958     }
1959     }
1960   }
1961   }
1962 };
1963
1964
1965 /* I_MPI_ADJUST_SCATTER
1966
1967 MPI_Scatter
1968
1969 1. Binomial algorithm
1970 2. Topology aware binomial algorithm
1971 3. Shumilin's algorithm
1972
1973 */
1974
1975 int (*intel_scatter_functions_table[])(void *sbuf, int scount,
1976                                             MPI_Datatype sdtype,
1977                                             void* rbuf, int rcount,
1978                                             MPI_Datatype rdtype,
1979                                             int root, MPI_Comm  comm
1980                                                     ) ={
1981       smpi_coll_tuned_scatter_ompi_binomial,
1982       smpi_coll_tuned_scatter_ompi_binomial,
1983       smpi_coll_tuned_scatter_mvapich2
1984 };
1985
1986 intel_tuning_table_element intel_scatter_table[] =
1987 {
1988   {1,{
1989     {2,2,
1990         {
1991         {16391,1},
1992         {2147483647,3}
1993         }
1994     },
1995     {4,6,
1996         {
1997         {16723,3},
1998         {153541,2},
1999         {425631,3},
2000         {794142,2},
2001         {1257027,3},
2002         {2147483647,2}
2003         }
2004     },
2005     {8,7,
2006         {
2007         {2633,3},
2008         {6144,2},
2009         {14043,3},
2010         {24576,2},
2011         {107995,3},
2012         {1752729,2},
2013         {2147483647,3}
2014         }
2015     },
2016     {16,7,
2017         {
2018         {2043,3},
2019         {2252,2},
2020         {17749,3},
2021         {106020,2},
2022         {628654,3},
2023         {3751354,2},
2024         {2147483647,3}
2025         }
2026     },
2027     {2147483647,4,
2028         {
2029         {65907,3},
2030         {245132,2},
2031         {1042439,3},
2032         {2147483647,2},
2033         {2147483647,1}
2034         }
2035     }
2036   }
2037   },
2038   {2,{
2039      {2147483647,1,{
2040        {2147483647,3}
2041      }
2042      }
2043   }
2044   },
2045   {4,{
2046     { 8,1,{
2047       {2147483647,3}
2048     }
2049     },
2050     { 16,2,{
2051       {140,3},
2052       {1302,1},
2053       {2147483647,3}
2054     }
2055     },
2056     { 32,2,{
2057       {159,3},
2058       {486,1},
2059       {2147483647,3}
2060     }
2061     },
2062     { 64,2,{
2063       {149,1},
2064       {2147483647,3}
2065     }
2066     },
2067     { 2147483647,2,{
2068       {139,1},
2069       {2147483647,3}
2070     }
2071     }
2072   }
2073   },
2074   {8,{
2075     { 16,4,{
2076       {587,1},
2077       {1370,2},
2078       {2102,1},
2079       {2147483647,3}
2080     }
2081     },
2082     { 32,3,{
2083       {1038,1},
2084       {2065,2},
2085       {2147483647,3}
2086     }
2087     },
2088     { 64,3,{
2089       {515,1},
2090       {2069,2},
2091       {2147483647,3}
2092     }
2093     },
2094     { 128,3,{
2095       {284,1},
2096       {796,2},
2097       {2147483647,3}
2098     }
2099     },
2100     { 2147483647,2,{
2101       {139,1},
2102       {2147483647,3}
2103     }
2104     }
2105   }
2106   },
2107   {16,{
2108     { 8,1,{
2109       {2147483647,3}
2110     }
2111     },
2112     { 16,3,{
2113       {140,3},
2114       {1302,1},
2115       {2147483647,3}
2116     }
2117     },
2118     { 32,3,{
2119       {159,3},
2120       {486,1},
2121       {2147483647,3}
2122     }
2123     },
2124     { 64,2,{
2125       {149,1},
2126       {2147483647,3}
2127     }
2128     },
2129     { 2147483647,2,{
2130       {139,1},
2131       {2147483647,3}
2132     }
2133     }
2134   }
2135   }
2136 };
2137
2138
2139
2140 /* I_MPI_ADJUST_ALLTOALLV
2141
2142 MPI_Alltoallv
2143
2144 1. Isend/Irecv + waitall algorithm
2145 2. Plum's algorithm
2146
2147 */
2148
2149 int (*intel_alltoallv_functions_table[])(void *sbuf, int *scounts, int *sdisps,
2150                                               MPI_Datatype sdtype,
2151                                               void *rbuf, int *rcounts, int *rdisps,
2152                                               MPI_Datatype rdtype,
2153                                               MPI_Comm  comm
2154                                                     ) ={
2155       smpi_coll_tuned_alltoallv_ompi_basic_linear,
2156       smpi_coll_tuned_alltoallv_bruck
2157 };
2158
2159 intel_tuning_table_element intel_alltoallv_table[] =
2160 {
2161   {1,{
2162     {2147483647,1,
2163         {
2164         {2147483647,1}
2165         }
2166     }
2167   }
2168   },
2169   {2,{
2170     {2147483647,1,
2171         {
2172         {2147483647,1}
2173         }
2174     }
2175   }
2176   },
2177   {4,{
2178     { 8,1,{
2179       {2147483647,1}//weirdly, intel reports the use of algo 0 here
2180     }
2181     },
2182     { 2147483647,2,{
2183       {4,1},//0 again
2184       {2147483647,2}
2185     }
2186     }
2187   }
2188   },
2189   {8,{
2190     { 16,1,{
2191       {2147483647,1}
2192     }
2193     },
2194     { 2147483647,2,{
2195       {0,1},//weird again, only for 0-sized messages
2196       {2147483647,2}
2197     }
2198     }
2199   }
2200   },
2201   {16,{
2202     { 8,1,{
2203       {2147483647,1}//0
2204     }
2205     },
2206     { 2147483647,2,{
2207       {4,1},//0
2208       {2147483647,2}
2209     }
2210     }
2211   }
2212   }
2213 };
2214
2215
2216 //These are collected from table 3.5-2 of the Intel MPI Reference Manual
2217
2218
2219 #define SIZECOMP_reduce_scatter\
2220     int total_message_size = 0;\
2221     for (i = 0; i < comm_size; i++) { \
2222         total_message_size += rcounts[i];\
2223     }\
2224     size_t block_dsize = total_message_size*smpi_datatype_size(dtype);\
2225
2226 #define SIZECOMP_allreduce\
2227   size_t block_dsize =rcount * smpi_datatype_size(dtype);
2228
2229 #define SIZECOMP_alltoall\
2230   size_t block_dsize =send_count * smpi_datatype_size(send_type);
2231
2232 #define SIZECOMP_bcast\
2233   size_t block_dsize =count * smpi_datatype_size(datatype);
2234
2235 #define SIZECOMP_reduce\
2236   size_t block_dsize =count * smpi_datatype_size(datatype);
2237
2238 #define SIZECOMP_barrier\
2239   size_t block_dsize = 1;
2240
2241 #define SIZECOMP_allgather\
2242   size_t block_dsize =recv_count * smpi_datatype_size(recv_type);
2243
2244 #define SIZECOMP_allgatherv\
2245     int total_message_size = 0;\
2246     for (i = 0; i < comm_size; i++) { \
2247         total_message_size += recv_count[i];\
2248     }\
2249     size_t block_dsize = total_message_size*smpi_datatype_size(recv_type);
2250
2251 #define SIZECOMP_gather\
2252   int rank = smpi_comm_rank(comm);\
2253   size_t block_dsize = (send_buff == MPI_IN_PLACE || rank ==root) ?\
2254                 recv_count * smpi_datatype_size(recv_type) :\
2255                 send_count * smpi_datatype_size(send_type);
2256
2257 #define SIZECOMP_scatter\
2258   int rank = smpi_comm_rank(comm);\
2259   size_t block_dsize = (sendbuf == MPI_IN_PLACE || rank !=root ) ?\
2260                 recvcount * smpi_datatype_size(recvtype) :\
2261                 sendcount * smpi_datatype_size(sendtype);
2262
2263 #define SIZECOMP_alltoallv\
2264   size_t block_dsize = 1;
2265
2266 #define IMPI_COLL_SELECT(cat, ret, args, args2)\
2267 ret smpi_coll_tuned_ ## cat ## _impi (COLL_UNPAREN args)\
2268 {\
2269     int comm_size = smpi_comm_size(comm);\
2270     int i =0;\
2271     SIZECOMP_ ## cat\
2272     i=0;\
2273     int j =0, k=0;\
2274     if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){\
2275       smpi_comm_init_smp(comm);\
2276     }\
2277     int local_size=1;\
2278     if (smpi_comm_is_uniform(comm)) {\
2279         local_size = smpi_comm_size(smpi_comm_get_intra_comm(comm));\
2280     }\
2281     while(i < INTEL_MAX_NB_PPN &&\
2282     local_size!=intel_ ## cat ## _table[i].ppn)\
2283       i++;\
2284     if(i==INTEL_MAX_NB_PPN) i=0;\
2285     while(comm_size>intel_ ## cat ## _table[i].elems[j].max_num_proc\
2286         && j < INTEL_MAX_NB_THRESHOLDS)\
2287       j++;\
2288     while(block_dsize >=intel_ ## cat ## _table[i].elems[j].elems[k].max_size\
2289          && k< intel_ ## cat ## _table[i].elems[j].num_elems)\
2290       k++;\
2291     return (intel_ ## cat ## _functions_table[intel_ ## cat ## _table[i].elems[j].elems[k].algo-1]\
2292     args2);\
2293 }
2294
2295 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHERV_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_disps, recv_type, comm));
2296 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLREDUCE_SIG, (sbuf, rbuf, rcount, dtype, op, comm));
2297 COLL_APPLY(IMPI_COLL_SELECT, COLL_GATHER_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_type, root, comm));
2298 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHER_SIG, (send_buff,send_count,send_type,recv_buff,recv_count,recv_type,comm));
2299 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALL_SIG,(send_buff, send_count, send_type, recv_buff, recv_count, recv_type,comm));
2300 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALLV_SIG, (send_buff, send_counts, send_disps, send_type, recv_buff, recv_counts, recv_disps, recv_type, comm));
2301 COLL_APPLY(IMPI_COLL_SELECT, COLL_BCAST_SIG , (buf, count, datatype, root, comm));
2302 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SIG,(buf,rbuf, count, datatype, op, root, comm));
2303 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SCATTER_SIG ,(sbuf,rbuf, rcounts,dtype,op,comm));
2304 COLL_APPLY(IMPI_COLL_SELECT, COLL_SCATTER_SIG ,(sendbuf, sendcount, sendtype,recvbuf, recvcount, recvtype,root, comm));
2305 COLL_APPLY(IMPI_COLL_SELECT, COLL_BARRIER_SIG,(comm));
2306