src/smpi/colls/smpi_intel_mpi_selector.cpp

   1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
   2
   3 /* Copyright (c) 2009-2023. The SimGrid Team.
   4  * All rights reserved.                                                     */
   5
   6 /* This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the license (GNU LGPL) which comes with this package. */
   8
   9 #include "colls_private.hpp"
  10
  11 // This selector is based on information gathered on the Stampede cluster, with Intel MPI 4.1.3.049, and from the intel reference manual. The data was gathered launching runs with 1,2,4,8,16 processes per node.
  12
  13 #define INTEL_MAX_NB_THRESHOLDS  32
  14 #define INTEL_MAX_NB_NUMPROCS  12
  15 #define INTEL_MAX_NB_PPN  5  /* 1 2 4 8 16 ppn */
  16
  17 struct intel_tuning_table_size_element {
  18   unsigned int max_size;
  19   int algo;
  20 };
  21
  22 struct intel_tuning_table_numproc_element {
  23   int max_num_proc;
  24   int num_elems;
  25   intel_tuning_table_size_element elems[INTEL_MAX_NB_THRESHOLDS];
  26 };
  27
  28 struct intel_tuning_table_element {
  29   int ppn;
  30   intel_tuning_table_numproc_element elems[INTEL_MAX_NB_NUMPROCS];
  31 };
  32
  33 /*
  34 I_MPI_ADJUST_ALLREDUCE
  35
  36 MPI_Allreduce
  37
  38 1 - Recursive doubling algorithm
  39 2 - Rabenseifner's algorithm
  40 3 - Reduce + Bcast algorithm
  41 4 - Topology aware Reduce + Bcast algorithm
  42 5 - Binomial gather + scatter algorithm
  43 6 - Topology aware binominal gather + scatter algorithm
  44 7 - Shumilin's ring algorithm
  45 8 - Ring algorithm
  46
  47   as Shumilin's ring algorithm is unknown, default to ring'
  48 */
  49
  50 namespace simgrid::smpi {
  51
  52 int (*intel_allreduce_functions_table[])(const void *sendbuf,
  53       void *recvbuf,
  54       int count,
  55       MPI_Datatype datatype,
  56       MPI_Op op, MPI_Comm comm) ={
  57       allreduce__rdb,
  58       allreduce__rab1,
  59       allreduce__redbcast,
  60       allreduce__mvapich2_two_level,
  61       allreduce__smp_binomial,
  62       allreduce__mvapich2_two_level,
  63       allreduce__ompi_ring_segmented,
  64       allreduce__ompi_ring_segmented
  65 };
  66
  67 intel_tuning_table_element intel_allreduce_table[] =
  68 {
  69   {1,{
  70     { 2,9,{
  71       {6,7},
  72       {85,1},
  73       {192,7},
  74       {853,1},
  75       {1279,7},
  76       {16684,1},
  77       {34279,8},
  78       {1681224,3},
  79       {2147483647,7}
  80     }
  81     },
  82     { 4, 8,{
  83       {16,7},
  84       {47,1},
  85       {2062,7},
  86       {16699,1},
  87       {33627,7},
  88       {70732,8},
  89       {1300705,3},
  90       {2147483647,8}
  91     }
  92     },
  93     {8,8,{
  94       {118,1},
  95       {146,4},
  96       {16760,1},
  97       {36364,6},
  98       {136239,8},
  99       {315710,7},
 100       {3220366,3},
 101       {2147483647,8}
 102       }
 103     },
 104     {16,7,{
 105       {934,1},
 106       {1160,6},
 107       {15505,1},
 108       {52730,2},
 109       {300705,8},
 110       {563680,7},
 111       {2147483647,3}
 112       }
 113     },
 114     {2147483647,11,{
 115       {5,6},
 116       {11,4},
 117       {182,1},
 118       {700,6},
 119       {1450,4},
 120       {11146,1},
 121       {25539,6},
 122       {37634,4},
 123       {93784,6},
 124       {817658,2},
 125       {2147483647,3}
 126     }
 127     }
 128   }
 129   },
 130   {2,{
 131     { 4,6,{
 132       {2084,7},
 133       {15216,1},
 134       {99715,7},
 135       {168666,3},
 136       {363889,2},
 137       {2147483647,7}
 138     }
 139     },
 140     { 8,6,{
 141       {14978,1},
 142       {66879,2},
 143       {179296,8},
 144       {304801,3},
 145       {704509,7},
 146       {2147483647,2}
 147     }
 148     },
 149     { 16,6,{
 150       {16405,1},
 151       {81784,2},
 152       {346385,8},
 153       {807546,7},
 154       {1259854,2},
 155       {2147483647,3}
 156     }
 157     },
 158     { 32,4,{
 159       {8913,1},
 160       {103578,2},
 161       {615876,8},
 162       {2147483647,2}
 163     }
 164     },
 165     { 64,7,{
 166       {1000,1},
 167       {2249,2},
 168       {6029,1},
 169       {325357,2},
 170       {1470976,8},
 171       {2556670,7},
 172       {2147483647,3}
 173     }
 174     },
 175     { 128,5,{
 176       {664,1},
 177       {754706,2},
 178       {1663862,4},
 179       {3269097,2},
 180       {2147483647,7}
 181     }
 182     },
 183     { 2147483647,3,{
 184       {789,1},
 185       {2247589,2},
 186       {2147483647,8}
 187     }
 188     }
 189   }
 190   },
 191   {4,{
 192     { 4,4,{
 193       {5738,1},
 194       {197433,2},
 195       {593742,7},
 196       {2147483647,2}
 197     }
 198     },
 199     { 8,7,{
 200       {5655,1},
 201       {75166,2},
 202       {177639,8},
 203       {988014,3},
 204       {1643869,2},
 205       {2494859,8},
 206       {2147483647,2}
 207     }
 208     },
 209     { 16,7,{
 210       {587,1},
 211       {3941,2},
 212       {9003,1},
 213       {101469,2},
 214       {355768,8},
 215       {3341814,3},
 216       {2147483647,8}
 217     }
 218     },
 219     { 32,4,{
 220       {795,1},
 221       {146567,2},
 222       {732118,8},
 223       {2147483647,3}
 224     }
 225     },
 226     { 64,4,{
 227       {528,1},
 228       {221277,2},
 229       {1440737,8},
 230       {2147483647,3}
 231     }
 232     },
 233     { 128,4,{
 234       {481,1},
 235       {593833,2},
 236       {2962021,8},
 237       {2147483647,7}
 238     }
 239     },
 240     { 256,2,{
 241       {584,1},
 242       {2147483647,2}
 243     }
 244     },
 245     { 2147483647,3,{
 246       {604,1},
 247       {2997006,2},
 248       {2147483647,8}
 249     }
 250     }
 251   }
 252   },
 253   {8,{
 254     { 8,6,{
 255       {2560,1},
 256       {114230,6},
 257       {288510,8},
 258       {664038,2},
 259       {1339913,6},
 260       {2147483647,4}
 261     }
 262     },
 263     { 16,5,{
 264       {497,1},
 265       {54201,2},
 266       {356217,8},
 267       {3413609,3},
 268       {2147483647,8}
 269     }
 270     },
 271     { 32,5,{
 272       {377,1},
 273       {109745,2},
 274       {716514,8},
 275       {3976768,3},
 276       {2147483647,8}
 277     }
 278     },
 279     { 64,6,{
 280       {109,1},
 281       {649,5},
 282       {266080,2},
 283       {1493331,8},
 284       {2541403,7},
 285       {2147483647,3}
 286     }
 287     },
 288     { 128,4,{
 289       {7,1},
 290       {751,5},
 291       {408808,2},
 292       {2147483647,8}
 293     }
 294     },
 295     { 256,3,{
 296       {828,5},
 297       {909676,2},
 298       {2147483647,8}
 299     }
 300     },
 301     { 512,5,{
 302       {847,5},
 303       {1007066,2},
 304       {1068775,4},
 305       {2803389,2},
 306       {2147483647,8}
 307     }
 308     },
 309     { 2147483647,3,{
 310       {1974,5},
 311       {4007876,2},
 312       {2147483647,8}
 313     }
 314     }
 315   }
 316   },
 317   {16,{
 318     { 16,12,{
 319       {409,1},
 320       {768,6},
 321       {1365,4},
 322       {3071,6},
 323       {11299,2},
 324       {21746,6},
 325       {55629,2},
 326       {86065,4},
 327       {153867,2},
 328       {590560,6},
 329       {1448760,2},
 330       {2147483647,8},
 331     }
 332     },
 333     { 32,8,{
 334       {6,1},
 335       {24,5},
 336       {86,1},
 337       {875,5},
 338       {74528,2},
 339       {813050,8},
 340       {1725981,7},
 341       {2147483647,8},
 342     }
 343     },
 344     { 64,6,{
 345       {1018,5},
 346       {1217,6},
 347       {2370,5},
 348       {160654,2},
 349       {1885487,8},
 350       {2147483647,3},
 351     }
 352     },
 353     { 128,4,{
 354       {2291,5},
 355       {434465,2},
 356       {3525103,8},
 357       {2147483647,7},
 358     }
 359     },
 360     { 256,3,{
 361       {2189,5},
 362       {713154,2},
 363       {2147483647,8},
 364     }
 365     },
 366     { 512,3,{
 367       {2140,5},
 368       {1235056,2},
 369       {2147483647,8},
 370     }
 371     },
 372     { 2147483647,3,{
 373       {2153,5},
 374       {2629855,2},
 375       {2147483647,8},
 376     }
 377     }
 378   }
 379   }
 380 };
 381
 382
 383
 384 /*I_MPI_ADJUST_ALLTOALL
 385
 386 MPI_Alltoall
 387
 388 1. Bruck's algorithm
 389 2. Isend/Irecv + waitall algorithm
 390 3. Pair wise exchange algorithm
 391 4. Plum's algorithm
 392
 393 */
 394
 395
 396 intel_tuning_table_element intel_alltoall_table[] =
 397 {
 398   {1,{
 399     { 2,1,
 400         {
 401         {2147483647,3}
 402         }
 403     },
 404     { 4,2,
 405         {
 406         {0,4},
 407         {2147483647,2}
 408         }
 409     },
 410     {8,1,
 411         {
 412         {2147483647,2}
 413         }
 414     },
 415     {16,5,
 416         {
 417         {0,3},
 418         {84645,2},
 419         {167570,3},
 420         {413152,4},
 421         {2147483647,2}
 422         }
 423     },
 424     {32,6,
 425         {
 426         {61,1},
 427         {164,2},
 428         {696,1},
 429         {143254,2},
 430         {387024,3},
 431         {2147483647,2}
 432         },
 433     },
 434     {64,4,
 435         {
 436         {523,1},
 437         {146088,2},
 438         {488989,4},
 439         {2147483647,2}
 440         }
 441     },
 442     {2147483647,3,
 443         {
 444         {270,1},
 445         {628,4},
 446         {2147483647,2}
 447         }
 448     }
 449   }
 450   },
 451   {2, {
 452     { 4,4,{
 453       {1,2},
 454       {75,3},
 455       {131072,2},
 456       {2147483647,2}
 457     }
 458     },
 459     { 8,3,{
 460       {709,1},
 461       {131072,2},
 462       {2147483647,2}
 463     }
 464     },
 465     { 16,4,{
 466       {40048,2},
 467       {131072,3},
 468       {155927,3},
 469       {2147483647,4}
 470     }
 471     },
 472     { 32,7,{
 473       {105,1},
 474       {130,2},
 475       {1030,1},
 476       {58893,2},
 477       {131072,2},
 478       {271838,3},
 479       {2147483647,2}
 480     }
 481     },
 482     { 2147483647,8,{
 483       {521,1},
 484       {2032,4},
 485       {2412,2},
 486       {4112,4},
 487       {61620,2},
 488       {131072,3},
 489       {427408,3},
 490       {2147483647,4}
 491     }
 492     }
 493   }
 494   },
 495   {4,{
 496     { 8,3,{
 497       {512,1},
 498       {32768,2},
 499       {2147483647,2}
 500     }
 501     },
 502     { 64,8,{
 503       {7,1},
 504       {199,4},
 505       {764,1},
 506       {6409,4},
 507       {20026,2},
 508       {32768,3},
 509       {221643,4},
 510       {2147483647,3}
 511     }
 512     },
 513     { 2147483647,7,{
 514       {262,1},
 515       {7592,4},
 516       {22871,2},
 517       {32768,3},
 518       {47538,3},
 519       {101559,4},
 520       {2147483647,3}
 521     }
 522     }
 523   }
 524   },
 525   {8,{
 526     { 16,6,{
 527       {973,1},
 528       {5126,4},
 529       {16898,2},
 530       {32768,4},
 531       {65456,4},
 532       {2147483647,2}
 533     }
 534     },
 535     { 32,7,{
 536       {874,1},
 537       {6727,4},
 538       {17912,2},
 539       {32768,3},
 540       {41513,3},
 541       {199604,4},
 542       {2147483647,3}
 543     }
 544     },
 545     { 64,8,{
 546       {5,1},
 547       {114,4},
 548       {552,1},
 549       {8130,4},
 550       {32768,3},
 551       {34486,3},
 552       {160113,4},
 553       {2147483647,3}
 554     }
 555     },
 556     { 128,6,{
 557       {270,1},
 558       {3679,4},
 559       {32768,3},
 560       {64367,3},
 561       {146595,4},
 562       {2147483647,3}
 563     }
 564     },
 565     { 2147483647,4,{
 566       {133,1},
 567       {4017,4},
 568       {32768,3},
 569       {76351,4},
 570       {2147483647,3}
 571     }
 572     }
 573   }
 574   },
 575   {16,{
 576     { 32,7,{
 577       {963,1},
 578       {1818,4},
 579       {20007,2},
 580       {32768,4},
 581       {54296,4},
 582       {169735,3},
 583       {2147483647,2}
 584     }
 585     },
 586     { 64,11,{
 587       {17,1},
 588       {42,4},
 589       {592,1},
 590       {2015,4},
 591       {2753,2},
 592       {6496,3},
 593       {20402,4},
 594       {32768,3},
 595       {36246,3},
 596       {93229,4},
 597       {2147483647,3}
 598     }
 599     },
 600     { 128,9,{
 601       {18,1},
 602       {40,4},
 603       {287,1},
 604       {1308,4},
 605       {6842,1},
 606       {32768,3},
 607       {36986,3},
 608       {129081,4},
 609       {2147483647,3}
 610     }
 611     },
 612     { 256,7,{
 613       {135,1},
 614       {1538,4},
 615       {3267,1},
 616       {4132,3},
 617       {31469,4},
 618       {32768,3},
 619       {2147483647,3}
 620     }
 621     },
 622     { 2147483647,8,{
 623       {66,1},
 624       {1637,4},
 625       {2626,1},
 626       {4842,4},
 627       {32768,3},
 628       {33963,3},
 629       {72978,4},
 630       {2147483647,3}
 631     }
 632     }
 633   }
 634   }
 635 };
 636 int (*intel_alltoall_functions_table[])(const void *sbuf, int scount,
 637                                              MPI_Datatype sdtype,
 638                                              void* rbuf, int rcount,
 639                                              MPI_Datatype rdtype,
 640                                              MPI_Comm comm) ={
 641       alltoall__bruck,
 642       alltoall__mvapich2_scatter_dest,
 643       alltoall__pair,
 644       alltoall__mvapich2//Plum is proprietary ? (and super efficient)
 645 };
 646
 647 /*I_MPI_ADJUST_BARRIER
 648
 649 MPI_Barrier
 650
 651 1. Dissemination algorithm
 652 2. Recursive doubling algorithm
 653 3. Topology aware dissemination algorithm
 654 4. Topology aware recursive doubling algorithm
 655 5. Binominal gather + scatter algorithm
 656 6. Topology aware binominal gather + scatter algorithm
 657
 658 */
 659 static int intel_barrier_gather_scatter(MPI_Comm comm){
 660   // our default barrier performs an antibcast/bcast
 661   barrier__default(comm);
 662   return MPI_SUCCESS;
 663 }
 664
 665 int (*intel_barrier_functions_table[])(MPI_Comm comm) ={
 666       barrier__ompi_basic_linear,
 667       barrier__ompi_recursivedoubling,
 668       barrier__ompi_basic_linear,
 669       barrier__ompi_recursivedoubling,
 670       intel_barrier_gather_scatter,
 671       intel_barrier_gather_scatter
 672 };
 673
 674 intel_tuning_table_element intel_barrier_table[] =
 675 {
 676   {1,{
 677     {2,1,
 678         {
 679         {2147483647,2}
 680         }
 681     },
 682     {4,1,
 683         {
 684         {2147483647,6}
 685         }
 686     },
 687     {8,1,
 688         {
 689         {2147483647,1}
 690         }
 691     },
 692     {64,1,
 693         {
 694         {2147483647,2}
 695         }
 696     },
 697     {2147483647,1,
 698         {
 699         {2147483647,6}
 700         }
 701     }
 702   }
 703   },
 704   {2,{
 705     { 2,1,{
 706       {2147483647,1}
 707     }
 708     },
 709     { 4,1,{
 710       {2147483647,3}
 711     }
 712     },
 713     { 8,1,{
 714       {2147483647,5}
 715     }
 716     },
 717     { 32,1,{
 718       {2147483647,2}
 719     }
 720     },
 721     { 128,1,{
 722       {2147483647,3}
 723     }
 724     },
 725     { 2147483647,1,{
 726       {2147483647,4}
 727     }
 728     }
 729   }
 730   },
 731   {4,{
 732     { 4,1,{
 733       {2147483647,2}
 734     }
 735     },
 736     { 8,1,{
 737       {2147483647,5}
 738     }
 739     },
 740     { 32,1,{
 741       {2147483647,2}
 742     }
 743     },
 744     { 2147483647,1,{
 745       {2147483647,4}
 746     }
 747     }
 748   }
 749   },
 750   {8,{
 751     { 8,1,{
 752       {2147483647,1}
 753     }
 754     },
 755     { 32,1,{
 756       {2147483647,2}
 757     }
 758     },
 759     { 2147483647,1,{
 760       {2147483647,4}
 761     }
 762     }
 763   }
 764   },
 765   {16,{
 766     { 4,1,{
 767       {2147483647,2}
 768     }
 769     },
 770     { 8,1,{
 771       {2147483647,5}
 772     }
 773     },
 774     { 32,1,{
 775       {2147483647,2}
 776     }
 777     },
 778     { 2147483647,1,{
 779       {2147483647,4}
 780     }
 781     }
 782   }
 783   }
 784 };
 785
 786
 787 /*I_MPI_ADJUST_BCAST
 788
 789 MPI_Bcast
 790
 791 1. Binomial algorithm
 792 2. Recursive doubling algorithm
 793 3. Ring algorithm
 794 4. Topology aware binomial algorithm
 795 5. Topology aware recursive doubling algorithm
 796 6. Topology aware ring algorithm
 797 7. Shumilin's bcast algorithm
 798 */
 799
 800 int (*intel_bcast_functions_table[])(void *buff, int count,
 801                                           MPI_Datatype datatype, int root,
 802                                           MPI_Comm  comm) ={
 803       bcast__binomial_tree,
 804       //bcast__scatter_rdb_allgather,
 805       bcast__NTSL,
 806       bcast__NTSL,
 807       bcast__SMP_binomial,
 808       //bcast__scatter_rdb_allgather,
 809       bcast__NTSL,
 810       bcast__SMP_linear,
 811       bcast__mvapich2,//we don't know shumilin's algo'
 812 };
 813
 814 intel_tuning_table_element intel_bcast_table[] =
 815 {
 816   {1,{
 817     {2,9,
 818         {
 819         {1,2},
 820         {402,7},
 821         {682,5},
 822         {1433,4},
 823         {5734,7},
 824         {21845,1},
 825         {95963,6},
 826         {409897,5},
 827         {2147483647,1}
 828         }
 829     },
 830     {4,1,
 831         {
 832         {2147483647,7}
 833         }
 834     },
 835     {8,11,
 836         {
 837         {3,6},
 838         {4,7},
 839         {25,6},
 840         {256,1},
 841         {682,6},
 842         {1264,1},
 843         {2234,6},
 844         {6655,5},
 845         {16336,1},
 846         {3998434,7},
 847         {2147483647,6}
 848         }
 849     },
 850     {2147483647,1,
 851         {
 852         {2147483647,7}
 853         }
 854     }
 855   }
 856   },
 857   {2,{
 858     { 4,6,{
 859       {806,4},
 860       {18093,7},
 861       {51366,6},
 862       {182526,4},
 863       {618390,1},
 864       {2147483647,7}
 865     }
 866     },
 867     { 8,6,{
 868       {24,1},
 869       {74,4},
 870       {18137,1},
 871       {614661,7},
 872       {1284626,1},
 873       {2147483647,2}
 874     }
 875     },
 876     { 16,4,{
 877       {1,1},
 878       {158,7},
 879       {16955,1},
 880       {2147483647,7}
 881     }
 882     },
 883     { 32,3,{
 884       {242,7},
 885       {10345,1},
 886       {2147483647,7}
 887     }
 888     },
 889     { 2147483647,4,{
 890       {1,1},
 891       {737,7},
 892       {5340,1},
 893       {2147483647,7}
 894     }
 895     }
 896   }
 897   },
 898   {4,{
 899     { 8,4,{
 900       {256,4},
 901       {17181,1},
 902       {1048576,7},
 903       {2147483647,7}
 904     }
 905     },
 906     { 2147483647,1,{
 907       {2147483647,7}
 908     }
 909     }
 910   }
 911   },
 912   {8,{
 913     { 16,5,{
 914       {3,1},
 915       {318,7},
 916       {1505,1},
 917       {1048576,7},
 918       {2147483647,7}
 919     }
 920     },
 921     { 32,3,{
 922       {422,7},
 923       {851,1},
 924       {2147483647,7}
 925     }
 926     },
 927     { 64,3,{
 928       {468,7},
 929       {699,1},
 930       {2147483647,7}
 931     }
 932     },
 933     { 2147483647,1,{
 934       {2147483647,7}
 935     }
 936     }
 937   }
 938   },
 939   {16,{
 940     { 8,4,{
 941       {256,4},
 942       {17181,1},
 943       {1048576,7},
 944       {2147483647,7}
 945     }
 946     },
 947     { 2147483647,1,{
 948       {2147483647,7}
 949     }
 950     }
 951   }
 952   }
 953 };
 954
 955
 956 /*I_MPI_ADJUST_REDUCE
 957
 958 MPI_Reduce
 959
 960 1. Shumilin's algorithm
 961 2. Binomial algorithm
 962 3. Topology aware Shumilin's algorithm
 963 4. Topology aware binomial algorithm
 964 5. Rabenseifner's algorithm
 965 6. Topology aware Rabenseifner's algorithm
 966
 967 */
 968
 969 int (*intel_reduce_functions_table[])(const void *sendbuf, void *recvbuf,
 970                                             int count, MPI_Datatype  datatype,
 971                                             MPI_Op   op, int root,
 972                                             MPI_Comm   comm) ={
 973       reduce__mvapich2,
 974       reduce__binomial,
 975       reduce__mvapich2,
 976       reduce__mvapich2_two_level,
 977       reduce__rab,
 978       reduce__rab
 979 };
 980
 981 intel_tuning_table_element intel_reduce_table[] =
 982 {
 983   {1,{
 984     {2147483647,1,
 985       {
 986       {2147483647,1}
 987       }
 988     }
 989   }
 990   },
 991   {2,{
 992     { 2,1,{
 993       {2147483647,1}
 994     }
 995     },
 996     { 4,2,{
 997       {10541,3},
 998       {2147483647,1}
 999     }
1000     },
1001     { 2147483647,1,{
1002       {2147483647,1}
1003     }
1004     }
1005   }
1006   },
1007   {4,{
1008     { 256,1,{
1009       {2147483647,1}
1010     }
1011     },
1012     { 2147483647,2,{
1013       {45,3},
1014       {2147483647,1}
1015     }
1016     }
1017   }
1018   },
1019   {8,{
1020     { 512,1,{
1021       {2147483647,1}
1022     }
1023     },
1024     { 2147483647,3,{
1025       {5,1},
1026       {11882,3},
1027       {2147483647,1}
1028     }
1029     }
1030   }
1031   },
1032   {16,{
1033     { 256,1,{
1034       {2147483647,1}
1035     }
1036     },
1037     { 2147483647,2,{
1038       {45,3},
1039       {2147483647,1}
1040     }
1041     }
1042   }
1043   }
1044 };
1045
1046 /* I_MPI_ADJUST_REDUCE_SCATTER
1047
1048 MPI_Reduce_scatter
1049
1050 1. Recursive having algorithm
1051 2. Pair wise exchange algorithm
1052 3. Recursive doubling algorithm
1053 4. Reduce + Scatterv algorithm
1054 5. Topology aware Reduce + Scatterv algorithm
1055
1056 */
1057 static  int intel_reduce_scatter_reduce_scatterv(const void *sbuf, void *rbuf,
1058                                                     const int *rcounts,
1059                                                     MPI_Datatype dtype,
1060                                                     MPI_Op  op,
1061                                                     MPI_Comm  comm)
1062 {
1063   reduce_scatter__default(sbuf, rbuf, rcounts,dtype, op,comm);
1064   return MPI_SUCCESS;
1065 }
1066
1067 static  int  intel_reduce_scatter_recursivehalving(const void *sbuf, void *rbuf,
1068                                                     const int *rcounts,
1069                                                     MPI_Datatype dtype,
1070                                                     MPI_Op  op,
1071                                                     MPI_Comm  comm)
1072 {
1073   if(op==MPI_OP_NULL || op->is_commutative())
1074     return reduce_scatter__ompi_basic_recursivehalving(sbuf, rbuf, rcounts,dtype, op,comm);
1075   else
1076     return reduce_scatter__mvapich2(sbuf, rbuf, rcounts,dtype, op,comm);
1077 }
1078
1079 int (*intel_reduce_scatter_functions_table[])( const void *sbuf, void *rbuf,
1080                                                     const int *rcounts,
1081                                                     MPI_Datatype dtype,
1082                                                     MPI_Op  op,
1083                                                     MPI_Comm  comm
1084                                                     ) ={
1085       intel_reduce_scatter_recursivehalving,
1086       reduce_scatter__mpich_pair,
1087       reduce_scatter__mpich_rdb,
1088       intel_reduce_scatter_reduce_scatterv,
1089       intel_reduce_scatter_reduce_scatterv
1090 };
1091
1092 intel_tuning_table_element intel_reduce_scatter_table[] =
1093 {
1094   {1,{
1095     {2,5,
1096     {
1097         {5,4},
1098         {522429,2},
1099         {1375877,5},
1100         {2932736,2},
1101         {2147483647,5}
1102         }
1103     },
1104     {4,9,
1105         {
1106         {4,4},
1107         {15,1},
1108         {120,3},
1109         {651,1},
1110         {12188,3},
1111         {33890,1},
1112         {572117,2},
1113         {1410202,5},
1114         {2147483647,2}
1115         }
1116     },
1117     {8,7,
1118         {
1119         {4,4},
1120         {2263,1},
1121         {25007,3},
1122         {34861,1},
1123         {169625,2},
1124         {2734000,4},
1125         {2147483647,2}
1126         }
1127     },
1128     {16,5,
1129         {
1130         {4,4},
1131         {14228,1},
1132         {46084,3},
1133         {522139,2},
1134         {2147483647,5}
1135         }
1136     },
1137     {32,5,
1138         {
1139         {4,4},
1140         {27516,1},
1141         {61693,3},
1142         {2483469,2},
1143         {2147483647,5}
1144         }
1145     },
1146     {64,4,
1147         {
1148         {0,3},
1149         {4,4},
1150         {100396,1},
1151         {2147483647,2}
1152         }
1153     },
1154     {2147483647,6,
1155         {
1156         {0,3},
1157         {4,4},
1158         {186926,1},
1159         {278259,3},
1160         {1500100,2},
1161         {2147483647,5}
1162         }
1163     }
1164   }
1165   },
1166   {2,{
1167     { 2,2,{
1168       {6,1},
1169       {2147483647,2}
1170     }
1171     },
1172     { 4,7,{
1173       {5,4},
1174       {13,5},
1175       {59,3},
1176       {76,1},
1177       {91488,3},
1178       {680063,4},
1179       {2147483647,2}
1180     }
1181     },
1182     { 8,8,{
1183       {4,4},
1184       {11,5},
1185       {31,1},
1186       {69615,3},
1187       {202632,2},
1188       {396082,5},
1189       {1495696,4},
1190       {2147483647,2}
1191     }
1192     },
1193     { 16,1,{
1194       {4,4},
1195       {345,1},
1196       {79523,3},
1197       {2147483647,2}
1198     }
1199     },
1200     { 32,5,{
1201       {0,3},
1202       {4,4},
1203       {992,1},
1204       {71417,3},
1205       {2147483647,2}
1206     }
1207     },
1208     { 64,4,{
1209       {4,4},
1210       {1472,1},
1211       {196592,3},
1212       {2147483647,2}
1213     }
1214     },
1215     { 128,5,{
1216       {0,3},
1217       {4,4},
1218       {32892,1},
1219       {381072,3},
1220       {2147483647,2}
1221     }
1222     },
1223     { 2147483647,6,{
1224       {0,2},
1225       {4,4},
1226       {33262,1},
1227       {1571397,3},
1228       {2211398,5},
1229       {2147483647,4}
1230     }
1231     }
1232   }
1233   },
1234   {4,{
1235     { 4,7,{
1236       {12,4},
1237       {27,5},
1238       {49,3},
1239       {187,1},
1240       {405673,3},
1241       {594687,4},
1242       {2147483647,2}
1243     }
1244     },
1245     { 8,5,{
1246       {24,5},
1247       {155,1},
1248       {204501,3},
1249       {274267,5},
1250       {2147483647,4}
1251     }
1252     },
1253     { 16,6,{
1254       {63,1},
1255       {72,3},
1256       {264,1},
1257       {168421,3},
1258       {168421,4},
1259       {2147483647,2}
1260     }
1261     },
1262     { 32,10,{
1263       {0,3},
1264       {4,4},
1265       {12,1},
1266       {18,5},
1267       {419,1},
1268       {188739,3},
1269       {716329,4},
1270       {1365841,5},
1271       {2430194,2},
1272       {2147483647,4}
1273     }
1274     },
1275     { 64,8,{
1276       {0,3},
1277       {4,4},
1278       {17,5},
1279       {635,1},
1280       {202937,3},
1281       {308253,5},
1282       {1389874,4},
1283       {2147483647,2}
1284     }
1285     },
1286     { 128,8,{
1287       {0,3},
1288       {4,4},
1289       {16,5},
1290       {1238,1},
1291       {280097,3},
1292       {631434,5},
1293       {2605072,4},
1294       {2147483647,2}
1295     }
1296     },
1297     { 256,5,{
1298       {0,2},
1299       {4,4},
1300       {16,5},
1301       {2418,1},
1302       {2147483647,3}
1303     }
1304     },
1305     { 2147483647,6,{
1306       {0,2},
1307       {4,4},
1308       {16,5},
1309       {33182,1},
1310       {3763779,3},
1311       {2147483647,4}
1312     }
1313     }
1314   }
1315   },
1316   {8,{
1317     { 8,6,{
1318       {5,4},
1319       {494,1},
1320       {97739,3},
1321       {522836,2},
1322       {554174,5},
1323       {2147483647,2}
1324     }
1325     },
1326     { 16,8,{
1327       {5,4},
1328       {62,1},
1329       {94,3},
1330       {215,1},
1331       {185095,3},
1332       {454784,4},
1333       {607911,5},
1334       {2147483647,4}
1335     }
1336     },
1337     { 32,7,{
1338       {0,3},
1339       {4,4},
1340       {302,1},
1341       {250841,3},
1342       {665822,4},
1343       {1760980,5},
1344       {2147483647,4}
1345     }
1346     },
1347     { 64,8,{
1348       {0,3},
1349       {4,4},
1350       {41,5},
1351       {306,1},
1352       {332405,3},
1353       {1269189,4},
1354       {3712421,5},
1355       {2147483647,4}
1356     }
1357     },
1358     { 128,6,{
1359       {0,3},
1360       {4,4},
1361       {39,5},
1362       {526,1},
1363       {487878,3},
1364       {2147483647,4}
1365     }
1366     },
1367     { 256,8,{
1368       {0,2},
1369       {4,4},
1370       {36,5},
1371       {1382,1},
1372       {424162,3},
1373       {632881,5},
1374       {1127566,3},
1375       {2147483647,4}
1376     }
1377     },
1378     { 512,4,{
1379       {4,4},
1380       {34,5},
1381       {5884,1},
1382       {2147483647,3}
1383     }
1384     },
1385     { 2147483647,4,{
1386       {5,4},
1387       {32,5},
1388       {25105,1},
1389       {2147483647,3}
1390     }
1391     }
1392   }
1393   },
1394   {16,{
1395     { 4,7,{
1396       {12,4},
1397       {27,5},
1398       {49,3},
1399       {187,1},
1400       {405673,3},
1401       {594687,4},
1402       {2147483647,2}
1403     }
1404     },
1405     { 8,5,{
1406       {24,5},
1407       {155,1},
1408       {204501,3},
1409       {274267,5},
1410       {2147483647,4}
1411     }
1412     },
1413     { 16,6,{
1414       {63,1},
1415       {72,3},
1416       {264,1},
1417       {168421,3},
1418       {168421,4},
1419       {2147483647,2}
1420     }
1421     },
1422     { 32,10,{
1423       {0,3},
1424       {4,4},
1425       {12,1},
1426       {18,5},
1427       {419,1},
1428       {188739,3},
1429       {716329,4},
1430       {1365841,5},
1431       {2430194,2},
1432       {2147483647,4}
1433     }
1434     },
1435     { 64,8,{
1436       {0,3},
1437       {4,4},
1438       {17,5},
1439       {635,1},
1440       {202937,3},
1441       {308253,5},
1442       {1389874,4},
1443       {2147483647,2}
1444     }
1445     },
1446     { 128,8,{
1447       {0,3},
1448       {4,4},
1449       {16,5},
1450       {1238,1},
1451       {280097,3},
1452       {631434,5},
1453       {2605072,4},
1454       {2147483647,2}
1455     }
1456     },
1457     { 256,5,{
1458       {0,2},
1459       {4,4},
1460       {16,5},
1461       {2418,1},
1462       {2147483647,3}
1463     }
1464     },
1465     { 2147483647,6,{
1466       {0,2},
1467       {4,4},
1468       {16,5},
1469       {33182,1},
1470       {3763779,3},
1471       {2147483647,4}
1472     }
1473     }
1474   }
1475   }
1476 };
1477
1478 /* I_MPI_ADJUST_ALLGATHER
1479
1480 MPI_Allgather
1481
1482 1. Recursive doubling algorithm
1483 2. Bruck's algorithm
1484 3. Ring algorithm
1485 4. Topology aware Gatherv + Bcast algorithm
1486
1487 */
1488
1489 int (*intel_allgather_functions_table[])(const void *sbuf, int scount,
1490                                               MPI_Datatype sdtype,
1491                                               void* rbuf, int rcount,
1492                                               MPI_Datatype rdtype,
1493                                               MPI_Comm  comm
1494                                                     ) ={
1495       allgather__rdb,
1496       allgather__bruck,
1497       allgather__ring,
1498       allgather__GB
1499 };
1500
1501 intel_tuning_table_element intel_allgather_table[] =
1502 {
1503   {1,{
1504     {4,11,
1505         {
1506         {1,4},
1507         {384,1},
1508         {1533,4},
1509         {3296,1},
1510         {10763,4},
1511         {31816,3},
1512         {193343,4},
1513         {405857,3},
1514         {597626,4},
1515         {1844323,3},
1516         {2147483647,4}
1517         }
1518     },
1519     {8,10,
1520         {
1521         {12,4},
1522         {46,1},
1523         {205,4},
1524         {3422,2},
1525         {4200,4},
1526         {8748,1},
1527         {24080,3},
1528         {33244,4},
1529         {371159,1},
1530         {2147483647,3}
1531         }
1532     },
1533     {16, 8,
1534         {
1535         {3,4},
1536         {53,1},
1537         {100,4},
1538         {170,1},
1539         {6077,4},
1540         {127644,1},
1541         {143741,4},
1542         {2147483647,3}
1543         }
1544     },
1545     {2147483647,10,
1546         {
1547         {184,1},
1548         {320,4},
1549         {759,1},
1550         {1219,4},
1551         {2633,1},
1552         {8259,4},
1553         {123678,1},
1554         {160801,4},
1555         {284341,1},
1556         {2147483647,4}
1557         }
1558     }
1559   }
1560   },
1561   {2,{
1562     { 8,6,{
1563       {490,1},
1564       {558,2},
1565       {2319,1},
1566       {46227,3},
1567       {2215101,1},
1568       {2147483647,3}
1569     }
1570     },
1571     { 16,4,{
1572       {1005,1},
1573       {1042,2},
1574       {2059,1},
1575       {2147483647,3}
1576     }
1577     },
1578     { 2147483647,2,{
1579       {2454,1},
1580       {2147483647,3}
1581    }
1582    }
1583   }
1584   },
1585   {4,{
1586     { 8,2,{
1587       {2861,1},
1588       {2147483647,3}
1589     }
1590     },
1591     { 2147483647,2,{
1592       {605,1},
1593       {2147483647,3}
1594     }
1595     }
1596   }
1597   },
1598   {8,{
1599     { 16,4,{
1600       {66,1},
1601       {213,4},
1602       {514,1},
1603       {2147483647,3}
1604     }
1605     },
1606     { 32,4,{
1607       {91,1},
1608       {213,4},
1609       {514,1},
1610       {2147483647,3}
1611     }
1612     },
1613     { 64,4,{
1614       {71,1},
1615       {213,4},
1616       {514,1},
1617       {2147483647,3}
1618     }
1619     },
1620     { 128,2,{
1621       {305,1},
1622       {2147483647,3}
1623     }
1624     },
1625     { 2147483647,2,{
1626       {213,1},
1627       {2147483647,3}
1628     }
1629     }
1630   }
1631   },
1632   {16,{
1633     { 8,2,{
1634       {2861,1},
1635       {2147483647,3}
1636     }
1637     },
1638     { 2147483647,2,{
1639       {605,1},
1640       {2147483647,3}
1641     }
1642     }
1643   }
1644   }
1645 };
1646
1647 /* I_MPI_ADJUST_ALLGATHERV
1648
1649 MPI_Allgatherv
1650
1651 1. Recursive doubling algorithm
1652 2. Bruck's algorithm
1653 3. Ring algorithm
1654 4. Topology aware Gatherv + Bcast algorithm
1655
1656 */
1657
1658 int (*intel_allgatherv_functions_table[])(const void *sbuf, int scount,
1659                                                MPI_Datatype sdtype,
1660                                                void* rbuf, const int *rcounts,
1661                                                const int *rdispls,
1662                                                MPI_Datatype rdtype,
1663                                                MPI_Comm  comm
1664                                                     ) ={
1665       allgatherv__mpich_rdb,
1666       allgatherv__ompi_bruck,
1667       allgatherv__ring,
1668       allgatherv__GB
1669 };
1670
1671 intel_tuning_table_element intel_allgatherv_table[] =
1672 {
1673   {1,{
1674     {2,3,
1675         {
1676         {259668,3},
1677         {635750,4},
1678         {2147483647,3}
1679         }
1680     },
1681     {4,7,
1682         {
1683         {1,1},
1684         {5,4},
1685         {46,1},
1686         {2590,2},
1687         {1177259,3},
1688         {2767234,4},
1689         {2147483647,3}
1690         }
1691     },
1692     {8, 6,
1693         {
1694         {99,2},
1695         {143,1},
1696         {4646,2},
1697         {63522,3},
1698         {2187806,4},
1699         {2147483647,3}
1700         }
1701     },
1702     {2147483647,7,
1703         {
1704         {1,1},
1705         {5,4},
1706         {46,1},
1707         {2590,2},
1708         {1177259,3},
1709         {2767234,4},
1710         {2147483647,3}
1711         }
1712     }
1713   }
1714   },
1715   {2,{
1716     { 4,3,{
1717       {3147,1},
1718       {5622,2},
1719       {2147483647,3}
1720     }
1721     },
1722     { 8,3,{
1723       {975,1},
1724       {4158,2},
1725       {2147483647,3}
1726     }
1727     },
1728     { 16,2,{
1729       {2146,1},
1730       {2147483647,3}
1731     }
1732     },
1733     { 32,4,{
1734       {81,1},
1735       {414,2},
1736       {1190,1},
1737       {2147483647,3}
1738     }
1739     },
1740     { 2147483647,5,{
1741       {1,2},
1742       {3,1},
1743       {783,2},
1744       {1782,4},
1745       {2147483647,3}
1746     }
1747     }
1748   }
1749   },
1750   {4,{
1751     { 8,2,{
1752       {2554,1},
1753       {2147483647,3}
1754     }
1755     },
1756     { 16,4,{
1757       {272,1},
1758       {657,2},
1759       {2078,1},
1760       {2147483647,3}
1761     }
1762     },
1763     { 32,2,{
1764       {1081,1},
1765       {2147483647,3}
1766     }
1767     },
1768     { 64,2,{
1769       {547,1},
1770       {2147483647,3}
1771     }
1772     },
1773     { 2147483647,5,{
1774       {19,1},
1775       {239,2},
1776       {327,1},
1777       {821,4},
1778       {2147483647,3}
1779     }
1780     }
1781   }
1782   },
1783   {8,{
1784     { 16,3,{
1785       {55,1},
1786       {514,2},
1787       {2147483647,3}
1788     }
1789     },
1790     { 32,4,{
1791       {53,1},
1792       {167,4},
1793       {514,2},
1794       {2147483647,3}
1795     }
1796     },
1797     { 64,3,{
1798       {13,1},
1799       {319,4},
1800       {2147483647,3}
1801     }
1802     },
1803     { 128,7,{
1804       {2,1},
1805       {11,2},
1806       {48,1},
1807       {201,2},
1808       {304,1},
1809       {1048,4},
1810       {2147483647,3}
1811     }
1812     },
1813     { 2147483647,5,{
1814       {5,1},
1815       {115,4},
1816       {129,1},
1817       {451,4},
1818       {2147483647,3}
1819     }
1820     }
1821   }
1822   },
1823   {16,{
1824     { 8,2,{
1825       {2554,1},
1826       {2147483647,3}
1827     }
1828     },
1829     { 16,4,{
1830       {272,1},
1831       {657,2},
1832       {2078,1},
1833       {2147483647,3}
1834     }
1835     },
1836     { 32,2,{
1837       {1081,1},
1838       {2147483647,3}
1839     }
1840     },
1841     { 64,2,{
1842       {547,1},
1843       {2147483647,3}
1844     }
1845     },
1846     { 2147483647,5,{
1847       {19,1},
1848       {239,2},
1849       {327,1},
1850       {821,4},
1851       {2147483647,3}
1852     }
1853     }
1854   }
1855   }
1856 };
1857
1858
1859 /* I_MPI_ADJUST_GATHER
1860
1861 MPI_Gather
1862
1863 1. Binomial algorithm
1864 2. Topology aware binomial algorithm
1865 3. Shumilin's algorithm
1866
1867 */
1868
1869 int (*intel_gather_functions_table[])(const void *sbuf, int scount,
1870                                            MPI_Datatype sdtype,
1871                                            void* rbuf, int rcount,
1872                                            MPI_Datatype rdtype,
1873                                            int root,
1874                                            MPI_Comm  comm
1875                                                     ) ={
1876       gather__ompi_binomial,
1877       gather__ompi_binomial,
1878       gather__mvapich2
1879 };
1880
1881 intel_tuning_table_element intel_gather_table[] =
1882 {
1883   {1,{
1884     {8,3,
1885         {
1886         {17561,3},
1887         {44791,2},
1888         {2147483647,3}
1889         }
1890     },
1891     {16,7,
1892         {
1893         {16932,3},
1894         {84425,2},
1895         {158363,3},
1896         {702801,2},
1897         {1341444,3},
1898         {2413569,2},
1899         {2147483647,3}
1900         }
1901     },
1902     {2147483647,4,
1903         {
1904         {47187,3},
1905         {349696,2},
1906         {2147483647,3},
1907         {2147483647,1}
1908         }
1909     }
1910   }
1911   },
1912   {2,{
1913     {2147483647,1,{
1914       {2147483647,3}
1915     }
1916     }
1917   }
1918   },
1919   {4,{
1920     {2147483647,1,{
1921       {2147483647,3}
1922     }
1923     }
1924   }
1925   },
1926   {8,{
1927     { 16,1,{
1928       {2147483647,3}
1929     }
1930     },
1931     { 32,2,{
1932       {9,2},
1933       {2147483647,3}
1934     }
1935     },
1936     { 64,2,{
1937       {784,2},
1938       {2147483647,3}
1939     }
1940     },
1941     { 128,3,{
1942       {160,3},
1943       {655,2},
1944       {2147483647,3}
1945     }
1946     },
1947     { 2147483647,1,{
1948       {2147483647,3}
1949     }
1950     }
1951   }
1952   },
1953   {16,{
1954     {2147483647,1,{
1955       {2147483647,3}
1956     }
1957     }
1958   }
1959   }
1960 };
1961
1962
1963 /* I_MPI_ADJUST_SCATTER
1964
1965 MPI_Scatter
1966
1967 1. Binomial algorithm
1968 2. Topology aware binomial algorithm
1969 3. Shumilin's algorithm
1970
1971 */
1972
1973 int (*intel_scatter_functions_table[])(const void *sbuf, int scount,
1974                                             MPI_Datatype sdtype,
1975                                             void* rbuf, int rcount,
1976                                             MPI_Datatype rdtype,
1977                                             int root, MPI_Comm  comm
1978                                                     ) ={
1979       scatter__ompi_binomial,
1980       scatter__ompi_binomial,
1981       scatter__mvapich2
1982 };
1983
1984 intel_tuning_table_element intel_scatter_table[] =
1985 {
1986   {1,{
1987     {2,2,
1988         {
1989         {16391,1},
1990         {2147483647,3}
1991         }
1992     },
1993     {4,6,
1994         {
1995         {16723,3},
1996         {153541,2},
1997         {425631,3},
1998         {794142,2},
1999         {1257027,3},
2000         {2147483647,2}
2001         }
2002     },
2003     {8,7,
2004         {
2005         {2633,3},
2006         {6144,2},
2007         {14043,3},
2008         {24576,2},
2009         {107995,3},
2010         {1752729,2},
2011         {2147483647,3}
2012         }
2013     },
2014     {16,7,
2015         {
2016         {2043,3},
2017         {2252,2},
2018         {17749,3},
2019         {106020,2},
2020         {628654,3},
2021         {3751354,2},
2022         {2147483647,3}
2023         }
2024     },
2025     {2147483647,4,
2026         {
2027         {65907,3},
2028         {245132,2},
2029         {1042439,3},
2030         {2147483647,2},
2031         {2147483647,1}
2032         }
2033     }
2034   }
2035   },
2036   {2,{
2037      {2147483647,1,{
2038        {2147483647,3}
2039      }
2040      }
2041   }
2042   },
2043   {4,{
2044     { 8,1,{
2045       {2147483647,3}
2046     }
2047     },
2048     { 16,2,{
2049       {140,3},
2050       {1302,1},
2051       {2147483647,3}
2052     }
2053     },
2054     { 32,2,{
2055       {159,3},
2056       {486,1},
2057       {2147483647,3}
2058     }
2059     },
2060     { 64,2,{
2061       {149,1},
2062       {2147483647,3}
2063     }
2064     },
2065     { 2147483647,2,{
2066       {139,1},
2067       {2147483647,3}
2068     }
2069     }
2070   }
2071   },
2072   {8,{
2073     { 16,4,{
2074       {587,1},
2075       {1370,2},
2076       {2102,1},
2077       {2147483647,3}
2078     }
2079     },
2080     { 32,3,{
2081       {1038,1},
2082       {2065,2},
2083       {2147483647,3}
2084     }
2085     },
2086     { 64,3,{
2087       {515,1},
2088       {2069,2},
2089       {2147483647,3}
2090     }
2091     },
2092     { 128,3,{
2093       {284,1},
2094       {796,2},
2095       {2147483647,3}
2096     }
2097     },
2098     { 2147483647,2,{
2099       {139,1},
2100       {2147483647,3}
2101     }
2102     }
2103   }
2104   },
2105   {16,{
2106     { 8,1,{
2107       {2147483647,3}
2108     }
2109     },
2110     { 16,3,{
2111       {140,3},
2112       {1302,1},
2113       {2147483647,3}
2114     }
2115     },
2116     { 32,3,{
2117       {159,3},
2118       {486,1},
2119       {2147483647,3}
2120     }
2121     },
2122     { 64,2,{
2123       {149,1},
2124       {2147483647,3}
2125     }
2126     },
2127     { 2147483647,2,{
2128       {139,1},
2129       {2147483647,3}
2130     }
2131     }
2132   }
2133   }
2134 };
2135
2136
2137
2138 /* I_MPI_ADJUST_ALLTOALLV
2139
2140 MPI_Alltoallv
2141
2142 1. Isend/Irecv + waitall algorithm
2143 2. Plum's algorithm
2144
2145 */
2146
2147 int (*intel_alltoallv_functions_table[])(const void *sbuf, const int *scounts, const int *sdisps,
2148                                               MPI_Datatype sdtype,
2149                                               void *rbuf, const int *rcounts, const int *rdisps,
2150                                               MPI_Datatype rdtype,
2151                                               MPI_Comm  comm
2152                                                     ) ={
2153       alltoallv__ompi_basic_linear,
2154       alltoallv__bruck
2155 };
2156
2157 intel_tuning_table_element intel_alltoallv_table[] =
2158 {
2159   {1,{
2160     {2147483647,1,
2161         {
2162         {2147483647,1}
2163         }
2164     }
2165   }
2166   },
2167   {2,{
2168     {2147483647,1,
2169         {
2170         {2147483647,1}
2171         }
2172     }
2173   }
2174   },
2175   {4,{
2176     { 8,1,{
2177       {2147483647,1}//weirdly, intel reports the use of algo 0 here
2178     }
2179     },
2180     { 2147483647,2,{
2181       {4,1},//0 again
2182       {2147483647,2}
2183     }
2184     }
2185   }
2186   },
2187   {8,{
2188     { 16,1,{
2189       {2147483647,1}
2190     }
2191     },
2192     { 2147483647,2,{
2193       {0,1},//weird again, only for 0-sized messages
2194       {2147483647,2}
2195     }
2196     }
2197   }
2198   },
2199   {16,{
2200     { 8,1,{
2201       {2147483647,1}//0
2202     }
2203     },
2204     { 2147483647,2,{
2205       {4,1},//0
2206       {2147483647,2}
2207     }
2208     }
2209   }
2210   }
2211 };
2212
2213
2214 //These are collected from table 3.5-2 of the Intel MPI Reference Manual
2215
2216
2217 #define SIZECOMP_reduce_scatter\
2218     int total_message_size = 0;\
2219     for (i = 0; i < comm_size; i++) { \
2220         total_message_size += rcounts[i];\
2221     }\
2222     size_t block_dsize = total_message_size*dtype->size();\
2223
2224 #define SIZECOMP_allreduce\
2225   size_t block_dsize =rcount * dtype->size();
2226
2227 #define SIZECOMP_alltoall\
2228   size_t block_dsize =send_count * send_type->size();
2229
2230 #define SIZECOMP_bcast\
2231   size_t block_dsize =count * datatype->size();
2232
2233 #define SIZECOMP_reduce\
2234   size_t block_dsize =count * datatype->size();
2235
2236 #define SIZECOMP_barrier\
2237   size_t block_dsize = 1;
2238
2239 #define SIZECOMP_allgather\
2240   size_t block_dsize =recv_count * recv_type->size();
2241
2242 #define SIZECOMP_allgatherv\
2243     int total_message_size = 0;\
2244     for (i = 0; i < comm_size; i++) { \
2245         total_message_size += recv_count[i];\
2246     }\
2247     size_t block_dsize = total_message_size*recv_type->size();
2248
2249 #define SIZECOMP_gather\
2250   int rank = comm->rank();\
2251   size_t block_dsize = (send_buff == MPI_IN_PLACE || rank ==root) ?\
2252                 recv_count * recv_type->size() :\
2253                 send_count * send_type->size();
2254
2255 #define SIZECOMP_scatter\
2256   int rank = comm->rank();\
2257   size_t block_dsize = (sendbuf == MPI_IN_PLACE || rank !=root ) ?\
2258                 recvcount * recvtype->size() :\
2259                 sendcount * sendtype->size();
2260
2261 #define SIZECOMP_alltoallv\
2262   size_t block_dsize = 1;
2263
2264 #define IMPI_COLL_SELECT(cat, ret, args, args2)                                                                        \
2265   ret _XBT_CONCAT2(cat, __impi)(COLL_UNPAREN args)                                                          \
2266   {                                                                                                                    \
2267     int comm_size = comm->size();                                                                                      \
2268     int i         = 0;                                                                                                 \
2269     _XBT_CONCAT(SIZECOMP_, cat)                                                                                        \
2270     i     = 0;                                                                                                         \
2271     int j = 0, k = 0;                                                                                                  \
2272     if (comm->get_leaders_comm() == MPI_COMM_NULL) {                                                                   \
2273       comm->init_smp();                                                                                                \
2274     }                                                                                                                  \
2275     int local_size = 1;                                                                                                \
2276     if (comm->is_uniform()) {                                                                                          \
2277       local_size = comm->get_intra_comm()->size();                                                                     \
2278     }                                                                                                                  \
2279     while (i < INTEL_MAX_NB_PPN && local_size != _XBT_CONCAT3(intel_, cat, _table)[i].ppn)                             \
2280       i++;                                                                                                             \
2281     if (i == INTEL_MAX_NB_PPN)                                                                                         \
2282       i = 0;                                                                                                           \
2283     while (comm_size > _XBT_CONCAT3(intel_, cat, _table)[i].elems[j].max_num_proc && j < INTEL_MAX_NB_THRESHOLDS)      \
2284       j++;                                                                                                             \
2285     while (block_dsize >= _XBT_CONCAT3(intel_, cat, _table)[i].elems[j].elems[k].max_size &&                           \
2286            k < _XBT_CONCAT3(intel_, cat, _table)[i].elems[j].num_elems)                                                \
2287       k++;                                                                                                             \
2288     return (_XBT_CONCAT3(intel_, cat,                                                                                  \
2289                          _functions_table)[_XBT_CONCAT3(intel_, cat, _table)[i].elems[j].elems[k].algo - 1] args2);    \
2290   }
2291
2292 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHERV_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_disps, recv_type, comm))
2293 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLREDUCE_SIG, (sbuf, rbuf, rcount, dtype, op, comm))
2294 COLL_APPLY(IMPI_COLL_SELECT, COLL_GATHER_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_type, root, comm))
2295 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHER_SIG, (send_buff,send_count,send_type,recv_buff,recv_count,recv_type,comm))
2296 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALL_SIG,(send_buff, send_count, send_type, recv_buff, recv_count, recv_type,comm))
2297 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALLV_SIG, (send_buff, send_counts, send_disps, send_type, recv_buff, recv_counts, recv_disps, recv_type, comm))
2298 COLL_APPLY(IMPI_COLL_SELECT, COLL_BCAST_SIG , (buf, count, datatype, root, comm))
2299 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SIG,(buf,rbuf, count, datatype, op, root, comm))
2300 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SCATTER_SIG ,(sbuf,rbuf, rcounts,dtype,op,comm))
2301 COLL_APPLY(IMPI_COLL_SELECT, COLL_SCATTER_SIG ,(sendbuf, sendcount, sendtype,recvbuf, recvcount, recvtype,root, comm))
2302 COLL_APPLY(IMPI_COLL_SELECT, COLL_BARRIER_SIG,(comm))
2303
2304 } // namespace simgrid::smpi