src/smpi/colls/smpi_openmpi_selector.c

   1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
   2
   3 /* Copyright (c) 2009, 2010. The SimGrid Team.
   4  * All rights reserved.                                                     */
   5
   6 /* This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the license (GNU LGPL) which comes with this package. */
   8
   9 #include "colls_private.h"
  10
  11
  12 int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
  13                         MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
  14 {
  15     size_t dsize, block_dsize;
  16     int comm_size = smpi_comm_size(comm);
  17     const size_t intermediate_message = 10000;
  18
  19     /**
  20      * Decision function based on MX results from the Grig cluster at UTK.
  21      *
  22      * Currently, linear, recursive doubling, and nonoverlapping algorithms
  23      * can handle both commutative and non-commutative operations.
  24      * Ring algorithm does not support non-commutative operations.
  25      */
  26     dsize = smpi_datatype_size(dtype);
  27     block_dsize = dsize * count;
  28
  29     if (block_dsize < intermediate_message) {
  30         return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf,
  31                                                                    count, dtype,
  32                                                                    op, comm));
  33     }
  34
  35     if( smpi_op_is_commute(op) && (count > comm_size) ) {
  36         const size_t segment_size = 1 << 20; /* 1 MB */
  37         if ((comm_size * segment_size >= block_dsize)) {
  38             //FIXME: ok, these are not the right algorithms, try to find closer ones
  39             // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
  40             return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
  41                                               op, comm);
  42         } else {
  43            return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf,
  44                                                                     count, dtype,
  45                                                                     op, comm
  46                                                                     /*segment_size*/));
  47         }
  48     }
  49
  50     return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count,
  51                                                             dtype, op, comm));
  52 }
  53
  54
  55
  56 int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount,
  57                                              MPI_Datatype sdtype,
  58                                              void* rbuf, int rcount,
  59                                              MPI_Datatype rdtype,
  60                                              MPI_Comm comm)
  61 {
  62     int communicator_size;
  63     size_t dsize, block_dsize;
  64     communicator_size = smpi_comm_size(comm);
  65
  66     /* Decision function based on measurement on Grig cluster at
  67        the University of Tennessee (2GB MX) up to 64 nodes.
  68        Has better performance for messages of intermediate sizes than the old one */
  69     /* determine block size */
  70     dsize = smpi_datatype_size(sdtype);
  71     block_dsize = dsize * scount;
  72
  73     if ((block_dsize < 200) && (communicator_size > 12)) {
  74         return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype,
  75                                                     rbuf, rcount, rdtype,
  76                                                     comm);
  77
  78     } else if (block_dsize < 3000) {
  79         return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype,
  80                                                            rbuf, rcount, rdtype,
  81                                                            comm);
  82     }
  83
  84     return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype,
  85                                                     rbuf, rcount, rdtype,
  86                                                     comm);
  87 }
  88
  89 int smpi_coll_tuned_alltoallv_ompi(void *sbuf, int *scounts, int *sdisps,
  90                                               MPI_Datatype sdtype,
  91                                               void *rbuf, int *rcounts, int *rdisps,
  92                                               MPI_Datatype rdtype,
  93                                               MPI_Comm  comm
  94                                               )
  95 {
  96     /* For starters, just keep the original algorithm. */
  97     return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype,
  98                                                         rbuf, rcounts, rdisps,rdtype,
  99                                                         comm);
 100 }
 101
 102 /*
 103 void smpi_coll_tuned_barrier_ompi(MPI_Comm  comm)
 104 {    int communicator_size = smpi_comm_size(comm);
 105
 106     if( 2 == communicator_size )
 107         return smpi_coll_tuned_barrier_intra_two_procs(comm, module);
 108      * Basic optimisation. If we have a power of 2 number of nodes
 109      * the use the recursive doubling algorithm, otherwise
 110      * bruck is the one we want.
 111     {
 112         bool has_one = false;
 113         for( ; communicator_size > 0; communicator_size >>= 1 ) {
 114             if( communicator_size & 0x1 ) {
 115                 if( has_one )
 116                     return smpi_coll_tuned_barrier_intra_bruck(comm, module);
 117                 has_one = true;
 118             }
 119         }
 120     }
 121     return smpi_coll_tuned_barrier_intra_recursivedoubling(comm, module);
 122 }*/
 123
 124 int smpi_coll_tuned_bcast_ompi(void *buff, int count,
 125                                           MPI_Datatype datatype, int root,
 126                                           MPI_Comm  comm
 127                                           )
 128 {
 129     /* Decision function based on MX results for
 130        messages up to 36MB and communicator sizes up to 64 nodes */
 131     const size_t small_message_size = 2048;
 132     const size_t intermediate_message_size = 370728;
 133     const double a_p16  = 3.2118e-6; /* [1 / byte] */
 134     const double b_p16  = 8.7936;
 135     const double a_p64  = 2.3679e-6; /* [1 / byte] */
 136     const double b_p64  = 1.1787;
 137     const double a_p128 = 1.6134e-6; /* [1 / byte] */
 138     const double b_p128 = 2.1102;
 139
 140     int communicator_size;
 141     //int segsize = 0;
 142     size_t message_size, dsize;
 143
 144     communicator_size = smpi_comm_size(comm);
 145
 146     /* else we need data size for decision function */
 147     dsize = smpi_datatype_size(datatype);
 148     message_size = dsize * (unsigned long)count;   /* needed for decision */
 149
 150     /* Handle messages of small and intermediate size, and
 151        single-element broadcasts */
 152     if ((message_size < small_message_size) || (count <= 1)) {
 153         /* Binomial without segmentation */
 154         return  smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype,
 155                                                       root, comm);
 156
 157     } else if (message_size < intermediate_message_size) {
 158         // SplittedBinary with 1KB segments
 159         return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
 160                                                          root, comm);
 161
 162     }
 163      //Handle large message sizes
 164     else if (communicator_size < (a_p128 * message_size + b_p128)) {
 165         //Pipeline with 128KB segments
 166         //segsize = 1024  << 7;
 167         return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
 168                                                      root, comm);
 169
 170
 171     } else if (communicator_size < 13) {
 172         // Split Binary with 8KB segments
 173         return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
 174                                                          root, comm);
 175
 176     } else if (communicator_size < (a_p64 * message_size + b_p64)) {
 177         // Pipeline with 64KB segments
 178         //segsize = 1024 << 6;
 179         return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
 180                                                      root, comm);
 181
 182
 183     } else if (communicator_size < (a_p16 * message_size + b_p16)) {
 184         //Pipeline with 16KB segments
 185         //segsize = 1024 << 4;
 186         return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
 187                                                      root, comm);
 188
 189
 190     }
 191     /* Pipeline with 8KB segments */
 192     //segsize = 1024 << 3;
 193     return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
 194                                                  root, comm
 195                                                  /*segsize*/);
 196 #if 0
 197     /* this is based on gige measurements */
 198
 199     if (communicator_size  < 4) {
 200         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
 201     }
 202     if (communicator_size == 4) {
 203         if (message_size < 524288) segsize = 0;
 204         else segsize = 16384;
 205         return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 206     }
 207     if (communicator_size <= 8 && message_size < 4096) {
 208         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
 209     }
 210     if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
 211         segsize = 16384;
 212         return  smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 213     }
 214     if (message_size >= 524288) {
 215         segsize = 16384;
 216         return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
 217     }
 218     segsize = 0;
 219     /* once tested can swap this back in */
 220     /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
 221     return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 222 #endif  /* 0 */
 223 }
 224
 225 int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
 226                                             int count, MPI_Datatype  datatype,
 227                                             MPI_Op   op, int root,
 228                                             MPI_Comm   comm
 229                                             )
 230 {
 231     int communicator_size=0;
 232     //int segsize = 0;
 233     size_t message_size, dsize;
 234     const double a1 =  0.6016 / 1024.0; /* [1/B] */
 235     const double b1 =  1.3496;
 236     const double a2 =  0.0410 / 1024.0; /* [1/B] */
 237     const double b2 =  9.7128;
 238     const double a3 =  0.0422 / 1024.0; /* [1/B] */
 239     const double b3 =  1.1614;
 240     //const double a4 =  0.0033 / 1024.0; /* [1/B] */
 241     //const double b4 =  1.6761;
 242
 243     //const int max_requests = 0; /* no limit on # of outstanding requests */
 244
 245     communicator_size = smpi_comm_size(comm);
 246
 247     /* need data size for decision function */
 248     dsize=smpi_datatype_size(datatype);
 249     message_size = dsize * count;   /* needed for decision */
 250
 251     /**
 252      * If the operation is non commutative we currently have choice of linear
 253      * or in-order binary tree algorithm.
 254      */
 255     if( !smpi_op_is_commute(op) ) {
 256         if ((communicator_size < 12) && (message_size < 2048)) {
 257             return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm/*, module*/);
 258         }
 259         return smpi_coll_tuned_reduce_ompi_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 260                                                              0, max_requests*/);
 261     }
 262
 263     if ((communicator_size < 8) && (message_size < 512)){
 264         /* Linear_0K */
 265         return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
 266     } else if (((communicator_size < 8) && (message_size < 20480)) ||
 267                (message_size < 2048) || (count <= 1)) {
 268         /* Binomial_0K */
 269         //segsize = 0;
 270         return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 271                                                      segsize, max_requests*/);
 272     } else if (communicator_size > (a1 * message_size + b1)) {
 273         // Binomial_1K
 274         //segsize = 1024;
 275         return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 276                                                      segsize, max_requests*/);
 277     } else if (communicator_size > (a2 * message_size + b2)) {
 278         // Pipeline_1K
 279         //segsize = 1024;
 280         return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 281                                                       segsize, max_requests*/);
 282     } else if (communicator_size > (a3 * message_size + b3)) {
 283         // Binary_32K
 284         //segsize = 32*1024;
 285         return smpi_coll_tuned_reduce_ompi_binary( sendbuf, recvbuf, count, datatype, op, root,
 286                                                     comm/*, module, segsize, max_requests*/);
 287     }
 288     /*if (communicator_size > (a4 * message_size + b4)) {
 289         // Pipeline_32K
 290         segsize = 32*1024;
 291     } else {
 292         // Pipeline_64K
 293         segsize = 64*1024;
 294     }*/
 295     return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 296                                                   segsize, max_requests*/);
 297
 298 #if 0
 299     /* for small messages use linear algorithm */
 300     if (message_size <= 4096) {
 301         segsize = 0;
 302         fanout = communicator_size - 1;
 303         /* when linear implemented or taken from basic put here, right now using chain as a linear system */
 304         /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
 305         return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
 306         /*        return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
 307     }
 308     if (message_size < 524288) {
 309         if (message_size <= 65536 ) {
 310             segsize = 32768;
 311             fanout = 8;
 312         } else {
 313             segsize = 1024;
 314             fanout = communicator_size/2;
 315         }
 316         /* later swap this for a binary tree */
 317         /*         fanout = 2; */
 318         return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 319                                                    segsize, fanout, max_requests);
 320     }
 321     segsize = 1024;
 322     return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 323                                                   segsize, max_requests);
 324 #endif  /* 0 */
 325 }
 326
 327 /*int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
 328                                                     int *rcounts,
 329                                                     MPI_Datatype dtype,
 330                                                     MPI_Op  op,
 331                                                     MPI_Comm  comm,
 332                                                     )
 333 {
 334     int comm_size, i, pow2;
 335     size_t total_message_size, dsize;
 336     const double a = 0.0012;
 337     const double b = 8.0;
 338     const size_t small_message_size = 12 * 1024;
 339     const size_t large_message_size = 256 * 1024;
 340     bool zerocounts = false;
 341
 342     OPAL_OUTPUT((smpi_coll_tuned_stream, "smpi_coll_tuned_reduce_scatter_ompi"));
 343
 344     comm_size = smpi_comm_size(comm);
 345     // We need data size for decision function
 346     ompi_datatype_type_size(dtype, &dsize);
 347     total_message_size = 0;
 348     for (i = 0; i < comm_size; i++) {
 349         total_message_size += rcounts[i];
 350         if (0 == rcounts[i]) {
 351             zerocounts = true;
 352         }
 353     }
 354
 355     if( !ompi_op_is_commute(op) || (zerocounts)) {
 356         return smpi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts,
 357                                                                     dtype, op,
 358                                                                     comm, module);
 359     }
 360
 361     total_message_size *= dsize;
 362
 363     // compute the nearest power of 2
 364     for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
 365
 366     if ((total_message_size <= small_message_size) ||
 367         ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
 368         (comm_size >= a * total_message_size + b)) {
 369         return
 370             smpi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
 371                                                                         dtype, op,
 372                                                                         comm, module);
 373     }
 374     return smpi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
 375                                                      dtype, op,
 376                                                      comm, module);
 377
 378
 379     return smpi_coll_tuned_reduce_scatter(sbuf, rbuf, rcounts,
 380                                                      dtype, op,
 381                                                      comm;
 382
 383 }*/
 384
 385 int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount,
 386                                               MPI_Datatype sdtype,
 387                                               void* rbuf, int rcount,
 388                                               MPI_Datatype rdtype,
 389                                               MPI_Comm  comm
 390                                               )
 391 {
 392     int communicator_size, pow2_size;
 393     size_t dsize, total_dsize;
 394
 395     communicator_size = smpi_comm_size(comm);
 396
 397     /* Special case for 2 processes */
 398     if (communicator_size == 2) {
 399         return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype,
 400                                                           rbuf, rcount, rdtype,
 401                                                           comm/*, module*/);
 402     }
 403
 404     /* Determine complete data size */
 405     dsize=smpi_datatype_size(sdtype);
 406     total_dsize = dsize * scount * communicator_size;
 407
 408     for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);
 409
 410     /* Decision based on MX 2Gb results from Grig cluster at
 411        The University of Tennesse, Knoxville
 412        - if total message size is less than 50KB use either bruck or
 413        recursive doubling for non-power of two and power of two nodes,
 414        respectively.
 415        - else use ring and neighbor exchange algorithms for odd and even
 416        number of nodes, respectively.
 417     */
 418     if (total_dsize < 50000) {
 419         if (pow2_size == communicator_size) {
 420             return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
 421                                                                      rbuf, rcount, rdtype,
 422                                                                      comm);
 423         } else {
 424             return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
 425                                                          rbuf, rcount, rdtype,
 426                                                          comm);
 427         }
 428     } else {
 429         if (communicator_size % 2) {
 430             return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
 431                                                         rbuf, rcount, rdtype,
 432                                                         comm);
 433         } else {
 434             return  smpi_coll_tuned_allgather_ompi_neighborexchange(sbuf, scount, sdtype,
 435                                                                      rbuf, rcount, rdtype,
 436                                                                      comm);
 437         }
 438     }
 439
 440 #if defined(USE_MPICH2_DECISION)
 441     /* Decision as in MPICH-2
 442        presented in Thakur et.al. "Optimization of Collective Communication
 443        Operations in MPICH", International Journal of High Performance Computing
 444        Applications, Vol. 19, No. 1, 49-66 (2005)
 445        - for power-of-two processes and small and medium size messages
 446        (up to 512KB) use recursive doubling
 447        - for non-power-of-two processes and small messages (80KB) use bruck,
 448        - for everything else use ring.
 449     */
 450     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
 451         return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
 452                                                                  rbuf, rcount, rdtype,
 453                                                                  comm);
 454     } else if (total_dsize <= 81920) {
 455         return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
 456                                                      rbuf, rcount, rdtype,
 457                                                      comm);
 458     }
 459     return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
 460                                                 rbuf, rcount, rdtype,
 461                                                 comm);
 462 #endif  /* defined(USE_MPICH2_DECISION) */
 463 }
 464
 465 int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount,
 466                                                MPI_Datatype sdtype,
 467                                                void* rbuf, int *rcounts,
 468                                                int *rdispls,
 469                                                MPI_Datatype rdtype,
 470                                                MPI_Comm  comm
 471                                                )
 472 {
 473     int i;
 474     int communicator_size;
 475     size_t dsize, total_dsize;
 476
 477     communicator_size = smpi_comm_size(comm);
 478
 479     /* Special case for 2 processes */
 480     if (communicator_size == 2) {
 481         return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
 482                                                            rbuf, rcounts, rdispls, rdtype,
 483                                                            comm);
 484     }
 485
 486     /* Determine complete data size */
 487     dsize=smpi_datatype_size(sdtype);
 488     total_dsize = 0;
 489     for (i = 0; i < communicator_size; i++) {
 490         total_dsize += dsize * rcounts[i];
 491     }
 492
 493     /* Decision based on allgather decision.   */
 494     if (total_dsize < 50000) {
 495 /*        return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
 496                                                       rbuf, rcounts, rdispls, rdtype,
 497                                                       comm, module);*/
 498     return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
 499                                                       rbuf, rcounts, rdispls, rdtype,
 500                                                       comm);
 501
 502     } else {
 503         if (communicator_size % 2) {
 504             return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
 505                                                          rbuf, rcounts, rdispls, rdtype,
 506                                                          comm);
 507         } else {
 508             return  smpi_coll_tuned_allgatherv_ompi_neighborexchange(sbuf, scount, sdtype,
 509                                                                       rbuf, rcounts, rdispls, rdtype,
 510                                                                       comm);
 511         }
 512     }
 513 }
 514 /*
 515 int smpi_coll_tuned_gather_ompi(void *sbuf, int scount,
 516                                            MPI_Datatype sdtype,
 517                                            void* rbuf, int rcount,
 518                                            MPI_Datatype rdtype,
 519                                            int root,
 520                                            MPI_Comm  comm,
 521                                            )
 522 {
 523     const int large_segment_size = 32768;
 524     const int small_segment_size = 1024;
 525
 526     const size_t large_block_size = 92160;
 527     const size_t intermediate_block_size = 6000;
 528     const size_t small_block_size = 1024;
 529
 530     const int large_communicator_size = 60;
 531     const int small_communicator_size = 10;
 532
 533     int communicator_size, rank;
 534     size_t dsize, block_size;
 535
 536     OPAL_OUTPUT((smpi_coll_tuned_stream,
 537                  "smpi_coll_tuned_gather_ompi"));
 538
 539     communicator_size = smpi_comm_size(comm);
 540     rank = ompi_comm_rank(comm);
 541
 542     // Determine block size
 543     if (rank == root) {
 544         ompi_datatype_type_size(rdtype, &dsize);
 545         block_size = dsize * rcount;
 546     } else {
 547         ompi_datatype_type_size(sdtype, &dsize);
 548         block_size = dsize * scount;
 549     }
 550
 551     if (block_size > large_block_size) {
 552         return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
 553                                                          rbuf, rcount, rdtype,
 554                                                          root, comm, module,
 555                                                          large_segment_size);
 556
 557     } else if (block_size > intermediate_block_size) {
 558         return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
 559                                                          rbuf, rcount, rdtype,
 560                                                          root, comm, module,
 561                                                          small_segment_size);
 562
 563     } else if ((communicator_size > large_communicator_size) ||
 564                ((communicator_size > small_communicator_size) &&
 565                 (block_size < small_block_size))) {
 566         return smpi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype,
 567                                                       rbuf, rcount, rdtype,
 568                                                       root, comm, module);
 569
 570     }
 571     // Otherwise, use basic linear
 572     return smpi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
 573                                                       rbuf, rcount, rdtype,
 574                                                       root, comm, module);
 575 }*/
 576 /*
 577 int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount,
 578                                             MPI_Datatype sdtype,
 579                                             void* rbuf, int rcount,
 580                                             MPI_Datatype rdtype,
 581                                             int root, MPI_Comm  comm,
 582                                             )
 583 {
 584     const size_t small_block_size = 300;
 585     const int small_comm_size = 10;
 586     int communicator_size, rank;
 587     size_t dsize, block_size;
 588
 589     OPAL_OUTPUT((smpi_coll_tuned_stream,
 590                  "smpi_coll_tuned_scatter_ompi"));
 591
 592     communicator_size = smpi_comm_size(comm);
 593     rank = ompi_comm_rank(comm);
 594     // Determine block size
 595     if (root == rank) {
 596         ompi_datatype_type_size(sdtype, &dsize);
 597         block_size = dsize * scount;
 598     } else {
 599         ompi_datatype_type_size(rdtype, &dsize);
 600         block_size = dsize * rcount;
 601     }
 602
 603     if ((communicator_size > small_comm_size) &&
 604         (block_size < small_block_size)) {
 605         return smpi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype,
 606                                                        rbuf, rcount, rdtype,
 607                                                        root, comm, module);
 608     }
 609     return smpi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
 610                                                        rbuf, rcount, rdtype,
 611                                                        root, comm, module);
 612 }*/
 613