src/smpi/colls/smpi_openmpi_selector.c

   1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
   2
   3 /* Copyright (c) 2009, 2010. The SimGrid Team.
   4  * All rights reserved.                                                     */
   5
   6 /* This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the license (GNU LGPL) which comes with this package. */
   8
   9 #include "colls_private.h"
  10
  11
  12 int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
  13                         MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
  14 {
  15     size_t dsize, block_dsize;
  16     int comm_size = smpi_comm_size(comm);
  17     const size_t intermediate_message = 10000;
  18
  19     /**
  20      * Decision function based on MX results from the Grig cluster at UTK.
  21      *
  22      * Currently, linear, recursive doubling, and nonoverlapping algorithms
  23      * can handle both commutative and non-commutative operations.
  24      * Ring algorithm does not support non-commutative operations.
  25      */
  26     dsize = smpi_datatype_size(dtype);
  27     block_dsize = dsize * count;
  28
  29     if (block_dsize < intermediate_message) {
  30         return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf,
  31                                                                    count, dtype,
  32                                                                    op, comm));
  33     }
  34
  35     if( smpi_op_is_commute(op) && (count > comm_size) ) {
  36         const size_t segment_size = 1 << 20; /* 1 MB */
  37         if ((comm_size * segment_size >= block_dsize)) {
  38             //FIXME: ok, these are not the right algorithms, try to find closer ones
  39             // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
  40             return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
  41                                               op, comm);
  42         } else {
  43            // return (smpi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf,
  44            return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf,
  45                                                                     count, dtype,
  46                                                                     op, comm
  47                                                                     /*segment_size*/));
  48         }
  49     }
  50
  51     return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count,
  52                                                             dtype, op, comm));
  53 }
  54
  55
  56
  57 int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount,
  58                                              MPI_Datatype sdtype,
  59                                              void* rbuf, int rcount,
  60                                              MPI_Datatype rdtype,
  61                                              MPI_Comm comm)
  62 {
  63     int communicator_size;
  64     size_t dsize, block_dsize;
  65     communicator_size = smpi_comm_size(comm);
  66
  67     /* Decision function based on measurement on Grig cluster at
  68        the University of Tennessee (2GB MX) up to 64 nodes.
  69        Has better performance for messages of intermediate sizes than the old one */
  70     /* determine block size */
  71     dsize = smpi_datatype_size(sdtype);
  72     block_dsize = dsize * scount;
  73
  74     if ((block_dsize < 200) && (communicator_size > 12)) {
  75         return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype,
  76                                                     rbuf, rcount, rdtype,
  77                                                     comm);
  78
  79     } else if (block_dsize < 3000) {
  80         return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype,
  81                                                            rbuf, rcount, rdtype,
  82                                                            comm);
  83     }
  84
  85     return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype,
  86                                                     rbuf, rcount, rdtype,
  87                                                     comm);
  88 }
  89
  90 int smpi_coll_tuned_alltoallv_ompi(void *sbuf, int *scounts, int *sdisps,
  91                                               MPI_Datatype sdtype,
  92                                               void *rbuf, int *rcounts, int *rdisps,
  93                                               MPI_Datatype rdtype,
  94                                               MPI_Comm  comm
  95                                               )
  96 {
  97     /* For starters, just keep the original algorithm. */
  98     return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype,
  99                                                         rbuf, rcounts, rdisps,rdtype,
 100                                                         comm);
 101 }
 102
 103 /*
 104 void smpi_coll_tuned_barrier_ompi(MPI_Comm  comm)
 105 {    int communicator_size = smpi_comm_size(comm);
 106
 107     if( 2 == communicator_size )
 108         return smpi_coll_tuned_barrier_intra_two_procs(comm, module);
 109      * Basic optimisation. If we have a power of 2 number of nodes
 110      * the use the recursive doubling algorithm, otherwise
 111      * bruck is the one we want.
 112     {
 113         bool has_one = false;
 114         for( ; communicator_size > 0; communicator_size >>= 1 ) {
 115             if( communicator_size & 0x1 ) {
 116                 if( has_one )
 117                     return smpi_coll_tuned_barrier_intra_bruck(comm, module);
 118                 has_one = true;
 119             }
 120         }
 121     }
 122     return smpi_coll_tuned_barrier_intra_recursivedoubling(comm, module);
 123 }*/
 124
 125 int smpi_coll_tuned_bcast_ompi(void *buff, int count,
 126                                           MPI_Datatype datatype, int root,
 127                                           MPI_Comm  comm
 128                                           )
 129 {
 130     /* Decision function based on MX results for
 131        messages up to 36MB and communicator sizes up to 64 nodes */
 132     const size_t small_message_size = 2048;
 133     const size_t intermediate_message_size = 370728;
 134     //const double a_p16  = 3.2118e-6; /* [1 / byte] */
 135     //const double b_p16  = 8.7936;
 136     //const double a_p64  = 2.3679e-6; /* [1 / byte] */
 137     //const double b_p64  = 1.1787;
 138     //const double a_p128 = 1.6134e-6; /* [1 / byte] */
 139     //const double b_p128 = 2.1102;
 140
 141     int communicator_size;
 142     //int segsize = 0;
 143     size_t message_size, dsize;
 144
 145     communicator_size = smpi_comm_size(comm);
 146
 147     /* else we need data size for decision function */
 148     dsize = smpi_datatype_size(datatype);
 149     message_size = dsize * (unsigned long)count;   /* needed for decision */
 150
 151     /* Handle messages of small and intermediate size, and
 152        single-element broadcasts */
 153     if ((message_size < small_message_size) || (count <= 1)) {
 154         /* Binomial without segmentation */
 155         return  smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype,
 156                                                       root, comm);
 157
 158     } else if (message_size < intermediate_message_size) {
 159         // SplittedBinary with 1KB segments
 160         return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
 161                                                          root, comm);
 162
 163     } /*
 164      Handle large message sizes
 165     else if (communicator_size < (a_p128 * message_size + b_p128)) {
 166          Pipeline with 128KB segments
 167         segsize = 1024  << 7;
 168         return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
 169                                                      root, comm, module,
 170                                                      segsize);
 171
 172     }*/ else if (communicator_size < 13) {
 173         // Split Binary with 8KB segments
 174         return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
 175                                                          root, comm);
 176
 177     } /*else if (communicator_size < (a_p64 * message_size + b_p64)) {
 178         // Pipeline with 64KB segments
 179         segsize = 1024 << 6;
 180         return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
 181                                                      root, comm, module,
 182                                                      segsize);
 183
 184     } else if (communicator_size < (a_p16 * message_size + b_p16)) {
 185          Pipeline with 16KB segments
 186         //segsize = 1024 << 4;
 187         return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
 188                                                      root, comm, module,
 189                                                      segsize);
 190
 191     }*/
 192
 193     /* Pipeline with 8KB segments */
 194     //segsize = 1024 << 3;
 195     return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
 196                                                  root, comm
 197                                                  /*segsize*/);
 198 #if 0
 199     /* this is based on gige measurements */
 200
 201     if (communicator_size  < 4) {
 202         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
 203     }
 204     if (communicator_size == 4) {
 205         if (message_size < 524288) segsize = 0;
 206         else segsize = 16384;
 207         return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 208     }
 209     if (communicator_size <= 8 && message_size < 4096) {
 210         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
 211     }
 212     if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
 213         segsize = 16384;
 214         return  smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 215     }
 216     if (message_size >= 524288) {
 217         segsize = 16384;
 218         return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
 219     }
 220     segsize = 0;
 221     /* once tested can swap this back in */
 222     /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
 223     return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 224 #endif  /* 0 */
 225 }
 226
 227 int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
 228                                             int count, MPI_Datatype  datatype,
 229                                             MPI_Op   op, int root,
 230                                             MPI_Comm   comm
 231                                             )
 232 {
 233     int communicator_size=0;
 234     //int segsize = 0;
 235     size_t message_size, dsize;
 236     //const double a1 =  0.6016 / 1024.0; /* [1/B] */
 237     //const double b1 =  1.3496;
 238     //const double a2 =  0.0410 / 1024.0; /* [1/B] */
 239     //const double b2 =  9.7128;
 240     //const double a3 =  0.0422 / 1024.0; /* [1/B] */
 241     //const double b3 =  1.1614;
 242     //const double a4 =  0.0033 / 1024.0; /* [1/B] */
 243     //const double b4 =  1.6761;
 244
 245     //const int max_requests = 0; /* no limit on # of outstanding requests */
 246
 247     communicator_size = smpi_comm_size(comm);
 248
 249     /* need data size for decision function */
 250     dsize=smpi_datatype_size(datatype);
 251     message_size = dsize * count;   /* needed for decision */
 252
 253     /**
 254      * If the operation is non commutative we currently have choice of linear
 255      * or in-order binary tree algorithm.
 256      */
 257 /*    if( !ompi_op_is_commute(op) ) {
 258         if ((communicator_size < 12) && (message_size < 2048)) {
 259             return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
 260         }
 261         return smpi_coll_tuned_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 262                                                              0, max_requests);
 263     }*/
 264
 265     if ((communicator_size < 8) && (message_size < 512)){
 266         /* Linear_0K */
 267         return smpi_coll_tuned_reduce_flat_tree (sendbuf, recvbuf, count, datatype, op, root, comm);
 268     } else if (((communicator_size < 8) && (message_size < 20480)) ||
 269                (message_size < 2048) || (count <= 1)) {
 270         /* Binomial_0K */
 271         //segsize = 0;
 272         return smpi_coll_tuned_reduce_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 273                                                      segsize, max_requests*/);
 274     } /*else if (communicator_size > (a1 * message_size + b1)) {
 275         // Binomial_1K
 276         segsize = 1024;
 277         return smpi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
 278                                                      segsize, max_requests);
 279     } else if (communicator_size > (a2 * message_size + b2)) {
 280         // Pipeline_1K
 281         segsize = 1024;
 282         return smpi_coll_tuned_reduce_NTSL (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 283                                                       segsize, max_requests);
 284     } else if (communicator_size > (a3 * message_size + b3)) {
 285         // Binary_32K
 286         segsize = 32*1024;
 287         return smpi_coll_tuned_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
 288                                                     comm, module, segsize, max_requests);
 289     }
 290     if (communicator_size > (a4 * message_size + b4)) {
 291         // Pipeline_32K
 292         segsize = 32*1024;
 293     } else {
 294         // Pipeline_64K
 295         segsize = 64*1024;
 296     }*/
 297     return smpi_coll_tuned_reduce_NTSL (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 298                                                   segsize, max_requests*/);
 299
 300 #if 0
 301     /* for small messages use linear algorithm */
 302     if (message_size <= 4096) {
 303         segsize = 0;
 304         fanout = communicator_size - 1;
 305         /* when linear implemented or taken from basic put here, right now using chain as a linear system */
 306         /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
 307         return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
 308         /*        return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
 309     }
 310     if (message_size < 524288) {
 311         if (message_size <= 65536 ) {
 312             segsize = 32768;
 313             fanout = 8;
 314         } else {
 315             segsize = 1024;
 316             fanout = communicator_size/2;
 317         }
 318         /* later swap this for a binary tree */
 319         /*         fanout = 2; */
 320         return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 321                                                    segsize, fanout, max_requests);
 322     }
 323     segsize = 1024;
 324     return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 325                                                   segsize, max_requests);
 326 #endif  /* 0 */
 327 }
 328
 329 /*int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
 330                                                     int *rcounts,
 331                                                     MPI_Datatype dtype,
 332                                                     MPI_Op  op,
 333                                                     MPI_Comm  comm,
 334                                                     )
 335 {
 336     int comm_size, i, pow2;
 337     size_t total_message_size, dsize;
 338     const double a = 0.0012;
 339     const double b = 8.0;
 340     const size_t small_message_size = 12 * 1024;
 341     const size_t large_message_size = 256 * 1024;
 342     bool zerocounts = false;
 343
 344     OPAL_OUTPUT((smpi_coll_tuned_stream, "smpi_coll_tuned_reduce_scatter_ompi"));
 345
 346     comm_size = smpi_comm_size(comm);
 347     // We need data size for decision function
 348     ompi_datatype_type_size(dtype, &dsize);
 349     total_message_size = 0;
 350     for (i = 0; i < comm_size; i++) {
 351         total_message_size += rcounts[i];
 352         if (0 == rcounts[i]) {
 353             zerocounts = true;
 354         }
 355     }
 356
 357     if( !ompi_op_is_commute(op) || (zerocounts)) {
 358         return smpi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts,
 359                                                                     dtype, op,
 360                                                                     comm, module);
 361     }
 362
 363     total_message_size *= dsize;
 364
 365     // compute the nearest power of 2
 366     for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
 367
 368     if ((total_message_size <= small_message_size) ||
 369         ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
 370         (comm_size >= a * total_message_size + b)) {
 371         return
 372             smpi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
 373                                                                         dtype, op,
 374                                                                         comm, module);
 375     }
 376     return smpi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
 377                                                      dtype, op,
 378                                                      comm, module);
 379
 380
 381     return smpi_coll_tuned_reduce_scatter(sbuf, rbuf, rcounts,
 382                                                      dtype, op,
 383                                                      comm;
 384
 385 }*/
 386
 387 int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount,
 388                                               MPI_Datatype sdtype,
 389                                               void* rbuf, int rcount,
 390                                               MPI_Datatype rdtype,
 391                                               MPI_Comm  comm
 392                                               )
 393 {
 394     int communicator_size, pow2_size;
 395     size_t dsize, total_dsize;
 396
 397     communicator_size = smpi_comm_size(comm);
 398
 399     /* Special case for 2 processes */
 400     if (communicator_size == 2) {
 401         return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype,
 402                                                           rbuf, rcount, rdtype,
 403                                                           comm/*, module*/);
 404     }
 405
 406     /* Determine complete data size */
 407     dsize=smpi_datatype_size(sdtype);
 408     total_dsize = dsize * scount * communicator_size;
 409
 410     for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);
 411
 412     /* Decision based on MX 2Gb results from Grig cluster at
 413        The University of Tennesse, Knoxville
 414        - if total message size is less than 50KB use either bruck or
 415        recursive doubling for non-power of two and power of two nodes,
 416        respectively.
 417        - else use ring and neighbor exchange algorithms for odd and even
 418        number of nodes, respectively.
 419     */
 420     if (total_dsize < 50000) {
 421         if (pow2_size == communicator_size) {
 422             return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
 423                                                                      rbuf, rcount, rdtype,
 424                                                                      comm);
 425         } else {
 426             return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
 427                                                          rbuf, rcount, rdtype,
 428                                                          comm);
 429         }
 430     } else {
 431         //if (communicator_size % 2) {
 432             return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
 433                                                         rbuf, rcount, rdtype,
 434                                                         comm);
 435         /*} else {
 436             return  smpi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
 437                                                                      rbuf, rcount, rdtype,
 438                                                                      comm, module);
 439         }*/
 440     }
 441
 442 #if defined(USE_MPICH2_DECISION)
 443     /* Decision as in MPICH-2
 444        presented in Thakur et.al. "Optimization of Collective Communication
 445        Operations in MPICH", International Journal of High Performance Computing
 446        Applications, Vol. 19, No. 1, 49-66 (2005)
 447        - for power-of-two processes and small and medium size messages
 448        (up to 512KB) use recursive doubling
 449        - for non-power-of-two processes and small messages (80KB) use bruck,
 450        - for everything else use ring.
 451     */
 452     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
 453         return smpi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
 454                                                                  rbuf, rcount, rdtype,
 455                                                                  comm, module);
 456     } else if (total_dsize <= 81920) {
 457         return smpi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
 458                                                      rbuf, rcount, rdtype,
 459                                                      comm, module);
 460     }
 461     return smpi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
 462                                                 rbuf, rcount, rdtype,
 463                                                 comm, module);
 464 #endif  /* defined(USE_MPICH2_DECISION) */
 465 }
 466
 467 int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount,
 468                                                MPI_Datatype sdtype,
 469                                                void* rbuf, int *rcounts,
 470                                                int *rdispls,
 471                                                MPI_Datatype rdtype,
 472                                                MPI_Comm  comm
 473                                                )
 474 {
 475     int i;
 476     int communicator_size;
 477     size_t dsize, total_dsize;
 478
 479     communicator_size = smpi_comm_size(comm);
 480
 481     /* Special case for 2 processes */
 482     if (communicator_size == 2) {
 483         return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
 484                                                            rbuf, rcounts, rdispls, rdtype,
 485                                                            comm);
 486     }
 487
 488     /* Determine complete data size */
 489     dsize=smpi_datatype_size(sdtype);
 490     total_dsize = 0;
 491     for (i = 0; i < communicator_size; i++) {
 492         total_dsize += dsize * rcounts[i];
 493     }
 494
 495     /* Decision based on allgather decision.   */
 496     if (total_dsize < 50000) {
 497 /*        return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
 498                                                       rbuf, rcounts, rdispls, rdtype,
 499                                                       comm, module);*/
 500     return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
 501                                                       rbuf, rcounts, rdispls, rdtype,
 502                                                       comm);
 503
 504     } else {
 505 //        if (communicator_size % 2) {
 506             return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
 507                                                          rbuf, rcounts, rdispls, rdtype,
 508                                                          comm);
 509 /*        } else {
 510             return  smpi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
 511                                                                       rbuf, rcounts, rdispls, rdtype,
 512                                                                       comm, module);
 513         }*/
 514     }
 515 }
 516 /*
 517 int smpi_coll_tuned_gather_ompi(void *sbuf, int scount,
 518                                            MPI_Datatype sdtype,
 519                                            void* rbuf, int rcount,
 520                                            MPI_Datatype rdtype,
 521                                            int root,
 522                                            MPI_Comm  comm,
 523                                            )
 524 {
 525     const int large_segment_size = 32768;
 526     const int small_segment_size = 1024;
 527
 528     const size_t large_block_size = 92160;
 529     const size_t intermediate_block_size = 6000;
 530     const size_t small_block_size = 1024;
 531
 532     const int large_communicator_size = 60;
 533     const int small_communicator_size = 10;
 534
 535     int communicator_size, rank;
 536     size_t dsize, block_size;
 537
 538     OPAL_OUTPUT((smpi_coll_tuned_stream,
 539                  "smpi_coll_tuned_gather_ompi"));
 540
 541     communicator_size = smpi_comm_size(comm);
 542     rank = ompi_comm_rank(comm);
 543
 544     // Determine block size
 545     if (rank == root) {
 546         ompi_datatype_type_size(rdtype, &dsize);
 547         block_size = dsize * rcount;
 548     } else {
 549         ompi_datatype_type_size(sdtype, &dsize);
 550         block_size = dsize * scount;
 551     }
 552
 553     if (block_size > large_block_size) {
 554         return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
 555                                                          rbuf, rcount, rdtype,
 556                                                          root, comm, module,
 557                                                          large_segment_size);
 558
 559     } else if (block_size > intermediate_block_size) {
 560         return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
 561                                                          rbuf, rcount, rdtype,
 562                                                          root, comm, module,
 563                                                          small_segment_size);
 564
 565     } else if ((communicator_size > large_communicator_size) ||
 566                ((communicator_size > small_communicator_size) &&
 567                 (block_size < small_block_size))) {
 568         return smpi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype,
 569                                                       rbuf, rcount, rdtype,
 570                                                       root, comm, module);
 571
 572     }
 573     // Otherwise, use basic linear
 574     return smpi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
 575                                                       rbuf, rcount, rdtype,
 576                                                       root, comm, module);
 577 }*/
 578 /*
 579 int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount,
 580                                             MPI_Datatype sdtype,
 581                                             void* rbuf, int rcount,
 582                                             MPI_Datatype rdtype,
 583                                             int root, MPI_Comm  comm,
 584                                             )
 585 {
 586     const size_t small_block_size = 300;
 587     const int small_comm_size = 10;
 588     int communicator_size, rank;
 589     size_t dsize, block_size;
 590
 591     OPAL_OUTPUT((smpi_coll_tuned_stream,
 592                  "smpi_coll_tuned_scatter_ompi"));
 593
 594     communicator_size = smpi_comm_size(comm);
 595     rank = ompi_comm_rank(comm);
 596     // Determine block size
 597     if (root == rank) {
 598         ompi_datatype_type_size(sdtype, &dsize);
 599         block_size = dsize * scount;
 600     } else {
 601         ompi_datatype_type_size(rdtype, &dsize);
 602         block_size = dsize * rcount;
 603     }
 604
 605     if ((communicator_size > small_comm_size) &&
 606         (block_size < small_block_size)) {
 607         return smpi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype,
 608                                                        rbuf, rcount, rdtype,
 609                                                        root, comm, module);
 610     }
 611     return smpi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
 612                                                        rbuf, rcount, rdtype,
 613                                                        root, comm, module);
 614 }*/
 615