src/smpi/colls/smpi_openmpi_selector.c

   1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
   2
   3 /* Copyright (c) 2009, 2010. The SimGrid Team.
   4  * All rights reserved.                                                     */
   5
   6 /* This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the license (GNU LGPL) which comes with this package. */
   8
   9 #include "colls_private.h"
  10
  11
  12 int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
  13                         MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
  14 {
  15     size_t dsize, block_dsize;
  16     int comm_size = smpi_comm_size(comm);
  17     const size_t intermediate_message = 10000;
  18
  19     /**
  20      * Decision function based on MX results from the Grig cluster at UTK.
  21      *
  22      * Currently, linear, recursive doubling, and nonoverlapping algorithms
  23      * can handle both commutative and non-commutative operations.
  24      * Ring algorithm does not support non-commutative operations.
  25      */
  26     dsize = smpi_datatype_size(dtype);
  27     block_dsize = dsize * count;
  28
  29     if (block_dsize < intermediate_message) {
  30         return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf,
  31                                                                    count, dtype,
  32                                                                    op, comm));
  33     }
  34
  35     if( /*smpi_op_is_commute(op) && */(count > comm_size) ) {
  36         const size_t segment_size = 1 << 20; /* 1 MB */
  37         if ((comm_size * segment_size >= block_dsize)) {
  38             //FIXME: ok, these are not the right algorithms, try to find closer ones
  39             // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
  40             return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
  41                                               op, comm);
  42         } else {
  43            // return (smpi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf,
  44            return (smpi_coll_tuned_allreduce_rab2 (sbuf, rbuf,
  45                                                                     count, dtype,
  46                                                                     op, comm
  47                                                                     /*segment_size*/));
  48         }
  49     }
  50
  51     return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count,
  52                                                             dtype, op, comm));
  53 }
  54
  55
  56
  57 int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount,
  58                                              MPI_Datatype sdtype,
  59                                              void* rbuf, int rcount,
  60                                              MPI_Datatype rdtype,
  61                                              MPI_Comm comm)
  62 {
  63     int communicator_size;
  64     size_t dsize, block_dsize;
  65     communicator_size = smpi_comm_size(comm);
  66
  67     /* Decision function based on measurement on Grig cluster at
  68        the University of Tennessee (2GB MX) up to 64 nodes.
  69        Has better performance for messages of intermediate sizes than the old one */
  70     /* determine block size */
  71     dsize = smpi_datatype_size(sdtype);
  72     block_dsize = dsize * scount;
  73
  74     if ((block_dsize < 200) && (communicator_size > 12)) {
  75         return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype,
  76                                                     rbuf, rcount, rdtype,
  77                                                     comm);
  78
  79     } else if (block_dsize < 3000) {
  80         return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype,
  81                                                            rbuf, rcount, rdtype,
  82                                                            comm);
  83     }
  84
  85     return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype,
  86                                                     rbuf, rcount, rdtype,
  87                                                     comm);
  88 }
  89
  90 int smpi_coll_tuned_alltoallv_ompi(void *sbuf, int *scounts, int *sdisps,
  91                                               MPI_Datatype sdtype,
  92                                               void *rbuf, int *rcounts, int *rdisps,
  93                                               MPI_Datatype rdtype,
  94                                               MPI_Comm  comm
  95                                               )
  96 {
  97     /* For starters, just keep the original algorithm. */
  98     return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype,
  99                                                         rbuf, rcounts, rdisps,rdtype,
 100                                                         comm);
 101 }
 102
 103 /*
 104 void smpi_coll_tuned_barrier_ompi(MPI_Comm  comm)
 105 {    int communicator_size = smpi_comm_size(comm);
 106
 107     if( 2 == communicator_size )
 108         return smpi_coll_tuned_barrier_intra_two_procs(comm, module);
 109      * Basic optimisation. If we have a power of 2 number of nodes
 110      * the use the recursive doubling algorithm, otherwise
 111      * bruck is the one we want.
 112     {
 113         bool has_one = false;
 114         for( ; communicator_size > 0; communicator_size >>= 1 ) {
 115             if( communicator_size & 0x1 ) {
 116                 if( has_one )
 117                     return smpi_coll_tuned_barrier_intra_bruck(comm, module);
 118                 has_one = true;
 119             }
 120         }
 121     }
 122     return smpi_coll_tuned_barrier_intra_recursivedoubling(comm, module);
 123 }*/
 124
 125 int smpi_coll_tuned_bcast_ompi(void *buff, int count,
 126                                           MPI_Datatype datatype, int root,
 127                                           MPI_Comm  comm
 128                                           )
 129 {
 130     /* Decision function based on MX results for
 131        messages up to 36MB and communicator sizes up to 64 nodes */
 132     //const size_t small_message_size = 2048;
 133     const size_t intermediate_message_size = 370728;
 134     //const double a_p16  = 3.2118e-6; /* [1 / byte] */
 135     //const double b_p16  = 8.7936;
 136     //const double a_p64  = 2.3679e-6; /* [1 / byte] */
 137     //const double b_p64  = 1.1787;
 138     //const double a_p128 = 1.6134e-6; /* [1 / byte] */
 139     //const double b_p128 = 2.1102;
 140
 141     //int communicator_size;
 142     //int segsize = 0;
 143     size_t message_size, dsize;
 144
 145     //communicator_size = smpi_comm_size(comm);
 146
 147     /* else we need data size for decision function */
 148     dsize = smpi_datatype_size(datatype);
 149     message_size = dsize * (unsigned long)count;   /* needed for decision */
 150
 151     /* Handle messages of small and intermediate size, and
 152        single-element broadcasts */
 153     if ((message_size < /*small_message_size*/intermediate_message_size) || (count <= 1)) {
 154         /* Binomial without segmentation */
 155         //segsize = 0;
 156         return  smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype,
 157                                                       root, comm/*
 158                                                       segsize*/);
 159
 160     } /*else if (message_size < intermediate_message_size) {
 161         // SplittedBinary with 1KB segments
 162         segsize = 1024;
 163         return smpi_coll_tuned_bcast_split_bintree(buff, count, datatype,
 164                                                          root, comm
 165                                                          segsize);
 166
 167     }
 168      Handle large message sizes
 169     else if (communicator_size < (a_p128 * message_size + b_p128)) {
 170          Pipeline with 128KB segments
 171         segsize = 1024  << 7;
 172         return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
 173                                                      root, comm, module,
 174                                                      segsize);
 175
 176     } else if (communicator_size < 13) {
 177         // Split Binary with 8KB segments
 178         segsize = 1024 << 3;
 179         return smpi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype,
 180                                                          root, comm, module,
 181                                                          segsize);
 182
 183     } else if (communicator_size < (a_p64 * message_size + b_p64)) {
 184         // Pipeline with 64KB segments
 185         segsize = 1024 << 6;
 186         return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
 187                                                      root, comm, module,
 188                                                      segsize);
 189
 190     } else if (communicator_size < (a_p16 * message_size + b_p16)) {
 191          Pipeline with 16KB segments
 192         //segsize = 1024 << 4;
 193         return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
 194                                                      root, comm, module,
 195                                                      segsize);
 196
 197     }*/
 198
 199     /* Pipeline with 8KB segments */
 200     //segsize = 1024 << 3;
 201     return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
 202                                                  root, comm
 203                                                  /*segsize*/);
 204 #if 0
 205     /* this is based on gige measurements */
 206
 207     if (communicator_size  < 4) {
 208         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
 209     }
 210     if (communicator_size == 4) {
 211         if (message_size < 524288) segsize = 0;
 212         else segsize = 16384;
 213         return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 214     }
 215     if (communicator_size <= 8 && message_size < 4096) {
 216         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
 217     }
 218     if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
 219         segsize = 16384;
 220         return  smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 221     }
 222     if (message_size >= 524288) {
 223         segsize = 16384;
 224         return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
 225     }
 226     segsize = 0;
 227     /* once tested can swap this back in */
 228     /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
 229     return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 230 #endif  /* 0 */
 231 }
 232
 233 int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
 234                                             int count, MPI_Datatype  datatype,
 235                                             MPI_Op   op, int root,
 236                                             MPI_Comm   comm
 237                                             )
 238 {
 239     int communicator_size=0;
 240     //int segsize = 0;
 241     size_t message_size, dsize;
 242     //const double a1 =  0.6016 / 1024.0; /* [1/B] */
 243     //const double b1 =  1.3496;
 244     //const double a2 =  0.0410 / 1024.0; /* [1/B] */
 245     //const double b2 =  9.7128;
 246     //const double a3 =  0.0422 / 1024.0; /* [1/B] */
 247     //const double b3 =  1.1614;
 248     //const double a4 =  0.0033 / 1024.0; /* [1/B] */
 249     //const double b4 =  1.6761;
 250
 251     //const int max_requests = 0; /* no limit on # of outstanding requests */
 252
 253     communicator_size = smpi_comm_size(comm);
 254
 255     /* need data size for decision function */
 256     dsize=smpi_datatype_size(datatype);
 257     message_size = dsize * count;   /* needed for decision */
 258
 259     /**
 260      * If the operation is non commutative we currently have choice of linear
 261      * or in-order binary tree algorithm.
 262      */
 263 /*    if( !ompi_op_is_commute(op) ) {
 264         if ((communicator_size < 12) && (message_size < 2048)) {
 265             return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
 266         }
 267         return smpi_coll_tuned_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 268                                                              0, max_requests);
 269     }*/
 270
 271     if ((communicator_size < 8) && (message_size < 512)){
 272         /* Linear_0K */
 273         return smpi_coll_tuned_reduce_flat_tree (sendbuf, recvbuf, count, datatype, op, root, comm);
 274     } else if (((communicator_size < 8) && (message_size < 20480)) ||
 275                (message_size < 2048) || (count <= 1)) {
 276         /* Binomial_0K */
 277         //segsize = 0;
 278         return smpi_coll_tuned_reduce_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 279                                                      segsize, max_requests*/);
 280     } /*else if (communicator_size > (a1 * message_size + b1)) {
 281         // Binomial_1K
 282         segsize = 1024;
 283         return smpi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
 284                                                      segsize, max_requests);
 285     } else if (communicator_size > (a2 * message_size + b2)) {
 286         // Pipeline_1K
 287         segsize = 1024;
 288         return smpi_coll_tuned_reduce_NTSL (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 289                                                       segsize, max_requests);
 290     } else if (communicator_size > (a3 * message_size + b3)) {
 291         // Binary_32K
 292         segsize = 32*1024;
 293         return smpi_coll_tuned_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
 294                                                     comm, module, segsize, max_requests);
 295     }
 296     if (communicator_size > (a4 * message_size + b4)) {
 297         // Pipeline_32K
 298         segsize = 32*1024;
 299     } else {
 300         // Pipeline_64K
 301         segsize = 64*1024;
 302     }*/
 303     return smpi_coll_tuned_reduce_NTSL (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 304                                                   segsize, max_requests*/);
 305
 306 #if 0
 307     /* for small messages use linear algorithm */
 308     if (message_size <= 4096) {
 309         segsize = 0;
 310         fanout = communicator_size - 1;
 311         /* when linear implemented or taken from basic put here, right now using chain as a linear system */
 312         /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
 313         return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
 314         /*        return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
 315     }
 316     if (message_size < 524288) {
 317         if (message_size <= 65536 ) {
 318             segsize = 32768;
 319             fanout = 8;
 320         } else {
 321             segsize = 1024;
 322             fanout = communicator_size/2;
 323         }
 324         /* later swap this for a binary tree */
 325         /*         fanout = 2; */
 326         return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 327                                                    segsize, fanout, max_requests);
 328     }
 329     segsize = 1024;
 330     return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 331                                                   segsize, max_requests);
 332 #endif  /* 0 */
 333 }
 334
 335 /*int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
 336                                                     int *rcounts,
 337                                                     MPI_Datatype dtype,
 338                                                     MPI_Op  op,
 339                                                     MPI_Comm  comm,
 340                                                     )
 341 {
 342     int comm_size, i, pow2;
 343     size_t total_message_size, dsize;
 344     const double a = 0.0012;
 345     const double b = 8.0;
 346     const size_t small_message_size = 12 * 1024;
 347     const size_t large_message_size = 256 * 1024;
 348     bool zerocounts = false;
 349
 350     OPAL_OUTPUT((smpi_coll_tuned_stream, "smpi_coll_tuned_reduce_scatter_ompi"));
 351
 352     comm_size = smpi_comm_size(comm);
 353     // We need data size for decision function
 354     ompi_datatype_type_size(dtype, &dsize);
 355     total_message_size = 0;
 356     for (i = 0; i < comm_size; i++) {
 357         total_message_size += rcounts[i];
 358         if (0 == rcounts[i]) {
 359             zerocounts = true;
 360         }
 361     }
 362
 363     if( !ompi_op_is_commute(op) || (zerocounts)) {
 364         return smpi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts,
 365                                                                     dtype, op,
 366                                                                     comm, module);
 367     }
 368
 369     total_message_size *= dsize;
 370
 371     // compute the nearest power of 2
 372     for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
 373
 374     if ((total_message_size <= small_message_size) ||
 375         ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
 376         (comm_size >= a * total_message_size + b)) {
 377         return
 378             smpi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
 379                                                                         dtype, op,
 380                                                                         comm, module);
 381     }
 382     return smpi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
 383                                                      dtype, op,
 384                                                      comm, module);
 385
 386
 387     return smpi_coll_tuned_reduce_scatter(sbuf, rbuf, rcounts,
 388                                                      dtype, op,
 389                                                      comm;
 390
 391 }*/
 392
 393 int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount,
 394                                               MPI_Datatype sdtype,
 395                                               void* rbuf, int rcount,
 396                                               MPI_Datatype rdtype,
 397                                               MPI_Comm  comm
 398                                               )
 399 {
 400     int communicator_size, pow2_size;
 401     size_t dsize, total_dsize;
 402
 403     communicator_size = smpi_comm_size(comm);
 404
 405     /* Special case for 2 processes */
 406     if (communicator_size == 2) {
 407         return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype,
 408                                                           rbuf, rcount, rdtype,
 409                                                           comm/*, module*/);
 410     }
 411
 412     /* Determine complete data size */
 413     dsize=smpi_datatype_size(sdtype);
 414     total_dsize = dsize * scount * communicator_size;
 415
 416     for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);
 417
 418     /* Decision based on MX 2Gb results from Grig cluster at
 419        The University of Tennesse, Knoxville
 420        - if total message size is less than 50KB use either bruck or
 421        recursive doubling for non-power of two and power of two nodes,
 422        respectively.
 423        - else use ring and neighbor exchange algorithms for odd and even
 424        number of nodes, respectively.
 425     */
 426     if (total_dsize < 50000) {
 427         if (pow2_size == communicator_size) {
 428             return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
 429                                                                      rbuf, rcount, rdtype,
 430                                                                      comm);
 431         } else {
 432             return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
 433                                                          rbuf, rcount, rdtype,
 434                                                          comm);
 435         }
 436     } else {
 437         //if (communicator_size % 2) {
 438             return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
 439                                                         rbuf, rcount, rdtype,
 440                                                         comm);
 441         /*} else {
 442             return  smpi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
 443                                                                      rbuf, rcount, rdtype,
 444                                                                      comm, module);
 445         }*/
 446     }
 447
 448 #if defined(USE_MPICH2_DECISION)
 449     /* Decision as in MPICH-2
 450        presented in Thakur et.al. "Optimization of Collective Communication
 451        Operations in MPICH", International Journal of High Performance Computing
 452        Applications, Vol. 19, No. 1, 49-66 (2005)
 453        - for power-of-two processes and small and medium size messages
 454        (up to 512KB) use recursive doubling
 455        - for non-power-of-two processes and small messages (80KB) use bruck,
 456        - for everything else use ring.
 457     */
 458     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
 459         return smpi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
 460                                                                  rbuf, rcount, rdtype,
 461                                                                  comm, module);
 462     } else if (total_dsize <= 81920) {
 463         return smpi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
 464                                                      rbuf, rcount, rdtype,
 465                                                      comm, module);
 466     }
 467     return smpi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
 468                                                 rbuf, rcount, rdtype,
 469                                                 comm, module);
 470 #endif  /* defined(USE_MPICH2_DECISION) */
 471 }
 472
 473 int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount,
 474                                                MPI_Datatype sdtype,
 475                                                void* rbuf, int *rcounts,
 476                                                int *rdispls,
 477                                                MPI_Datatype rdtype,
 478                                                MPI_Comm  comm
 479                                                )
 480 {
 481     int i;
 482     int communicator_size;
 483     size_t dsize, total_dsize;
 484
 485     communicator_size = smpi_comm_size(comm);
 486
 487     /* Special case for 2 processes */
 488     if (communicator_size == 2) {
 489         return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
 490                                                            rbuf, rcounts, rdispls, rdtype,
 491                                                            comm);
 492     }
 493
 494     /* Determine complete data size */
 495     dsize=smpi_datatype_size(sdtype);
 496     total_dsize = 0;
 497     for (i = 0; i < communicator_size; i++) {
 498         total_dsize += dsize * rcounts[i];
 499     }
 500
 501     /* Decision based on allgather decision.   */
 502     if (total_dsize < 50000) {
 503 /*        return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
 504                                                       rbuf, rcounts, rdispls, rdtype,
 505                                                       comm, module);*/
 506     return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
 507                                                       rbuf, rcounts, rdispls, rdtype,
 508                                                       comm);
 509
 510     } else {
 511 //        if (communicator_size % 2) {
 512             return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
 513                                                          rbuf, rcounts, rdispls, rdtype,
 514                                                          comm);
 515 /*        } else {
 516             return  smpi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
 517                                                                       rbuf, rcounts, rdispls, rdtype,
 518                                                                       comm, module);
 519         }*/
 520     }
 521 }
 522 /*
 523 int smpi_coll_tuned_gather_ompi(void *sbuf, int scount,
 524                                            MPI_Datatype sdtype,
 525                                            void* rbuf, int rcount,
 526                                            MPI_Datatype rdtype,
 527                                            int root,
 528                                            MPI_Comm  comm,
 529                                            )
 530 {
 531     const int large_segment_size = 32768;
 532     const int small_segment_size = 1024;
 533
 534     const size_t large_block_size = 92160;
 535     const size_t intermediate_block_size = 6000;
 536     const size_t small_block_size = 1024;
 537
 538     const int large_communicator_size = 60;
 539     const int small_communicator_size = 10;
 540
 541     int communicator_size, rank;
 542     size_t dsize, block_size;
 543
 544     OPAL_OUTPUT((smpi_coll_tuned_stream,
 545                  "smpi_coll_tuned_gather_ompi"));
 546
 547     communicator_size = smpi_comm_size(comm);
 548     rank = ompi_comm_rank(comm);
 549
 550     // Determine block size
 551     if (rank == root) {
 552         ompi_datatype_type_size(rdtype, &dsize);
 553         block_size = dsize * rcount;
 554     } else {
 555         ompi_datatype_type_size(sdtype, &dsize);
 556         block_size = dsize * scount;
 557     }
 558
 559     if (block_size > large_block_size) {
 560         return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
 561                                                          rbuf, rcount, rdtype,
 562                                                          root, comm, module,
 563                                                          large_segment_size);
 564
 565     } else if (block_size > intermediate_block_size) {
 566         return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
 567                                                          rbuf, rcount, rdtype,
 568                                                          root, comm, module,
 569                                                          small_segment_size);
 570
 571     } else if ((communicator_size > large_communicator_size) ||
 572                ((communicator_size > small_communicator_size) &&
 573                 (block_size < small_block_size))) {
 574         return smpi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype,
 575                                                       rbuf, rcount, rdtype,
 576                                                       root, comm, module);
 577
 578     }
 579     // Otherwise, use basic linear
 580     return smpi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
 581                                                       rbuf, rcount, rdtype,
 582                                                       root, comm, module);
 583 }*/
 584 /*
 585 int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount,
 586                                             MPI_Datatype sdtype,
 587                                             void* rbuf, int rcount,
 588                                             MPI_Datatype rdtype,
 589                                             int root, MPI_Comm  comm,
 590                                             )
 591 {
 592     const size_t small_block_size = 300;
 593     const int small_comm_size = 10;
 594     int communicator_size, rank;
 595     size_t dsize, block_size;
 596
 597     OPAL_OUTPUT((smpi_coll_tuned_stream,
 598                  "smpi_coll_tuned_scatter_ompi"));
 599
 600     communicator_size = smpi_comm_size(comm);
 601     rank = ompi_comm_rank(comm);
 602     // Determine block size
 603     if (root == rank) {
 604         ompi_datatype_type_size(sdtype, &dsize);
 605         block_size = dsize * scount;
 606     } else {
 607         ompi_datatype_type_size(rdtype, &dsize);
 608         block_size = dsize * rcount;
 609     }
 610
 611     if ((communicator_size > small_comm_size) &&
 612         (block_size < small_block_size)) {
 613         return smpi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype,
 614                                                        rbuf, rcount, rdtype,
 615                                                        root, comm, module);
 616     }
 617     return smpi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
 618                                                        rbuf, rcount, rdtype,
 619                                                        root, comm, module);
 620 }*/
 621