src/smpi/colls/smpi_openmpi_selector.cpp

   1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
   2
   3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
   4  * All rights reserved.                                                     */
   5
   6 /* This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the license (GNU LGPL) which comes with this package. */
   8
   9 #include "colls_private.h"
  10
  11
  12 int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
  13                         MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
  14 {
  15     size_t dsize, block_dsize;
  16     int comm_size = comm->size();
  17     const size_t intermediate_message = 10000;
  18
  19     /**
  20      * Decision function based on MX results from the Grig cluster at UTK.
  21      *
  22      * Currently, linear, recursive doubling, and nonoverlapping algorithms
  23      * can handle both commutative and non-commutative operations.
  24      * Ring algorithm does not support non-commutative operations.
  25      */
  26     dsize = smpi_datatype_size(dtype);
  27     block_dsize = dsize * count;
  28
  29     if (block_dsize < intermediate_message) {
  30         return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf,
  31                                                                    count, dtype,
  32                                                                    op, comm));
  33     }
  34
  35     if( smpi_op_is_commute(op) && (count > comm_size) ) {
  36         const size_t segment_size = 1 << 20; /* 1 MB */
  37         if ((comm_size * segment_size >= block_dsize)) {
  38             //FIXME: ok, these are not the right algorithms, try to find closer ones
  39             // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
  40             return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
  41                                               op, comm);
  42         } else {
  43            return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf,
  44                                                                     count, dtype,
  45                                                                     op, comm
  46                                                                     /*segment_size*/));
  47         }
  48     }
  49
  50     return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count,
  51                                                             dtype, op, comm));
  52 }
  53
  54
  55
  56 int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount,
  57                                              MPI_Datatype sdtype,
  58                                              void* rbuf, int rcount,
  59                                              MPI_Datatype rdtype,
  60                                              MPI_Comm comm)
  61 {
  62     int communicator_size;
  63     size_t dsize, block_dsize;
  64     communicator_size = comm->size();
  65
  66     /* Decision function based on measurement on Grig cluster at
  67        the University of Tennessee (2GB MX) up to 64 nodes.
  68        Has better performance for messages of intermediate sizes than the old one */
  69     /* determine block size */
  70     dsize = smpi_datatype_size(sdtype);
  71     block_dsize = dsize * scount;
  72
  73     if ((block_dsize < 200) && (communicator_size > 12)) {
  74         return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype,
  75                                                     rbuf, rcount, rdtype,
  76                                                     comm);
  77
  78     } else if (block_dsize < 3000) {
  79         return smpi_coll_tuned_alltoall_basic_linear(sbuf, scount, sdtype,
  80                                                            rbuf, rcount, rdtype,
  81                                                            comm);
  82     }
  83
  84     return smpi_coll_tuned_alltoall_ring (sbuf, scount, sdtype,
  85                                                     rbuf, rcount, rdtype,
  86                                                     comm);
  87 }
  88
  89 int smpi_coll_tuned_alltoallv_ompi(void *sbuf, int *scounts, int *sdisps,
  90                                               MPI_Datatype sdtype,
  91                                               void *rbuf, int *rcounts, int *rdisps,
  92                                               MPI_Datatype rdtype,
  93                                               MPI_Comm  comm
  94                                               )
  95 {
  96     /* For starters, just keep the original algorithm. */
  97     return smpi_coll_tuned_alltoallv_ompi_basic_linear(sbuf, scounts, sdisps, sdtype,
  98                                                         rbuf, rcounts, rdisps,rdtype,
  99                                                         comm);
 100 }
 101
 102
 103 int smpi_coll_tuned_barrier_ompi(MPI_Comm  comm)
 104 {    int communicator_size = comm->size();
 105
 106     if( 2 == communicator_size )
 107         return smpi_coll_tuned_barrier_ompi_two_procs(comm);
 108 /*     * Basic optimisation. If we have a power of 2 number of nodes*/
 109 /*     * the use the recursive doubling algorithm, otherwise*/
 110 /*     * bruck is the one we want.*/
 111     {
 112         int has_one = 0;
 113         for( ; communicator_size > 0; communicator_size >>= 1 ) {
 114             if( communicator_size & 0x1 ) {
 115                 if( has_one )
 116                     return smpi_coll_tuned_barrier_ompi_bruck(comm);
 117                 has_one = 1;
 118             }
 119         }
 120     }
 121     return smpi_coll_tuned_barrier_ompi_recursivedoubling(comm);
 122 }
 123
 124 int smpi_coll_tuned_bcast_ompi(void *buff, int count,
 125                                           MPI_Datatype datatype, int root,
 126                                           MPI_Comm  comm
 127                                           )
 128 {
 129     /* Decision function based on MX results for
 130        messages up to 36MB and communicator sizes up to 64 nodes */
 131     const size_t small_message_size = 2048;
 132     const size_t intermediate_message_size = 370728;
 133     const double a_p16  = 3.2118e-6; /* [1 / byte] */
 134     const double b_p16  = 8.7936;
 135     const double a_p64  = 2.3679e-6; /* [1 / byte] */
 136     const double b_p64  = 1.1787;
 137     const double a_p128 = 1.6134e-6; /* [1 / byte] */
 138     const double b_p128 = 2.1102;
 139
 140     int communicator_size;
 141     //int segsize = 0;
 142     size_t message_size, dsize;
 143
 144     communicator_size = comm->size();
 145
 146     /* else we need data size for decision function */
 147     dsize = smpi_datatype_size(datatype);
 148     message_size = dsize * (unsigned long)count;   /* needed for decision */
 149
 150     /* Handle messages of small and intermediate size, and
 151        single-element broadcasts */
 152     if ((message_size < small_message_size) || (count <= 1)) {
 153         /* Binomial without segmentation */
 154         return  smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype,
 155                                                       root, comm);
 156
 157     } else if (message_size < intermediate_message_size) {
 158         // SplittedBinary with 1KB segments
 159         return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
 160                                                          root, comm);
 161
 162     }
 163      //Handle large message sizes
 164     else if (communicator_size < (a_p128 * message_size + b_p128)) {
 165         //Pipeline with 128KB segments
 166         //segsize = 1024  << 7;
 167         return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
 168                                                      root, comm);
 169
 170
 171     } else if (communicator_size < 13) {
 172         // Split Binary with 8KB segments
 173         return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype,
 174                                                          root, comm);
 175
 176     } else if (communicator_size < (a_p64 * message_size + b_p64)) {
 177         // Pipeline with 64KB segments
 178         //segsize = 1024 << 6;
 179         return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
 180                                                      root, comm);
 181
 182
 183     } else if (communicator_size < (a_p16 * message_size + b_p16)) {
 184         //Pipeline with 16KB segments
 185         //segsize = 1024 << 4;
 186         return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype,
 187                                                      root, comm);
 188
 189
 190     }
 191     /* Pipeline with 8KB segments */
 192     //segsize = 1024 << 3;
 193     return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype,
 194                                                  root, comm
 195                                                  /*segsize*/);
 196 #if 0
 197     /* this is based on gige measurements */
 198
 199     if (communicator_size  < 4) {
 200         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
 201     }
 202     if (communicator_size == 4) {
 203         if (message_size < 524288) segsize = 0;
 204         else segsize = 16384;
 205         return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 206     }
 207     if (communicator_size <= 8 && message_size < 4096) {
 208         return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
 209     }
 210     if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
 211         segsize = 16384;
 212         return  smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 213     }
 214     if (message_size >= 524288) {
 215         segsize = 16384;
 216         return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
 217     }
 218     segsize = 0;
 219     /* once tested can swap this back in */
 220     /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
 221     return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
 222 #endif  /* 0 */
 223 }
 224
 225 int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
 226                                             int count, MPI_Datatype  datatype,
 227                                             MPI_Op   op, int root,
 228                                             MPI_Comm   comm
 229                                             )
 230 {
 231     int communicator_size=0;
 232     //int segsize = 0;
 233     size_t message_size, dsize;
 234     const double a1 =  0.6016 / 1024.0; /* [1/B] */
 235     const double b1 =  1.3496;
 236     const double a2 =  0.0410 / 1024.0; /* [1/B] */
 237     const double b2 =  9.7128;
 238     const double a3 =  0.0422 / 1024.0; /* [1/B] */
 239     const double b3 =  1.1614;
 240     //const double a4 =  0.0033 / 1024.0;  [1/B]
 241     //const double b4 =  1.6761;
 242
 243     /* no limit on # of outstanding requests */
 244     //const int max_requests = 0;
 245
 246     communicator_size = comm->size();
 247
 248     /* need data size for decision function */
 249     dsize=smpi_datatype_size(datatype);
 250     message_size = dsize * count;   /* needed for decision */
 251
 252     /**
 253      * If the operation is non commutative we currently have choice of linear
 254      * or in-order binary tree algorithm.
 255      */
 256     if( !smpi_op_is_commute(op) ) {
 257         if ((communicator_size < 12) && (message_size < 2048)) {
 258             return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm/*, module*/);
 259         }
 260         return smpi_coll_tuned_reduce_ompi_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 261                                                              0, max_requests*/);
 262     }
 263
 264     if ((communicator_size < 8) && (message_size < 512)){
 265         /* Linear_0K */
 266         return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
 267     } else if (((communicator_size < 8) && (message_size < 20480)) ||
 268                (message_size < 2048) || (count <= 1)) {
 269         /* Binomial_0K */
 270         //segsize = 0;
 271         return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 272                                                      segsize, max_requests*/);
 273     } else if (communicator_size > (a1 * message_size + b1)) {
 274         // Binomial_1K
 275         //segsize = 1024;
 276         return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 277                                                      segsize, max_requests*/);
 278     } else if (communicator_size > (a2 * message_size + b2)) {
 279         // Pipeline_1K
 280         //segsize = 1024;
 281         return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 282                                                       segsize, max_requests*/);
 283     } else if (communicator_size > (a3 * message_size + b3)) {
 284         // Binary_32K
 285         //segsize = 32*1024;
 286         return smpi_coll_tuned_reduce_ompi_binary( sendbuf, recvbuf, count, datatype, op, root,
 287                                                     comm/*, module, segsize, max_requests*/);
 288     }
 289 //    if (communicator_size > (a4 * message_size + b4)) {
 290         // Pipeline_32K
 291 //        segsize = 32*1024;
 292 //    } else {
 293         // Pipeline_64K
 294 //        segsize = 64*1024;
 295 //    }
 296     return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 297                                                   segsize, max_requests*/);
 298
 299 #if 0
 300     /* for small messages use linear algorithm */
 301     if (message_size <= 4096) {
 302         segsize = 0;
 303         fanout = communicator_size - 1;
 304         /* when linear implemented or taken from basic put here, right now using chain as a linear system */
 305         /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
 306         return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
 307         /*        return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
 308     }
 309     if (message_size < 524288) {
 310         if (message_size <= 65536 ) {
 311             segsize = 32768;
 312             fanout = 8;
 313         } else {
 314             segsize = 1024;
 315             fanout = communicator_size/2;
 316         }
 317         /* later swap this for a binary tree */
 318         /*         fanout = 2; */
 319         return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 320                                                    segsize, fanout, max_requests);
 321     }
 322     segsize = 1024;
 323     return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 324                                                   segsize, max_requests);
 325 #endif  /* 0 */
 326 }
 327
 328 int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
 329                                                     int *rcounts,
 330                                                     MPI_Datatype dtype,
 331                                                     MPI_Op  op,
 332                                                     MPI_Comm  comm
 333                                                     )
 334 {
 335     int comm_size, i, pow2;
 336     size_t total_message_size, dsize;
 337     const double a = 0.0012;
 338     const double b = 8.0;
 339     const size_t small_message_size = 12 * 1024;
 340     const size_t large_message_size = 256 * 1024;
 341     int zerocounts = 0;
 342
 343     XBT_DEBUG("smpi_coll_tuned_reduce_scatter_ompi");
 344
 345     comm_size = comm->size();
 346     // We need data size for decision function
 347     dsize=smpi_datatype_size(dtype);
 348     total_message_size = 0;
 349     for (i = 0; i < comm_size; i++) {
 350         total_message_size += rcounts[i];
 351         if (0 == rcounts[i]) {
 352             zerocounts = 1;
 353         }
 354     }
 355
 356     if( !smpi_op_is_commute(op) || (zerocounts)) {
 357         smpi_mpi_reduce_scatter (sbuf, rbuf, rcounts,
 358                                                                     dtype, op,
 359                                                                     comm);
 360         return MPI_SUCCESS;
 361     }
 362
 363     total_message_size *= dsize;
 364
 365     // compute the nearest power of 2
 366     for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
 367
 368     if ((total_message_size <= small_message_size) ||
 369         ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
 370         (comm_size >= a * total_message_size + b)) {
 371         return
 372             smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(sbuf, rbuf, rcounts,
 373                                                                         dtype, op,
 374                                                                         comm);
 375     }
 376     return smpi_coll_tuned_reduce_scatter_ompi_ring(sbuf, rbuf, rcounts,
 377                                                      dtype, op,
 378                                                      comm);
 379
 380
 381
 382 }
 383
 384 int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount,
 385                                               MPI_Datatype sdtype,
 386                                               void* rbuf, int rcount,
 387                                               MPI_Datatype rdtype,
 388                                               MPI_Comm  comm
 389                                               )
 390 {
 391     int communicator_size, pow2_size;
 392     size_t dsize, total_dsize;
 393
 394     communicator_size = comm->size();
 395
 396     /* Special case for 2 processes */
 397     if (communicator_size == 2) {
 398         return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype,
 399                                                           rbuf, rcount, rdtype,
 400                                                           comm/*, module*/);
 401     }
 402
 403     /* Determine complete data size */
 404     dsize=smpi_datatype_size(sdtype);
 405     total_dsize = dsize * scount * communicator_size;
 406
 407     for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);
 408
 409     /* Decision based on MX 2Gb results from Grig cluster at
 410        The University of Tennesse, Knoxville
 411        - if total message size is less than 50KB use either bruck or
 412        recursive doubling for non-power of two and power of two nodes,
 413        respectively.
 414        - else use ring and neighbor exchange algorithms for odd and even
 415        number of nodes, respectively.
 416     */
 417     if (total_dsize < 50000) {
 418         if (pow2_size == communicator_size) {
 419             return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
 420                                                                      rbuf, rcount, rdtype,
 421                                                                      comm);
 422         } else {
 423             return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
 424                                                          rbuf, rcount, rdtype,
 425                                                          comm);
 426         }
 427     } else {
 428         if (communicator_size % 2) {
 429             return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
 430                                                         rbuf, rcount, rdtype,
 431                                                         comm);
 432         } else {
 433             return  smpi_coll_tuned_allgather_ompi_neighborexchange(sbuf, scount, sdtype,
 434                                                                      rbuf, rcount, rdtype,
 435                                                                      comm);
 436         }
 437     }
 438
 439 #if defined(USE_MPICH2_DECISION)
 440     /* Decision as in MPICH-2
 441        presented in Thakur et.al. "Optimization of Collective Communication
 442        Operations in MPICH", International Journal of High Performance Computing
 443        Applications, Vol. 19, No. 1, 49-66 (2005)
 444        - for power-of-two processes and small and medium size messages
 445        (up to 512KB) use recursive doubling
 446        - for non-power-of-two processes and small messages (80KB) use bruck,
 447        - for everything else use ring.
 448     */
 449     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
 450         return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
 451                                                                  rbuf, rcount, rdtype,
 452                                                                  comm);
 453     } else if (total_dsize <= 81920) {
 454         return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
 455                                                      rbuf, rcount, rdtype,
 456                                                      comm);
 457     }
 458     return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
 459                                                 rbuf, rcount, rdtype,
 460                                                 comm);
 461 #endif  /* defined(USE_MPICH2_DECISION) */
 462 }
 463
 464 int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount,
 465                                                MPI_Datatype sdtype,
 466                                                void* rbuf, int *rcounts,
 467                                                int *rdispls,
 468                                                MPI_Datatype rdtype,
 469                                                MPI_Comm  comm
 470                                                )
 471 {
 472     int i;
 473     int communicator_size;
 474     size_t dsize, total_dsize;
 475
 476     communicator_size = comm->size();
 477
 478     /* Special case for 2 processes */
 479     if (communicator_size == 2) {
 480         return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
 481                                                            rbuf, rcounts, rdispls, rdtype,
 482                                                            comm);
 483     }
 484
 485     /* Determine complete data size */
 486     dsize=smpi_datatype_size(sdtype);
 487     total_dsize = 0;
 488     for (i = 0; i < communicator_size; i++) {
 489         total_dsize += dsize * rcounts[i];
 490     }
 491
 492     /* Decision based on allgather decision.   */
 493     if (total_dsize < 50000) {
 494 /*        return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
 495                                                       rbuf, rcounts, rdispls, rdtype,
 496                                                       comm, module);*/
 497     return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
 498                                                       rbuf, rcounts, rdispls, rdtype,
 499                                                       comm);
 500
 501     } else {
 502         if (communicator_size % 2) {
 503             return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
 504                                                          rbuf, rcounts, rdispls, rdtype,
 505                                                          comm);
 506         } else {
 507             return  smpi_coll_tuned_allgatherv_ompi_neighborexchange(sbuf, scount, sdtype,
 508                                                                       rbuf, rcounts, rdispls, rdtype,
 509                                                                       comm);
 510         }
 511     }
 512 }
 513
 514 int smpi_coll_tuned_gather_ompi(void *sbuf, int scount,
 515                                            MPI_Datatype sdtype,
 516                                            void* rbuf, int rcount,
 517                                            MPI_Datatype rdtype,
 518                                            int root,
 519                                            MPI_Comm  comm
 520                                            )
 521 {
 522     //const int large_segment_size = 32768;
 523     //const int small_segment_size = 1024;
 524
 525     //const size_t large_block_size = 92160;
 526     const size_t intermediate_block_size = 6000;
 527     const size_t small_block_size = 1024;
 528
 529     const int large_communicator_size = 60;
 530     const int small_communicator_size = 10;
 531
 532     int communicator_size, rank;
 533     size_t dsize, block_size;
 534
 535     XBT_DEBUG("smpi_coll_tuned_gather_ompi");
 536
 537     communicator_size = comm->size();
 538     rank = comm->rank();
 539
 540     // Determine block size
 541     if (rank == root) {
 542         dsize = smpi_datatype_size(rdtype);
 543         block_size = dsize * rcount;
 544     } else {
 545         dsize = smpi_datatype_size(sdtype);
 546         block_size = dsize * scount;
 547     }
 548
 549 /*    if (block_size > large_block_size) {*/
 550 /*        return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, */
 551 /*                                                         rbuf, rcount, rdtype, */
 552 /*                                                         root, comm);*/
 553
 554 /*    } else*/ if (block_size > intermediate_block_size) {
 555         return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype,
 556                                                          rbuf, rcount, rdtype,
 557                                                          root, comm);
 558
 559     } else if ((communicator_size > large_communicator_size) ||
 560                ((communicator_size > small_communicator_size) &&
 561                 (block_size < small_block_size))) {
 562         return smpi_coll_tuned_gather_ompi_binomial (sbuf, scount, sdtype,
 563                                                       rbuf, rcount, rdtype,
 564                                                       root, comm);
 565
 566     }
 567     // Otherwise, use basic linear
 568     return smpi_coll_tuned_gather_ompi_basic_linear (sbuf, scount, sdtype,
 569                                                       rbuf, rcount, rdtype,
 570                                                       root, comm);
 571 }
 572
 573 int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount,
 574                                             MPI_Datatype sdtype,
 575                                             void* rbuf, int rcount,
 576                                             MPI_Datatype rdtype,
 577                                             int root, MPI_Comm  comm
 578                                             )
 579 {
 580     const size_t small_block_size = 300;
 581     const int small_comm_size = 10;
 582     int communicator_size, rank;
 583     size_t dsize, block_size;
 584
 585     XBT_DEBUG("smpi_coll_tuned_scatter_ompi");
 586
 587     communicator_size = comm->size();
 588     rank = comm->rank();
 589     // Determine block size
 590     if (root == rank) {
 591         dsize=smpi_datatype_size(sdtype);
 592         block_size = dsize * scount;
 593     } else {
 594         dsize=smpi_datatype_size(rdtype);
 595         block_size = dsize * rcount;
 596     }
 597
 598     if ((communicator_size > small_comm_size) &&
 599         (block_size < small_block_size)) {
 600         if(rank!=root){
 601             sbuf=xbt_malloc(rcount*smpi_datatype_get_extent(rdtype));
 602             scount=rcount;
 603             sdtype=rdtype;
 604         }
 605         int ret=smpi_coll_tuned_scatter_ompi_binomial (sbuf, scount, sdtype,
 606             rbuf, rcount, rdtype,
 607             root, comm);
 608         if(rank!=root){
 609             xbt_free(sbuf);
 610         }
 611         return ret;
 612     }
 613     return smpi_coll_tuned_scatter_ompi_basic_linear (sbuf, scount, sdtype,
 614                                                        rbuf, rcount, rdtype,
 615                                                        root, comm);
 616 }
 617