src/smpi/colls/allreduce-smp-binomial.c

   1 /* Copyright (c) 2013-2014. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 #include "colls_private.h"
   8 /* IMPLEMENTED BY PITCH PATARASUK
   9    Non-topoloty-specific (however, number of cores/node need to be changed)
  10    all-reduce operation designed for smp clusters
  11    It uses 2-layer communication: binomial for both intra-communication
  12    inter-communication*/
  13
  14
  15 /* ** NOTE **
  16    Use -DMPICH2 if this code does not compile.
  17    MPICH1 code also work on MPICH2 on our cluster and the performance are similar.
  18    This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc).
  19 */
  20
  21 //#include <star-reduction.c>
  22
  23 /*
  24 This fucntion performs all-reduce operation as follow.
  25 1) binomial_tree reduce inside each SMP node
  26 2) binomial_tree reduce intra-communication between root of each SMP node
  27 3) binomial_tree bcast intra-communication between root of each SMP node
  28 4) binomial_tree bcast inside each SMP node
  29 */
  30 int smpi_coll_tuned_allreduce_smp_binomial(void *send_buf, void *recv_buf,
  31                                            int count, MPI_Datatype dtype,
  32                                            MPI_Op op, MPI_Comm comm)
  33 {
  34   int comm_size, rank;
  35   void *tmp_buf;
  36   int tag = COLL_TAG_ALLREDUCE;
  37   int mask, src, dst;
  38
  39   if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
  40     smpi_comm_init_smp(comm);
  41   }
  42   int num_core=1;
  43   if (smpi_comm_is_uniform(comm)){
  44     num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
  45   }
  46   MPI_Status status;
  47
  48   comm_size=smpi_comm_size(comm);
  49   rank=smpi_comm_rank(comm);
  50   MPI_Aint extent, lb;
  51   smpi_datatype_extent(dtype, &lb, &extent);
  52   tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent);
  53
  54   /* compute intra and inter ranking */
  55   int intra_rank, inter_rank;
  56   intra_rank = rank % num_core;
  57   inter_rank = rank / num_core;
  58
  59   /* size of processes participate in intra communications =>
  60      should be equal to number of machines */
  61   int inter_comm_size = (comm_size + num_core - 1) / num_core;
  62
  63   /* copy input buffer to output buffer */
  64   smpi_mpi_sendrecv(send_buf, count, dtype, rank, tag,
  65                recv_buf, count, dtype, rank, tag, comm, &status);
  66
  67   /* start binomial reduce intra communication inside each SMP node */
  68   mask = 1;
  69   while (mask < num_core) {
  70     if ((mask & intra_rank) == 0) {
  71       src = (inter_rank * num_core) + (intra_rank | mask);
  72       if (src < comm_size) {
  73         smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
  74         smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype);
  75       }
  76     } else {
  77       dst = (inter_rank * num_core) + (intra_rank & (~mask));
  78       smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
  79       break;
  80     }
  81     mask <<= 1;
  82   }
  83
  84   /* start binomial reduce inter-communication between each SMP nodes:
  85      each node only have one process that can communicate to other nodes */
  86   if (intra_rank == 0) {
  87     mask = 1;
  88     while (mask < inter_comm_size) {
  89       if ((mask & inter_rank) == 0) {
  90         src = (inter_rank | mask) * num_core;
  91         if (src < comm_size) {
  92           smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
  93           smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype);
  94         }
  95       } else {
  96         dst = (inter_rank & (~mask)) * num_core;
  97         smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
  98         break;
  99       }
 100       mask <<= 1;
 101     }
 102   }
 103
 104   /* start binomial broadcast inter-communication between each SMP nodes:
 105      each node only have one process that can communicate to other nodes */
 106   if (intra_rank == 0) {
 107     mask = 1;
 108     while (mask < inter_comm_size) {
 109       if (inter_rank & mask) {
 110         src = (inter_rank - mask) * num_core;
 111         smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status);
 112         break;
 113       }
 114       mask <<= 1;
 115     }
 116     mask >>= 1;
 117
 118     while (mask > 0) {
 119       if (inter_rank < inter_comm_size) {
 120         dst = (inter_rank + mask) * num_core;
 121         if (dst < comm_size) {
 122           smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
 123         }
 124       }
 125       mask >>= 1;
 126     }
 127   }
 128
 129   /* start binomial broadcast intra-communication inside each SMP nodes */
 130   int num_core_in_current_smp = num_core;
 131   if (inter_rank == (inter_comm_size - 1)) {
 132     num_core_in_current_smp = comm_size - (inter_rank * num_core);
 133   }
 134   mask = 1;
 135   while (mask < num_core_in_current_smp) {
 136     if (intra_rank & mask) {
 137       src = (inter_rank * num_core) + (intra_rank - mask);
 138       smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status);
 139       break;
 140     }
 141     mask <<= 1;
 142   }
 143   mask >>= 1;
 144
 145   while (mask > 0) {
 146     dst = (inter_rank * num_core) + (intra_rank + mask);
 147     if (dst < comm_size) {
 148       smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
 149     }
 150     mask >>= 1;
 151   }
 152
 153   smpi_free_tmp_buffer(tmp_buf);
 154   return MPI_SUCCESS;
 155 }