src/smpi/colls/allreduce-smp-binomial.c

   1 /* Copyright (c) 2013-2014. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 #include "colls_private.h"
   8 /* IMPLEMENTED BY PITCH PATARASUK
   9    Non-topoloty-specific (however, number of cores/node need to be changed)
  10    all-reduce operation designed for smp clusters
  11    It uses 2-layer communication: binomial for both intra-communication
  12    inter-communication*/
  13
  14 /* change number of core per smp-node
  15    we assume that number of core per process will be the same for all implementations */
  16 #ifndef NUM_CORE
  17 #define NUM_CORE 8
  18 #endif
  19
  20 /* ** NOTE **
  21    Use -DMPICH2 if this code does not compile.
  22    MPICH1 code also work on MPICH2 on our cluster and the performance are similar.
  23    This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc).
  24 */
  25
  26 //#include <star-reduction.c>
  27
  28 /*
  29 This fucntion performs all-reduce operation as follow.
  30 1) binomial_tree reduce inside each SMP node
  31 2) binomial_tree reduce intra-communication between root of each SMP node
  32 3) binomial_tree bcast intra-communication between root of each SMP node
  33 4) binomial_tree bcast inside each SMP node
  34 */
  35 int smpi_coll_tuned_allreduce_smp_binomial(void *send_buf, void *recv_buf,
  36                                            int count, MPI_Datatype dtype,
  37                                            MPI_Op op, MPI_Comm comm)
  38 {
  39   int comm_size, rank;
  40   void *tmp_buf;
  41   int tag = COLL_TAG_ALLREDUCE;
  42   int mask, src, dst;
  43
  44
  45   int num_core = simcall_host_get_core(SIMIX_host_self());
  46   // do we use the default one or the number of cores in the platform ?
  47   // if the number of cores is one, the platform may be simulated with 1 node = 1 core
  48   if (num_core == 1) num_core = NUM_CORE;
  49   MPI_Status status;
  50
  51   comm_size=smpi_comm_size(comm);
  52   rank=smpi_comm_rank(comm);
  53   MPI_Aint extent, lb;
  54   smpi_datatype_extent(dtype, &lb, &extent);
  55   tmp_buf = (void *) xbt_malloc(count * extent);
  56
  57   /* compute intra and inter ranking */
  58   int intra_rank, inter_rank;
  59   intra_rank = rank % num_core;
  60   inter_rank = rank / num_core;
  61
  62   /* size of processes participate in intra communications =>
  63      should be equal to number of machines */
  64   int inter_comm_size = (comm_size + num_core - 1) / num_core;
  65
  66   /* copy input buffer to output buffer */
  67   smpi_mpi_sendrecv(send_buf, count, dtype, rank, tag,
  68                recv_buf, count, dtype, rank, tag, comm, &status);
  69
  70   /* start binomial reduce intra communication inside each SMP node */
  71   mask = 1;
  72   while (mask < num_core) {
  73     if ((mask & intra_rank) == 0) {
  74       src = (inter_rank * num_core) + (intra_rank | mask);
  75       if (src < comm_size) {
  76         smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
  77         smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype);
  78       }
  79     } else {
  80       dst = (inter_rank * num_core) + (intra_rank & (~mask));
  81       smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
  82       break;
  83     }
  84     mask <<= 1;
  85   }
  86
  87   /* start binomial reduce inter-communication between each SMP nodes:
  88      each node only have one process that can communicate to other nodes */
  89   if (intra_rank == 0) {
  90     mask = 1;
  91     while (mask < inter_comm_size) {
  92       if ((mask & inter_rank) == 0) {
  93         src = (inter_rank | mask) * num_core;
  94         if (src < comm_size) {
  95           smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
  96           smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype);
  97         }
  98       } else {
  99         dst = (inter_rank & (~mask)) * num_core;
 100         smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
 101         break;
 102       }
 103       mask <<= 1;
 104     }
 105   }
 106
 107   /* start binomial broadcast inter-communication between each SMP nodes:
 108      each node only have one process that can communicate to other nodes */
 109   if (intra_rank == 0) {
 110     mask = 1;
 111     while (mask < inter_comm_size) {
 112       if (inter_rank & mask) {
 113         src = (inter_rank - mask) * num_core;
 114         smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status);
 115         break;
 116       }
 117       mask <<= 1;
 118     }
 119     mask >>= 1;
 120
 121     while (mask > 0) {
 122       if (inter_rank < inter_comm_size) {
 123         dst = (inter_rank + mask) * num_core;
 124         if (dst < comm_size) {
 125           smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
 126         }
 127       }
 128       mask >>= 1;
 129     }
 130   }
 131
 132   /* start binomial broadcast intra-communication inside each SMP nodes */
 133   int num_core_in_current_smp = num_core;
 134   if (inter_rank == (inter_comm_size - 1)) {
 135     num_core_in_current_smp = comm_size - (inter_rank * num_core);
 136   }
 137   mask = 1;
 138   while (mask < num_core_in_current_smp) {
 139     if (intra_rank & mask) {
 140       src = (inter_rank * num_core) + (intra_rank - mask);
 141       smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status);
 142       break;
 143     }
 144     mask <<= 1;
 145   }
 146   mask >>= 1;
 147
 148   while (mask > 0) {
 149     dst = (inter_rank * num_core) + (intra_rank + mask);
 150     if (dst < comm_size) {
 151       smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
 152     }
 153     mask >>= 1;
 154   }
 155
 156   free(tmp_buf);
 157   return MPI_SUCCESS;
 158 }