1 /* Copyright (c) 2013-2022. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "../colls_private.hpp"
8 /* IMPLEMENTED BY PITCH PATARASUK
9 Non-topology-specific (however, number of cores/node need to be changed)
10 all-reduce operation designed for smp clusters
11 It uses 2-layer communication: binomial for intra-communication
12 and rdb for inter-communication*/
15 Use -DMPICH2 if this code does not compile.
16 MPICH1 code also work on MPICH2 on our cluster and the performance are similar.
17 This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc).
20 //#include <star-reduction.c>
23 This function performs all-reduce operation as follow.
24 1) binomial_tree reduce inside each SMP node
25 2) Recursive doubling intra-communication between root of each SMP node
26 3) binomial_tree bcast inside each SMP node
28 namespace simgrid::smpi {
29 int allreduce__smp_rdb(const void *send_buf, void *recv_buf, int count,
30 MPI_Datatype dtype, MPI_Op op,
34 int tag = COLL_TAG_ALLREDUCE;
37 if(comm->get_leaders_comm()==MPI_COMM_NULL){
41 if (comm->is_uniform()){
42 num_core = comm->get_intra_comm()->size();
45 #ifdef MPICH2_REDUCTION
46 MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
48 MPI_User_function *uop;
50 op_ptr = MPIR_ToPointer(op);
54 comm_size = comm->size();
57 extent = dtype->get_extent();
58 unsigned char* tmp_buf = smpi_get_tmp_sendbuffer(count * extent);
60 /* compute intra and inter ranking */
61 int intra_rank, inter_rank;
62 intra_rank = rank % num_core;
63 inter_rank = rank / num_core;
65 /* size of processes participate in intra communications =>
66 should be equal to number of machines */
67 int inter_comm_size = (comm_size + num_core - 1) / num_core;
69 /* copy input buffer to output buffer */
70 Request::sendrecv(send_buf, count, dtype, rank, tag,
71 recv_buf, count, dtype, rank, tag, comm, &status);
73 /* start binomial reduce intra communication inside each SMP node */
75 while (mask < num_core) {
76 if ((mask & intra_rank) == 0) {
77 src = (inter_rank * num_core) + (intra_rank | mask);
78 if (src < comm_size) {
79 Request::recv(tmp_buf, count, dtype, src, tag, comm, &status);
80 if(op!=MPI_OP_NULL) op->apply( tmp_buf, recv_buf, &count, dtype);
83 dst = (inter_rank * num_core) + (intra_rank & (~mask));
84 Request::send(recv_buf, count, dtype, dst, tag, comm);
88 } /* end binomial reduce intra-communication */
91 /* start rdb (recursive doubling) all-reduce inter-communication
92 between each SMP nodes : each node only have one process that can communicate
94 if (intra_rank == 0) {
96 /* find nearest power-of-two less than or equal to inter_comm_size */
97 int pof2, rem, newrank, newdst;
99 while (pof2 <= inter_comm_size)
102 rem = inter_comm_size - pof2;
104 /* In the non-power-of-two case, all even-numbered
105 processes of rank < 2*rem send their data to
106 (rank+1). These even-numbered processes no longer
107 participate in the algorithm until the very end.
109 if (inter_rank < 2 * rem) {
110 if (inter_rank % 2 == 0) {
111 dst = rank + num_core;
112 Request::send(recv_buf, count, dtype, dst, tag, comm);
115 src = rank - num_core;
116 Request::recv(tmp_buf, count, dtype, src, tag, comm, &status);
117 if(op!=MPI_OP_NULL) op->apply( tmp_buf, recv_buf, &count, dtype);
118 newrank = inter_rank / 2;
121 newrank = inter_rank - rem;
124 /* example inter-communication RDB rank change algorithm
125 0,4,8,12..36 <= true rank (assume 4 core per SMP)
126 0123 4567 89 <= inter_rank
127 1 3 4567 89 (1,3 got data from 0,2 : 0,2 will be idle until the end)
129 0 1 2345 67 => newrank
134 while (mask < pof2) {
135 newdst = newrank ^ mask;
136 /* find real rank of dest */
137 dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
140 /* exchange data in rdb manner */
141 Request::sendrecv(recv_buf, count, dtype, dst, tag, tmp_buf, count, dtype,
142 dst, tag, comm, &status);
143 if(op!=MPI_OP_NULL) op->apply( tmp_buf, recv_buf, &count, dtype);
149 left-over processes (all even ranks: < 2 * rem) get the result
151 if (inter_rank < 2 * rem) {
152 if (inter_rank % 2) {
153 Request::send(recv_buf, count, dtype, rank - num_core, tag, comm);
155 Request::recv(recv_buf, count, dtype, rank + num_core, tag, comm, &status);
160 /* start binomial broadcast intra-communication inside each SMP nodes */
161 int num_core_in_current_smp = num_core;
162 if (inter_rank == (inter_comm_size - 1)) {
163 num_core_in_current_smp = comm_size - (inter_rank * num_core);
166 while (mask < num_core_in_current_smp) {
167 if (intra_rank & mask) {
168 src = (inter_rank * num_core) + (intra_rank - mask);
169 Request::recv(recv_buf, count, dtype, src, tag, comm, &status);
177 dst = (inter_rank * num_core) + (intra_rank + mask);
178 if (dst < comm_size) {
179 Request::send(recv_buf, count, dtype, dst, tag, comm);
184 smpi_free_tmp_buffer(tmp_buf);
187 } // namespace simgrid::smpi