1 /* Copyright (c) 2013-2019. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "../colls_private.hpp"
8 /* IMPLEMENTED BY PITCH PATARASUK
9 Non-topoloty-specific (however, number of cores/node need to be changed)
10 all-reduce operation designed for smp clusters
11 It uses 2-layer communication: binomial for both intra-communication
16 Use -DMPICH2 if this code does not compile.
17 MPICH1 code also work on MPICH2 on our cluster and the performance are similar.
18 This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc).
21 //#include <star-reduction.c>
24 This fucntion performs all-reduce operation as follow.
25 1) binomial_tree reduce inside each SMP node
26 2) binomial_tree reduce intra-communication between root of each SMP node
27 3) binomial_tree bcast intra-communication between root of each SMP node
28 4) binomial_tree bcast inside each SMP node
32 int Coll_allreduce_smp_binomial::allreduce(const void *send_buf, void *recv_buf,
33 int count, MPI_Datatype dtype,
34 MPI_Op op, MPI_Comm comm)
37 int tag = COLL_TAG_ALLREDUCE;
40 if(comm->get_leaders_comm()==MPI_COMM_NULL){
44 if (comm->is_uniform()){
45 num_core = comm->get_intra_comm()->size();
49 comm_size=comm->size();
52 dtype->extent(&lb, &extent);
53 unsigned char* tmp_buf = smpi_get_tmp_sendbuffer(count * extent);
55 /* compute intra and inter ranking */
56 int intra_rank, inter_rank;
57 intra_rank = rank % num_core;
58 inter_rank = rank / num_core;
60 /* size of processes participate in intra communications =>
61 should be equal to number of machines */
62 int inter_comm_size = (comm_size + num_core - 1) / num_core;
64 /* copy input buffer to output buffer */
65 Request::sendrecv(send_buf, count, dtype, rank, tag,
66 recv_buf, count, dtype, rank, tag, comm, &status);
68 /* start binomial reduce intra communication inside each SMP node */
70 while (mask < num_core) {
71 if ((mask & intra_rank) == 0) {
72 src = (inter_rank * num_core) + (intra_rank | mask);
73 if (src < comm_size) {
74 Request::recv(tmp_buf, count, dtype, src, tag, comm, &status);
75 if(op!=MPI_OP_NULL) op->apply( tmp_buf, recv_buf, &count, dtype);
78 dst = (inter_rank * num_core) + (intra_rank & (~mask));
79 Request::send(recv_buf, count, dtype, dst, tag, comm);
85 /* start binomial reduce inter-communication between each SMP nodes:
86 each node only have one process that can communicate to other nodes */
87 if (intra_rank == 0) {
89 while (mask < inter_comm_size) {
90 if ((mask & inter_rank) == 0) {
91 src = (inter_rank | mask) * num_core;
92 if (src < comm_size) {
93 Request::recv(tmp_buf, count, dtype, src, tag, comm, &status);
94 if(op!=MPI_OP_NULL) op->apply( tmp_buf, recv_buf, &count, dtype);
97 dst = (inter_rank & (~mask)) * num_core;
98 Request::send(recv_buf, count, dtype, dst, tag, comm);
105 /* start binomial broadcast inter-communication between each SMP nodes:
106 each node only have one process that can communicate to other nodes */
107 if (intra_rank == 0) {
109 while (mask < inter_comm_size) {
110 if (inter_rank & mask) {
111 src = (inter_rank - mask) * num_core;
112 Request::recv(recv_buf, count, dtype, src, tag, comm, &status);
120 if (inter_rank < inter_comm_size) {
121 dst = (inter_rank + mask) * num_core;
122 if (dst < comm_size) {
123 Request::send(recv_buf, count, dtype, dst, tag, comm);
130 /* start binomial broadcast intra-communication inside each SMP nodes */
131 int num_core_in_current_smp = num_core;
132 if (inter_rank == (inter_comm_size - 1)) {
133 num_core_in_current_smp = comm_size - (inter_rank * num_core);
136 while (mask < num_core_in_current_smp) {
137 if (intra_rank & mask) {
138 src = (inter_rank * num_core) + (intra_rank - mask);
139 Request::recv(recv_buf, count, dtype, src, tag, comm, &status);
147 dst = (inter_rank * num_core) + (intra_rank + mask);
148 if (dst < comm_size) {
149 Request::send(recv_buf, count, dtype, dst, tag, comm);
154 smpi_free_tmp_buffer(tmp_buf);