1 /* Copyright (c) 2013-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "colls_private.h"
8 /* IMPLEMENTED BY PITCH PATARASUK
9 Non-topoloty-specific (however, number of cores/node need to be changed)
10 all-reduce operation designed for smp clusters
11 It uses 2-layer communication: binomial for both intra-communication
14 /* change number of core per smp-node
15 we assume that number of core per process will be the same for all implementations */
21 Use -DMPICH2 if this code does not compile.
22 MPICH1 code also work on MPICH2 on our cluster and the performance are similar.
23 This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc).
26 //#include <star-reduction.c>
29 This fucntion performs all-reduce operation as follow.
30 1) binomial_tree reduce inside each SMP node
31 2) binomial_tree reduce intra-communication between root of each SMP node
32 3) binomial_tree bcast intra-communication between root of each SMP node
33 4) binomial_tree bcast inside each SMP node
35 int smpi_coll_tuned_allreduce_smp_binomial(void *send_buf, void *recv_buf,
36 int count, MPI_Datatype dtype,
37 MPI_Op op, MPI_Comm comm)
41 int tag = COLL_TAG_ALLREDUCE;
45 int num_core = simcall_host_get_core(SIMIX_host_self());
46 // do we use the default one or the number of cores in the platform ?
47 // if the number of cores is one, the platform may be simulated with 1 node = 1 core
48 if (num_core == 1) num_core = NUM_CORE;
51 comm_size=smpi_comm_size(comm);
52 rank=smpi_comm_rank(comm);
54 smpi_datatype_extent(dtype, &lb, &extent);
55 tmp_buf = (void *) xbt_malloc(count * extent);
57 /* compute intra and inter ranking */
58 int intra_rank, inter_rank;
59 intra_rank = rank % num_core;
60 inter_rank = rank / num_core;
62 /* size of processes participate in intra communications =>
63 should be equal to number of machines */
64 int inter_comm_size = (comm_size + num_core - 1) / num_core;
66 /* copy input buffer to output buffer */
67 smpi_mpi_sendrecv(send_buf, count, dtype, rank, tag,
68 recv_buf, count, dtype, rank, tag, comm, &status);
70 /* start binomial reduce intra communication inside each SMP node */
72 while (mask < num_core) {
73 if ((mask & intra_rank) == 0) {
74 src = (inter_rank * num_core) + (intra_rank | mask);
75 if (src < comm_size) {
76 smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
77 smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype);
80 dst = (inter_rank * num_core) + (intra_rank & (~mask));
81 smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
87 /* start binomial reduce inter-communication between each SMP nodes:
88 each node only have one process that can communicate to other nodes */
89 if (intra_rank == 0) {
91 while (mask < inter_comm_size) {
92 if ((mask & inter_rank) == 0) {
93 src = (inter_rank | mask) * num_core;
94 if (src < comm_size) {
95 smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
96 smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype);
99 dst = (inter_rank & (~mask)) * num_core;
100 smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
107 /* start binomial broadcast inter-communication between each SMP nodes:
108 each node only have one process that can communicate to other nodes */
109 if (intra_rank == 0) {
111 while (mask < inter_comm_size) {
112 if (inter_rank & mask) {
113 src = (inter_rank - mask) * num_core;
114 smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status);
122 if (inter_rank < inter_comm_size) {
123 dst = (inter_rank + mask) * num_core;
124 if (dst < comm_size) {
125 smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
132 /* start binomial broadcast intra-communication inside each SMP nodes */
133 int num_core_in_current_smp = num_core;
134 if (inter_rank == (inter_comm_size - 1)) {
135 num_core_in_current_smp = comm_size - (inter_rank * num_core);
138 while (mask < num_core_in_current_smp) {
139 if (intra_rank & mask) {
140 src = (inter_rank * num_core) + (intra_rank - mask);
141 smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status);
149 dst = (inter_rank * num_core) + (intra_rank + mask);
150 if (dst < comm_size) {
151 smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);