1 /* Copyright (c) 2013-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2009 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
19 * Additional copyrights may follow
21 /* -*- Mode: C; c-basic-offset:4 ; -*- */
22 /* Copyright (c) 2001-2014, The Ohio State University. All rights
25 * This file is part of the MVAPICH2 software package developed by the
26 * team members of The Ohio State University's Network-Based Computing
27 * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
29 * For detailed copyright and licensing information, please refer to the
30 * copyright file COPYRIGHT in the top level MVAPICH2 directory.
34 * (C) 2001 by Argonne National Laboratory.
35 * See COPYRIGHT in top-level directory.
38 #include "colls_private.h"
39 #define MV2_INTRA_SHMEM_REDUCE_MSG 2048
41 #define mv2_g_shmem_coll_max_msg_size (1 << 17)
42 #define SHMEM_COLL_BLOCK_SIZE (local_size * mv2_g_shmem_coll_max_msg_size)
43 #define mv2_use_knomial_reduce 1
45 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
46 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
47 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
48 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
49 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
51 extern int (*MV2_Reduce_function)( void *sendbuf,
54 MPI_Datatype datatype,
59 extern int (*MV2_Reduce_intra_function)( void *sendbuf,
62 MPI_Datatype datatype,
68 /*Fn pointers for collectives */
69 static int (*reduce_fn)(void *sendbuf,
72 MPI_Datatype datatype,
73 MPI_Op op, int root, MPI_Comm comm);
75 int smpi_coll_tuned_reduce_mvapich2_two_level( void *sendbuf,
78 MPI_Datatype datatype,
83 int mpi_errno = MPI_SUCCESS;
84 int my_rank, total_size, local_rank, local_size;
85 int leader_comm_rank = -1, leader_comm_size = 0;
86 MPI_Comm shmem_comm, leader_comm;
87 int leader_root, leader_of_root;
88 void *in_buf = NULL, *out_buf = NULL, *tmp_buf = NULL;
89 MPI_Aint true_lb, true_extent, extent;
90 int is_commutative = 0, stride = 0;
91 int intra_node_root=0;
93 //if not set (use of the algo directly, without mvapich2 selector)
94 if(MV2_Reduce_function==NULL)
95 MV2_Reduce_function=smpi_coll_tuned_reduce_mpich;
96 if(MV2_Reduce_intra_function==NULL)
97 MV2_Reduce_intra_function=smpi_coll_tuned_reduce_mpich;
99 if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
100 smpi_comm_init_smp(comm);
103 my_rank = smpi_comm_rank(comm);
104 total_size = smpi_comm_size(comm);
105 shmem_comm = smpi_comm_get_intra_comm(comm);
106 local_rank = smpi_comm_rank(shmem_comm);
107 local_size = smpi_comm_size(shmem_comm);
109 leader_comm = smpi_comm_get_leaders_comm(comm);
110 int* leaders_map = smpi_comm_get_leaders_map(comm);
111 leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]);
112 leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]);
114 is_commutative=smpi_op_is_commute(op);
116 smpi_datatype_extent(datatype, &true_lb,
118 extent =smpi_datatype_get_extent(datatype);
119 stride = count * MAX(extent, true_extent);
121 if (local_size == total_size) {
122 /* First handle the case where there is only one node */
123 if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG &&
124 is_commutative == 1) {
125 if (local_rank == 0 ) {
126 tmp_buf=(void *)smpi_get_tmp_sendbuffer( count *
127 (MAX(extent, true_extent)));
128 tmp_buf = (void *) ((char *) tmp_buf - true_lb);
131 if (sendbuf != MPI_IN_PLACE) {
132 in_buf = (void *)sendbuf;
137 if (local_rank == 0) {
138 if( my_rank != root) {
142 if(in_buf == out_buf) {
143 in_buf = MPI_IN_PLACE;
148 in_buf = (void *)sendbuf;
152 if (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) {
153 mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count,
158 mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
163 if (local_rank == 0 && root != my_rank) {
164 smpi_mpi_send(out_buf, count, datatype, root,
165 COLL_TAG_REDUCE+1, comm);
167 if ((local_rank != 0) && (root == my_rank)) {
168 smpi_mpi_recv(recvbuf, count, datatype,
169 leader_of_root, COLL_TAG_REDUCE+1, comm,
173 if(mv2_use_knomial_reduce == 1) {
174 reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2;
176 reduce_fn = &MPIR_Reduce_binomial_MV2;
178 mpi_errno = reduce_fn(sendbuf, recvbuf, count,
187 if (local_rank == 0) {
188 leader_comm = smpi_comm_get_leaders_comm(comm);
189 if(leader_comm==MPI_COMM_NULL){
190 leader_comm = MPI_COMM_WORLD;
192 leader_comm_size = smpi_comm_size(leader_comm);
193 leader_comm_rank = smpi_comm_rank(leader_comm);
194 tmp_buf=(void *)smpi_get_tmp_sendbuffer(count *
195 (MAX(extent, true_extent)));
196 tmp_buf = (void *) ((char *) tmp_buf - true_lb);
198 if (sendbuf != MPI_IN_PLACE) {
199 in_buf = (void *)sendbuf;
203 if (local_rank == 0) {
211 /* Lets do the intra-node reduce operations, if we have more than one
212 * process in the node */
214 /*Fix the input and outbuf buffers for the intra-node reduce.
215 *Node leaders will have the reduced data in tmp_buf after
217 if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2)
219 if (is_commutative == 1
220 && (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) {
221 mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
223 intra_node_root, shmem_comm);
225 mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
227 intra_node_root, shmem_comm);
231 mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
233 intra_node_root, shmem_comm);
239 /* Now work on the inter-leader phase. Data is in tmp_buf */
240 if (local_rank == 0 && leader_comm_size > 1) {
241 /*The leader of root will have the global reduced data in tmp_buf
243 at the end of the reduce */
244 if (leader_comm_rank == leader_root) {
245 if (my_rank == root) {
246 /* I am the root of the leader-comm, and the
247 * root of the reduce op. So, I will write the
248 * final result directly into my recvbuf */
249 if(tmp_buf != recvbuf) {
254 in_buf = (char *)smpi_get_tmp_sendbuffer(count*
255 smpi_datatype_get_extent(datatype));
256 smpi_datatype_copy(tmp_buf, count, datatype,
257 in_buf, count, datatype);
258 //in_buf = MPI_IN_PLACE;
262 in_buf = (char *)smpi_get_tmp_sendbuffer(count*
263 smpi_datatype_get_extent(datatype));
264 smpi_datatype_copy(tmp_buf, count, datatype,
265 in_buf, count, datatype);
266 //in_buf = MPI_IN_PLACE;
274 /* inter-leader communication */
275 mpi_errno = MV2_Reduce_function(in_buf, out_buf, count,
277 leader_root, leader_comm);
281 if (local_size > 1) {
282 /* Send the message to the root if the leader is not the
283 * root of the reduce operation. The reduced data is in tmp_buf */
284 if ((local_rank == 0) && (root != my_rank)
285 && (leader_root == leader_comm_rank)) {
286 smpi_mpi_send(tmp_buf, count, datatype, root,
287 COLL_TAG_REDUCE+1, comm);
290 if ((local_rank != 0) && (root == my_rank)) {
291 smpi_mpi_recv(recvbuf, count, datatype,
293 COLL_TAG_REDUCE+1, comm,