Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
oops
[simgrid.git] / src / smpi / colls / reduce-mvapich-two-level.c
1 /* Copyright (c) 2013-2014. The SimGrid Team.
2  * All rights reserved.                                                     */
3
4 /* This program is free software; you can redistribute it and/or modify it
5  * under the terms of the license (GNU LGPL) which comes with this package. */
6
7 /*
8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9  *                         University Research and Technology
10  *                         Corporation.  All rights reserved.
11  * Copyright (c) 2004-2009 The University of Tennessee and The University
12  *                         of Tennessee Research Foundation.  All rights
13  *                         reserved.
14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15  *                         University of Stuttgart.  All rights reserved.
16  * Copyright (c) 2004-2005 The Regents of the University of California.
17  *                         All rights reserved.
18  *
19  * Additional copyrights may follow
20  */
21  /* -*- Mode: C; c-basic-offset:4 ; -*- */
22 /* Copyright (c) 2001-2014, The Ohio State University. All rights
23  * reserved.
24  *
25  * This file is part of the MVAPICH2 software package developed by the
26  * team members of The Ohio State University's Network-Based Computing
27  * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
28  *
29  * For detailed copyright and licensing information, please refer to the
30  * copyright file COPYRIGHT in the top level MVAPICH2 directory.
31  */
32 /*
33  *
34  *  (C) 2001 by Argonne National Laboratory.
35  *      See COPYRIGHT in top-level directory.
36  */
37
38 #include "colls_private.h"
39 #define MV2_INTRA_SHMEM_REDUCE_MSG 2048
40
41 #define mv2_g_shmem_coll_max_msg_size (1 << 17)
42 #define SHMEM_COLL_BLOCK_SIZE (local_size * mv2_g_shmem_coll_max_msg_size)
43 #define mv2_use_knomial_reduce 1
44
45 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
46 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
47 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
48 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
49 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
50
51 extern int (*MV2_Reduce_function)( void *sendbuf,
52     void *recvbuf,
53     int count,
54     MPI_Datatype datatype,
55     MPI_Op op,
56     int root,
57     MPI_Comm  comm_ptr);
58
59 extern int (*MV2_Reduce_intra_function)( void *sendbuf,
60     void *recvbuf,
61     int count,
62     MPI_Datatype datatype,
63     MPI_Op op,
64     int root,
65     MPI_Comm  comm_ptr);
66
67
68 /*Fn pointers for collectives */
69 static int (*reduce_fn)(void *sendbuf,
70                              void *recvbuf,
71                              int count,
72                              MPI_Datatype datatype,
73                              MPI_Op op, int root, MPI_Comm  comm);
74
75 int smpi_coll_tuned_reduce_mvapich2_two_level( void *sendbuf,
76                                      void *recvbuf,
77                                      int count,
78                                      MPI_Datatype datatype,
79                                      MPI_Op op,
80                                      int root,
81                                      MPI_Comm comm)
82 {
83     int mpi_errno = MPI_SUCCESS;
84     int my_rank, total_size, local_rank, local_size;
85     int leader_comm_rank = -1, leader_comm_size = 0;
86     MPI_Comm shmem_comm, leader_comm;
87     int leader_root, leader_of_root;
88     void *in_buf = NULL, *out_buf = NULL, *tmp_buf = NULL;
89     MPI_Aint true_lb, true_extent, extent;
90     int is_commutative = 0, stride = 0;
91     int intra_node_root=0; 
92     
93     //if not set (use of the algo directly, without mvapich2 selector)
94     if(MV2_Reduce_function==NULL)
95       MV2_Reduce_function=smpi_coll_tuned_reduce_mpich;
96     if(MV2_Reduce_intra_function==NULL)
97       MV2_Reduce_intra_function=smpi_coll_tuned_reduce_mpich;
98
99     if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
100       smpi_comm_init_smp(comm);
101     }
102   
103     my_rank = smpi_comm_rank(comm);
104     total_size = smpi_comm_size(comm);
105     shmem_comm = smpi_comm_get_intra_comm(comm);
106     local_rank = smpi_comm_rank(shmem_comm);
107     local_size = smpi_comm_size(shmem_comm);
108     
109     leader_comm = smpi_comm_get_leaders_comm(comm);
110     int* leaders_map = smpi_comm_get_leaders_map(comm);
111     leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]);
112     leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]);
113
114     is_commutative=smpi_op_is_commute(op);
115
116     smpi_datatype_extent(datatype, &true_lb,
117                                        &true_extent);
118     extent =smpi_datatype_get_extent(datatype);
119     stride = count * MAX(extent, true_extent);
120
121     if (local_size == total_size) {
122         /* First handle the case where there is only one node */
123         if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG &&
124             is_commutative == 1) {
125             if (local_rank == 0 ) {
126                 tmp_buf=(void *)smpi_get_tmp_sendbuffer( count *
127                                     (MAX(extent, true_extent)));
128                 tmp_buf = (void *) ((char *) tmp_buf - true_lb);
129             }
130
131             if (sendbuf != MPI_IN_PLACE) {
132                 in_buf = (void *)sendbuf;
133             } else {
134                 in_buf = recvbuf;
135             }
136
137             if (local_rank == 0) { 
138                  if( my_rank != root) {
139                      out_buf = tmp_buf;
140                  } else { 
141                      out_buf = recvbuf; 
142                      if(in_buf == out_buf) { 
143                         in_buf = MPI_IN_PLACE; 
144                         out_buf = recvbuf; 
145                      } 
146                  } 
147             } else {
148                 in_buf  = (void *)sendbuf; 
149                 out_buf = NULL;
150             }
151
152             if (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) {
153                 mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count,
154                                                   datatype, op,
155                                                   0, shmem_comm);
156             }
157             else {
158                 mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
159                                                                   datatype, op,
160                                                                   0, shmem_comm);
161             }
162             
163             if (local_rank == 0 && root != my_rank) {
164                 smpi_mpi_send(out_buf, count, datatype, root,
165                                          COLL_TAG_REDUCE+1, comm);
166             }
167             if ((local_rank != 0) && (root == my_rank)) {
168                 smpi_mpi_recv(recvbuf, count, datatype,
169                                          leader_of_root, COLL_TAG_REDUCE+1, comm,
170                                          MPI_STATUS_IGNORE);
171             }
172         } else {
173             if(mv2_use_knomial_reduce == 1) { 
174                 reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2; 
175             } else { 
176                 reduce_fn = &MPIR_Reduce_binomial_MV2; 
177             } 
178             mpi_errno = reduce_fn(sendbuf, recvbuf, count,
179                                   datatype, op,
180                                   root, comm);
181         }
182         /* We are done */
183         if(tmp_buf!=NULL) 
184           smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
185         goto fn_exit;
186     }
187     
188
189     if (local_rank == 0) {
190         leader_comm = smpi_comm_get_leaders_comm(comm);
191         if(leader_comm==MPI_COMM_NULL){
192           leader_comm = MPI_COMM_WORLD;
193         }
194         leader_comm_size = smpi_comm_size(leader_comm);
195         leader_comm_rank = smpi_comm_rank(leader_comm);
196         tmp_buf=(void *)smpi_get_tmp_sendbuffer(count *
197                             (MAX(extent, true_extent)));
198         tmp_buf = (void *) ((char *) tmp_buf - true_lb);
199     }
200     if (sendbuf != MPI_IN_PLACE) {
201         in_buf = (void *)sendbuf;
202     } else {
203         in_buf = recvbuf;
204     }
205     if (local_rank == 0) {
206         out_buf = tmp_buf;
207     } else {
208         out_buf = NULL;
209     }
210
211
212     if(local_size > 1) { 
213         /* Lets do the intra-node reduce operations, if we have more than one
214          * process in the node */
215
216         /*Fix the input and outbuf buffers for the intra-node reduce.
217          *Node leaders will have the reduced data in tmp_buf after 
218          *this step*/
219         if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2)
220         {
221             if (is_commutative == 1
222                 && (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) {
223                     mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
224                                       datatype, op,
225                                       intra_node_root, shmem_comm);
226             } else {
227                     mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
228                                       datatype, op,
229                                       intra_node_root, shmem_comm);
230             }
231         } else {
232
233             mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
234                                       datatype, op,
235                                       intra_node_root, shmem_comm);
236         }
237     } else { 
238         smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
239         tmp_buf = in_buf; 
240     } 
241
242     /* Now work on the inter-leader phase. Data is in tmp_buf */
243     if (local_rank == 0 && leader_comm_size > 1) {
244         /*The leader of root will have the global reduced data in tmp_buf 
245            or recv_buf
246            at the end of the reduce */
247         if (leader_comm_rank == leader_root) {
248             if (my_rank == root) {
249                 /* I am the root of the leader-comm, and the 
250                  * root of the reduce op. So, I will write the 
251                  * final result directly into my recvbuf */
252                 if(tmp_buf != recvbuf) { 
253                     in_buf = tmp_buf;
254                     out_buf = recvbuf;
255                 } else { 
256
257                      in_buf = (char *)smpi_get_tmp_sendbuffer(count*
258                                        smpi_datatype_get_extent(datatype));
259                      smpi_datatype_copy(tmp_buf, count, datatype,
260                                         in_buf, count, datatype);
261                     //in_buf = MPI_IN_PLACE; 
262                     out_buf = recvbuf; 
263                 } 
264             } else {
265                 in_buf = (char *)smpi_get_tmp_sendbuffer(count*
266                                        smpi_datatype_get_extent(datatype));
267                 smpi_datatype_copy(tmp_buf, count, datatype,
268                                         in_buf, count, datatype);
269                 //in_buf = MPI_IN_PLACE;
270                 out_buf = tmp_buf;
271             }
272         } else {
273             in_buf = tmp_buf;
274             out_buf = NULL;
275         }
276
277         /* inter-leader communication  */
278         mpi_errno = MV2_Reduce_function(in_buf, out_buf, count,
279                               datatype, op,
280                               leader_root, leader_comm);
281
282     }
283
284     if (local_size > 1) {
285         /* Send the message to the root if the leader is not the
286          * root of the reduce operation. The reduced data is in tmp_buf */
287         if ((local_rank == 0) && (root != my_rank)
288             && (leader_root == leader_comm_rank)) {
289             smpi_mpi_send(tmp_buf, count, datatype, root,
290                                      COLL_TAG_REDUCE+1, comm);
291         }
292         if ((local_rank != 0) && (root == my_rank)) {
293             smpi_mpi_recv(recvbuf, count, datatype,
294                                      leader_of_root,
295                                      COLL_TAG_REDUCE+1, comm,
296                                      MPI_STATUS_IGNORE);
297         }
298       smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
299
300       if (leader_comm_rank == leader_root) {
301         if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) { 
302           smpi_free_tmp_buffer(in_buf);
303         }
304       }
305     }
306
307
308
309   fn_exit:
310     return mpi_errno;
311 }