Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
MPI_Comm -> C++
[simgrid.git] / src / smpi / colls / reduce-mvapich-two-level.cpp
1 /* Copyright (c) 2013-2014. The SimGrid Team.
2  * All rights reserved.                                                     */
3
4 /* This program is free software; you can redistribute it and/or modify it
5  * under the terms of the license (GNU LGPL) which comes with this package. */
6
7 /*
8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9  *                         University Research and Technology
10  *                         Corporation.  All rights reserved.
11  * Copyright (c) 2004-2009 The University of Tennessee and The University
12  *                         of Tennessee Research Foundation.  All rights
13  *                         reserved.
14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15  *                         University of Stuttgart.  All rights reserved.
16  * Copyright (c) 2004-2005 The Regents of the University of California.
17  *                         All rights reserved.
18  *
19  * Additional copyrights may follow
20  */
21  /* -*- Mode: C; c-basic-offset:4 ; -*- */
22 /* Copyright (c) 2001-2014, The Ohio State University. All rights
23  * reserved.
24  *
25  * This file is part of the MVAPICH2 software package developed by the
26  * team members of The Ohio State University's Network-Based Computing
27  * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
28  *
29  * For detailed copyright and licensing information, please refer to the
30  * copyright file COPYRIGHT in the top level MVAPICH2 directory.
31  */
32 /*
33  *
34  *  (C) 2001 by Argonne National Laboratory.
35  *      See COPYRIGHT in top-level directory.
36  */
37
38 #include "colls_private.h"
39 #include "src/smpi/smpi_group.hpp"
40 #define MV2_INTRA_SHMEM_REDUCE_MSG 2048
41
42 #define mv2_g_shmem_coll_max_msg_size (1 << 17)
43 #define SHMEM_COLL_BLOCK_SIZE (local_size * mv2_g_shmem_coll_max_msg_size)
44 #define mv2_use_knomial_reduce 1
45
46 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
47 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
48 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
49 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
50 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
51
52 extern int (*MV2_Reduce_function)( void *sendbuf,
53     void *recvbuf,
54     int count,
55     MPI_Datatype datatype,
56     MPI_Op op,
57     int root,
58     MPI_Comm  comm_ptr);
59
60 extern int (*MV2_Reduce_intra_function)( void *sendbuf,
61     void *recvbuf,
62     int count,
63     MPI_Datatype datatype,
64     MPI_Op op,
65     int root,
66     MPI_Comm  comm_ptr);
67
68
69 /*Fn pointers for collectives */
70 static int (*reduce_fn)(void *sendbuf,
71                              void *recvbuf,
72                              int count,
73                              MPI_Datatype datatype,
74                              MPI_Op op, int root, MPI_Comm  comm);
75
76 int smpi_coll_tuned_reduce_mvapich2_two_level( void *sendbuf,
77                                      void *recvbuf,
78                                      int count,
79                                      MPI_Datatype datatype,
80                                      MPI_Op op,
81                                      int root,
82                                      MPI_Comm comm)
83 {
84     int mpi_errno = MPI_SUCCESS;
85     int my_rank, total_size, local_rank, local_size;
86     int leader_comm_rank = -1, leader_comm_size = 0;
87     MPI_Comm shmem_comm, leader_comm;
88     int leader_root, leader_of_root;
89     void *in_buf = NULL, *out_buf = NULL, *tmp_buf = NULL;
90     MPI_Aint true_lb, true_extent, extent;
91     int is_commutative = 0, stride = 0;
92     int intra_node_root=0; 
93     
94     //if not set (use of the algo directly, without mvapich2 selector)
95     if(MV2_Reduce_function==NULL)
96       MV2_Reduce_function=smpi_coll_tuned_reduce_mpich;
97     if(MV2_Reduce_intra_function==NULL)
98       MV2_Reduce_intra_function=smpi_coll_tuned_reduce_mpich;
99
100     if(comm->get_leaders_comm()==MPI_COMM_NULL){
101       comm->init_smp();
102     }
103   
104     my_rank = comm->rank();
105     total_size = comm->size();
106     shmem_comm = comm->get_intra_comm();
107     local_rank = shmem_comm->rank();
108     local_size = shmem_comm->size();
109     
110     leader_comm = comm->get_leaders_comm();
111     int* leaders_map = comm->get_leaders_map();
112     leader_of_root = comm->group()->rank(leaders_map[root]);
113     leader_root = leader_comm->group()->rank(leaders_map[root]);
114
115     is_commutative=smpi_op_is_commute(op);
116
117     smpi_datatype_extent(datatype, &true_lb,
118                                        &true_extent);
119     extent =smpi_datatype_get_extent(datatype);
120     stride = count * MAX(extent, true_extent);
121
122     if (local_size == total_size) {
123         /* First handle the case where there is only one node */
124         if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG &&
125             is_commutative == 1) {
126             if (local_rank == 0 ) {
127                 tmp_buf=(void *)smpi_get_tmp_sendbuffer( count *
128                                     (MAX(extent, true_extent)));
129                 tmp_buf = (void *) ((char *) tmp_buf - true_lb);
130             }
131
132             if (sendbuf != MPI_IN_PLACE) {
133                 in_buf = (void *)sendbuf;
134             } else {
135                 in_buf = recvbuf;
136             }
137
138             if (local_rank == 0) { 
139                  if( my_rank != root) {
140                      out_buf = tmp_buf;
141                  } else { 
142                      out_buf = recvbuf; 
143                      if(in_buf == out_buf) { 
144                         in_buf = MPI_IN_PLACE; 
145                         out_buf = recvbuf; 
146                      } 
147                  } 
148             } else {
149                 in_buf  = (void *)sendbuf; 
150                 out_buf = NULL;
151             }
152
153             if (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) {
154                 mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count,
155                                                   datatype, op,
156                                                   0, shmem_comm);
157             }
158             else {
159                 mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
160                                                                   datatype, op,
161                                                                   0, shmem_comm);
162             }
163             
164             if (local_rank == 0 && root != my_rank) {
165                 smpi_mpi_send(out_buf, count, datatype, root,
166                                          COLL_TAG_REDUCE+1, comm);
167             }
168             if ((local_rank != 0) && (root == my_rank)) {
169                 smpi_mpi_recv(recvbuf, count, datatype,
170                                          leader_of_root, COLL_TAG_REDUCE+1, comm,
171                                          MPI_STATUS_IGNORE);
172             }
173         } else {
174             if(mv2_use_knomial_reduce == 1) { 
175                 reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2; 
176             } else { 
177                 reduce_fn = &MPIR_Reduce_binomial_MV2; 
178             } 
179             mpi_errno = reduce_fn(sendbuf, recvbuf, count,
180                                   datatype, op,
181                                   root, comm);
182         }
183         /* We are done */
184         if(tmp_buf!=NULL) 
185           smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
186         goto fn_exit;
187     }
188     
189
190     if (local_rank == 0) {
191         leader_comm = comm->get_leaders_comm();
192         if(leader_comm==MPI_COMM_NULL){
193           leader_comm = MPI_COMM_WORLD;
194         }
195         leader_comm_size = leader_comm->size();
196         leader_comm_rank = leader_comm->rank();
197         tmp_buf=(void *)smpi_get_tmp_sendbuffer(count *
198                             (MAX(extent, true_extent)));
199         tmp_buf = (void *) ((char *) tmp_buf - true_lb);
200     }
201     if (sendbuf != MPI_IN_PLACE) {
202         in_buf = (void *)sendbuf;
203     } else {
204         in_buf = recvbuf;
205     }
206     if (local_rank == 0) {
207         out_buf = tmp_buf;
208     } else {
209         out_buf = NULL;
210     }
211
212
213     if(local_size > 1) { 
214         /* Lets do the intra-node reduce operations, if we have more than one
215          * process in the node */
216
217         /*Fix the input and outbuf buffers for the intra-node reduce.
218          *Node leaders will have the reduced data in tmp_buf after 
219          *this step*/
220         if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2)
221         {
222             if (is_commutative == 1
223                 && (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) {
224                     mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
225                                       datatype, op,
226                                       intra_node_root, shmem_comm);
227             } else {
228                     mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
229                                       datatype, op,
230                                       intra_node_root, shmem_comm);
231             }
232         } else {
233
234             mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
235                                       datatype, op,
236                                       intra_node_root, shmem_comm);
237         }
238     } else { 
239         smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
240         tmp_buf = in_buf; 
241     } 
242
243     /* Now work on the inter-leader phase. Data is in tmp_buf */
244     if (local_rank == 0 && leader_comm_size > 1) {
245         /*The leader of root will have the global reduced data in tmp_buf 
246            or recv_buf
247            at the end of the reduce */
248         if (leader_comm_rank == leader_root) {
249             if (my_rank == root) {
250                 /* I am the root of the leader-comm, and the 
251                  * root of the reduce op. So, I will write the 
252                  * final result directly into my recvbuf */
253                 if(tmp_buf != recvbuf) { 
254                     in_buf = tmp_buf;
255                     out_buf = recvbuf;
256                 } else { 
257
258                      in_buf = (char *)smpi_get_tmp_sendbuffer(count*
259                                        smpi_datatype_get_extent(datatype));
260                      smpi_datatype_copy(tmp_buf, count, datatype,
261                                         in_buf, count, datatype);
262                     //in_buf = MPI_IN_PLACE; 
263                     out_buf = recvbuf; 
264                 } 
265             } else {
266                 in_buf = (char *)smpi_get_tmp_sendbuffer(count*
267                                        smpi_datatype_get_extent(datatype));
268                 smpi_datatype_copy(tmp_buf, count, datatype,
269                                         in_buf, count, datatype);
270                 //in_buf = MPI_IN_PLACE;
271                 out_buf = tmp_buf;
272             }
273         } else {
274             in_buf = tmp_buf;
275             out_buf = NULL;
276         }
277
278         /* inter-leader communication  */
279         mpi_errno = MV2_Reduce_function(in_buf, out_buf, count,
280                               datatype, op,
281                               leader_root, leader_comm);
282
283     }
284
285     if (local_size > 1) {
286         /* Send the message to the root if the leader is not the
287          * root of the reduce operation. The reduced data is in tmp_buf */
288         if ((local_rank == 0) && (root != my_rank)
289             && (leader_root == leader_comm_rank)) {
290             smpi_mpi_send(tmp_buf, count, datatype, root,
291                                      COLL_TAG_REDUCE+1, comm);
292         }
293         if ((local_rank != 0) && (root == my_rank)) {
294             smpi_mpi_recv(recvbuf, count, datatype,
295                                      leader_of_root,
296                                      COLL_TAG_REDUCE+1, comm,
297                                      MPI_STATUS_IGNORE);
298         }
299       smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
300
301       if (leader_comm_rank == leader_root) {
302         if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) { 
303           smpi_free_tmp_buffer(in_buf);
304         }
305       }
306     }
307
308
309
310   fn_exit:
311     return mpi_errno;
312 }