Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
21302b781300bb43c5958254bc52ecd69e8a60f1
[simgrid.git] / src / smpi / colls / bcast-mvapich-smp.cpp
1 /* Copyright (c) 2013-2014. The SimGrid Team.
2  * All rights reserved.                                                     */
3
4 /* This program is free software; you can redistribute it and/or modify it
5  * under the terms of the license (GNU LGPL) which comes with this package. */
6
7 /*
8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9  *                         University Research and Technology
10  *                         Corporation.  All rights reserved.
11  * Copyright (c) 2004-2009 The University of Tennessee and The University
12  *                         of Tennessee Research Foundation.  All rights
13  *                         reserved.
14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15  *                         University of Stuttgart.  All rights reserved.
16  * Copyright (c) 2004-2005 The Regents of the University of California.
17  *                         All rights reserved.
18  *
19  * Additional copyrights may follow
20  */
21  /* -*- Mode: C; c-basic-offset:4 ; -*- */
22 /* Copyright (c) 2001-2014, The Ohio State University. All rights
23  * reserved.
24  *
25  * This file is part of the MVAPICH2 software package developed by the
26  * team members of The Ohio State University's Network-Based Computing
27  * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
28  *
29  * For detailed copyright and licensing information, please refer to the
30  * copyright file COPYRIGHT in the top level MVAPICH2 directory.
31  */
32 /*
33  *
34  *  (C) 2001 by Argonne National Laboratory.
35  *      See COPYRIGHT in top-level directory.
36  */
37 #include "colls_private.h"
38 #include "src/smpi/smpi_group.hpp"
39
40
41 extern int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
42                            int root, MPI_Comm comm_ptr);
43
44 extern int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
45                                       int root, MPI_Comm comm_ptr);
46                                       
47 extern int zcpy_knomial_factor;
48 extern int mv2_pipelined_zcpy_knomial_factor;
49 extern int bcast_segment_size;
50 extern int mv2_inter_node_knomial_factor;
51 extern int mv2_intra_node_knomial_factor;
52 extern int mv2_bcast_two_level_system_size;
53 #define INTRA_NODE_ROOT 0
54
55 #define MPIR_Pipelined_Bcast_Zcpy_MV2 smpi_coll_tuned_bcast_mpich
56 #define MPIR_Pipelined_Bcast_MV2 smpi_coll_tuned_bcast_mpich
57 #define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_binomial_tree
58 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
59 #define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_scatter_rdb_allgather
60 #define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
61 #define MPIR_Shmem_Bcast_MV2 smpi_coll_tuned_bcast_mpich
62 #define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
63 #define MPIR_Bcast_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
64 #define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mvapich2_knomial_intra_node
65 #define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mvapich2_intra_node
66
67 extern int zcpy_knomial_factor;
68 extern int mv2_pipelined_zcpy_knomial_factor;
69 extern int bcast_segment_size;
70 extern int mv2_inter_node_knomial_factor;
71 extern int mv2_intra_node_knomial_factor;
72 #define mv2_bcast_two_level_system_size  64
73 #define mv2_bcast_short_msg             16384
74 #define mv2_bcast_large_msg            512*1024
75 #define mv2_knomial_intra_node_threshold 131072
76 #define mv2_scatter_rd_inter_leader_bcast 1
77 int smpi_coll_tuned_bcast_mvapich2_inter_node(void *buffer,
78                                                  int count,
79                                                  MPI_Datatype datatype,
80                                                  int root,
81                                                  MPI_Comm  comm)
82 {
83     int rank;
84     int mpi_errno = MPI_SUCCESS;
85     MPI_Comm shmem_comm, leader_comm;
86     int local_rank, local_size, global_rank = -1;
87     int leader_root, leader_of_root;
88
89
90     rank = smpi_comm_rank(comm);
91     //comm_size = smpi_comm_size(comm);
92
93
94     if (MV2_Bcast_function==NULL){
95       MV2_Bcast_function=smpi_coll_tuned_bcast_mpich;
96     }
97     
98     if (MV2_Bcast_intra_node_function==NULL){
99       MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich;
100     }
101     
102     if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
103       smpi_comm_init_smp(comm);
104     }
105     
106     shmem_comm = smpi_comm_get_intra_comm(comm);
107     local_rank = smpi_comm_rank(shmem_comm);
108     local_size = smpi_comm_size(shmem_comm);
109
110     leader_comm = smpi_comm_get_leaders_comm(comm);
111
112     if ((local_rank == 0) && (local_size > 1)) {
113       global_rank = smpi_comm_rank(leader_comm);
114     }
115
116     int* leaders_map = smpi_comm_get_leaders_map(comm);
117     leader_of_root = smpi_comm_group(comm)->rank(leaders_map[root]);
118     leader_root = smpi_comm_group(leader_comm)->rank(leaders_map[root]);
119     
120     
121     if (local_size > 1) {
122         if ((local_rank == 0) && (root != rank) && (leader_root == global_rank)) {
123             smpi_mpi_recv(buffer, count, datatype, root,
124                                      COLL_TAG_BCAST, comm, MPI_STATUS_IGNORE);
125         }
126         if ((local_rank != 0) && (root == rank)) {
127             smpi_mpi_send(buffer, count, datatype,
128                                      leader_of_root, COLL_TAG_BCAST, comm);
129         }
130     }
131 #if defined(_MCST_SUPPORT_)
132     if (comm_ptr->ch.is_mcast_ok) {
133         mpi_errno = MPIR_Mcast_inter_node_MV2(buffer, count, datatype, root, comm_ptr,
134                                               errflag);
135         if (mpi_errno == MPI_SUCCESS) {
136             goto fn_exit;
137         } else {
138             goto fn_fail;
139         }
140     }
141 #endif
142 /*
143     if (local_rank == 0) {
144         leader_comm = smpi_comm_get_leaders_comm(comm);
145         root = leader_root;
146     }
147
148     if (MV2_Bcast_function == &MPIR_Pipelined_Bcast_MV2) {
149         mpi_errno = MPIR_Pipelined_Bcast_MV2(buffer, count, datatype,
150                                              root, comm);
151     } else if (MV2_Bcast_function == &MPIR_Bcast_scatter_ring_allgather_shm_MV2) {
152         mpi_errno = MPIR_Bcast_scatter_ring_allgather_shm_MV2(buffer, count,
153                                                               datatype, root,
154                                                               comm);
155     } else */{
156         if (local_rank == 0) {
157       /*      if (MV2_Bcast_function == &MPIR_Knomial_Bcast_inter_node_wrapper_MV2) {
158                 mpi_errno = MPIR_Knomial_Bcast_inter_node_wrapper_MV2(buffer, count,
159                                                               datatype, root,
160                                                               comm);
161             } else {*/
162                 mpi_errno = MV2_Bcast_function(buffer, count, datatype,
163                                                leader_root, leader_comm);
164           //  }
165         }
166     }
167
168     return mpi_errno;
169 }
170
171
172 int smpi_coll_tuned_bcast_mvapich2_knomial_intra_node(void *buffer,
173                                       int count,
174                                       MPI_Datatype datatype,
175                                       int root, MPI_Comm  comm)
176 {
177     int local_size = 0, rank;
178     int mpi_errno = MPI_SUCCESS;
179     MPI_Request *reqarray = NULL;
180     MPI_Status *starray = NULL;
181     int src, dst, mask, relative_rank;
182     int k;
183     if (MV2_Bcast_function==NULL){
184       MV2_Bcast_function=smpi_coll_tuned_bcast_mpich;
185     }
186     
187     if (MV2_Bcast_intra_node_function==NULL){
188       MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich;
189     }
190     
191     if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
192       smpi_comm_init_smp(comm);
193     }
194     
195     local_size = smpi_comm_size(comm);
196     rank = smpi_comm_rank(comm);
197
198
199     reqarray=(MPI_Request *)xbt_malloc(2 * mv2_intra_node_knomial_factor * sizeof (MPI_Request));
200
201     starray=(MPI_Status *)xbt_malloc(2 * mv2_intra_node_knomial_factor * sizeof (MPI_Status));
202
203     /* intra-node k-nomial bcast  */
204     if (local_size > 1) {
205         relative_rank = (rank >= root) ? rank - root : rank - root + local_size;
206         mask = 0x1;
207
208         while (mask < local_size) {
209             if (relative_rank % (mv2_intra_node_knomial_factor * mask)) {
210                 src = relative_rank / (mv2_intra_node_knomial_factor * mask) *
211                     (mv2_intra_node_knomial_factor * mask) + root;
212                 if (src >= local_size) {
213                     src -= local_size;
214                 }
215
216                 smpi_mpi_recv(buffer, count, datatype, src,
217                                          COLL_TAG_BCAST, comm,
218                                          MPI_STATUS_IGNORE);
219                 break;
220             }
221             mask *= mv2_intra_node_knomial_factor;
222         }
223         mask /= mv2_intra_node_knomial_factor;
224
225         while (mask > 0) {
226             int reqs = 0;
227             for (k = 1; k < mv2_intra_node_knomial_factor; k++) {
228                 if (relative_rank + mask * k < local_size) {
229                     dst = rank + mask * k;
230                     if (dst >= local_size) {
231                         dst -= local_size;
232                     }
233                     reqarray[reqs++]=smpi_mpi_isend(buffer, count, datatype, dst,
234                                               COLL_TAG_BCAST, comm);
235                 }
236             }
237             smpi_mpi_waitall(reqs, reqarray, starray);
238
239             mask /= mv2_intra_node_knomial_factor;
240         }
241     }
242     xbt_free(reqarray);
243     xbt_free(starray);
244     return mpi_errno;
245 }
246
247
248 int smpi_coll_tuned_bcast_mvapich2_intra_node(void *buffer,
249                          int count,
250                          MPI_Datatype datatype,
251                          int root, MPI_Comm  comm)
252 {
253     int mpi_errno = MPI_SUCCESS;
254     int comm_size;
255     int two_level_bcast = 1;
256     size_t nbytes = 0; 
257     int is_homogeneous, is_contig;
258     MPI_Aint type_size;
259     void *tmp_buf = NULL;
260     MPI_Comm shmem_comm;
261
262     if (count == 0)
263         return MPI_SUCCESS;
264     if (MV2_Bcast_function==NULL){
265       MV2_Bcast_function=smpi_coll_tuned_bcast_mpich;
266     }
267     
268     if (MV2_Bcast_intra_node_function==NULL){
269       MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich;
270     }
271     
272     if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
273       smpi_comm_init_smp(comm);
274     }
275     
276     comm_size = smpi_comm_size(comm);
277    // rank = smpi_comm_rank(comm);
278 /*
279     if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
280         is_contig = 1;
281 /*    else {
282         MPID_Datatype_get_ptr(datatype, dtp);
283         is_contig = dtp->is_contig;
284     }
285 */
286     is_homogeneous = 1;
287 #ifdef MPID_HAS_HETERO
288     if (comm_ptr->is_hetero)
289         is_homogeneous = 0;
290 #endif
291
292     /* MPI_Type_size() might not give the accurate size of the packed
293      * datatype for heterogeneous systems (because of padding, encoding,
294      * etc). On the other hand, MPI_Pack_size() can become very
295      * expensive, depending on the implementation, especially for
296      * heterogeneous systems. We want to use MPI_Type_size() wherever
297      * possible, and MPI_Pack_size() in other places.
298      */
299     //if (is_homogeneous) {
300         type_size=smpi_datatype_size(datatype);
301     //}
302 /*    else {*/
303 /*        MPIR_Pack_size_impl(1, datatype, &type_size);*/
304 /*    }*/
305     nbytes = (size_t) (count) * (type_size);
306     if (comm_size <= mv2_bcast_two_level_system_size) {
307         if (nbytes > mv2_bcast_short_msg && nbytes < mv2_bcast_large_msg) {
308             two_level_bcast = 1;
309         } else {
310             two_level_bcast = 0;
311         }
312     }
313
314     if (two_level_bcast == 1
315 #if defined(_MCST_SUPPORT_)
316             || comm_ptr->ch.is_mcast_ok
317 #endif
318         ) {
319
320         if (!is_contig || !is_homogeneous) {
321             tmp_buf=(void *)smpi_get_tmp_sendbuffer(nbytes);
322
323             /* TODO: Pipeline the packing and communication */
324            // position = 0;
325 /*            if (rank == root) {*/
326 /*                mpi_errno =*/
327 /*                    MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
328 /*                if (mpi_errno)*/
329 /*                    MPIU_ERR_POP(mpi_errno);*/
330 /*            }*/
331         }
332
333         shmem_comm = smpi_comm_get_intra_comm(comm);
334         if (!is_contig || !is_homogeneous) {
335             mpi_errno =
336                 MPIR_Bcast_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE,
337                                                  root, comm);
338         } else {
339             mpi_errno =
340                 MPIR_Bcast_inter_node_helper_MV2(buffer, count, datatype, root,
341                                                  comm);
342         }
343
344         /* We are now done with the inter-node phase */
345             if (nbytes <= mv2_knomial_intra_node_threshold) {
346                 if (!is_contig || !is_homogeneous) {
347                     mpi_errno = MPIR_Shmem_Bcast_MV2(tmp_buf, nbytes, MPI_BYTE,
348                                                      root, shmem_comm);
349                 } else {
350                     mpi_errno = MPIR_Shmem_Bcast_MV2(buffer, count, datatype,
351                                                      root, shmem_comm);
352                 }
353             } else {
354                 if (!is_contig || !is_homogeneous) {
355                     mpi_errno =
356                         MPIR_Knomial_Bcast_intra_node_MV2(tmp_buf, nbytes,
357                                                           MPI_BYTE,
358                                                           INTRA_NODE_ROOT,
359                                                           shmem_comm);
360                 } else {
361                     mpi_errno =
362                         MPIR_Knomial_Bcast_intra_node_MV2(buffer, count,
363                                                           datatype,
364                                                           INTRA_NODE_ROOT,
365                                                           shmem_comm);
366                 }
367             }
368
369     } else {
370         if (nbytes <= mv2_bcast_short_msg) {
371             mpi_errno = MPIR_Bcast_binomial_MV2(buffer, count, datatype, root,
372                                                 comm);
373         } else {
374             if (mv2_scatter_rd_inter_leader_bcast) {
375                 mpi_errno = MPIR_Bcast_scatter_ring_allgather_MV2(buffer, count,
376                                                                   datatype,
377                                                                   root,
378                                                                   comm);
379             } else {
380                 mpi_errno =
381                     MPIR_Bcast_scatter_doubling_allgather_MV2(buffer, count,
382                                                               datatype, root,
383                                                               comm);
384             }
385         }
386     }
387
388
389     return mpi_errno;
390
391 }