Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
oops
[simgrid.git] / src / smpi / colls / bcast-mvapich-smp.c
1 /* Copyright (c) 2013-2014. The SimGrid Team.
2  * All rights reserved.                                                     */
3
4 /* This program is free software; you can redistribute it and/or modify it
5  * under the terms of the license (GNU LGPL) which comes with this package. */
6
7 /*
8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9  *                         University Research and Technology
10  *                         Corporation.  All rights reserved.
11  * Copyright (c) 2004-2009 The University of Tennessee and The University
12  *                         of Tennessee Research Foundation.  All rights
13  *                         reserved.
14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15  *                         University of Stuttgart.  All rights reserved.
16  * Copyright (c) 2004-2005 The Regents of the University of California.
17  *                         All rights reserved.
18  *
19  * Additional copyrights may follow
20  */
21  /* -*- Mode: C; c-basic-offset:4 ; -*- */
22 /* Copyright (c) 2001-2014, The Ohio State University. All rights
23  * reserved.
24  *
25  * This file is part of the MVAPICH2 software package developed by the
26  * team members of The Ohio State University's Network-Based Computing
27  * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
28  *
29  * For detailed copyright and licensing information, please refer to the
30  * copyright file COPYRIGHT in the top level MVAPICH2 directory.
31  */
32 /*
33  *
34  *  (C) 2001 by Argonne National Laboratory.
35  *      See COPYRIGHT in top-level directory.
36  */
37 #include "colls_private.h"
38
39
40 extern int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
41                            int root, MPI_Comm comm_ptr);
42
43 extern int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
44                                       int root, MPI_Comm comm_ptr);
45                                       
46 extern int zcpy_knomial_factor;
47 extern int mv2_pipelined_zcpy_knomial_factor;
48 extern int bcast_segment_size;
49 extern int mv2_inter_node_knomial_factor;
50 extern int mv2_intra_node_knomial_factor;
51 extern int mv2_bcast_two_level_system_size;
52 #define INTRA_NODE_ROOT 0
53
54 #define MPIR_Pipelined_Bcast_Zcpy_MV2 smpi_coll_tuned_bcast_mpich
55 #define MPIR_Pipelined_Bcast_MV2 smpi_coll_tuned_bcast_mpich
56 #define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_binomial_tree
57 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
58 #define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_scatter_rdb_allgather
59 #define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
60 #define MPIR_Shmem_Bcast_MV2 smpi_coll_tuned_bcast_mpich
61 #define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
62 #define MPIR_Bcast_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
63 #define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mvapich2_knomial_intra_node
64 #define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mvapich2_intra_node
65
66 extern int zcpy_knomial_factor;
67 extern int mv2_pipelined_zcpy_knomial_factor;
68 extern int bcast_segment_size;
69 extern int mv2_inter_node_knomial_factor;
70 extern int mv2_intra_node_knomial_factor;
71 #define mv2_bcast_two_level_system_size  64
72 #define mv2_bcast_short_msg             16384
73 #define mv2_bcast_large_msg            512*1024
74 #define mv2_knomial_intra_node_threshold 131072
75 #define mv2_scatter_rd_inter_leader_bcast 1
76 int smpi_coll_tuned_bcast_mvapich2_inter_node(void *buffer,
77                                                  int count,
78                                                  MPI_Datatype datatype,
79                                                  int root,
80                                                  MPI_Comm  comm)
81 {
82     int rank;
83     int mpi_errno = MPI_SUCCESS;
84     MPI_Comm shmem_comm, leader_comm;
85     int local_rank, local_size, global_rank = -1;
86     int leader_root, leader_of_root;
87
88
89     rank = smpi_comm_rank(comm);
90     //comm_size = smpi_comm_size(comm);
91
92
93     if (MV2_Bcast_function==NULL){
94       MV2_Bcast_function=smpi_coll_tuned_bcast_mpich;
95     }
96     
97     if (MV2_Bcast_intra_node_function==NULL){
98       MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich;
99     }
100     
101     if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
102       smpi_comm_init_smp(comm);
103     }
104     
105     shmem_comm = smpi_comm_get_intra_comm(comm);
106     local_rank = smpi_comm_rank(shmem_comm);
107     local_size = smpi_comm_size(shmem_comm);
108
109     leader_comm = smpi_comm_get_leaders_comm(comm);
110
111     if ((local_rank == 0) && (local_size > 1)) {
112       global_rank = smpi_comm_rank(leader_comm);
113     }
114
115     int* leaders_map = smpi_comm_get_leaders_map(comm);
116     leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]);
117     leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]);
118     
119     
120     if (local_size > 1) {
121         if ((local_rank == 0) && (root != rank) && (leader_root == global_rank)) {
122             smpi_mpi_recv(buffer, count, datatype, root,
123                                      COLL_TAG_BCAST, comm, MPI_STATUS_IGNORE);
124         }
125         if ((local_rank != 0) && (root == rank)) {
126             smpi_mpi_send(buffer, count, datatype,
127                                      leader_of_root, COLL_TAG_BCAST, comm);
128         }
129     }
130 #if defined(_MCST_SUPPORT_)
131     if (comm_ptr->ch.is_mcast_ok) {
132         mpi_errno = MPIR_Mcast_inter_node_MV2(buffer, count, datatype, root, comm_ptr,
133                                               errflag);
134         if (mpi_errno == MPI_SUCCESS) {
135             goto fn_exit;
136         } else {
137             goto fn_fail;
138         }
139     }
140 #endif
141 /*
142     if (local_rank == 0) {
143         leader_comm = smpi_comm_get_leaders_comm(comm);
144         root = leader_root;
145     }
146
147     if (MV2_Bcast_function == &MPIR_Pipelined_Bcast_MV2) {
148         mpi_errno = MPIR_Pipelined_Bcast_MV2(buffer, count, datatype,
149                                              root, comm);
150     } else if (MV2_Bcast_function == &MPIR_Bcast_scatter_ring_allgather_shm_MV2) {
151         mpi_errno = MPIR_Bcast_scatter_ring_allgather_shm_MV2(buffer, count,
152                                                               datatype, root,
153                                                               comm);
154     } else */{
155         if (local_rank == 0) {
156       /*      if (MV2_Bcast_function == &MPIR_Knomial_Bcast_inter_node_wrapper_MV2) {
157                 mpi_errno = MPIR_Knomial_Bcast_inter_node_wrapper_MV2(buffer, count,
158                                                               datatype, root,
159                                                               comm);
160             } else {*/
161                 mpi_errno = MV2_Bcast_function(buffer, count, datatype,
162                                                leader_root, leader_comm);
163           //  }
164         }
165     }
166
167     return mpi_errno;
168 }
169
170
171 int smpi_coll_tuned_bcast_mvapich2_knomial_intra_node(void *buffer,
172                                       int count,
173                                       MPI_Datatype datatype,
174                                       int root, MPI_Comm  comm)
175 {
176     int local_size = 0, rank;
177     int mpi_errno = MPI_SUCCESS;
178     MPI_Request *reqarray = NULL;
179     MPI_Status *starray = NULL;
180     int src, dst, mask, relative_rank;
181     int k;
182     if (MV2_Bcast_function==NULL){
183       MV2_Bcast_function=smpi_coll_tuned_bcast_mpich;
184     }
185     
186     if (MV2_Bcast_intra_node_function==NULL){
187       MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich;
188     }
189     
190     if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
191       smpi_comm_init_smp(comm);
192     }
193     
194     local_size = smpi_comm_size(comm);
195     rank = smpi_comm_rank(comm);
196
197
198     reqarray=(MPI_Request *)xbt_malloc(2 * mv2_intra_node_knomial_factor * sizeof (MPI_Request));
199
200     starray=(MPI_Status *)xbt_malloc(2 * mv2_intra_node_knomial_factor * sizeof (MPI_Status));
201
202     /* intra-node k-nomial bcast  */
203     if (local_size > 1) {
204         relative_rank = (rank >= root) ? rank - root : rank - root + local_size;
205         mask = 0x1;
206
207         while (mask < local_size) {
208             if (relative_rank % (mv2_intra_node_knomial_factor * mask)) {
209                 src = relative_rank / (mv2_intra_node_knomial_factor * mask) *
210                     (mv2_intra_node_knomial_factor * mask) + root;
211                 if (src >= local_size) {
212                     src -= local_size;
213                 }
214
215                 smpi_mpi_recv(buffer, count, datatype, src,
216                                          COLL_TAG_BCAST, comm,
217                                          MPI_STATUS_IGNORE);
218                 break;
219             }
220             mask *= mv2_intra_node_knomial_factor;
221         }
222         mask /= mv2_intra_node_knomial_factor;
223
224         while (mask > 0) {
225             int reqs = 0;
226             for (k = 1; k < mv2_intra_node_knomial_factor; k++) {
227                 if (relative_rank + mask * k < local_size) {
228                     dst = rank + mask * k;
229                     if (dst >= local_size) {
230                         dst -= local_size;
231                     }
232                     reqarray[reqs++]=smpi_mpi_isend(buffer, count, datatype, dst,
233                                               COLL_TAG_BCAST, comm);
234                 }
235             }
236             smpi_mpi_waitall(reqs, reqarray, starray);
237
238             mask /= mv2_intra_node_knomial_factor;
239         }
240     }
241
242     return mpi_errno;
243 }
244
245
246 int smpi_coll_tuned_bcast_mvapich2_intra_node(void *buffer,
247                          int count,
248                          MPI_Datatype datatype,
249                          int root, MPI_Comm  comm)
250 {
251     int mpi_errno = MPI_SUCCESS;
252     int comm_size;
253     int two_level_bcast = 1;
254     size_t nbytes = 0; 
255     int is_homogeneous, is_contig;
256     MPI_Aint type_size;
257     void *tmp_buf = NULL;
258     MPI_Comm shmem_comm;
259
260     if (count == 0)
261         return MPI_SUCCESS;
262     if (MV2_Bcast_function==NULL){
263       MV2_Bcast_function=smpi_coll_tuned_bcast_mpich;
264     }
265     
266     if (MV2_Bcast_intra_node_function==NULL){
267       MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich;
268     }
269     
270     if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
271       smpi_comm_init_smp(comm);
272     }
273     
274     comm_size = smpi_comm_size(comm);
275    // rank = smpi_comm_rank(comm);
276 /*
277     if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
278         is_contig = 1;
279 /*    else {
280         MPID_Datatype_get_ptr(datatype, dtp);
281         is_contig = dtp->is_contig;
282     }
283 */
284     is_homogeneous = 1;
285 #ifdef MPID_HAS_HETERO
286     if (comm_ptr->is_hetero)
287         is_homogeneous = 0;
288 #endif
289
290     /* MPI_Type_size() might not give the accurate size of the packed
291      * datatype for heterogeneous systems (because of padding, encoding,
292      * etc). On the other hand, MPI_Pack_size() can become very
293      * expensive, depending on the implementation, especially for
294      * heterogeneous systems. We want to use MPI_Type_size() wherever
295      * possible, and MPI_Pack_size() in other places.
296      */
297     //if (is_homogeneous) {
298         type_size=smpi_datatype_size(datatype);
299     //} /*else {*/
300 /*        MPIR_Pack_size_impl(1, datatype, &type_size);*/
301 /*    }*/
302     nbytes = (size_t) (count) * (type_size);
303     if (comm_size <= mv2_bcast_two_level_system_size) {
304         if (nbytes > mv2_bcast_short_msg && nbytes < mv2_bcast_large_msg) {
305             two_level_bcast = 1;
306         } else {
307             two_level_bcast = 0;
308         }
309     }
310
311     if (two_level_bcast == 1
312 #if defined(_MCST_SUPPORT_)
313             || comm_ptr->ch.is_mcast_ok
314 #endif
315         ) {
316
317         if (!is_contig || !is_homogeneous) {
318             tmp_buf=(void *)smpi_get_tmp_sendbuffer(nbytes);
319
320             /* TODO: Pipeline the packing and communication */
321            // position = 0;
322 /*            if (rank == root) {*/
323 /*                mpi_errno =*/
324 /*                    MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
325 /*                if (mpi_errno)*/
326 /*                    MPIU_ERR_POP(mpi_errno);*/
327 /*            }*/
328         }
329
330         shmem_comm = smpi_comm_get_intra_comm(comm);
331         if (!is_contig || !is_homogeneous) {
332             mpi_errno =
333                 MPIR_Bcast_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE,
334                                                  root, comm);
335         } else {
336             mpi_errno =
337                 MPIR_Bcast_inter_node_helper_MV2(buffer, count, datatype, root,
338                                                  comm);
339         }
340
341         /* We are now done with the inter-node phase */
342             if (nbytes <= mv2_knomial_intra_node_threshold) {
343                 if (!is_contig || !is_homogeneous) {
344                     mpi_errno = MPIR_Shmem_Bcast_MV2(tmp_buf, nbytes, MPI_BYTE,
345                                                      root, shmem_comm);
346                 } else {
347                     mpi_errno = MPIR_Shmem_Bcast_MV2(buffer, count, datatype,
348                                                      root, shmem_comm);
349                 }
350             } else {
351                 if (!is_contig || !is_homogeneous) {
352                     mpi_errno =
353                         MPIR_Knomial_Bcast_intra_node_MV2(tmp_buf, nbytes,
354                                                           MPI_BYTE,
355                                                           INTRA_NODE_ROOT,
356                                                           shmem_comm);
357                 } else {
358                     mpi_errno =
359                         MPIR_Knomial_Bcast_intra_node_MV2(buffer, count,
360                                                           datatype,
361                                                           INTRA_NODE_ROOT,
362                                                           shmem_comm);
363                 }
364             }
365
366     } else {
367         if (nbytes <= mv2_bcast_short_msg) {
368             mpi_errno = MPIR_Bcast_binomial_MV2(buffer, count, datatype, root,
369                                                 comm);
370         } else {
371             if (mv2_scatter_rd_inter_leader_bcast) {
372                 mpi_errno = MPIR_Bcast_scatter_ring_allgather_MV2(buffer, count,
373                                                                   datatype,
374                                                                   root,
375                                                                   comm);
376             } else {
377                 mpi_errno =
378                     MPIR_Bcast_scatter_doubling_allgather_MV2(buffer, count,
379                                                               datatype, root,
380                                                               comm);
381             }
382         }
383     }
384
385
386     return mpi_errno;
387
388 }