1 /* Copyright (c) 2013-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2009 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
19 * Additional copyrights may follow
21 /* -*- Mode: C; c-basic-offset:4 ; -*- */
22 /* Copyright (c) 2001-2014, The Ohio State University. All rights
25 * This file is part of the MVAPICH2 software package developed by the
26 * team members of The Ohio State University's Network-Based Computing
27 * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
29 * For detailed copyright and licensing information, please refer to the
30 * copyright file COPYRIGHT in the top level MVAPICH2 directory.
34 * (C) 2001 by Argonne National Laboratory.
35 * See COPYRIGHT in top-level directory.
37 #include "colls_private.h"
40 extern int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
41 int root, MPI_Comm comm_ptr);
43 extern int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
44 int root, MPI_Comm comm_ptr);
46 extern int zcpy_knomial_factor;
47 extern int mv2_pipelined_zcpy_knomial_factor;
48 extern int bcast_segment_size;
49 extern int mv2_inter_node_knomial_factor;
50 extern int mv2_intra_node_knomial_factor;
51 extern int mv2_bcast_two_level_system_size;
52 #define INTRA_NODE_ROOT 0
54 #define MPIR_Pipelined_Bcast_Zcpy_MV2 smpi_coll_tuned_bcast_mpich
55 #define MPIR_Pipelined_Bcast_MV2 smpi_coll_tuned_bcast_mpich
56 #define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_binomial_tree
57 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
58 #define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_scatter_rdb_allgather
59 #define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
60 #define MPIR_Shmem_Bcast_MV2 smpi_coll_tuned_bcast_mpich
61 #define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
62 #define MPIR_Bcast_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
63 #define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mvapich2_knomial_intra_node
64 #define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mvapich2_intra_node
66 extern int zcpy_knomial_factor;
67 extern int mv2_pipelined_zcpy_knomial_factor;
68 extern int bcast_segment_size;
69 extern int mv2_inter_node_knomial_factor;
70 extern int mv2_intra_node_knomial_factor;
71 #define mv2_bcast_two_level_system_size 64
72 #define mv2_bcast_short_msg 16384
73 #define mv2_bcast_large_msg 512*1024
74 #define mv2_knomial_intra_node_threshold 131072
75 #define mv2_scatter_rd_inter_leader_bcast 1
76 int smpi_coll_tuned_bcast_mvapich2_inter_node(void *buffer,
78 MPI_Datatype datatype,
83 int mpi_errno = MPI_SUCCESS;
84 MPI_Comm shmem_comm, leader_comm;
85 int local_rank, local_size, global_rank = -1;
86 int leader_root, leader_of_root;
89 rank = smpi_comm_rank(comm);
90 //comm_size = smpi_comm_size(comm);
93 if (MV2_Bcast_function==NULL){
94 MV2_Bcast_function=smpi_coll_tuned_bcast_mpich;
97 if (MV2_Bcast_intra_node_function==NULL){
98 MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich;
101 if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
102 smpi_comm_init_smp(comm);
105 shmem_comm = smpi_comm_get_intra_comm(comm);
106 local_rank = smpi_comm_rank(shmem_comm);
107 local_size = smpi_comm_size(shmem_comm);
109 leader_comm = smpi_comm_get_leaders_comm(comm);
111 if ((local_rank == 0) && (local_size > 1)) {
112 global_rank = smpi_comm_rank(leader_comm);
115 int* leaders_map = smpi_comm_get_leaders_map(comm);
116 leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]);
117 leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]);
120 if (local_size > 1) {
121 if ((local_rank == 0) && (root != rank) && (leader_root == global_rank)) {
122 smpi_mpi_recv(buffer, count, datatype, root,
123 COLL_TAG_BCAST, comm, MPI_STATUS_IGNORE);
125 if ((local_rank != 0) && (root == rank)) {
126 smpi_mpi_send(buffer, count, datatype,
127 leader_of_root, COLL_TAG_BCAST, comm);
130 #if defined(_MCST_SUPPORT_)
131 if (comm_ptr->ch.is_mcast_ok) {
132 mpi_errno = MPIR_Mcast_inter_node_MV2(buffer, count, datatype, root, comm_ptr,
134 if (mpi_errno == MPI_SUCCESS) {
142 if (local_rank == 0) {
143 leader_comm = smpi_comm_get_leaders_comm(comm);
147 if (MV2_Bcast_function == &MPIR_Pipelined_Bcast_MV2) {
148 mpi_errno = MPIR_Pipelined_Bcast_MV2(buffer, count, datatype,
150 } else if (MV2_Bcast_function == &MPIR_Bcast_scatter_ring_allgather_shm_MV2) {
151 mpi_errno = MPIR_Bcast_scatter_ring_allgather_shm_MV2(buffer, count,
155 if (local_rank == 0) {
156 /* if (MV2_Bcast_function == &MPIR_Knomial_Bcast_inter_node_wrapper_MV2) {
157 mpi_errno = MPIR_Knomial_Bcast_inter_node_wrapper_MV2(buffer, count,
161 mpi_errno = MV2_Bcast_function(buffer, count, datatype,
162 leader_root, leader_comm);
171 int smpi_coll_tuned_bcast_mvapich2_knomial_intra_node(void *buffer,
173 MPI_Datatype datatype,
174 int root, MPI_Comm comm)
176 int local_size = 0, rank;
177 int mpi_errno = MPI_SUCCESS;
178 MPI_Request *reqarray = NULL;
179 MPI_Status *starray = NULL;
180 int src, dst, mask, relative_rank;
182 if (MV2_Bcast_function==NULL){
183 MV2_Bcast_function=smpi_coll_tuned_bcast_mpich;
186 if (MV2_Bcast_intra_node_function==NULL){
187 MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich;
190 if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
191 smpi_comm_init_smp(comm);
194 local_size = smpi_comm_size(comm);
195 rank = smpi_comm_rank(comm);
198 reqarray=(MPI_Request *)xbt_malloc(2 * mv2_intra_node_knomial_factor * sizeof (MPI_Request));
200 starray=(MPI_Status *)xbt_malloc(2 * mv2_intra_node_knomial_factor * sizeof (MPI_Status));
202 /* intra-node k-nomial bcast */
203 if (local_size > 1) {
204 relative_rank = (rank >= root) ? rank - root : rank - root + local_size;
207 while (mask < local_size) {
208 if (relative_rank % (mv2_intra_node_knomial_factor * mask)) {
209 src = relative_rank / (mv2_intra_node_knomial_factor * mask) *
210 (mv2_intra_node_knomial_factor * mask) + root;
211 if (src >= local_size) {
215 smpi_mpi_recv(buffer, count, datatype, src,
216 COLL_TAG_BCAST, comm,
220 mask *= mv2_intra_node_knomial_factor;
222 mask /= mv2_intra_node_knomial_factor;
226 for (k = 1; k < mv2_intra_node_knomial_factor; k++) {
227 if (relative_rank + mask * k < local_size) {
228 dst = rank + mask * k;
229 if (dst >= local_size) {
232 reqarray[reqs++]=smpi_mpi_isend(buffer, count, datatype, dst,
233 COLL_TAG_BCAST, comm);
236 smpi_mpi_waitall(reqs, reqarray, starray);
238 mask /= mv2_intra_node_knomial_factor;
247 int smpi_coll_tuned_bcast_mvapich2_intra_node(void *buffer,
249 MPI_Datatype datatype,
250 int root, MPI_Comm comm)
252 int mpi_errno = MPI_SUCCESS;
254 int two_level_bcast = 1;
256 int is_homogeneous, is_contig;
258 void *tmp_buf = NULL;
263 if (MV2_Bcast_function==NULL){
264 MV2_Bcast_function=smpi_coll_tuned_bcast_mpich;
267 if (MV2_Bcast_intra_node_function==NULL){
268 MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich;
271 if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
272 smpi_comm_init_smp(comm);
275 comm_size = smpi_comm_size(comm);
276 // rank = smpi_comm_rank(comm);
278 if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
281 MPID_Datatype_get_ptr(datatype, dtp);
282 is_contig = dtp->is_contig;
286 #ifdef MPID_HAS_HETERO
287 if (comm_ptr->is_hetero)
291 /* MPI_Type_size() might not give the accurate size of the packed
292 * datatype for heterogeneous systems (because of padding, encoding,
293 * etc). On the other hand, MPI_Pack_size() can become very
294 * expensive, depending on the implementation, especially for
295 * heterogeneous systems. We want to use MPI_Type_size() wherever
296 * possible, and MPI_Pack_size() in other places.
298 //if (is_homogeneous) {
299 type_size=smpi_datatype_size(datatype);
302 /* MPIR_Pack_size_impl(1, datatype, &type_size);*/
304 nbytes = (size_t) (count) * (type_size);
305 if (comm_size <= mv2_bcast_two_level_system_size) {
306 if (nbytes > mv2_bcast_short_msg && nbytes < mv2_bcast_large_msg) {
313 if (two_level_bcast == 1
314 #if defined(_MCST_SUPPORT_)
315 || comm_ptr->ch.is_mcast_ok
319 if (!is_contig || !is_homogeneous) {
320 tmp_buf=(void *)smpi_get_tmp_sendbuffer(nbytes);
322 /* TODO: Pipeline the packing and communication */
324 /* if (rank == root) {*/
326 /* MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
328 /* MPIU_ERR_POP(mpi_errno);*/
332 shmem_comm = smpi_comm_get_intra_comm(comm);
333 if (!is_contig || !is_homogeneous) {
335 MPIR_Bcast_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE,
339 MPIR_Bcast_inter_node_helper_MV2(buffer, count, datatype, root,
343 /* We are now done with the inter-node phase */
344 if (nbytes <= mv2_knomial_intra_node_threshold) {
345 if (!is_contig || !is_homogeneous) {
346 mpi_errno = MPIR_Shmem_Bcast_MV2(tmp_buf, nbytes, MPI_BYTE,
349 mpi_errno = MPIR_Shmem_Bcast_MV2(buffer, count, datatype,
353 if (!is_contig || !is_homogeneous) {
355 MPIR_Knomial_Bcast_intra_node_MV2(tmp_buf, nbytes,
361 MPIR_Knomial_Bcast_intra_node_MV2(buffer, count,
369 if (nbytes <= mv2_bcast_short_msg) {
370 mpi_errno = MPIR_Bcast_binomial_MV2(buffer, count, datatype, root,
373 if (mv2_scatter_rd_inter_leader_bcast) {
374 mpi_errno = MPIR_Bcast_scatter_ring_allgather_MV2(buffer, count,
380 MPIR_Bcast_scatter_doubling_allgather_MV2(buffer, count,