1 /* Copyright (c) 2013-2022. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2009 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
19 * Additional copyrights may follow
21 /* -*- Mode: C; c-basic-offset:4 ; -*- */
22 /* Copyright (c) 2001-2014, The Ohio State University. All rights
25 * This file is part of the MVAPICH2 software package developed by the
26 * team members of The Ohio State University's Network-Based Computing
27 * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
29 * For detailed copyright and licensing information, please refer to the
30 * copyright file COPYRIGHT in the top level MVAPICH2 directory.
34 * (C) 2001 by Argonne National Laboratory.
35 * See COPYRIGHT in top-level directory.
37 #include "../colls_private.hpp"
39 #define MPIR_Scatter_MV2_Binomial scatter__ompi_binomial
40 #define MPIR_Scatter_MV2_Direct scatter__ompi_basic_linear
42 extern int (*MV2_Scatter_intra_function) (const void *sendbuf, int sendcount, MPI_Datatype sendtype,
43 void *recvbuf, int recvcount, MPI_Datatype recvtype,
44 int root, MPI_Comm comm);
46 namespace simgrid::smpi {
48 int scatter__mvapich2_two_level_direct(const void *sendbuf,
50 MPI_Datatype sendtype,
53 MPI_Datatype recvtype,
54 int root, MPI_Comm comm)
57 int local_rank, local_size;
58 int leader_comm_rank = -1, leader_comm_size = -1;
59 int mpi_errno = MPI_SUCCESS;
60 int recvtype_size, sendtype_size, nbytes;
61 unsigned char* tmp_buf = nullptr;
62 unsigned char* leader_scatter_buf = nullptr;
64 int leader_root, leader_of_root = -1;
65 MPI_Comm shmem_comm, leader_comm;
66 //if not set (use of the algo directly, without mvapich2 selector)
67 if (MV2_Scatter_intra_function == nullptr)
68 MV2_Scatter_intra_function = scatter__mpich;
70 if(comm->get_leaders_comm()==MPI_COMM_NULL){
73 comm_size = comm->size();
76 if (((rank == root) && (recvcnt == 0))
77 || ((rank != root) && (sendcnt == 0))) {
81 /* extract the rank,size information for the intra-node
83 shmem_comm = comm->get_intra_comm();
84 local_rank = shmem_comm->rank();
85 local_size = shmem_comm->size();
87 if (local_rank == 0) {
88 /* Node leader. Extract the rank, size information for the leader
90 leader_comm = comm->get_leaders_comm();
91 leader_comm_size = leader_comm->size();
92 leader_comm_rank = leader_comm->rank();
95 if (local_size == comm_size) {
96 /* purely intra-node scatter. Just use the direct algorithm and we are done */
97 mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype,
98 recvbuf, recvcnt, recvtype,
102 recvtype_size=recvtype->size();
103 sendtype_size=sendtype->size();
106 nbytes = sendcnt * sendtype_size;
108 nbytes = recvcnt * recvtype_size;
111 if (local_rank == 0) {
112 /* Node leader, allocate tmp_buffer */
113 tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size);
116 leader_comm = comm->get_leaders_comm();
117 int* leaders_map = comm->get_leaders_map();
118 leader_of_root = comm->group()->rank(leaders_map[root]);
119 leader_root = leader_comm->group()->rank(leaders_map[root]);
120 /* leader_root is the rank of the leader of the root in leader_comm.
121 * leader_root is to be used as the root of the inter-leader gather ops
124 if ((local_rank == 0) && (root != rank)
125 && (leader_of_root == rank)) {
126 /* The root of the scatter operation is not the node leader. Recv
127 * data from the node leader */
128 leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size);
129 Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE,
130 root, COLL_TAG_SCATTER, comm, &status);
134 if (rank == root && local_rank != 0) {
135 /* The root of the scatter operation is not the node leader. Send
136 * data to the node leader */
137 Request::send(sendbuf, sendcnt * comm_size, sendtype,
138 leader_of_root, COLL_TAG_SCATTER, comm
142 if (leader_comm_size > 1 && local_rank == 0) {
143 if (not comm->is_uniform()) {
144 int* displs = nullptr;
145 int* sendcnts = nullptr;
148 node_sizes = comm->get_non_uniform_map();
150 if (root != leader_of_root) {
151 if (leader_comm_rank == leader_root) {
152 displs = new int[leader_comm_size];
153 sendcnts = new int[leader_comm_size];
154 sendcnts[0] = node_sizes[0] * nbytes;
157 for (i = 1; i < leader_comm_size; i++) {
158 displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes;
159 sendcnts[i] = node_sizes[i] * nbytes;
162 colls::scatterv(leader_scatter_buf, sendcnts, displs, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE,
163 leader_root, leader_comm);
165 if (leader_comm_rank == leader_root) {
166 displs = new int[leader_comm_size];
167 sendcnts = new int[leader_comm_size];
168 sendcnts[0] = node_sizes[0] * sendcnt;
171 for (i = 1; i < leader_comm_size; i++) {
172 displs[i] = displs[i - 1] + node_sizes[i - 1] * sendcnt;
173 sendcnts[i] = node_sizes[i] * sendcnt;
176 colls::scatterv(sendbuf, sendcnts, displs, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root,
179 if (leader_comm_rank == leader_root) {
184 if (leader_of_root != root) {
186 MPIR_Scatter_MV2_Direct(leader_scatter_buf,
187 nbytes * local_size, MPI_BYTE,
188 tmp_buf, nbytes * local_size,
189 MPI_BYTE, leader_root,
193 MPIR_Scatter_MV2_Direct(sendbuf, sendcnt * local_size,
195 nbytes * local_size, MPI_BYTE,
196 leader_root, leader_comm);
201 /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */
203 if (rank == root && recvbuf == MPI_IN_PLACE) {
204 mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
205 (void *)sendbuf, sendcnt, sendtype,
208 mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
209 recvbuf, recvcnt, recvtype,
214 /* check if multiple threads are calling this collective function */
215 if (comm_size != local_size && local_rank == 0) {
216 smpi_free_tmp_buffer(tmp_buf);
217 if (leader_of_root == rank && root != rank) {
218 smpi_free_tmp_buffer(leader_scatter_buf);
225 int scatter__mvapich2_two_level_binomial(const void *sendbuf,
227 MPI_Datatype sendtype,
230 MPI_Datatype recvtype,
231 int root, MPI_Comm comm)
234 int local_rank, local_size;
235 int leader_comm_rank = -1, leader_comm_size = -1;
236 int mpi_errno = MPI_SUCCESS;
237 int recvtype_size, sendtype_size, nbytes;
238 unsigned char* tmp_buf = nullptr;
239 unsigned char* leader_scatter_buf = nullptr;
241 int leader_root = -1, leader_of_root = -1;
242 MPI_Comm shmem_comm, leader_comm;
245 //if not set (use of the algo directly, without mvapich2 selector)
246 if (MV2_Scatter_intra_function == nullptr)
247 MV2_Scatter_intra_function = scatter__mpich;
249 if(comm->get_leaders_comm()==MPI_COMM_NULL){
252 comm_size = comm->size();
255 if (((rank == root) && (recvcnt == 0))
256 || ((rank != root) && (sendcnt == 0))) {
260 /* extract the rank,size information for the intra-node
262 shmem_comm = comm->get_intra_comm();
263 local_rank = shmem_comm->rank();
264 local_size = shmem_comm->size();
266 if (local_rank == 0) {
267 /* Node leader. Extract the rank, size information for the leader
269 leader_comm = comm->get_leaders_comm();
270 leader_comm_size = leader_comm->size();
271 leader_comm_rank = leader_comm->rank();
274 if (local_size == comm_size) {
275 /* purely intra-node scatter. Just use the direct algorithm and we are done */
276 mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype,
277 recvbuf, recvcnt, recvtype,
281 recvtype_size=recvtype->size();
282 sendtype_size=sendtype->size();
285 nbytes = sendcnt * sendtype_size;
287 nbytes = recvcnt * recvtype_size;
290 if (local_rank == 0) {
291 /* Node leader, allocate tmp_buffer */
292 tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size);
294 leader_comm = comm->get_leaders_comm();
295 int* leaders_map = comm->get_leaders_map();
296 leader_of_root = comm->group()->rank(leaders_map[root]);
297 leader_root = leader_comm->group()->rank(leaders_map[root]);
298 /* leader_root is the rank of the leader of the root in leader_comm.
299 * leader_root is to be used as the root of the inter-leader gather ops
302 if ((local_rank == 0) && (root != rank)
303 && (leader_of_root == rank)) {
304 /* The root of the scatter operation is not the node leader. Recv
305 * data from the node leader */
306 leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size);
307 Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE,
308 root, COLL_TAG_SCATTER, comm, &status);
311 if (rank == root && local_rank != 0) {
312 /* The root of the scatter operation is not the node leader. Send
313 * data to the node leader */
314 Request::send(sendbuf, sendcnt * comm_size, sendtype,
315 leader_of_root, COLL_TAG_SCATTER, comm);
318 if (leader_comm_size > 1 && local_rank == 0) {
319 if (not comm->is_uniform()) {
320 int* displs = nullptr;
321 int* sendcnts = nullptr;
324 node_sizes = comm->get_non_uniform_map();
326 if (root != leader_of_root) {
327 if (leader_comm_rank == leader_root) {
328 displs = new int[leader_comm_size];
329 sendcnts = new int[leader_comm_size];
330 sendcnts[0] = node_sizes[0] * nbytes;
333 for (i = 1; i < leader_comm_size; i++) {
334 displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes;
335 sendcnts[i] = node_sizes[i] * nbytes;
338 colls::scatterv(leader_scatter_buf, sendcnts, displs, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE,
339 leader_root, leader_comm);
341 if (leader_comm_rank == leader_root) {
342 displs = new int[leader_comm_size];
343 sendcnts = new int[leader_comm_size];
344 sendcnts[0] = node_sizes[0] * sendcnt;
347 for (i = 1; i < leader_comm_size; i++) {
348 displs[i] = displs[i - 1] + node_sizes[i - 1] * sendcnt;
349 sendcnts[i] = node_sizes[i] * sendcnt;
352 colls::scatterv(sendbuf, sendcnts, displs, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root,
355 if (leader_comm_rank == leader_root) {
360 if (leader_of_root != root) {
362 MPIR_Scatter_MV2_Binomial(leader_scatter_buf,
363 nbytes * local_size, MPI_BYTE,
364 tmp_buf, nbytes * local_size,
365 MPI_BYTE, leader_root,
369 MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt * local_size,
371 nbytes * local_size, MPI_BYTE,
372 leader_root, leader_comm);
377 /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */
379 if (rank == root && recvbuf == MPI_IN_PLACE) {
380 mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
381 (void *)sendbuf, sendcnt, sendtype,
384 mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
385 recvbuf, recvcnt, recvtype,
392 /* check if multiple threads are calling this collective function */
393 if (comm_size != local_size && local_rank == 0) {
394 smpi_free_tmp_buffer(tmp_buf);
395 if (leader_of_root == rank && root != rank) {
396 smpi_free_tmp_buffer(leader_scatter_buf);
403 } // namespace simgrid::smpi