1 /* Copyright (c) 2013-2020. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2009 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
19 * Additional copyrights may follow
21 /* -*- Mode: C; c-basic-offset:4 ; -*- */
22 /* Copyright (c) 2001-2014, The Ohio State University. All rights
25 * This file is part of the MVAPICH2 software package developed by the
26 * team members of The Ohio State University's Network-Based Computing
27 * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
29 * For detailed copyright and licensing information, please refer to the
30 * copyright file COPYRIGHT in the top level MVAPICH2 directory.
34 * (C) 2001 by Argonne National Laboratory.
35 * See COPYRIGHT in top-level directory.
38 #include "../colls_private.hpp"
41 #define MPIR_Gather_MV2_Direct gather__ompi_basic_linear
42 #define MPIR_Gather_MV2_two_level_Direct gather__ompi_basic_linear
43 #define MPIR_Gather_intra gather__mpich
44 typedef int (*MV2_Gather_function_ptr) (const void *sendbuf,
46 MPI_Datatype sendtype,
49 MPI_Datatype recvtype,
50 int root, MPI_Comm comm);
52 extern MV2_Gather_function_ptr MV2_Gather_inter_leader_function;
53 extern MV2_Gather_function_ptr MV2_Gather_intra_node_function;
55 #define TEMP_BUF_HAS_NO_DATA (false)
56 #define TEMP_BUF_HAS_DATA (true)
61 /* sendbuf - (in) sender's buffer
62 * sendcnt - (in) sender's element count
63 * sendtype - (in) sender's data type
64 * recvbuf - (in) receiver's buffer
65 * recvcnt - (in) receiver's element count
66 * recvtype - (in) receiver's data type
67 * root - (in)root for the gather operation
68 * rank - (in) global rank(rank in the global comm)
69 * tmp_buf - (out/in) tmp_buf into which intra node
71 * is_data_avail - (in) based on this, tmp_buf acts
72 * as in/out parameter.
73 * 1 - tmp_buf acts as in parameter
74 * 0 - tmp_buf acts as out parameter
75 * comm_ptr - (in) pointer to the communicator
76 * (shmem_comm or intra_sock_comm or
77 * inter-sock_leader_comm)
78 * intra_node_fn_ptr - (in) Function ptr to choose the
79 * intra node gather function
80 * errflag - (out) to record errors
82 static int MPIR_pt_pt_intra_gather(const void* sendbuf, int sendcnt, MPI_Datatype sendtype, void* recvbuf, int recvcnt,
83 MPI_Datatype recvtype, int root, int rank, void* tmp_buf, int nbytes,
84 bool is_data_avail, MPI_Comm comm, MV2_Gather_function_ptr intra_node_fn_ptr)
86 int mpi_errno = MPI_SUCCESS;
87 MPI_Aint recvtype_extent = 0; /* Datatype extent */
88 MPI_Aint true_lb, sendtype_true_extent, recvtype_true_extent;
91 if (sendtype != MPI_DATATYPE_NULL) {
92 sendtype->extent(&true_lb,
93 &sendtype_true_extent);
95 if (recvtype != MPI_DATATYPE_NULL) {
96 recvtype_extent=recvtype->get_extent();
97 recvtype->extent(&true_lb,
98 &recvtype_true_extent);
101 /* Special case, when tmp_buf itself has data */
102 if (rank == root && sendbuf == MPI_IN_PLACE && is_data_avail) {
104 mpi_errno = intra_node_fn_ptr(MPI_IN_PLACE,
105 sendcnt, sendtype, tmp_buf, nbytes,
108 } else if (rank == root && sendbuf == MPI_IN_PLACE) {
109 mpi_errno = intra_node_fn_ptr((char*)recvbuf +
110 rank * recvcnt * recvtype_extent,
111 recvcnt, recvtype, tmp_buf, nbytes,
114 mpi_errno = intra_node_fn_ptr(sendbuf, sendcnt, sendtype,
115 tmp_buf, nbytes, MPI_BYTE,
125 int gather__mvapich2_two_level(const void *sendbuf,
127 MPI_Datatype sendtype,
130 MPI_Datatype recvtype,
134 unsigned char* leader_gather_buf = nullptr;
136 int local_rank, local_size;
137 int leader_comm_rank = -1, leader_comm_size = 0;
138 int mpi_errno = MPI_SUCCESS;
139 int recvtype_size = 0, sendtype_size = 0, nbytes = 0;
140 int leader_root, leader_of_root;
142 MPI_Aint sendtype_extent = 0, recvtype_extent = 0; /* Datatype extent */
143 MPI_Aint true_lb = 0, sendtype_true_extent = 0, recvtype_true_extent = 0;
144 MPI_Comm shmem_comm, leader_comm;
145 unsigned char* tmp_buf = nullptr;
147 // if not set (use of the algo directly, without mvapich2 selector)
148 if (MV2_Gather_intra_node_function == nullptr)
149 MV2_Gather_intra_node_function = gather__mpich;
151 if (comm->get_leaders_comm() == MPI_COMM_NULL) {
154 comm_size = comm->size();
157 if (((rank == root) && (recvcnt == 0)) ||
158 ((rank != root) && (sendcnt == 0))) {
162 if (sendtype != MPI_DATATYPE_NULL) {
163 sendtype_extent=sendtype->get_extent();
164 sendtype_size=sendtype->size();
165 sendtype->extent(&true_lb,
166 &sendtype_true_extent);
168 if (recvtype != MPI_DATATYPE_NULL) {
169 recvtype_extent=recvtype->get_extent();
170 recvtype_size=recvtype->size();
171 recvtype->extent(&true_lb,
172 &recvtype_true_extent);
175 /* extract the rank,size information for the intra-node
177 shmem_comm = comm->get_intra_comm();
178 local_rank = shmem_comm->rank();
179 local_size = shmem_comm->size();
181 if (local_rank == 0) {
182 /* Node leader. Extract the rank, size information for the leader
184 leader_comm = comm->get_leaders_comm();
185 if(leader_comm==MPI_COMM_NULL){
186 leader_comm = MPI_COMM_WORLD;
188 leader_comm_size = leader_comm->size();
189 leader_comm_rank = leader_comm->size();
193 nbytes = recvcnt * recvtype_size;
196 nbytes = sendcnt * sendtype_size;
199 #if defined(_SMP_LIMIC_)
200 if((g_use_limic2_coll) && (shmem_commptr->ch.use_intra_sock_comm == 1)
201 && (use_limic_gather)
202 &&((num_scheme == USE_GATHER_PT_PT_BINOMIAL)
203 || (num_scheme == USE_GATHER_PT_PT_DIRECT)
204 ||(num_scheme == USE_GATHER_PT_LINEAR_BINOMIAL)
205 || (num_scheme == USE_GATHER_PT_LINEAR_DIRECT)
206 || (num_scheme == USE_GATHER_LINEAR_PT_BINOMIAL)
207 || (num_scheme == USE_GATHER_LINEAR_PT_DIRECT)
208 || (num_scheme == USE_GATHER_LINEAR_LINEAR)
209 || (num_scheme == USE_GATHER_SINGLE_LEADER))) {
211 mpi_errno = MV2_Gather_intra_node_function(sendbuf, sendcnt, sendtype,
212 recvbuf, recvcnt,recvtype,
216 #endif/*#if defined(_SMP_LIMIC_)*/
218 if (local_rank == 0) {
219 /* Node leader, allocate tmp_buffer */
221 tmp_buf = smpi_get_tmp_recvbuffer(recvcnt * std::max(recvtype_extent, recvtype_true_extent) * local_size);
223 tmp_buf = smpi_get_tmp_sendbuffer(sendcnt * std::max(sendtype_extent, sendtype_true_extent) * local_size);
225 if (tmp_buf == nullptr) {
226 mpi_errno = MPI_ERR_OTHER;
230 /*while testing mpich2 gather test, we see that
231 * which basically splits the comm, and we come to
232 * a point, where use_intra_sock_comm == 0, but if the
233 * intra node function is MPIR_Intra_node_LIMIC_Gather_MV2,
234 * it would use the intra sock comm. In such cases, we
235 * fallback to binomial as a default case.*/
236 #if defined(_SMP_LIMIC_)
237 if(*MV2_Gather_intra_node_function == MPIR_Intra_node_LIMIC_Gather_MV2) {
239 mpi_errno = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype,
240 recvbuf, recvcnt, recvtype,
243 TEMP_BUF_HAS_NO_DATA,
249 /*We are gathering the data into tmp_buf and the output
250 * will be of MPI_BYTE datatype. Since the tmp_buf has no
251 * local data, we pass is_data_avail = TEMP_BUF_HAS_NO_DATA*/
252 mpi_errno = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype,
253 recvbuf, recvcnt, recvtype,
256 TEMP_BUF_HAS_NO_DATA,
258 MV2_Gather_intra_node_function
262 leader_comm = comm->get_leaders_comm();
263 int* leaders_map = comm->get_leaders_map();
264 leader_of_root = comm->group()->rank(leaders_map[root]);
265 leader_root = leader_comm->group()->rank(leaders_map[root]);
266 /* leader_root is the rank of the leader of the root in leader_comm.
267 * leader_root is to be used as the root of the inter-leader gather ops
269 if (not comm->is_uniform()) {
270 if (local_rank == 0) {
271 int* displs = nullptr;
272 int* recvcnts = nullptr;
275 /* Node leaders have all the data. But, different nodes can have
276 * different number of processes. Do a Gather first to get the
277 * buffer lengths at each leader, followed by a Gatherv to move
280 if (leader_comm_rank == leader_root && root != leader_of_root) {
281 /* The root of the Gather operation is not a node-level
282 * leader and this process's rank in the leader_comm
283 * is the same as leader_root */
286 smpi_get_tmp_recvbuffer(recvcnt * std::max(recvtype_extent, recvtype_true_extent) * comm_size);
289 smpi_get_tmp_sendbuffer(sendcnt * std::max(sendtype_extent, sendtype_true_extent) * comm_size);
291 if (leader_gather_buf == nullptr) {
292 mpi_errno = MPI_ERR_OTHER;
297 node_sizes = comm->get_non_uniform_map();
299 if (leader_comm_rank == leader_root) {
300 displs = new int[leader_comm_size];
301 recvcnts = new int[leader_comm_size];
304 if (root == leader_of_root) {
305 /* The root of the gather operation is also the node
306 * leader. Receive into recvbuf and we are done */
307 if (leader_comm_rank == leader_root) {
308 recvcnts[0] = node_sizes[0] * recvcnt;
311 for (i = 1; i < leader_comm_size; i++) {
312 displs[i] = displs[i - 1] + node_sizes[i - 1] * recvcnt;
313 recvcnts[i] = node_sizes[i] * recvcnt;
316 colls::gatherv(tmp_buf, local_size * nbytes, MPI_BYTE, recvbuf, recvcnts, displs, recvtype, leader_root,
319 /* The root of the gather operation is not the node leader.
320 * Receive into leader_gather_buf and then send
322 if (leader_comm_rank == leader_root) {
323 recvcnts[0] = node_sizes[0] * nbytes;
326 for (i = 1; i < leader_comm_size; i++) {
327 displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes;
328 recvcnts[i] = node_sizes[i] * nbytes;
331 colls::gatherv(tmp_buf, local_size * nbytes, MPI_BYTE, leader_gather_buf, recvcnts, displs, MPI_BYTE,
332 leader_root, leader_comm);
334 if (leader_comm_rank == leader_root) {
340 /* All nodes have the same number of processes.
341 * Just do one Gather to get all
342 * the data at the leader of the root process */
343 if (local_rank == 0) {
344 if (leader_comm_rank == leader_root && root != leader_of_root) {
345 /* The root of the Gather operation is not a node-level leader
347 leader_gather_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size);
348 if (leader_gather_buf == nullptr) {
349 mpi_errno = MPI_ERR_OTHER;
353 if (root == leader_of_root) {
354 mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf,
357 recvcnt * local_size,
358 recvtype, leader_root,
362 mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf, nbytes * local_size,
363 MPI_BYTE, leader_gather_buf,
365 MPI_BYTE, leader_root,
370 if ((local_rank == 0) && (root != rank)
371 && (leader_of_root == rank)) {
372 Request::send(leader_gather_buf,
373 nbytes * comm_size, MPI_BYTE,
374 root, COLL_TAG_GATHER, comm);
377 if (rank == root && local_rank != 0) {
378 /* The root of the gather operation is not the node leader. Receive
379 y* data from the node leader */
380 Request::recv(recvbuf, recvcnt * comm_size, recvtype,
381 leader_of_root, COLL_TAG_GATHER, comm,
385 /* check if multiple threads are calling this collective function */
386 if (local_rank == 0 ) {
387 if (tmp_buf != nullptr) {
388 smpi_free_tmp_buffer(tmp_buf);
390 if (leader_gather_buf != nullptr) {
391 smpi_free_tmp_buffer(leader_gather_buf);