1 /* Copyright (c) 2013-2017. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2009 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
19 * Additional copyrights may follow
21 /* -*- Mode: C; c-basic-offset:4 ; -*- */
22 /* Copyright (c) 2001-2014, The Ohio State University. All rights
25 * This file is part of the MVAPICH2 software package developed by the
26 * team members of The Ohio State University's Network-Based Computing
27 * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
29 * For detailed copyright and licensing information, please refer to the
30 * copyright file COPYRIGHT in the top level MVAPICH2 directory.
34 * (C) 2001 by Argonne National Laboratory.
35 * See COPYRIGHT in top-level directory.
38 #include "../colls_private.hpp"
41 #define MPIR_Gather_MV2_Direct Coll_gather_ompi_basic_linear::gather
42 #define MPIR_Gather_MV2_two_level_Direct Coll_gather_ompi_basic_linear::gather
43 #define MPIR_Gather_intra Coll_gather_mpich::gather
44 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
46 MPI_Datatype sendtype,
49 MPI_Datatype recvtype,
50 int root, MPI_Comm comm);
52 extern MV2_Gather_function_ptr MV2_Gather_inter_leader_function;
53 extern MV2_Gather_function_ptr MV2_Gather_intra_node_function;
55 #define TEMP_BUF_HAS_NO_DATA (0)
56 #define TEMP_BUF_HAS_DATA (1)
62 /* sendbuf - (in) sender's buffer
63 * sendcnt - (in) sender's element count
64 * sendtype - (in) sender's data type
65 * recvbuf - (in) receiver's buffer
66 * recvcnt - (in) receiver's element count
67 * recvtype - (in) receiver's data type
68 * root - (in)root for the gather operation
69 * rank - (in) global rank(rank in the global comm)
70 * tmp_buf - (out/in) tmp_buf into which intra node
72 * is_data_avail - (in) based on this, tmp_buf acts
73 * as in/out parameter.
74 * 1 - tmp_buf acts as in parameter
75 * 0 - tmp_buf acts as out parameter
76 * comm_ptr - (in) pointer to the communicator
77 * (shmem_comm or intra_sock_comm or
78 * inter-sock_leader_comm)
79 * intra_node_fn_ptr - (in) Function ptr to choose the
80 * intra node gather function
81 * errflag - (out) to record errors
83 static int MPIR_pt_pt_intra_gather( void *sendbuf, int sendcnt, MPI_Datatype sendtype,
84 void *recvbuf, int recvcnt, MPI_Datatype recvtype,
86 void *tmp_buf, int nbytes,
89 MV2_Gather_function_ptr intra_node_fn_ptr)
91 int mpi_errno = MPI_SUCCESS;
92 MPI_Aint recvtype_extent = 0; /* Datatype extent */
93 MPI_Aint true_lb, sendtype_true_extent, recvtype_true_extent;
96 if (sendtype != MPI_DATATYPE_NULL) {
97 sendtype->extent(&true_lb,
98 &sendtype_true_extent);
100 if (recvtype != MPI_DATATYPE_NULL) {
101 recvtype_extent=recvtype->get_extent();
102 recvtype->extent(&true_lb,
103 &recvtype_true_extent);
106 /* Special case, when tmp_buf itself has data */
107 if (rank == root && sendbuf == MPI_IN_PLACE && is_data_avail) {
109 mpi_errno = intra_node_fn_ptr(MPI_IN_PLACE,
110 sendcnt, sendtype, tmp_buf, nbytes,
113 } else if (rank == root && sendbuf == MPI_IN_PLACE) {
114 mpi_errno = intra_node_fn_ptr((char*)recvbuf +
115 rank * recvcnt * recvtype_extent,
116 recvcnt, recvtype, tmp_buf, nbytes,
119 mpi_errno = intra_node_fn_ptr(sendbuf, sendcnt, sendtype,
120 tmp_buf, nbytes, MPI_BYTE,
130 int Coll_gather_mvapich2_two_level::gather(void *sendbuf,
132 MPI_Datatype sendtype,
135 MPI_Datatype recvtype,
139 void *leader_gather_buf = NULL;
141 int local_rank, local_size;
142 int leader_comm_rank = -1, leader_comm_size = 0;
143 int mpi_errno = MPI_SUCCESS;
144 int recvtype_size = 0, sendtype_size = 0, nbytes=0;
145 int leader_root, leader_of_root;
147 MPI_Aint sendtype_extent = 0, recvtype_extent = 0; /* Datatype extent */
148 MPI_Aint true_lb = 0, sendtype_true_extent = 0, recvtype_true_extent = 0;
149 MPI_Comm shmem_comm, leader_comm;
150 void* tmp_buf = NULL;
153 //if not set (use of the algo directly, without mvapich2 selector)
154 if(MV2_Gather_intra_node_function==NULL)
155 MV2_Gather_intra_node_function= Coll_gather_mpich::gather;
157 if(comm->get_leaders_comm()==MPI_COMM_NULL){
160 comm_size = comm->size();
163 if (((rank == root) && (recvcnt == 0)) ||
164 ((rank != root) && (sendcnt == 0))) {
168 if (sendtype != MPI_DATATYPE_NULL) {
169 sendtype_extent=sendtype->get_extent();
170 sendtype_size=sendtype->size();
171 sendtype->extent(&true_lb,
172 &sendtype_true_extent);
174 if (recvtype != MPI_DATATYPE_NULL) {
175 recvtype_extent=recvtype->get_extent();
176 recvtype_size=recvtype->size();
177 recvtype->extent(&true_lb,
178 &recvtype_true_extent);
181 /* extract the rank,size information for the intra-node
183 shmem_comm = comm->get_intra_comm();
184 local_rank = shmem_comm->rank();
185 local_size = shmem_comm->size();
187 if (local_rank == 0) {
188 /* Node leader. Extract the rank, size information for the leader
190 leader_comm = comm->get_leaders_comm();
191 if(leader_comm==MPI_COMM_NULL){
192 leader_comm = MPI_COMM_WORLD;
194 leader_comm_size = leader_comm->size();
195 leader_comm_rank = leader_comm->size();
199 nbytes = recvcnt * recvtype_size;
202 nbytes = sendcnt * sendtype_size;
205 #if defined(_SMP_LIMIC_)
206 if((g_use_limic2_coll) && (shmem_commptr->ch.use_intra_sock_comm == 1)
207 && (use_limic_gather)
208 &&((num_scheme == USE_GATHER_PT_PT_BINOMIAL)
209 || (num_scheme == USE_GATHER_PT_PT_DIRECT)
210 ||(num_scheme == USE_GATHER_PT_LINEAR_BINOMIAL)
211 || (num_scheme == USE_GATHER_PT_LINEAR_DIRECT)
212 || (num_scheme == USE_GATHER_LINEAR_PT_BINOMIAL)
213 || (num_scheme == USE_GATHER_LINEAR_PT_DIRECT)
214 || (num_scheme == USE_GATHER_LINEAR_LINEAR)
215 || (num_scheme == USE_GATHER_SINGLE_LEADER))) {
217 mpi_errno = MV2_Gather_intra_node_function(sendbuf, sendcnt, sendtype,
218 recvbuf, recvcnt,recvtype,
222 #endif/*#if defined(_SMP_LIMIC_)*/
224 if (local_rank == 0) {
225 /* Node leader, allocate tmp_buffer */
227 tmp_buf = smpi_get_tmp_recvbuffer(recvcnt * std::max(recvtype_extent, recvtype_true_extent) * local_size);
229 tmp_buf = smpi_get_tmp_sendbuffer(sendcnt * std::max(sendtype_extent, sendtype_true_extent) * local_size);
231 if (tmp_buf == NULL) {
232 mpi_errno = MPI_ERR_OTHER;
236 /*while testing mpich2 gather test, we see that
237 * which basically splits the comm, and we come to
238 * a point, where use_intra_sock_comm == 0, but if the
239 * intra node function is MPIR_Intra_node_LIMIC_Gather_MV2,
240 * it would use the intra sock comm. In such cases, we
241 * fallback to binomial as a default case.*/
242 #if defined(_SMP_LIMIC_)
243 if(*MV2_Gather_intra_node_function == MPIR_Intra_node_LIMIC_Gather_MV2) {
245 mpi_errno = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype,
246 recvbuf, recvcnt, recvtype,
249 TEMP_BUF_HAS_NO_DATA,
255 /*We are gathering the data into tmp_buf and the output
256 * will be of MPI_BYTE datatype. Since the tmp_buf has no
257 * local data, we pass is_data_avail = TEMP_BUF_HAS_NO_DATA*/
258 mpi_errno = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype,
259 recvbuf, recvcnt, recvtype,
262 TEMP_BUF_HAS_NO_DATA,
264 MV2_Gather_intra_node_function
268 leader_comm = comm->get_leaders_comm();
269 int* leaders_map = comm->get_leaders_map();
270 leader_of_root = comm->group()->rank(leaders_map[root]);
271 leader_root = leader_comm->group()->rank(leaders_map[root]);
272 /* leader_root is the rank of the leader of the root in leader_comm.
273 * leader_root is to be used as the root of the inter-leader gather ops
275 if (not comm->is_uniform()) {
276 if (local_rank == 0) {
278 int* recvcnts = NULL;
281 /* Node leaders have all the data. But, different nodes can have
282 * different number of processes. Do a Gather first to get the
283 * buffer lengths at each leader, followed by a Gatherv to move
286 if (leader_comm_rank == leader_root && root != leader_of_root) {
287 /* The root of the Gather operation is not a node-level
288 * leader and this process's rank in the leader_comm
289 * is the same as leader_root */
292 smpi_get_tmp_recvbuffer(recvcnt * std::max(recvtype_extent, recvtype_true_extent) * comm_size);
295 smpi_get_tmp_sendbuffer(sendcnt * std::max(sendtype_extent, sendtype_true_extent) * comm_size);
297 if (leader_gather_buf == NULL) {
298 mpi_errno = MPI_ERR_OTHER;
303 node_sizes = comm->get_non_uniform_map();
305 if (leader_comm_rank == leader_root) {
306 displs = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
307 recvcnts = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
308 if (not displs || not recvcnts) {
309 mpi_errno = MPI_ERR_OTHER;
314 if (root == leader_of_root) {
315 /* The root of the gather operation is also the node
316 * leader. Receive into recvbuf and we are done */
317 if (leader_comm_rank == leader_root) {
318 recvcnts[0] = node_sizes[0] * recvcnt;
321 for (i = 1; i < leader_comm_size; i++) {
322 displs[i] = displs[i - 1] + node_sizes[i - 1] * recvcnt;
323 recvcnts[i] = node_sizes[i] * recvcnt;
326 Colls::gatherv(tmp_buf, local_size * nbytes, MPI_BYTE, recvbuf, recvcnts, displs, recvtype, leader_root,
329 /* The root of the gather operation is not the node leader.
330 * Receive into leader_gather_buf and then send
332 if (leader_comm_rank == leader_root) {
333 recvcnts[0] = node_sizes[0] * nbytes;
336 for (i = 1; i < leader_comm_size; i++) {
337 displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes;
338 recvcnts[i] = node_sizes[i] * nbytes;
341 Colls::gatherv(tmp_buf, local_size * nbytes, MPI_BYTE, leader_gather_buf, recvcnts, displs, MPI_BYTE,
342 leader_root, leader_comm);
344 if (leader_comm_rank == leader_root) {
350 /* All nodes have the same number of processes.
351 * Just do one Gather to get all
352 * the data at the leader of the root process */
353 if (local_rank == 0) {
354 if (leader_comm_rank == leader_root && root != leader_of_root) {
355 /* The root of the Gather operation is not a node-level leader
357 leader_gather_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size);
358 if (leader_gather_buf == NULL) {
359 mpi_errno = MPI_ERR_OTHER;
363 if (root == leader_of_root) {
364 mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf,
367 recvcnt * local_size,
368 recvtype, leader_root,
372 mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf, nbytes * local_size,
373 MPI_BYTE, leader_gather_buf,
375 MPI_BYTE, leader_root,
380 if ((local_rank == 0) && (root != rank)
381 && (leader_of_root == rank)) {
382 Request::send(leader_gather_buf,
383 nbytes * comm_size, MPI_BYTE,
384 root, COLL_TAG_GATHER, comm);
387 if (rank == root && local_rank != 0) {
388 /* The root of the gather operation is not the node leader. Receive
389 y* data from the node leader */
390 Request::recv(recvbuf, recvcnt * comm_size, recvtype,
391 leader_of_root, COLL_TAG_GATHER, comm,
395 /* check if multiple threads are calling this collective function */
396 if (local_rank == 0 ) {
397 if (tmp_buf != NULL) {
398 smpi_free_tmp_buffer(tmp_buf);
400 if (leader_gather_buf != NULL) {
401 smpi_free_tmp_buffer(leader_gather_buf);