1 /* Copyright (c) 2013-2017. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2009 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
19 * Additional copyrights may follow
22 #include "../colls_private.h"
23 #include "../coll_tuned_topo.h"
29 int Coll_gather_ompi_binomial::gather(void *sbuf, int scount,
31 void *rbuf, int rcount,
45 ompi_coll_tree_t* bmtree;
47 MPI_Aint sextent, slb, strue_lb, strue_extent;
48 MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;
55 "smpi_coll_tuned_gather_ompi_binomial rank %d", rank);
57 /* create the binomial tree */
58 // COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
59 bmtree = ompi_coll_tuned_topo_build_in_order_bmtree(comm, root);
60 // data->cached_in_order_bmtree;
62 sdtype->extent(&slb, &sextent);
63 sdtype->extent(&strue_lb, &strue_extent);
65 vrank = (rank - root + size) % size;
68 rdtype->extent(&rlb, &rextent);
69 rdtype->extent(&rtrue_lb, &rtrue_extent);
71 /* root on 0, just use the recv buffer */
73 if (sbuf != MPI_IN_PLACE) {
74 err = Datatype::copy(sbuf, scount, sdtype,
75 ptmp, rcount, rdtype);
76 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
79 /* root is not on 0, allocate temp buffer for recv,
80 * rotate data at the end */
81 tempbuf = (char *) smpi_get_tmp_recvbuffer(rtrue_extent + (rcount*size - 1) * rextent);
82 if (NULL == tempbuf) {
83 err= MPI_ERR_OTHER; line = __LINE__; goto err_hndl;
87 if (sbuf != MPI_IN_PLACE) {
88 /* copy from sbuf to temp buffer */
89 err = Datatype::copy(sbuf, scount, sdtype,
90 ptmp, rcount, rdtype);
91 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
93 /* copy from rbuf to temp buffer */
94 err = Datatype::copy((char *) rbuf + rank*rextent*rcount, rcount, rdtype, ptmp, rcount, rdtype );
95 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
99 } else if (!(vrank % 2)) {
100 /* other non-leaf nodes, allocate temp buffer for data received from
101 * children, the most we need is half of the total data elements due
102 * to the property of binimoal tree */
103 tempbuf = (char *) smpi_get_tmp_sendbuffer(strue_extent + (scount*size - 1) * sextent);
104 if (NULL == tempbuf) {
105 err= MPI_ERR_OTHER; line = __LINE__; goto err_hndl;
108 ptmp = tempbuf - slb;
109 /* local copy to tempbuf */
110 err = Datatype::copy(sbuf, scount, sdtype,
111 ptmp, scount, sdtype);
112 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
114 /* use sdtype,scount as rdtype,rdcount since they are ignored on
121 /* leaf nodes, no temp buffer needed, use sdtype,scount as
122 * rdtype,rdcount since they are ignored on non-root procs */
123 ptmp = (char *) sbuf;
128 /* all non-leaf nodes recv from children */
129 for (i = 0; i < bmtree->tree_nextsize; i++) {
130 int mycount = 0, vkid;
131 /* figure out how much data I have to send to this child */
132 vkid = (bmtree->tree_next[i] - root + size) % size;
133 mycount = vkid - vrank;
134 if (mycount > (size - vkid))
135 mycount = size - vkid;
139 "smpi_coll_tuned_gather_ompi_binomial rank %d recv %d mycount = %d",
140 rank, bmtree->tree_next[i], mycount);
142 Request::recv(ptmp + total_recv*rextent, mycount, rdtype,
143 bmtree->tree_next[i], COLL_TAG_GATHER,
146 total_recv += mycount;
151 /* all nodes except root send to parents */
153 "smpi_coll_tuned_gather_ompi_binomial rank %d send %d count %d\n",
154 rank, bmtree->tree_prev, total_recv);
156 Request::send(ptmp, total_recv, sdtype,
163 /* rotate received data on root if root != 0 */
164 err = Datatype::copy(ptmp, rcount*(size - root), rdtype,
165 (char *) rbuf + rextent*root*rcount, rcount*(size - root), rdtype );
166 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
169 err = Datatype::copy( ptmp + rextent*rcount*(size-root), rcount*root,rdtype,
170 (char *) rbuf,rcount*root,rdtype);
171 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
173 smpi_free_tmp_buffer(tempbuf);
175 } else if (!(vrank % 2)) {
176 /* other non-leaf nodes */
177 smpi_free_tmp_buffer(tempbuf);
184 smpi_free_tmp_buffer(tempbuf);
186 XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
187 __FILE__, line, err, rank);
192 * gather_intra_linear_sync
194 * Function: - synchronized gather operation with
195 * Accepts: - same arguments as MPI_Gather(), first segment size
196 * Returns: - MPI_SUCCESS or error code
198 int Coll_gather_ompi_linear_sync::gather(void *sbuf, int scount,
200 void *rbuf, int rcount,
208 int first_segment_count;
213 int first_segment_size=0;
217 size_t dsize, block_size;
219 dsize= rdtype->size();
220 block_size = dsize * rcount;
222 dsize=sdtype->size();
223 block_size = dsize * scount;
226 if (block_size > 92160){
227 first_segment_size = 32768;
229 first_segment_size = 1024;
233 "smpi_coll_tuned_gather_ompi_linear_sync rank %d, segment %d", rank, first_segment_size);
236 /* Non-root processes:
237 - receive zero byte message from the root,
238 - send the first segment of the data synchronously,
239 - send the second segment of the data.
242 typelng= sdtype->size();
243 sdtype->extent(&lb, &extent);
244 first_segment_count = scount;
245 COLL_TUNED_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng,
246 first_segment_count );
248 Request::recv(sbuf, 0, MPI_BYTE, root,
250 comm, MPI_STATUS_IGNORE);
252 Request::send(sbuf, first_segment_count, sdtype, root,
256 Request::send((char*)sbuf + extent * first_segment_count,
257 (scount - first_segment_count), sdtype,
258 root, COLL_TAG_GATHER,
264 - For every non-root node:
265 - post irecv for the first segment of the message
266 - send zero byte message to signal node to send the message
267 - post irecv for the second segment of the message
268 - wait for the first segment to complete
269 - Copy local data if necessary
270 - Waitall for all the second segments to complete.
273 MPI_Request *reqs = NULL, first_segment_req;
274 reqs = (MPI_Request *) calloc(size, sizeof(MPI_Request ));
275 if (NULL == reqs) { ret = -1; line = __LINE__; goto error_hndl; }
277 typelng=rdtype->size();
278 rdtype->extent(&lb, &extent);
279 first_segment_count = rcount;
280 COLL_TUNED_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
281 first_segment_count );
283 for (i = 0; i < size; ++i) {
286 reqs[i] = MPI_REQUEST_NULL;
290 /* irecv for the first segment from i */
291 ptmp = (char*)rbuf + i * rcount * extent;
292 first_segment_req = Request::irecv(ptmp, first_segment_count, rdtype, i,
293 COLL_TAG_GATHER, comm
296 /* send sync message */
297 Request::send(rbuf, 0, MPI_BYTE, i,
301 /* irecv for the second segment */
302 ptmp = (char*)rbuf + (i * rcount + first_segment_count) * extent;
303 reqs[i]=Request::irecv(ptmp, (rcount - first_segment_count),
304 rdtype, i, COLL_TAG_GATHER, comm
307 /* wait on the first segment to complete */
308 Request::wait(&first_segment_req, MPI_STATUS_IGNORE);
311 /* copy local data if necessary */
312 if (MPI_IN_PLACE != sbuf) {
313 ret = Datatype::copy(sbuf, scount, sdtype,
314 (char*)rbuf + rank * rcount * extent,
316 if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
319 /* wait all second segments to complete */
320 ret = Request::waitall(size, reqs, MPI_STATUSES_IGNORE);
321 if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
331 "ERROR_HNDL: node %d file %s line %d error %d\n",
332 rank, __FILE__, line, ret );
337 * Linear functions are copied from the BASIC coll module
338 * they do not segment the message and are simple implementations
339 * but for some small number of nodes and/or small data sizes they
340 * are just as fast as tuned/tree based segmenting operations
341 * and as such may be selected by the decision functions
342 * These are copied into this module due to the way we select modules
343 * in V1. i.e. in V2 we will handle this differently and so will not
344 * have to duplicate code.
345 * JPG following the examples from other coll_tuned implementations. Dec06.
348 /* copied function (with appropriate renaming) starts here */
352 * Function: - basic gather operation
353 * Accepts: - same arguments as MPI_Gather()
354 * Returns: - MPI_SUCCESS or error code
356 int Coll_gather_ompi_basic_linear::gather(void *sbuf, int scount,
358 void *rbuf, int rcount,
375 /* Everyone but root sends data and returns. */
377 "ompi_coll_tuned_gather_intra_basic_linear rank %d", rank);
380 Request::send(sbuf, scount, sdtype, root,
386 /* I am the root, loop receiving the data. */
388 rdtype->extent(&lb, &extent);
389 incr = extent * rcount;
390 for (i = 0, ptmp = (char *) rbuf; i < size; ++i, ptmp += incr) {
392 if (MPI_IN_PLACE != sbuf) {
393 err = Datatype::copy(sbuf, scount, sdtype,
394 ptmp, rcount, rdtype);
399 Request::recv(ptmp, rcount, rdtype, i,
401 comm, MPI_STATUS_IGNORE);
404 if (MPI_SUCCESS != err) {