1 /* Copyright (c) 2013-2023. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2009 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
19 * Additional copyrights may follow
22 #include "../coll_tuned_topo.hpp"
23 #include "../colls_private.hpp"
25 namespace simgrid::smpi {
27 int gather__ompi_binomial(const void* sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount,
28 MPI_Datatype rdtype, int root, MPI_Comm comm)
36 unsigned char* ptmp = nullptr;
37 unsigned char* tempbuf = nullptr;
38 const unsigned char* src_buf;
40 ompi_coll_tree_t* bmtree;
42 MPI_Aint sextent, slb, strue_lb, strue_extent;
43 MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;
49 XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d", rank);
51 /* create the binomial tree */
52 // COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
53 bmtree = ompi_coll_tuned_topo_build_in_order_bmtree(comm, root);
54 // data->cached_in_order_bmtree;
56 sdtype->extent(&slb, &sextent);
57 sdtype->extent(&strue_lb, &strue_extent);
59 vrank = (rank - root + size) % size;
62 rdtype->extent(&rlb, &rextent);
63 rdtype->extent(&rtrue_lb, &rtrue_extent);
65 /* root on 0, just use the recv buffer */
66 ptmp = static_cast<unsigned char*>(rbuf);
67 if (sbuf != MPI_IN_PLACE) {
68 err = Datatype::copy(sbuf, scount, sdtype, ptmp, rcount, rdtype);
69 if (MPI_SUCCESS != err) {
75 /* root is not on 0, allocate temp buffer for recv,
76 * rotate data at the end */
77 tempbuf = smpi_get_tmp_recvbuffer(rtrue_extent + (rcount * size - 1) * rextent);
78 if (nullptr == tempbuf) {
85 if (sbuf != MPI_IN_PLACE) {
86 /* copy from sbuf to temp buffer */
87 err = Datatype::copy(sbuf, scount, sdtype, ptmp, rcount, rdtype);
88 if (MPI_SUCCESS != err) {
93 /* copy from rbuf to temp buffer */
94 err = Datatype::copy((char*)rbuf + rank * rextent * rcount, rcount, rdtype, ptmp, rcount, rdtype);
95 if (MPI_SUCCESS != err) {
103 } else if (!(vrank % 2)) {
104 /* other non-leaf nodes, allocate temp buffer for data received from
105 * children, the most we need is half of the total data elements due
106 * to the property of binomial tree */
107 tempbuf = smpi_get_tmp_sendbuffer(strue_extent + (scount * size - 1) * sextent);
108 if (nullptr == tempbuf) {
114 ptmp = tempbuf - slb;
115 /* local copy to tempbuf */
116 err = Datatype::copy(sbuf, scount, sdtype, ptmp, scount, sdtype);
117 if (MPI_SUCCESS != err) {
122 /* use sdtype,scount as rdtype,rdcount since they are ignored on
130 /* leaf nodes, no temp buffer needed, use sdtype,scount as
131 * rdtype,rdcount since they are ignored on non-root procs */
133 src_buf = static_cast<const unsigned char*>(sbuf);
137 /* all non-leaf nodes recv from children */
138 for (i = 0; i < bmtree->tree_nextsize; i++) {
139 int mycount = 0, vkid;
140 /* figure out how much data I have to send to this child */
141 vkid = (bmtree->tree_next[i] - root + size) % size;
142 mycount = vkid - vrank;
143 if (mycount > (size - vkid))
144 mycount = size - vkid;
147 XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d recv %d mycount = %d", rank, bmtree->tree_next[i],
150 Request::recv(ptmp + total_recv * rextent, mycount, rdtype, bmtree->tree_next[i], COLL_TAG_GATHER, comm,
153 total_recv += mycount;
158 /* all nodes except root send to parents */
159 XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d send %d count %d\n", rank, bmtree->tree_prev, total_recv);
161 Request::send(src_buf, total_recv, sdtype, bmtree->tree_prev, COLL_TAG_GATHER, comm);
165 /* rotate received data on root if root != 0 */
166 err = Datatype::copy(ptmp, rcount * (size - root), rdtype, (char*)rbuf + rextent * root * rcount,
167 rcount * (size - root), rdtype);
168 if (MPI_SUCCESS != err) {
173 err = Datatype::copy(ptmp + rextent * rcount * (size - root), rcount * root, rdtype, (char*)rbuf, rcount * root,
175 if (MPI_SUCCESS != err) {
180 smpi_free_tmp_buffer(tempbuf);
182 } else if (!(vrank % 2)) {
183 /* other non-leaf nodes */
184 smpi_free_tmp_buffer(tempbuf);
186 ompi_coll_tuned_topo_destroy_tree(&bmtree);
190 if (nullptr != tempbuf)
191 smpi_free_tmp_buffer(tempbuf);
193 XBT_DEBUG("%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank);
198 * gather_intra_linear_sync
200 * Function: - synchronized gather operation with
201 * Accepts: - same arguments as MPI_Gather(), first segment size
202 * Returns: - MPI_SUCCESS or error code
204 int gather__ompi_linear_sync(const void *sbuf, int scount,
206 void *rbuf, int rcount,
214 int first_segment_count;
219 int first_segment_size=0;
223 size_t dsize, block_size;
225 dsize= rdtype->size();
226 block_size = dsize * rcount;
228 dsize=sdtype->size();
229 block_size = dsize * scount;
232 if (block_size > 92160){
233 first_segment_size = 32768;
235 first_segment_size = 1024;
238 XBT_DEBUG("smpi_coll_tuned_gather_ompi_linear_sync rank %d, segment %d", rank, first_segment_size);
241 /* Non-root processes:
242 - receive zero byte message from the root,
243 - send the first segment of the data synchronously,
244 - send the second segment of the data.
247 typelng = sdtype->size();
248 sdtype->extent(&lb, &extent);
249 first_segment_count = scount;
250 COLL_TUNED_COMPUTED_SEGCOUNT((size_t)first_segment_size, typelng, first_segment_count);
252 Request::recv(nullptr, 0, MPI_BYTE, root, COLL_TAG_GATHER, comm, MPI_STATUS_IGNORE);
254 Request::send(sbuf, first_segment_count, sdtype, root, COLL_TAG_GATHER, comm);
256 Request::send((char*)sbuf + extent * first_segment_count, (scount - first_segment_count), sdtype, root,
257 COLL_TAG_GATHER, comm);
262 - For every non-root node:
263 - post irecv for the first segment of the message
264 - send zero byte message to signal node to send the message
265 - post irecv for the second segment of the message
266 - wait for the first segment to complete
267 - Copy local data if necessary
268 - Waitall for all the second segments to complete.
271 MPI_Request first_segment_req;
272 auto* reqs = new (std::nothrow) MPI_Request[size];
273 if (nullptr == reqs) {
279 typelng=rdtype->size();
280 rdtype->extent(&lb, &extent);
281 first_segment_count = rcount;
282 COLL_TUNED_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
283 first_segment_count );
285 for (i = 0; i < size; ++i) {
288 reqs[i] = MPI_REQUEST_NULL;
292 /* irecv for the first segment from i */
293 ptmp = (char*)rbuf + i * rcount * extent;
294 first_segment_req = Request::irecv(ptmp, first_segment_count, rdtype, i,
295 COLL_TAG_GATHER, comm
298 /* send sync message */
299 Request::send(rbuf, 0, MPI_BYTE, i,
303 /* irecv for the second segment */
304 ptmp = (char*)rbuf + (i * rcount + first_segment_count) * extent;
305 reqs[i]=Request::irecv(ptmp, (rcount - first_segment_count),
306 rdtype, i, COLL_TAG_GATHER, comm
309 /* wait on the first segment to complete */
310 Request::wait(&first_segment_req, MPI_STATUS_IGNORE);
313 /* copy local data if necessary */
314 if (MPI_IN_PLACE != sbuf) {
315 ret = Datatype::copy(sbuf, scount, sdtype,
316 (char*)rbuf + rank * rcount * extent,
318 if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
321 /* wait all second segments to complete */
322 ret = Request::waitall(size, reqs, MPI_STATUSES_IGNORE);
323 if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
333 "ERROR_HNDL: node %d file %s line %d error %d\n",
334 rank, __FILE__, line, ret );
339 * Linear functions are copied from the BASIC coll module
340 * they do not segment the message and are simple implementations
341 * but for some small number of nodes and/or small data sizes they
342 * are just as fast as tuned/tree based segmenting operations
343 * and as such may be selected by the decision functions
344 * These are copied into this module due to the way we select modules
345 * in V1. i.e. in V2 we will handle this differently and so will not
346 * have to duplicate code.
347 * JPG following the examples from other coll_tuned implementations. Dec06.
350 /* copied function (with appropriate renaming) starts here */
354 * Function: - basic gather operation
355 * Accepts: - same arguments as MPI_Gather()
356 * Returns: - MPI_SUCCESS or error code
358 int gather__ompi_basic_linear(const void* sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount,
359 MPI_Datatype rdtype, int root, MPI_Comm comm)
373 /* Everyone but root sends data and returns. */
374 XBT_DEBUG("ompi_coll_tuned_gather_intra_basic_linear rank %d", rank);
377 Request::send(sbuf, scount, sdtype, root,
383 /* I am the root, loop receiving the data. */
385 rdtype->extent(&lb, &extent);
386 incr = extent * rcount;
387 for (i = 0, ptmp = (char *) rbuf; i < size; ++i, ptmp += incr) {
389 if (MPI_IN_PLACE != sbuf) {
390 err = Datatype::copy(sbuf, scount, sdtype,
391 ptmp, rcount, rdtype);
396 Request::recv(ptmp, rcount, rdtype, i,
398 comm, MPI_STATUS_IGNORE);
401 if (MPI_SUCCESS != err) {
411 } // namespace simgrid::smpi