X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/blobdiff_plain/9335baa2b23f940577448a84168eb0b50d3dd966..39c935d6d5ee86d153f6f7e6a10d723ae7c57f6f:/src/smpi/colls/allreduce/allreduce-ompi-ring-segmented.cpp diff --git a/src/smpi/colls/allreduce/allreduce-ompi-ring-segmented.cpp b/src/smpi/colls/allreduce/allreduce-ompi-ring-segmented.cpp index 8500d14e72..eaab673af6 100644 --- a/src/smpi/colls/allreduce/allreduce-ompi-ring-segmented.cpp +++ b/src/smpi/colls/allreduce/allreduce-ompi-ring-segmented.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2013-2017. The SimGrid Team. +/* Copyright (c) 2013-2021. The SimGrid Team. * All rights reserved. */ /* This program is free software; you can redistribute it and/or modify it @@ -62,24 +62,24 @@ * Accepts: Same as MPI_Allreduce(), segment size * Returns: MPI_SUCCESS or error code * - * Description: Implements pipelined ring algorithm for allreduce: + * Description: Implements pipelined ring algorithm for allreduce: * user supplies suggested segment size for the pipelining of * reduce operation. - * The segment size determines the number of phases, np, for - * the algorithm execution. - * The message is automatically divided into blocks of + * The segment size determines the number of phases, np, for + * the algorithm execution. + * The message is automatically divided into blocks of * approximately (count / (np * segcount)) elements. - * At the end of reduction phase, allgather like step is + * At the end of reduction phase, allgather like step is * executed. * Algorithm requires (np + 1)*(N - 1) steps. * - * Limitations: The algorithm DOES NOT preserve order of operations so it + * Limitations: The algorithm DOES NOT preserve order of operations so it * can be used only for commutative operations. - * In addition, algorithm cannot work if the total size is + * In addition, algorithm cannot work if the total size is * less than size * segment size. * Example on 3 nodes with 2 phases * Initial state - * # 0 1 2 + * # 0 1 2 * [00a] [10a] [20a] * [00b] [10b] [20b] * [01a] [11a] [21a] @@ -88,9 +88,9 @@ * [02b] [12b] [22b] * * COMPUTATION PHASE 0 (a) - * Step 0: rank r sends block ra to rank (r+1) and receives bloc (r-1)a + * Step 0: rank r sends block ra to rank (r+1) and receives block (r-1)a * from rank (r-1) [with wraparound]. - * # 0 1 2 + * # 0 1 2 * [00a] [00a+10a] [20a] * [00b] [10b] [20b] * [01a] [11a] [11a+21a] @@ -98,20 +98,20 @@ * [22a+02a] [12a] [22a] * [02b] [12b] [22b] * - * Step 1: rank r sends block (r-1)a to rank (r+1) and receives bloc + * Step 1: rank r sends block (r-1)a to rank (r+1) and receives block * (r-2)a from rank (r-1) [with wraparound]. - * # 0 1 2 + * # 0 1 2 * [00a] [00a+10a] [00a+10a+20a] * [00b] [10b] [20b] * [11a+21a+01a] [11a] [11a+21a] * [01b] [11b] [21b] * [22a+02a] [22a+02a+12a] [22a] - * [02b] [12b] [22b] + * [02b] [12b] [22b] * * COMPUTATION PHASE 1 (b) - * Step 0: rank r sends block rb to rank (r+1) and receives bloc (r-1)b + * Step 0: rank r sends block rb to rank (r+1) and receives block (r-1)b * from rank (r-1) [with wraparound]. - * # 0 1 2 + * # 0 1 2 * [00a] [00a+10a] [20a] * [00b] [00b+10b] [20b] * [01a] [11a] [11a+21a] @@ -119,22 +119,22 @@ * [22a+02a] [12a] [22a] * [22b+02b] [12b] [22b] * - * Step 1: rank r sends block (r-1)b to rank (r+1) and receives bloc + * Step 1: rank r sends block (r-1)b to rank (r+1) and receives block * (r-2)b from rank (r-1) [with wraparound]. - * # 0 1 2 + * # 0 1 2 * [00a] [00a+10a] [00a+10a+20a] * [00b] [10b] [0bb+10b+20b] * [11a+21a+01a] [11a] [11a+21a] * [11b+21b+01b] [11b] [21b] * [22a+02a] [22a+02a+12a] [22a] - * [02b] [22b+01b+12b] [22b] + * [02b] [22b+01b+12b] [22b] + * * - * * DISTRIBUTION PHASE: ring ALLGATHER with ranks shifted by 1 (same as * in regular ring algorithm. * */ - + #define COLL_TUNED_COMPUTED_SEGCOUNT(SEGSIZE, TYPELNG, SEGCOUNT) \ if( ((SEGSIZE) >= (TYPELNG)) && \ ((SEGSIZE) < ((TYPELNG) * (SEGCOUNT))) ) { \ @@ -144,7 +144,7 @@ if( residual > ((TYPELNG) >> 1) ) \ (SEGCOUNT)++; \ } \ - + #define COLL_TUNED_COMPUTE_BLOCKCOUNT( COUNT, NUM_BLOCKS, SPLIT_INDEX, \ EARLY_BLOCK_COUNT, LATE_BLOCK_COUNT ) \ EARLY_BLOCK_COUNT = LATE_BLOCK_COUNT = COUNT / NUM_BLOCKS; \ @@ -153,29 +153,29 @@ EARLY_BLOCK_COUNT = EARLY_BLOCK_COUNT + 1; \ } \ -#include "../colls_private.h" -namespace simgrid{ -namespace smpi{ -int -Coll_allreduce_ompi_ring_segmented::allreduce(void *sbuf, void *rbuf, int count, - MPI_Datatype dtype, - MPI_Op op, - MPI_Comm comm) +#include "../colls_private.hpp" + +namespace simgrid { +namespace smpi { +int allreduce__ompi_ring_segmented(const void *sbuf, void *rbuf, int count, + MPI_Datatype dtype, + MPI_Op op, + MPI_Comm comm) { int ret = MPI_SUCCESS; int line; int k, recv_from, send_to; - int early_blockcount, late_blockcount, split_rank; + int early_blockcount, late_blockcount, split_rank; int segcount, max_segcount; int num_phases, phase; int block_count; unsigned int inbi; size_t typelng; - char *tmpsend = NULL, *tmprecv = NULL; - char *inbuf[2] = {NULL, NULL}; + char *tmpsend = nullptr, *tmprecv = nullptr; + unsigned char* inbuf[2] = {nullptr, nullptr}; ptrdiff_t true_extent, extent; ptrdiff_t block_offset, max_real_segsize; - MPI_Request reqs[2] = {NULL, NULL}; + MPI_Request reqs[2] = {nullptr, nullptr}; const size_t segsize = 1 << 20; /* 1 MB */ int size = comm->size(); int rank = comm->rank(); @@ -190,7 +190,7 @@ Coll_allreduce_ompi_ring_segmented::allreduce(void *sbuf, void *rbuf, int count, } return MPI_SUCCESS; } - + /* Determine segment count based on the suggested segment size */ extent = dtype->get_extent(); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } @@ -204,38 +204,45 @@ Coll_allreduce_ompi_ring_segmented::allreduce(void *sbuf, void *rbuf, int count, /* Special case for count less than size * segcount - use regular ring */ if (count < size * segcount) { XBT_DEBUG( "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count); - return (Coll_allreduce_lr::allreduce(sbuf, rbuf, count, dtype, op, - comm)); + return (allreduce__lr(sbuf, rbuf, count, dtype, op, comm)); } /* Determine the number of phases of the algorithm */ num_phases = count / (size * segcount); - if ((count % (size * segcount) >= size) && + if ((count % (size * segcount) >= size) && (count % (size * segcount) > ((size * segcount) / 2))) { num_phases++; } - /* Determine the number of elements per block and corresponding + /* Determine the number of elements per block and corresponding block sizes. The blocks are divided into "early" and "late" ones: - blocks 0 .. (split_rank - 1) are "early" and + blocks 0 .. (split_rank - 1) are "early" and blocks (split_rank) .. (size - 1) are "late". Early blocks are at most 1 element larger than the late ones. Note, these blocks will be split into num_phases segments, out of the largest one will have max_segcount elements. */ - COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, + COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, early_blockcount, late_blockcount ) COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi, max_segcount, k) max_real_segsize = true_extent + (max_segcount - 1) * extent; /* Allocate and initialize temporary buffers */ - inbuf[0] = (char*)smpi_get_tmp_sendbuffer(max_real_segsize); - if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; } + inbuf[0] = smpi_get_tmp_sendbuffer(max_real_segsize); + if (nullptr == inbuf[0]) { + ret = -1; + line = __LINE__; + goto error_hndl; + } if (size > 2) { - inbuf[1] = (char*)smpi_get_tmp_recvbuffer(max_real_segsize); - if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; } + inbuf[1] = smpi_get_tmp_recvbuffer(max_real_segsize); + if (nullptr == inbuf[1]) { + ret = -1; + line = __LINE__; + goto error_hndl; + } } /* Handle MPI_IN_PLACE */ @@ -249,7 +256,7 @@ Coll_allreduce_ompi_ring_segmented::allreduce(void *sbuf, void *rbuf, int count, ptrdiff_t phase_offset; int early_phase_segcount, late_phase_segcount, split_phase, phase_count; - /* + /* For each of the remote nodes: - post irecv for block (r-1) - send block (r) @@ -263,12 +270,12 @@ Coll_allreduce_ompi_ring_segmented::allreduce(void *sbuf, void *rbuf, int count, - wait on block (r + 1) - compute on block (r + 1) - send block (r + 1) to rank (r + 1) - Note that we must be careful when computing the begining of buffers and + Note that we must be careful when computing the beginning of buffers and for send operations and computation we must compute the exact block size. */ send_to = (rank + 1) % size; recv_from = (rank + size - 1) % size; - + inbi = 0; /* Initialize first receive from the neighbor on the left */ reqs[inbi] = Request::irecv(inbuf[inbi], max_segcount, dtype, recv_from, @@ -276,8 +283,8 @@ Coll_allreduce_ompi_ring_segmented::allreduce(void *sbuf, void *rbuf, int count, /* Send first block (my block) to the neighbor on the right: - compute my block and phase offset - send data */ - block_offset = ((rank < split_rank)? - (rank * early_blockcount) : + block_offset = ((rank < split_rank)? + (rank * early_blockcount) : (rank * late_blockcount + split_rank)); block_count = ((rank < split_rank)? early_blockcount : late_blockcount); COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, @@ -285,39 +292,39 @@ Coll_allreduce_ompi_ring_segmented::allreduce(void *sbuf, void *rbuf, int count, phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? - (phase * early_phase_segcount) : + (phase * early_phase_segcount) : (phase * late_phase_segcount + split_phase)); tmpsend = ((char*)rbuf) + (block_offset + phase_offset) * extent; Request::send(tmpsend, phase_count, dtype, send_to, 666, comm); - + for (k = 2; k < size; k++) { const int prevblock = (rank + size - k + 1) % size; - + inbi = inbi ^ 0x1; - + /* Post irecv for the current block */ reqs[inbi] = Request::irecv(inbuf[inbi], max_segcount, dtype, recv_from, 666, comm); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - + /* Wait on previous block to arrive */ Request::wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE); - + /* Apply operation on previous block: result goes to rbuf rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock] */ block_offset = ((prevblock < split_rank)? (prevblock * early_blockcount) : (prevblock * late_blockcount + split_rank)); - block_count = ((prevblock < split_rank)? + block_count = ((prevblock < split_rank)? early_blockcount : late_blockcount); COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? - (phase * early_phase_segcount) : + (phase * early_phase_segcount) : (phase * late_phase_segcount + split_phase)); tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent; if(op!=MPI_OP_NULL) op->apply( inbuf[inbi ^ 0x1], tmprecv, &phase_count, dtype); @@ -325,25 +332,25 @@ Coll_allreduce_ompi_ring_segmented::allreduce(void *sbuf, void *rbuf, int count, Request::send(tmprecv, phase_count, dtype, send_to, 666, comm); } - + /* Wait on the last block to arrive */ Request::wait(&reqs[inbi], MPI_STATUS_IGNORE); - - /* Apply operation on the last block (from neighbor (rank + 1) + + /* Apply operation on the last block (from neighbor (rank + 1) rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */ recv_from = (rank + 1) % size; block_offset = ((recv_from < split_rank)? (recv_from * early_blockcount) : (recv_from * late_blockcount + split_rank)); - block_count = ((recv_from < split_rank)? + block_count = ((recv_from < split_rank)? early_blockcount : late_blockcount); COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? - (phase * early_phase_segcount) : + (phase * early_phase_segcount) : (phase * late_phase_segcount + split_phase)); tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent; if(op!=MPI_OP_NULL) op->apply( inbuf[inbi], tmprecv, &phase_count, dtype); @@ -355,15 +362,15 @@ Coll_allreduce_ompi_ring_segmented::allreduce(void *sbuf, void *rbuf, int count, for (k = 0; k < size - 1; k++) { const int recv_data_from = (rank + size - k) % size; const int send_data_from = (rank + 1 + size - k) % size; - const int send_block_offset = + const int send_block_offset = ((send_data_from < split_rank)? (send_data_from * early_blockcount) : (send_data_from * late_blockcount + split_rank)); - const int recv_block_offset = + const int recv_block_offset = ((recv_data_from < split_rank)? (recv_data_from * early_blockcount) : (recv_data_from * late_blockcount + split_rank)); - block_count = ((send_data_from < split_rank)? + block_count = ((send_data_from < split_rank)? early_blockcount : late_blockcount); tmprecv = (char*)rbuf + recv_block_offset * extent; @@ -377,16 +384,16 @@ Coll_allreduce_ompi_ring_segmented::allreduce(void *sbuf, void *rbuf, int count, } - if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]); - if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]); + smpi_free_tmp_buffer(inbuf[0]); + smpi_free_tmp_buffer(inbuf[1]); return MPI_SUCCESS; error_hndl: XBT_DEBUG("%s:%4d\tRank %d Error occurred %d\n", __FILE__, line, rank, ret); - if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]); - if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]); + smpi_free_tmp_buffer(inbuf[0]); + smpi_free_tmp_buffer(inbuf[1]); return ret; } }