X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/blobdiff_plain/727c068cd59669e79c6779f9ff93ec4ac91f7522..a2f1b23687f04169144f4ffb4f20dc4fc5c28395:/src/smpi/colls/bcast-NTSB.c diff --git a/src/smpi/colls/bcast-NTSB.c b/src/smpi/colls/bcast-NTSB.c new file mode 100644 index 0000000000..f7a7e4e73d --- /dev/null +++ b/src/smpi/colls/bcast-NTSB.c @@ -0,0 +1,178 @@ +#include "colls.h" + +int bcast_NTSB_segment_size_in_byte = 8192; + +int smpi_coll_tuned_bcast_NTSB(void *buf, int count, MPI_Datatype datatype, + int root, MPI_Comm comm) +{ + int tag = 5000; + MPI_Status status; + int rank, size; + int i; + + MPI_Request *send_request_array; + MPI_Request *recv_request_array; + MPI_Status *send_status_array; + MPI_Status *recv_status_array; + + MPI_Aint extent; + MPI_Type_extent(datatype, &extent); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + /* source node and destination nodes (same through out the functions) */ + int from = (rank - 1) / 2; + int to_left = rank * 2 + 1; + int to_right = rank * 2 + 2; + if (to_left >= size) + to_left = -1; + if (to_right >= size) + to_right = -1; + + /* segment is segment size in number of elements (not bytes) */ + int segment = bcast_NTSB_segment_size_in_byte / extent; + + /* pipeline length */ + int pipe_length = count / segment; + + /* use for buffer offset for sending and receiving data = segment size in byte */ + int increment = segment * extent; + + /* if the input size is not divisible by segment size => + the small remainder will be done with native implementation */ + int remainder = count % segment; + + /* if root is not zero send to rank zero first */ + if (root != 0) { + if (rank == root) { + MPI_Send(buf, count, datatype, 0, tag, comm); + } else if (rank == 0) { + MPI_Recv(buf, count, datatype, root, tag, comm, &status); + } + } + + /* when a message is smaller than a block size => no pipeline */ + if (count <= segment) { + + /* case: root */ + if (rank == 0) { + /* case root has only a left child */ + if (to_right == -1) { + MPI_Send(buf, count, datatype, to_left, tag, comm); + } + /* case root has both left and right children */ + else { + MPI_Send(buf, count, datatype, to_left, tag, comm); + MPI_Send(buf, count, datatype, to_right, tag, comm); + } + } + + /* case: leaf ==> receive only */ + else if (to_left == -1) { + MPI_Recv(buf, count, datatype, from, tag, comm, &status); + } + + /* case: intermidiate node with only left child ==> relay message */ + else if (to_right == -1) { + MPI_Recv(buf, count, datatype, from, tag, comm, &status); + MPI_Send(buf, count, datatype, to_left, tag, comm); + } + + /* case: intermidiate node with both left and right children ==> relay message */ + else { + MPI_Recv(buf, count, datatype, from, tag, comm, &status); + MPI_Send(buf, count, datatype, to_left, tag, comm); + MPI_Send(buf, count, datatype, to_right, tag, comm); + } + return MPI_SUCCESS; + } + // pipelining + else { + + send_request_array = + (MPI_Request *) malloc(2 * (size + pipe_length) * sizeof(MPI_Request)); + recv_request_array = + (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request)); + send_status_array = + (MPI_Status *) malloc(2 * (size + pipe_length) * sizeof(MPI_Status)); + recv_status_array = + (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status)); + + + + /* case: root */ + if (rank == 0) { + /* case root has only a left child */ + if (to_right == -1) { + for (i = 0; i < pipe_length; i++) { + MPI_Isend((char *) buf + (i * increment), segment, datatype, to_left, + tag + i, comm, &send_request_array[i]); + } + MPI_Waitall((pipe_length), send_request_array, send_status_array); + } + /* case root has both left and right children */ + else { + for (i = 0; i < pipe_length; i++) { + MPI_Isend((char *) buf + (i * increment), segment, datatype, to_left, + tag + i, comm, &send_request_array[i]); + MPI_Isend((char *) buf + (i * increment), segment, datatype, to_right, + tag + i, comm, &send_request_array[i + pipe_length]); + } + MPI_Waitall((2 * pipe_length), send_request_array, send_status_array); + } + } + + /* case: leaf ==> receive only */ + else if (to_left == -1) { + for (i = 0; i < pipe_length; i++) { + MPI_Irecv((char *) buf + (i * increment), segment, datatype, from, + tag + i, comm, &recv_request_array[i]); + } + MPI_Waitall((pipe_length), recv_request_array, recv_status_array); + } + + /* case: intermidiate node with only left child ==> relay message */ + else if (to_right == -1) { + for (i = 0; i < pipe_length; i++) { + MPI_Irecv((char *) buf + (i * increment), segment, datatype, from, + tag + i, comm, &recv_request_array[i]); + } + for (i = 0; i < pipe_length; i++) { + MPI_Wait(&recv_request_array[i], &status); + MPI_Isend((char *) buf + (i * increment), segment, datatype, to_left, + tag + i, comm, &send_request_array[i]); + } + MPI_Waitall(pipe_length, send_request_array, send_status_array); + + } + /* case: intermidiate node with both left and right children ==> relay message */ + else { + for (i = 0; i < pipe_length; i++) { + MPI_Irecv((char *) buf + (i * increment), segment, datatype, from, + tag + i, comm, &recv_request_array[i]); + } + for (i = 0; i < pipe_length; i++) { + MPI_Wait(&recv_request_array[i], &status); + MPI_Isend((char *) buf + (i * increment), segment, datatype, to_left, + tag + i, comm, &send_request_array[i]); + MPI_Isend((char *) buf + (i * increment), segment, datatype, to_right, + tag + i, comm, &send_request_array[i + pipe_length]); + } + MPI_Waitall((2 * pipe_length), send_request_array, send_status_array); + } + + free(send_request_array); + free(recv_request_array); + free(send_status_array); + free(recv_status_array); + } /* end pipeline */ + + /* when count is not divisible by block size, use default BCAST for the remainder */ + if ((remainder != 0) && (count > segment)) { + MPI_Bcast((char *) buf + (pipe_length * increment), remainder, datatype, + root, comm); + } + + return MPI_SUCCESS; +}