1 /* Copyright (c) 2013-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "colls_private.h"
9 #ifndef BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE
10 #define BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE 128
13 #ifndef BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE
14 #define BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE 128
17 /* Non-topology-specific pipelined linear-bcast function */
18 int smpi_coll_tuned_bcast_arrival_scatter(void *buf, int count,
19 MPI_Datatype datatype, int root,
22 int tag = -COLL_TAG_BCAST;//in order to use ANY_TAG, make this one positive
34 MPI_Status temp_status_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
41 int flag_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
42 int already_sent[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
43 int header_buf[BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE];
44 char temp_buf[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
45 int will_send[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
46 int max_node = BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE;
47 int header_size = BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE;
50 extent = smpi_datatype_get_extent(datatype);
53 /* source and destination */
56 rank = smpi_comm_rank(comm);
57 size = smpi_comm_size(comm);
59 /* message too small */
61 XBT_WARN("MPI_bcast_arrival_scatter use default MPI_bcast.");
62 smpi_mpi_bcast(buf, count, datatype, root, comm);
68 /* if root is not zero send to rank zero first
69 this can be modified to make it faster by using logical src, dst.
73 smpi_mpi_send(buf, count, datatype, 0, tag - 1, comm);
74 } else if (rank == 0) {
75 smpi_mpi_recv(buf, count, datatype, root, tag - 1, comm, &status);
80 /* value == 0 means root has not send data (or header) to the node yet */
81 for (i = 0; i < max_node; i++) {
90 for (i = 0; i < max_node; i++)
94 while (sent_count < (size - 1)) {
96 for (k = 0; k < 3; k++) {
97 for (i = 1; i < size; i++) {
98 if ((already_sent[i] == 0) && (will_send[i] == 0)) {
99 smpi_mpi_iprobe(i, MPI_ANY_TAG, comm, &flag_array[i],
100 &temp_status_array[i]);
101 if (flag_array[i] == 1) {
103 smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, comm,
112 /* recv 1-byte message in this round */
113 for (i = 1; i < size; i++) {
115 if ((will_send[i] == 1) && (already_sent[i] == 0)) {
116 header_buf[header_index] = i;
120 /* will send in the next step */
126 if (header_index != 0) {
127 printf("header index = %d node = ",header_index);
128 for (i=0;i<header_index;i++) {
129 printf("%d ",header_buf[i]);
135 /* send header followed by data */
136 if (header_index != 0) {
137 header_buf[header_index] = -1;
140 for (i = 0; i < header_index; i++) {
142 smpi_mpi_send(header_buf, header_size, MPI_INT, to, header_tag, comm);
145 curr_remainder = count % header_index;
146 curr_size = (count / header_index);
147 curr_increment = curr_size * extent;
151 for (i = 0; i < header_index; i++) {
153 if ((i == (header_index - 1)) || (curr_size == 0))
154 curr_size += curr_remainder;
155 //printf("Root send to %d index %d\n",to,(i*curr_increment));
156 smpi_mpi_send((char *) buf + (i * curr_increment), curr_size, datatype, to,
160 } /* while (sent_count < size-1) */
166 /* send 1-byte message to root */
167 smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm);
169 /* wait for header forward when required */
170 smpi_mpi_recv(header_buf, header_size, MPI_INT, 0, header_tag, comm, &status);
172 /* search for where it is */
174 while (rank != header_buf[myordering]) {
179 while (header_buf[total_nodes] != -1) {
183 curr_remainder = count % total_nodes;
184 curr_size = (count / total_nodes);
185 curr_increment = curr_size * extent;
186 int recv_size = curr_size;
189 if (myordering == (total_nodes - 1))
190 recv_size += curr_remainder;
191 smpi_mpi_recv((char *) buf + (myordering * curr_increment), recv_size, datatype,
192 0, tag, comm, &status);
194 /* at this point all nodes in this set perform all-gather operation */
195 to = header_buf[myordering + 1];
196 from = header_buf[myordering - 1];
198 from = header_buf[total_nodes - 1];
199 if (myordering == (total_nodes - 1))
203 /* last segment may have a larger size since it also include the remainder */
204 int last_segment_ptr = (total_nodes - 1) * (count / total_nodes) * extent;
208 for (i = 0; i < total_nodes - 1; i++) {
210 ((myordering - i + total_nodes) % total_nodes) * curr_increment;
212 ((myordering - i - 1 + total_nodes) % total_nodes) * curr_increment;
215 if (send_offset != last_segment_ptr)
216 send_count = curr_size;
218 send_count = curr_size + curr_remainder;
220 if (recv_offset != last_segment_ptr)
221 recv_count = curr_size;
223 recv_count = curr_size + curr_remainder;
225 //printf("\t\tnode %d sent_to %d recv_from %d send_size %d recv_size %d\n",rank,to,from,send_count,recv_count);
226 //printf("\tnode %d sent_offset %d send_count %d\n",rank,send_offset,send_count);
229 smpi_mpi_sendrecv((char *) buf + send_offset, send_count, datatype, to,
230 tag + i, (char *) buf + recv_offset, recv_count, datatype,
231 from, tag + i, comm, &status);