src/smpi/colls/reduce-NTSL.c

   1 #include "colls.h"
   2 //#include <star-reduction.c>
   3
   4 int reduce_NTSL_segment_size_in_byte = 8192;
   5
   6 /* Non-topology-specific pipelined linear-bcast function
   7    0->1, 1->2 ,2->3, ....., ->last node : in a pipeline fashion
   8 */
   9 int smpi_coll_tuned_reduce_NTSL(void *buf, void *rbuf, int count,
  10                                 MPI_Datatype datatype, MPI_Op op, int root,
  11                                 MPI_Comm comm)
  12 {
  13   int tag = 50;
  14   MPI_Status status;
  15   MPI_Request *send_request_array;
  16   MPI_Request *recv_request_array;
  17   MPI_Status *send_status_array;
  18   MPI_Status *recv_status_array;
  19   int rank, size;
  20   int i;
  21   MPI_Aint extent;
  22   MPI_Type_extent(datatype, &extent);
  23
  24   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  25   MPI_Comm_size(MPI_COMM_WORLD, &size);
  26
  27   /* source node and destination nodes (same through out the functions) */
  28   int to = (rank - 1 + size) % size;
  29   int from = (rank + 1) % size;
  30
  31   /* segment is segment size in number of elements (not bytes) */
  32   int segment = reduce_NTSL_segment_size_in_byte / extent;
  33
  34   /* pipeline length */
  35   int pipe_length = count / segment;
  36
  37   /* use for buffer offset for sending and receiving data = segment size in byte */
  38   int increment = segment * extent;
  39
  40   /* if the input size is not divisible by segment size =>
  41      the small remainder will be done with native implementation */
  42   int remainder = count % segment;
  43
  44   /* if root is not zero send to rank zero first
  45      this can be modified to make it faster by using logical src, dst.
  46    */
  47
  48   /*
  49      if (root != 0) {
  50      if (rank == root){
  51      MPI_Send(buf,count,datatype,0,tag,comm);
  52      }
  53      else if (rank == 0) {
  54      MPI_Recv(buf,count,datatype,root,tag,comm,&status);
  55      }
  56      }
  57    */
  58
  59   char *tmp_buf;
  60   tmp_buf = (char *) malloc(count * extent);
  61
  62   MPI_Sendrecv(buf, count, datatype, rank, tag, rbuf, count, datatype, rank,
  63                tag, comm, &status);
  64
  65   /* when a message is smaller than a block size => no pipeline */
  66   if (count <= segment) {
  67     if (rank == root) {
  68       MPI_Recv(tmp_buf, count, datatype, from, tag, comm, &status);
  69       star_reduction(op, tmp_buf, rbuf, &count, &datatype);
  70     } else if (rank == ((root - 1 + size) % size)) {
  71       MPI_Send(rbuf, count, datatype, to, tag, comm);
  72     } else {
  73       MPI_Recv(tmp_buf, count, datatype, from, tag, comm, &status);
  74       star_reduction(op, tmp_buf, rbuf, &count, &datatype);
  75       MPI_Send(rbuf, count, datatype, to, tag, comm);
  76     }
  77     free(tmp_buf);
  78     return MPI_SUCCESS;
  79   }
  80
  81   /* pipeline */
  82   else {
  83     send_request_array =
  84         (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
  85     recv_request_array =
  86         (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
  87     send_status_array =
  88         (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
  89     recv_status_array =
  90         (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
  91
  92     /* root recv data */
  93     if (rank == root) {
  94       for (i = 0; i < pipe_length; i++) {
  95         MPI_Irecv((char *) tmp_buf + (i * increment), segment, datatype, from,
  96                   (tag + i), comm, &recv_request_array[i]);
  97       }
  98       for (i = 0; i < pipe_length; i++) {
  99         MPI_Wait(&recv_request_array[i], &status);
 100         star_reduction(op, tmp_buf + (i * increment), (char *)rbuf + (i * increment),
 101                        &segment, &datatype);
 102       }
 103     }
 104
 105     /* last node only sends data */
 106     else if (rank == ((root - 1 + size) % size)) {
 107       for (i = 0; i < pipe_length; i++) {
 108         MPI_Isend((char *)rbuf + (i * increment), segment, datatype, to, (tag + i),
 109                   comm, &send_request_array[i]);
 110       }
 111       MPI_Waitall((pipe_length), send_request_array, send_status_array);
 112     }
 113
 114     /* intermediate nodes relay (receive, reduce, then send) data */
 115     else {
 116       for (i = 0; i < pipe_length; i++) {
 117         MPI_Irecv((char *) tmp_buf + (i * increment), segment, datatype, from,
 118                   (tag + i), comm, &recv_request_array[i]);
 119       }
 120       for (i = 0; i < pipe_length; i++) {
 121         MPI_Wait(&recv_request_array[i], &status);
 122         star_reduction(op, tmp_buf + (i * increment), (char *)rbuf + (i * increment),
 123                        &segment, &datatype);
 124         MPI_Isend((char *) rbuf + (i * increment), segment, datatype, to,
 125                   (tag + i), comm, &send_request_array[i]);
 126       }
 127       MPI_Waitall((pipe_length), send_request_array, send_status_array);
 128     }
 129
 130     free(send_request_array);
 131     free(recv_request_array);
 132     free(send_status_array);
 133     free(recv_status_array);
 134   }                             /* end pipeline */
 135
 136   /* when count is not divisible by block size, use default BCAST for the remainder */
 137   if ((remainder != 0) && (count > segment)) {
 138     MPI_Reduce((char *)buf + (pipe_length * increment),
 139                (char *)rbuf + (pipe_length * increment), remainder, datatype, op, root,
 140                comm);
 141   }
 142
 143   free(tmp_buf);
 144   return MPI_SUCCESS;
 145 }