src/smpi/colls/alltoall-3dmesh.c

   1 /* Copyright (c) 2013-2014. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 #include "colls_private.h"
   8 #include <math.h>
   9
  10 /*****************************************************************************
  11
  12  * Function: alltoall_3dmesh_shoot
  13
  14  * Return: int
  15
  16  * Inputs:
  17     send_buff: send input buffer
  18     send_count: number of elements to send
  19     send_type: data type of elements being sent
  20     recv_buff: receive output buffer
  21     recv_count: number of elements to received
  22     recv_type: data type of elements being received
  23     comm: communicator
  24
  25  * Descrp: Function realizes the alltoall operation using the 3dmesh
  26            algorithm. It actually performs allgather operation in x dimension,
  27            y dimension, then in z dimension. Each node then extracts the
  28            needed data. The communication in all dimension is simple.
  29
  30  * Auther: Ahmad Faraj
  31 ****************************************************************************/
  32
  33 static int alltoall_check_is_3dmesh(int num, int *i, int *j, int *k)
  34 {
  35   int x, max = num / 3;
  36   x = cbrt(num);
  37   *i = *j = *k = 0;
  38   while (x <= max) {
  39     if ((num % (x * x)) == 0) {
  40       *i = *j = x;
  41       *k = num / (x * x);
  42       return 1;
  43     }
  44     x++;
  45   }
  46   return 0;
  47 }
  48
  49 int smpi_coll_tuned_alltoall_3dmesh(void *send_buff, int send_count,
  50                                     MPI_Datatype send_type,
  51                                     void *recv_buff, int recv_count,
  52                                     MPI_Datatype recv_type, MPI_Comm comm)
  53 {
  54   MPI_Request *reqs, *req_ptr;
  55   MPI_Aint extent;
  56   MPI_Status status, *statuses;
  57   int i, j, src, dst, rank, num_procs, num_reqs, X, Y, Z, block_size, count;
  58   int my_z, two_dsize, my_row_base, my_col_base, my_z_base, src_row_base;
  59   int src_z_base, send_offset, recv_offset, tag = COLL_TAG_ALLTOALL;
  60
  61   char *tmp_buff1, *tmp_buff2;
  62
  63   rank = smpi_comm_rank(comm);
  64   num_procs = smpi_comm_size(comm);
  65   extent = smpi_datatype_get_extent(send_type);
  66
  67   if (!alltoall_check_is_3dmesh(num_procs, &X, &Y, &Z))
  68     return MPI_ERR_OTHER;
  69
  70   num_reqs = X;
  71   if (Y > X)
  72     num_reqs = Y;
  73   if (Z > Y)
  74     num_reqs = Z;
  75
  76   two_dsize = X * Y;
  77   my_z = rank / two_dsize;
  78
  79   my_row_base = (rank / X) * X;
  80   my_col_base = (rank % Y) + (my_z * two_dsize);
  81   my_z_base = my_z * two_dsize;
  82
  83   block_size = extent * send_count;
  84
  85   tmp_buff1 = (char *) xbt_malloc(block_size * num_procs * two_dsize);
  86   tmp_buff2 = (char *) xbt_malloc(block_size * two_dsize);
  87
  88   statuses = (MPI_Status *) xbt_malloc(num_reqs * sizeof(MPI_Status));
  89   reqs = (MPI_Request *) xbt_malloc(num_reqs * sizeof(MPI_Request));
  90
  91   req_ptr = reqs;
  92
  93   send_offset = recv_offset = (rank % two_dsize) * block_size * num_procs;
  94
  95   smpi_mpi_sendrecv(send_buff, send_count * num_procs, send_type, rank, tag,
  96                tmp_buff1 + recv_offset, num_procs * recv_count,
  97                recv_type, rank, tag, comm, &status);
  98
  99   count = send_count * num_procs;
 100
 101   for (i = 0; i < Y; i++) {
 102     src = i + my_row_base;
 103     if (src == rank)
 104       continue;
 105     recv_offset = (src % two_dsize) * block_size * num_procs;
 106     *(req_ptr++) = smpi_mpi_irecv(tmp_buff1 + recv_offset, count, recv_type, src, tag, comm);
 107   }
 108
 109   for (i = 0; i < Y; i++) {
 110     dst = i + my_row_base;
 111     if (dst == rank)
 112       continue;
 113     smpi_mpi_send(send_buff, count, send_type, dst, tag, comm);
 114   }
 115
 116   smpi_mpi_waitall(Y - 1, reqs, statuses);
 117   req_ptr = reqs;
 118
 119
 120   for (i = 0; i < X; i++) {
 121     src = (i * Y + my_col_base);
 122     if (src == rank)
 123       continue;
 124
 125     src_row_base = (src / X) * X;
 126
 127     recv_offset = (src_row_base % two_dsize) * block_size * num_procs;
 128     *(req_ptr++) = smpi_mpi_irecv(tmp_buff1 + recv_offset, recv_count * num_procs * Y,
 129               recv_type, src, tag, comm);
 130   }
 131
 132   send_offset = (my_row_base % two_dsize) * block_size * num_procs;
 133   for (i = 0; i < X; i++) {
 134     dst = (i * Y + my_col_base);
 135     if (dst == rank)
 136       continue;
 137     smpi_mpi_send(tmp_buff1 + send_offset, send_count * num_procs * Y, send_type,
 138              dst, tag, comm);
 139   }
 140
 141   smpi_mpi_waitall(X - 1, reqs, statuses);
 142   req_ptr = reqs;
 143
 144   for (i = 0; i < two_dsize; i++) {
 145     send_offset = (rank * block_size) + (i * block_size * num_procs);
 146     recv_offset = (my_z_base * block_size) + (i * block_size);
 147     smpi_mpi_sendrecv(tmp_buff1 + send_offset, send_count, send_type, rank, tag,
 148                  (char *) recv_buff + recv_offset, recv_count, recv_type,
 149                  rank, tag, comm, &status);
 150   }
 151
 152   for (i = 1; i < Z; i++) {
 153     src = (rank + i * two_dsize) % num_procs;
 154     src_z_base = (src / two_dsize) * two_dsize;
 155
 156     recv_offset = (src_z_base * block_size);
 157
 158     *(req_ptr++) = smpi_mpi_irecv((char *) recv_buff + recv_offset, recv_count * two_dsize,
 159               recv_type, src, tag, comm);
 160   }
 161
 162   for (i = 1; i < Z; i++) {
 163     dst = (rank + i * two_dsize) % num_procs;
 164
 165     recv_offset = 0;
 166     for (j = 0; j < two_dsize; j++) {
 167       send_offset = (dst + j * num_procs) * block_size;
 168       smpi_mpi_sendrecv(tmp_buff1 + send_offset, send_count, send_type,
 169                    rank, tag, tmp_buff2 + recv_offset, recv_count,
 170                    recv_type, rank, tag, comm, &status);
 171
 172       recv_offset += block_size;
 173     }
 174
 175     smpi_mpi_send(tmp_buff2, send_count * two_dsize, send_type, dst, tag, comm);
 176
 177   }
 178
 179   smpi_mpi_waitall(Z - 1, reqs, statuses);
 180
 181   free(reqs);
 182   free(statuses);
 183   free(tmp_buff1);
 184   free(tmp_buff2);
 185   return MPI_SUCCESS;
 186 }