src/smpi/colls/reduce_scatter/reduce_scatter-mpich.cpp

   1 /* Copyright (c) 2013-2021. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 #include "../colls_private.hpp"
   8 #include <algorithm>
   9
  10 static inline int MPIU_Mirror_permutation(unsigned int x, int bits)
  11 {
  12     /* a mask for the high order bits that should be copied as-is */
  13     int high_mask = ~((0x1 << bits) - 1);
  14     int retval = x & high_mask;
  15     int i;
  16
  17     for (i = 0; i < bits; ++i) {
  18         unsigned int bitval = (x & (0x1 << i)) >> i; /* 0x1 or 0x0 */
  19         retval |= bitval << ((bits - i) - 1);
  20     }
  21
  22     return retval;
  23 }
  24 namespace simgrid{
  25 namespace smpi{
  26
  27 int reduce_scatter__mpich_pair(const void *sendbuf, void *recvbuf, const int recvcounts[],
  28                                MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
  29 {
  30     int   rank, comm_size, i;
  31     MPI_Aint extent, true_extent, true_lb;
  32     unsigned char* tmp_recvbuf;
  33     int mpi_errno = MPI_SUCCESS;
  34     int total_count, dst, src;
  35     comm_size = comm->size();
  36     rank = comm->rank();
  37
  38     extent =datatype->get_extent();
  39     datatype->extent(&true_lb, &true_extent);
  40
  41     bool is_commutative = (op == MPI_OP_NULL || op->is_commutative());
  42
  43     int* disps = new int[comm_size];
  44
  45     total_count = 0;
  46     for (i=0; i<comm_size; i++) {
  47         disps[i] = total_count;
  48         total_count += recvcounts[i];
  49     }
  50
  51     if (total_count == 0) {
  52       delete[] disps;
  53       return MPI_ERR_COUNT;
  54     }
  55
  56         if (sendbuf != MPI_IN_PLACE) {
  57             /* copy local data into recvbuf */
  58             Datatype::copy(((char *)sendbuf+disps[rank]*extent),
  59                                        recvcounts[rank], datatype, recvbuf,
  60                                        recvcounts[rank], datatype);
  61         }
  62
  63         /* allocate temporary buffer to store incoming data */
  64         tmp_recvbuf = smpi_get_tmp_recvbuffer(recvcounts[rank] * std::max(true_extent, extent) + 1);
  65         /* adjust for potential negative lower bound in datatype */
  66         tmp_recvbuf = tmp_recvbuf - true_lb;
  67
  68         for (i=1; i<comm_size; i++) {
  69             src = (rank - i + comm_size) % comm_size;
  70             dst = (rank + i) % comm_size;
  71
  72             /* send the data that dst needs. recv data that this process
  73                needs from src into tmp_recvbuf */
  74             if (sendbuf != MPI_IN_PLACE)
  75                 Request::sendrecv(((char *)sendbuf+disps[dst]*extent),
  76                                              recvcounts[dst], datatype, dst,
  77                                              COLL_TAG_SCATTER, tmp_recvbuf,
  78                                              recvcounts[rank], datatype, src,
  79                                              COLL_TAG_SCATTER, comm,
  80                                              MPI_STATUS_IGNORE);
  81             else
  82                 Request::sendrecv(((char *)recvbuf+disps[dst]*extent),
  83                                              recvcounts[dst], datatype, dst,
  84                                              COLL_TAG_SCATTER, tmp_recvbuf,
  85                                              recvcounts[rank], datatype, src,
  86                                              COLL_TAG_SCATTER, comm,
  87                                              MPI_STATUS_IGNORE);
  88
  89             if (is_commutative || (src < rank)) {
  90                 if (sendbuf != MPI_IN_PLACE) {
  91                   if (op != MPI_OP_NULL)
  92                     op->apply(tmp_recvbuf, recvbuf, &recvcounts[rank], datatype);
  93                 }
  94                 else {
  95                   if (op != MPI_OP_NULL)
  96                     op->apply(tmp_recvbuf, ((char*)recvbuf + disps[rank] * extent), &recvcounts[rank], datatype);
  97                   /* we can't store the result at the beginning of
  98                      recvbuf right here because there is useful data
  99                      there that other process/processes need. at the
 100                      end, we will copy back the result to the
 101                      beginning of recvbuf. */
 102                 }
 103             }
 104             else {
 105                 if (sendbuf != MPI_IN_PLACE) {
 106                   if (op != MPI_OP_NULL)
 107                     op->apply(recvbuf, tmp_recvbuf, &recvcounts[rank], datatype);
 108                   /* copy result back into recvbuf */
 109                   mpi_errno =
 110                       Datatype::copy(tmp_recvbuf, recvcounts[rank], datatype, recvbuf, recvcounts[rank], datatype);
 111                   if (mpi_errno)
 112                     return (mpi_errno);
 113                 }
 114                 else {
 115                   if (op != MPI_OP_NULL)
 116                     op->apply(((char*)recvbuf + disps[rank] * extent), tmp_recvbuf, &recvcounts[rank], datatype);
 117                   /* copy result back into recvbuf */
 118                   mpi_errno = Datatype::copy(tmp_recvbuf, recvcounts[rank], datatype,
 119                                              ((char*)recvbuf + disps[rank] * extent), recvcounts[rank], datatype);
 120                   if (mpi_errno)
 121                     return (mpi_errno);
 122                 }
 123             }
 124         }
 125
 126         /* if MPI_IN_PLACE, move output data to the beginning of
 127            recvbuf. already done for rank 0. */
 128         if ((sendbuf == MPI_IN_PLACE) && (rank != 0)) {
 129             mpi_errno = Datatype::copy(((char *)recvbuf +
 130                                         disps[rank]*extent),
 131                                        recvcounts[rank], datatype,
 132                                        recvbuf,
 133                                        recvcounts[rank], datatype );
 134             if (mpi_errno) return(mpi_errno);
 135         }
 136
 137         delete[] disps;
 138         smpi_free_tmp_buffer(tmp_recvbuf);
 139
 140         return MPI_SUCCESS;
 141 }
 142
 143
 144 int reduce_scatter__mpich_noncomm(const void *sendbuf, void *recvbuf, const int recvcounts[],
 145                                   MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
 146 {
 147     int mpi_errno = MPI_SUCCESS;
 148     int comm_size = comm->size() ;
 149     int rank = comm->rank();
 150     int pof2;
 151     int log2_comm_size;
 152     int i, k;
 153     int recv_offset, send_offset;
 154     int block_size, total_count, size;
 155     MPI_Aint true_extent, true_lb;
 156     int buf0_was_inout;
 157     unsigned char* tmp_buf0;
 158     unsigned char* tmp_buf1;
 159     unsigned char* result_ptr;
 160
 161     datatype->extent(&true_lb, &true_extent);
 162
 163     pof2 = 1;
 164     log2_comm_size = 0;
 165     while (pof2 < comm_size) {
 166         pof2 <<= 1;
 167         ++log2_comm_size;
 168     }
 169
 170     /* begin error checking */
 171     xbt_assert(pof2 == comm_size); /* FIXME this version only works for power of 2 procs */
 172
 173     for (i = 0; i < (comm_size - 1); ++i) {
 174         xbt_assert(recvcounts[i] == recvcounts[i+1]);
 175     }
 176     /* end error checking */
 177
 178     /* size of a block (count of datatype per block, NOT bytes per block) */
 179     block_size = recvcounts[0];
 180     total_count = block_size * comm_size;
 181
 182     tmp_buf0                     = smpi_get_tmp_sendbuffer(true_extent * total_count);
 183     tmp_buf1                     = smpi_get_tmp_recvbuffer(true_extent * total_count);
 184     unsigned char* tmp_buf0_save = tmp_buf0;
 185     unsigned char* tmp_buf1_save = tmp_buf1;
 186
 187     /* adjust for potential negative lower bound in datatype */
 188     tmp_buf0 = tmp_buf0 - true_lb;
 189     tmp_buf1 = tmp_buf1 - true_lb;
 190
 191     /* Copy our send data to tmp_buf0.  We do this one block at a time and
 192        permute the blocks as we go according to the mirror permutation. */
 193     for (i = 0; i < comm_size; ++i) {
 194       mpi_errno = Datatype::copy(
 195           static_cast<const char*>(sendbuf == MPI_IN_PLACE ? recvbuf : sendbuf) + (i * true_extent * block_size), block_size,
 196           datatype, tmp_buf0 + (MPIU_Mirror_permutation(i, log2_comm_size) * true_extent * block_size), block_size,
 197           datatype);
 198       if (mpi_errno)
 199         return mpi_errno;
 200     }
 201     buf0_was_inout = 1;
 202
 203     send_offset = 0;
 204     recv_offset = 0;
 205     size = total_count;
 206     for (k = 0; k < log2_comm_size; ++k) {
 207         /* use a double-buffering scheme to avoid local copies */
 208         unsigned char* incoming_data = buf0_was_inout ? tmp_buf1 : tmp_buf0;
 209         unsigned char* outgoing_data = buf0_was_inout ? tmp_buf0 : tmp_buf1;
 210         int peer = rank ^ (0x1 << k);
 211         size /= 2;
 212
 213         if (rank > peer) {
 214             /* we have the higher rank: send top half, recv bottom half */
 215             recv_offset += size;
 216         }
 217         else {
 218             /* we have the lower rank: recv top half, send bottom half */
 219             send_offset += size;
 220         }
 221
 222         Request::sendrecv(outgoing_data + send_offset*true_extent,
 223                                      size, datatype, peer, COLL_TAG_SCATTER,
 224                                      incoming_data + recv_offset*true_extent,
 225                                      size, datatype, peer, COLL_TAG_SCATTER,
 226                                      comm, MPI_STATUS_IGNORE);
 227         /* always perform the reduction at recv_offset, the data at send_offset
 228            is now our peer's responsibility */
 229         if (rank > peer) {
 230             /* higher ranked value so need to call op(received_data, my_data) */
 231             if(op!=MPI_OP_NULL) op->apply(
 232                    incoming_data + recv_offset*true_extent,
 233                      outgoing_data + recv_offset*true_extent,
 234                      &size, datatype );
 235             /* buf0_was_inout = buf0_was_inout; */
 236         }
 237         else {
 238             /* lower ranked value so need to call op(my_data, received_data) */
 239             if (op != MPI_OP_NULL)
 240               op->apply(outgoing_data + recv_offset * true_extent, incoming_data + recv_offset * true_extent, &size,
 241                         datatype);
 242             buf0_was_inout = not buf0_was_inout;
 243         }
 244
 245         /* the next round of send/recv needs to happen within the block (of size
 246            "size") that we just received and reduced */
 247         send_offset = recv_offset;
 248     }
 249
 250     xbt_assert(size == recvcounts[rank]);
 251
 252     /* copy the reduced data to the recvbuf */
 253     result_ptr = (buf0_was_inout ? tmp_buf0 : tmp_buf1) + recv_offset * true_extent;
 254     mpi_errno = Datatype::copy(result_ptr, size, datatype,
 255                                recvbuf, size, datatype);
 256     smpi_free_tmp_buffer(tmp_buf0_save);
 257     smpi_free_tmp_buffer(tmp_buf1_save);
 258     if (mpi_errno) return(mpi_errno);
 259     return MPI_SUCCESS;
 260 }
 261
 262
 263
 264 int reduce_scatter__mpich_rdb(const void *sendbuf, void *recvbuf, const int recvcounts[],
 265                               MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
 266 {
 267     int   rank, comm_size, i;
 268     MPI_Aint extent, true_extent, true_lb;
 269     int mpi_errno = MPI_SUCCESS;
 270     int dis[2], blklens[2], total_count, dst;
 271     int mask, dst_tree_root, my_tree_root, j, k;
 272     int received;
 273     MPI_Datatype sendtype, recvtype;
 274     int nprocs_completed, tmp_mask, tree_root;
 275     comm_size = comm->size();
 276     rank = comm->rank();
 277
 278     extent =datatype->get_extent();
 279     datatype->extent(&true_lb, &true_extent);
 280
 281     bool is_commutative = (op == MPI_OP_NULL || op->is_commutative());
 282
 283     int* disps = new int[comm_size];
 284
 285     total_count = 0;
 286     for (i=0; i<comm_size; i++) {
 287         disps[i] = total_count;
 288         total_count += recvcounts[i];
 289     }
 290
 291             /* noncommutative and (non-pof2 or block irregular), use recursive doubling. */
 292
 293             /* need to allocate temporary buffer to receive incoming data*/
 294     unsigned char* tmp_recvbuf = smpi_get_tmp_recvbuffer(total_count * std::max(true_extent, extent));
 295     /* adjust for potential negative lower bound in datatype */
 296     tmp_recvbuf = tmp_recvbuf - true_lb;
 297
 298     /* need to allocate another temporary buffer to accumulate
 299        results */
 300     unsigned char* tmp_results = smpi_get_tmp_sendbuffer(total_count * std::max(true_extent, extent));
 301     /* adjust for potential negative lower bound in datatype */
 302     tmp_results = tmp_results - true_lb;
 303
 304     /* copy sendbuf into tmp_results */
 305     if (sendbuf != MPI_IN_PLACE)
 306       mpi_errno = Datatype::copy(sendbuf, total_count, datatype, tmp_results, total_count, datatype);
 307     else
 308       mpi_errno = Datatype::copy(recvbuf, total_count, datatype, tmp_results, total_count, datatype);
 309
 310     if (mpi_errno)
 311       return (mpi_errno);
 312
 313     mask = 0x1;
 314     i    = 0;
 315     while (mask < comm_size) {
 316       dst = rank ^ mask;
 317
 318       dst_tree_root = dst >> i;
 319       dst_tree_root <<= i;
 320
 321       my_tree_root = rank >> i;
 322       my_tree_root <<= i;
 323
 324       /* At step 1, processes exchange (n-n/p) amount of
 325          data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
 326          amount of data, and so forth. We use derived datatypes for this.
 327
 328          At each step, a process does not need to send data
 329          indexed from my_tree_root to
 330          my_tree_root+mask-1. Similarly, a process won't receive
 331          data indexed from dst_tree_root to dst_tree_root+mask-1. */
 332
 333       /* calculate sendtype */
 334       blklens[0] = blklens[1] = 0;
 335       for (j = 0; j < my_tree_root; j++)
 336         blklens[0] += recvcounts[j];
 337       for (j = my_tree_root + mask; j < comm_size; j++)
 338         blklens[1] += recvcounts[j];
 339
 340       dis[0] = 0;
 341       dis[1] = blklens[0];
 342       for (j = my_tree_root; (j < my_tree_root + mask) && (j < comm_size); j++)
 343         dis[1] += recvcounts[j];
 344
 345       mpi_errno = Datatype::create_indexed(2, blklens, dis, datatype, &sendtype);
 346       if (mpi_errno)
 347         return (mpi_errno);
 348
 349       sendtype->commit();
 350
 351       /* calculate recvtype */
 352       blklens[0] = blklens[1] = 0;
 353       for (j = 0; j < dst_tree_root && j < comm_size; j++)
 354         blklens[0] += recvcounts[j];
 355       for (j = dst_tree_root + mask; j < comm_size; j++)
 356         blklens[1] += recvcounts[j];
 357
 358       dis[0] = 0;
 359       dis[1] = blklens[0];
 360       for (j = dst_tree_root; (j < dst_tree_root + mask) && (j < comm_size); j++)
 361         dis[1] += recvcounts[j];
 362
 363       mpi_errno = Datatype::create_indexed(2, blklens, dis, datatype, &recvtype);
 364       if (mpi_errno)
 365         return (mpi_errno);
 366
 367       recvtype->commit();
 368
 369       received = 0;
 370       if (dst < comm_size) {
 371         /* tmp_results contains data to be sent in each step. Data is
 372            received in tmp_recvbuf and then accumulated into
 373            tmp_results. accumulation is done later below.   */
 374
 375         Request::sendrecv(tmp_results, 1, sendtype, dst, COLL_TAG_SCATTER, tmp_recvbuf, 1, recvtype, dst,
 376                           COLL_TAG_SCATTER, comm, MPI_STATUS_IGNORE);
 377         received = 1;
 378       }
 379
 380       /* if some processes in this process's subtree in this step
 381          did not have any destination process to communicate with
 382          because of non-power-of-two, we need to send them the
 383          result. We use a logarithmic recursive-halfing algorithm
 384          for this. */
 385
 386       if (dst_tree_root + mask > comm_size) {
 387         nprocs_completed = comm_size - my_tree_root - mask;
 388         /* nprocs_completed is the number of processes in this
 389            subtree that have all the data. Send data to others
 390            in a tree fashion. First find root of current tree
 391            that is being divided into two. k is the number of
 392            least-significant bits in this process's rank that
 393            must be zeroed out to find the rank of the root */
 394         j = mask;
 395         k = 0;
 396         while (j) {
 397           j >>= 1;
 398           k++;
 399         }
 400         k--;
 401
 402         tmp_mask = mask >> 1;
 403         while (tmp_mask) {
 404           dst = rank ^ tmp_mask;
 405
 406           tree_root = rank >> k;
 407           tree_root <<= k;
 408
 409           /* send only if this proc has data and destination
 410              doesn't have data. at any step, multiple processes
 411              can send if they have the data */
 412           if ((dst > rank) && (rank < tree_root + nprocs_completed) && (dst >= tree_root + nprocs_completed)) {
 413             /* send the current result */
 414             Request::send(tmp_recvbuf, 1, recvtype, dst, COLL_TAG_SCATTER, comm);
 415           }
 416           /* recv only if this proc. doesn't have data and sender
 417              has data */
 418           else if ((dst < rank) && (dst < tree_root + nprocs_completed) && (rank >= tree_root + nprocs_completed)) {
 419             Request::recv(tmp_recvbuf, 1, recvtype, dst, COLL_TAG_SCATTER, comm, MPI_STATUS_IGNORE);
 420             received = 1;
 421           }
 422           tmp_mask >>= 1;
 423           k--;
 424         }
 425       }
 426
 427       /* The following reduction is done here instead of after
 428          the MPIC_Sendrecv_ft or MPIC_Recv_ft above. This is
 429          because to do it above, in the noncommutative
 430          case, we would need an extra temp buffer so as not to
 431          overwrite temp_recvbuf, because temp_recvbuf may have
 432          to be communicated to other processes in the
 433          non-power-of-two case. To avoid that extra allocation,
 434          we do the reduce here. */
 435       if (received) {
 436         if (is_commutative || (dst_tree_root < my_tree_root)) {
 437           {
 438             if (op != MPI_OP_NULL)
 439               op->apply(tmp_recvbuf, tmp_results, &blklens[0], datatype);
 440             if (op != MPI_OP_NULL)
 441               op->apply(tmp_recvbuf + dis[1] * extent, tmp_results + dis[1] * extent, &blklens[1], datatype);
 442           }
 443         } else {
 444           {
 445             if (op != MPI_OP_NULL)
 446               op->apply(tmp_results, tmp_recvbuf, &blklens[0], datatype);
 447             if (op != MPI_OP_NULL)
 448               op->apply(tmp_results + dis[1] * extent, tmp_recvbuf + dis[1] * extent, &blklens[1], datatype);
 449           }
 450           /* copy result back into tmp_results */
 451           mpi_errno = Datatype::copy(tmp_recvbuf, 1, recvtype, tmp_results, 1, recvtype);
 452           if (mpi_errno)
 453             return (mpi_errno);
 454         }
 455       }
 456
 457       Datatype::unref(sendtype);
 458       Datatype::unref(recvtype);
 459
 460       mask <<= 1;
 461       i++;
 462             }
 463
 464             /* now copy final results from tmp_results to recvbuf */
 465             mpi_errno = Datatype::copy(tmp_results + disps[rank] * extent, recvcounts[rank], datatype, recvbuf,
 466                                        recvcounts[rank], datatype);
 467             if (mpi_errno) return(mpi_errno);
 468
 469     delete[] disps;
 470     smpi_free_tmp_buffer(tmp_recvbuf);
 471     smpi_free_tmp_buffer(tmp_results);
 472     return MPI_SUCCESS;
 473         }
 474 }
 475 }
 476