src/smpi/colls/reduce/reduce-mvapich-two-level.cpp

   1 /* Copyright (c) 2013-2018. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 /*
   8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   9  *                         University Research and Technology
  10  *                         Corporation.  All rights reserved.
  11  * Copyright (c) 2004-2009 The University of Tennessee and The University
  12  *                         of Tennessee Research Foundation.  All rights
  13  *                         reserved.
  14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  15  *                         University of Stuttgart.  All rights reserved.
  16  * Copyright (c) 2004-2005 The Regents of the University of California.
  17  *                         All rights reserved.
  18  *
  19  * Additional copyrights may follow
  20  */
  21  /* -*- Mode: C; c-basic-offset:4 ; -*- */
  22 /* Copyright (c) 2001-2014, The Ohio State University. All rights
  23  * reserved.
  24  *
  25  * This file is part of the MVAPICH2 software package developed by the
  26  * team members of The Ohio State University's Network-Based Computing
  27  * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
  28  *
  29  * For detailed copyright and licensing information, please refer to the
  30  * copyright file COPYRIGHT in the top level MVAPICH2 directory.
  31  */
  32 /*
  33  *
  34  *  (C) 2001 by Argonne National Laboratory.
  35  *      See COPYRIGHT in top-level directory.
  36  */
  37
  38 #include "../colls_private.hpp"
  39 #include <algorithm>
  40
  41 #define MV2_INTRA_SHMEM_REDUCE_MSG 2048
  42
  43 #define mv2_g_shmem_coll_max_msg_size (1 << 17)
  44 #define SHMEM_COLL_BLOCK_SIZE (local_size * mv2_g_shmem_coll_max_msg_size)
  45 #define mv2_use_knomial_reduce 1
  46
  47 #define MPIR_Reduce_inter_knomial_wrapper_MV2 Coll_reduce_mvapich2_knomial::reduce
  48 #define MPIR_Reduce_intra_knomial_wrapper_MV2 Coll_reduce_mvapich2_knomial::reduce
  49 #define MPIR_Reduce_binomial_MV2 Coll_reduce_binomial::reduce
  50 #define MPIR_Reduce_redscat_gather_MV2 Coll_reduce_scatter_gather::reduce
  51 #define MPIR_Reduce_shmem_MV2 Coll_reduce_ompi_basic_linear::reduce
  52
  53 extern int (*MV2_Reduce_function)( void *sendbuf,
  54     void *recvbuf,
  55     int count,
  56     MPI_Datatype datatype,
  57     MPI_Op op,
  58     int root,
  59     MPI_Comm  comm_ptr);
  60
  61 extern int (*MV2_Reduce_intra_function)( void *sendbuf,
  62     void *recvbuf,
  63     int count,
  64     MPI_Datatype datatype,
  65     MPI_Op op,
  66     int root,
  67     MPI_Comm  comm_ptr);
  68
  69
  70 /*Fn pointers for collectives */
  71 static int (*reduce_fn)(void *sendbuf,
  72                              void *recvbuf,
  73                              int count,
  74                              MPI_Datatype datatype,
  75                              MPI_Op op, int root, MPI_Comm  comm);
  76 namespace simgrid{
  77 namespace smpi{
  78 int Coll_reduce_mvapich2_two_level::reduce( void *sendbuf,
  79                                      void *recvbuf,
  80                                      int count,
  81                                      MPI_Datatype datatype,
  82                                      MPI_Op op,
  83                                      int root,
  84                                      MPI_Comm comm)
  85 {
  86     int mpi_errno = MPI_SUCCESS;
  87     int my_rank, total_size, local_rank, local_size;
  88     int leader_comm_rank = -1, leader_comm_size = 0;
  89     MPI_Comm shmem_comm, leader_comm;
  90     int leader_root, leader_of_root;
  91     void *in_buf = NULL, *out_buf = NULL, *tmp_buf = NULL;
  92     MPI_Aint true_lb, true_extent, extent;
  93     int is_commutative = 0, stride = 0;
  94     int intra_node_root=0;
  95
  96     //if not set (use of the algo directly, without mvapich2 selector)
  97     if(MV2_Reduce_function==NULL)
  98       MV2_Reduce_function=Coll_reduce_mpich::reduce;
  99     if(MV2_Reduce_intra_function==NULL)
 100       MV2_Reduce_intra_function=Coll_reduce_mpich::reduce;
 101
 102     if(comm->get_leaders_comm()==MPI_COMM_NULL){
 103       comm->init_smp();
 104     }
 105
 106     my_rank = comm->rank();
 107     total_size = comm->size();
 108     shmem_comm = comm->get_intra_comm();
 109     local_rank = shmem_comm->rank();
 110     local_size = shmem_comm->size();
 111
 112     leader_comm = comm->get_leaders_comm();
 113     int* leaders_map = comm->get_leaders_map();
 114     leader_of_root = comm->group()->rank(leaders_map[root]);
 115     leader_root = leader_comm->group()->rank(leaders_map[root]);
 116
 117     is_commutative= (op==MPI_OP_NULL || op->is_commutative());
 118
 119     datatype->extent(&true_lb,
 120                                        &true_extent);
 121     extent =datatype->get_extent();
 122     stride = count * std::max(extent, true_extent);
 123
 124     if (local_size == total_size) {
 125         /* First handle the case where there is only one node */
 126         if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG &&
 127             is_commutative == 1) {
 128             if (local_rank == 0 ) {
 129                 tmp_buf = (void*)smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent));
 130                 tmp_buf = (void *) ((char *) tmp_buf - true_lb);
 131             }
 132
 133             if (sendbuf != MPI_IN_PLACE) {
 134                 in_buf = (void *)sendbuf;
 135             } else {
 136                 in_buf = recvbuf;
 137             }
 138
 139             if (local_rank == 0) {
 140                  if( my_rank != root) {
 141                      out_buf = tmp_buf;
 142                  } else {
 143                      out_buf = recvbuf;
 144                      if(in_buf == out_buf) {
 145                         in_buf = MPI_IN_PLACE;
 146                         out_buf = recvbuf;
 147                      }
 148                  }
 149             } else {
 150                 in_buf  = (void *)sendbuf;
 151                 out_buf = NULL;
 152             }
 153
 154             if (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) {
 155               mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm);
 156             } else {
 157               mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm);
 158             }
 159
 160             if (local_rank == 0 && root != my_rank) {
 161                 Request::send(out_buf, count, datatype, root,
 162                                          COLL_TAG_REDUCE+1, comm);
 163             }
 164             if ((local_rank != 0) && (root == my_rank)) {
 165                 Request::recv(recvbuf, count, datatype,
 166                                          leader_of_root, COLL_TAG_REDUCE+1, comm,
 167                                          MPI_STATUS_IGNORE);
 168             }
 169         } else {
 170             if(mv2_use_knomial_reduce == 1) {
 171                 reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2;
 172             } else {
 173                 reduce_fn = &MPIR_Reduce_binomial_MV2;
 174             }
 175             mpi_errno = reduce_fn(sendbuf, recvbuf, count,
 176                                   datatype, op,
 177                                   root, comm);
 178         }
 179         /* We are done */
 180         if(tmp_buf!=NULL)
 181           smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
 182         goto fn_exit;
 183     }
 184
 185
 186     if (local_rank == 0) {
 187         leader_comm = comm->get_leaders_comm();
 188         if(leader_comm==MPI_COMM_NULL){
 189           leader_comm = MPI_COMM_WORLD;
 190         }
 191         leader_comm_size = leader_comm->size();
 192         leader_comm_rank = leader_comm->rank();
 193         tmp_buf          = (void*)smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent));
 194         tmp_buf = (void *) ((char *) tmp_buf - true_lb);
 195     }
 196     if (sendbuf != MPI_IN_PLACE) {
 197         in_buf = (void *)sendbuf;
 198     } else {
 199         in_buf = recvbuf;
 200     }
 201     if (local_rank == 0) {
 202         out_buf = tmp_buf;
 203     } else {
 204         out_buf = NULL;
 205     }
 206
 207
 208     if(local_size > 1) {
 209         /* Lets do the intra-node reduce operations, if we have more than one
 210          * process in the node */
 211
 212         /*Fix the input and outbuf buffers for the intra-node reduce.
 213          *Node leaders will have the reduced data in tmp_buf after
 214          *this step*/
 215         if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2)
 216         {
 217           if (is_commutative == 1 && (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) {
 218             mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm);
 219             } else {
 220                     mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
 221                                       datatype, op,
 222                                       intra_node_root, shmem_comm);
 223             }
 224         } else {
 225
 226             mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
 227                                       datatype, op,
 228                                       intra_node_root, shmem_comm);
 229         }
 230     } else {
 231         smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
 232         tmp_buf = in_buf;
 233     }
 234
 235     /* Now work on the inter-leader phase. Data is in tmp_buf */
 236     if (local_rank == 0 && leader_comm_size > 1) {
 237         /*The leader of root will have the global reduced data in tmp_buf
 238            or recv_buf
 239            at the end of the reduce */
 240         if (leader_comm_rank == leader_root) {
 241             if (my_rank == root) {
 242                 /* I am the root of the leader-comm, and the
 243                  * root of the reduce op. So, I will write the
 244                  * final result directly into my recvbuf */
 245                 if(tmp_buf != recvbuf) {
 246                     in_buf = tmp_buf;
 247                     out_buf = recvbuf;
 248                 } else {
 249
 250                      in_buf = (char *)smpi_get_tmp_sendbuffer(count*
 251                                        datatype->get_extent());
 252                      Datatype::copy(tmp_buf, count, datatype,
 253                                         in_buf, count, datatype);
 254                     //in_buf = MPI_IN_PLACE;
 255                     out_buf = recvbuf;
 256                 }
 257             } else {
 258                 in_buf = (char *)smpi_get_tmp_sendbuffer(count*
 259                                        datatype->get_extent());
 260                 Datatype::copy(tmp_buf, count, datatype,
 261                                         in_buf, count, datatype);
 262                 //in_buf = MPI_IN_PLACE;
 263                 out_buf = tmp_buf;
 264             }
 265         } else {
 266             in_buf = tmp_buf;
 267             out_buf = NULL;
 268         }
 269
 270         /* inter-leader communication  */
 271         mpi_errno = MV2_Reduce_function(in_buf, out_buf, count,
 272                               datatype, op,
 273                               leader_root, leader_comm);
 274
 275     }
 276
 277     if (local_size > 1) {
 278         /* Send the message to the root if the leader is not the
 279          * root of the reduce operation. The reduced data is in tmp_buf */
 280         if ((local_rank == 0) && (root != my_rank)
 281             && (leader_root == leader_comm_rank)) {
 282             Request::send(tmp_buf, count, datatype, root,
 283                                      COLL_TAG_REDUCE+1, comm);
 284         }
 285         if ((local_rank != 0) && (root == my_rank)) {
 286             Request::recv(recvbuf, count, datatype,
 287                                      leader_of_root,
 288                                      COLL_TAG_REDUCE+1, comm,
 289                                      MPI_STATUS_IGNORE);
 290         }
 291       smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
 292
 293       if (leader_comm_rank == leader_root) {
 294         if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) {
 295           smpi_free_tmp_buffer(in_buf);
 296         }
 297       }
 298     }
 299
 300
 301
 302   fn_exit:
 303     return mpi_errno;
 304 }
 305 }
 306 }