src/smpi/colls/reduce/reduce-mvapich-two-level.cpp

   1 /* Copyright (c) 2013-2017. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 /*
   8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   9  *                         University Research and Technology
  10  *                         Corporation.  All rights reserved.
  11  * Copyright (c) 2004-2009 The University of Tennessee and The University
  12  *                         of Tennessee Research Foundation.  All rights
  13  *                         reserved.
  14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  15  *                         University of Stuttgart.  All rights reserved.
  16  * Copyright (c) 2004-2005 The Regents of the University of California.
  17  *                         All rights reserved.
  18  *
  19  * Additional copyrights may follow
  20  */
  21  /* -*- Mode: C; c-basic-offset:4 ; -*- */
  22 /* Copyright (c) 2001-2014, The Ohio State University. All rights
  23  * reserved.
  24  *
  25  * This file is part of the MVAPICH2 software package developed by the
  26  * team members of The Ohio State University's Network-Based Computing
  27  * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
  28  *
  29  * For detailed copyright and licensing information, please refer to the
  30  * copyright file COPYRIGHT in the top level MVAPICH2 directory.
  31  */
  32 /*
  33  *
  34  *  (C) 2001 by Argonne National Laboratory.
  35  *      See COPYRIGHT in top-level directory.
  36  */
  37
  38 #include "../colls_private.h"
  39 #define MV2_INTRA_SHMEM_REDUCE_MSG 2048
  40
  41 #define mv2_g_shmem_coll_max_msg_size (1 << 17)
  42 #define SHMEM_COLL_BLOCK_SIZE (local_size * mv2_g_shmem_coll_max_msg_size)
  43 #define mv2_use_knomial_reduce 1
  44
  45 #define MPIR_Reduce_inter_knomial_wrapper_MV2 Coll_reduce_mvapich2_knomial::reduce
  46 #define MPIR_Reduce_intra_knomial_wrapper_MV2 Coll_reduce_mvapich2_knomial::reduce
  47 #define MPIR_Reduce_binomial_MV2 Coll_reduce_binomial::reduce
  48 #define MPIR_Reduce_redscat_gather_MV2 Coll_reduce_scatter_gather::reduce
  49 #define MPIR_Reduce_shmem_MV2 Coll_reduce_ompi_basic_linear::reduce
  50
  51 extern int (*MV2_Reduce_function)( void *sendbuf,
  52     void *recvbuf,
  53     int count,
  54     MPI_Datatype datatype,
  55     MPI_Op op,
  56     int root,
  57     MPI_Comm  comm_ptr);
  58
  59 extern int (*MV2_Reduce_intra_function)( void *sendbuf,
  60     void *recvbuf,
  61     int count,
  62     MPI_Datatype datatype,
  63     MPI_Op op,
  64     int root,
  65     MPI_Comm  comm_ptr);
  66
  67
  68 /*Fn pointers for collectives */
  69 static int (*reduce_fn)(void *sendbuf,
  70                              void *recvbuf,
  71                              int count,
  72                              MPI_Datatype datatype,
  73                              MPI_Op op, int root, MPI_Comm  comm);
  74 namespace simgrid{
  75 namespace smpi{
  76 int Coll_reduce_mvapich2_two_level::reduce( void *sendbuf,
  77                                      void *recvbuf,
  78                                      int count,
  79                                      MPI_Datatype datatype,
  80                                      MPI_Op op,
  81                                      int root,
  82                                      MPI_Comm comm)
  83 {
  84     int mpi_errno = MPI_SUCCESS;
  85     int my_rank, total_size, local_rank, local_size;
  86     int leader_comm_rank = -1, leader_comm_size = 0;
  87     MPI_Comm shmem_comm, leader_comm;
  88     int leader_root, leader_of_root;
  89     void *in_buf = NULL, *out_buf = NULL, *tmp_buf = NULL;
  90     MPI_Aint true_lb, true_extent, extent;
  91     int is_commutative = 0, stride = 0;
  92     int intra_node_root=0;
  93
  94     //if not set (use of the algo directly, without mvapich2 selector)
  95     if(MV2_Reduce_function==NULL)
  96       MV2_Reduce_function=Coll_reduce_mpich::reduce;
  97     if(MV2_Reduce_intra_function==NULL)
  98       MV2_Reduce_intra_function=Coll_reduce_mpich::reduce;
  99
 100     if(comm->get_leaders_comm()==MPI_COMM_NULL){
 101       comm->init_smp();
 102     }
 103
 104     my_rank = comm->rank();
 105     total_size = comm->size();
 106     shmem_comm = comm->get_intra_comm();
 107     local_rank = shmem_comm->rank();
 108     local_size = shmem_comm->size();
 109
 110     leader_comm = comm->get_leaders_comm();
 111     int* leaders_map = comm->get_leaders_map();
 112     leader_of_root = comm->group()->rank(leaders_map[root]);
 113     leader_root = leader_comm->group()->rank(leaders_map[root]);
 114
 115     is_commutative= (op==MPI_OP_NULL || op->is_commutative());
 116
 117     datatype->extent(&true_lb,
 118                                        &true_extent);
 119     extent =datatype->get_extent();
 120     stride = count * MAX(extent, true_extent);
 121
 122     if (local_size == total_size) {
 123         /* First handle the case where there is only one node */
 124         if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG &&
 125             is_commutative == 1) {
 126             if (local_rank == 0 ) {
 127                 tmp_buf=(void *)smpi_get_tmp_sendbuffer( count *
 128                                     (MAX(extent, true_extent)));
 129                 tmp_buf = (void *) ((char *) tmp_buf - true_lb);
 130             }
 131
 132             if (sendbuf != MPI_IN_PLACE) {
 133                 in_buf = (void *)sendbuf;
 134             } else {
 135                 in_buf = recvbuf;
 136             }
 137
 138             if (local_rank == 0) {
 139                  if( my_rank != root) {
 140                      out_buf = tmp_buf;
 141                  } else {
 142                      out_buf = recvbuf;
 143                      if(in_buf == out_buf) {
 144                         in_buf = MPI_IN_PLACE;
 145                         out_buf = recvbuf;
 146                      }
 147                  }
 148             } else {
 149                 in_buf  = (void *)sendbuf;
 150                 out_buf = NULL;
 151             }
 152
 153             if (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) {
 154               mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm);
 155             } else {
 156               mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm);
 157             }
 158
 159             if (local_rank == 0 && root != my_rank) {
 160                 Request::send(out_buf, count, datatype, root,
 161                                          COLL_TAG_REDUCE+1, comm);
 162             }
 163             if ((local_rank != 0) && (root == my_rank)) {
 164                 Request::recv(recvbuf, count, datatype,
 165                                          leader_of_root, COLL_TAG_REDUCE+1, comm,
 166                                          MPI_STATUS_IGNORE);
 167             }
 168         } else {
 169             if(mv2_use_knomial_reduce == 1) {
 170                 reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2;
 171             } else {
 172                 reduce_fn = &MPIR_Reduce_binomial_MV2;
 173             }
 174             mpi_errno = reduce_fn(sendbuf, recvbuf, count,
 175                                   datatype, op,
 176                                   root, comm);
 177         }
 178         /* We are done */
 179         if(tmp_buf!=NULL)
 180           smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
 181         goto fn_exit;
 182     }
 183
 184
 185     if (local_rank == 0) {
 186         leader_comm = comm->get_leaders_comm();
 187         if(leader_comm==MPI_COMM_NULL){
 188           leader_comm = MPI_COMM_WORLD;
 189         }
 190         leader_comm_size = leader_comm->size();
 191         leader_comm_rank = leader_comm->rank();
 192         tmp_buf=(void *)smpi_get_tmp_sendbuffer(count *
 193                             (MAX(extent, true_extent)));
 194         tmp_buf = (void *) ((char *) tmp_buf - true_lb);
 195     }
 196     if (sendbuf != MPI_IN_PLACE) {
 197         in_buf = (void *)sendbuf;
 198     } else {
 199         in_buf = recvbuf;
 200     }
 201     if (local_rank == 0) {
 202         out_buf = tmp_buf;
 203     } else {
 204         out_buf = NULL;
 205     }
 206
 207
 208     if(local_size > 1) {
 209         /* Lets do the intra-node reduce operations, if we have more than one
 210          * process in the node */
 211
 212         /*Fix the input and outbuf buffers for the intra-node reduce.
 213          *Node leaders will have the reduced data in tmp_buf after
 214          *this step*/
 215         if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2)
 216         {
 217           if (is_commutative == 1 && (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) {
 218             mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm);
 219             } else {
 220                     mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
 221                                       datatype, op,
 222                                       intra_node_root, shmem_comm);
 223             }
 224         } else {
 225
 226             mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
 227                                       datatype, op,
 228                                       intra_node_root, shmem_comm);
 229         }
 230     } else {
 231         smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
 232         tmp_buf = in_buf;
 233     }
 234
 235     /* Now work on the inter-leader phase. Data is in tmp_buf */
 236     if (local_rank == 0 && leader_comm_size > 1) {
 237         /*The leader of root will have the global reduced data in tmp_buf
 238            or recv_buf
 239            at the end of the reduce */
 240         if (leader_comm_rank == leader_root) {
 241             if (my_rank == root) {
 242                 /* I am the root of the leader-comm, and the
 243                  * root of the reduce op. So, I will write the
 244                  * final result directly into my recvbuf */
 245                 if(tmp_buf != recvbuf) {
 246                     in_buf = tmp_buf;
 247                     out_buf = recvbuf;
 248                 } else {
 249
 250                      in_buf = (char *)smpi_get_tmp_sendbuffer(count*
 251                                        datatype->get_extent());
 252                      Datatype::copy(tmp_buf, count, datatype,
 253                                         in_buf, count, datatype);
 254                     //in_buf = MPI_IN_PLACE;
 255                     out_buf = recvbuf;
 256                 }
 257             } else {
 258                 in_buf = (char *)smpi_get_tmp_sendbuffer(count*
 259                                        datatype->get_extent());
 260                 Datatype::copy(tmp_buf, count, datatype,
 261                                         in_buf, count, datatype);
 262                 //in_buf = MPI_IN_PLACE;
 263                 out_buf = tmp_buf;
 264             }
 265         } else {
 266             in_buf = tmp_buf;
 267             out_buf = NULL;
 268         }
 269
 270         /* inter-leader communication  */
 271         mpi_errno = MV2_Reduce_function(in_buf, out_buf, count,
 272                               datatype, op,
 273                               leader_root, leader_comm);
 274
 275     }
 276
 277     if (local_size > 1) {
 278         /* Send the message to the root if the leader is not the
 279          * root of the reduce operation. The reduced data is in tmp_buf */
 280         if ((local_rank == 0) && (root != my_rank)
 281             && (leader_root == leader_comm_rank)) {
 282             Request::send(tmp_buf, count, datatype, root,
 283                                      COLL_TAG_REDUCE+1, comm);
 284         }
 285         if ((local_rank != 0) && (root == my_rank)) {
 286             Request::recv(recvbuf, count, datatype,
 287                                      leader_of_root,
 288                                      COLL_TAG_REDUCE+1, comm,
 289                                      MPI_STATUS_IGNORE);
 290         }
 291       smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
 292
 293       if (leader_comm_rank == leader_root) {
 294         if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) {
 295           smpi_free_tmp_buffer(in_buf);
 296         }
 297       }
 298     }
 299
 300
 301
 302   fn_exit:
 303     return mpi_errno;
 304 }
 305 }
 306 }