src/smpi/colls/reduce-mvapich-two-level.cpp

   1 /* Copyright (c) 2013-2014. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 /*
   8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   9  *                         University Research and Technology
  10  *                         Corporation.  All rights reserved.
  11  * Copyright (c) 2004-2009 The University of Tennessee and The University
  12  *                         of Tennessee Research Foundation.  All rights
  13  *                         reserved.
  14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  15  *                         University of Stuttgart.  All rights reserved.
  16  * Copyright (c) 2004-2005 The Regents of the University of California.
  17  *                         All rights reserved.
  18  *
  19  * Additional copyrights may follow
  20  */
  21  /* -*- Mode: C; c-basic-offset:4 ; -*- */
  22 /* Copyright (c) 2001-2014, The Ohio State University. All rights
  23  * reserved.
  24  *
  25  * This file is part of the MVAPICH2 software package developed by the
  26  * team members of The Ohio State University's Network-Based Computing
  27  * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
  28  *
  29  * For detailed copyright and licensing information, please refer to the
  30  * copyright file COPYRIGHT in the top level MVAPICH2 directory.
  31  */
  32 /*
  33  *
  34  *  (C) 2001 by Argonne National Laboratory.
  35  *      See COPYRIGHT in top-level directory.
  36  */
  37
  38 #include "colls_private.h"
  39 #include "src/smpi/smpi_group.hpp"
  40 #define MV2_INTRA_SHMEM_REDUCE_MSG 2048
  41
  42 #define mv2_g_shmem_coll_max_msg_size (1 << 17)
  43 #define SHMEM_COLL_BLOCK_SIZE (local_size * mv2_g_shmem_coll_max_msg_size)
  44 #define mv2_use_knomial_reduce 1
  45
  46 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
  47 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
  48 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
  49 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
  50 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
  51
  52 extern int (*MV2_Reduce_function)( void *sendbuf,
  53     void *recvbuf,
  54     int count,
  55     MPI_Datatype datatype,
  56     MPI_Op op,
  57     int root,
  58     MPI_Comm  comm_ptr);
  59
  60 extern int (*MV2_Reduce_intra_function)( void *sendbuf,
  61     void *recvbuf,
  62     int count,
  63     MPI_Datatype datatype,
  64     MPI_Op op,
  65     int root,
  66     MPI_Comm  comm_ptr);
  67
  68
  69 /*Fn pointers for collectives */
  70 static int (*reduce_fn)(void *sendbuf,
  71                              void *recvbuf,
  72                              int count,
  73                              MPI_Datatype datatype,
  74                              MPI_Op op, int root, MPI_Comm  comm);
  75
  76 int smpi_coll_tuned_reduce_mvapich2_two_level( void *sendbuf,
  77                                      void *recvbuf,
  78                                      int count,
  79                                      MPI_Datatype datatype,
  80                                      MPI_Op op,
  81                                      int root,
  82                                      MPI_Comm comm)
  83 {
  84     int mpi_errno = MPI_SUCCESS;
  85     int my_rank, total_size, local_rank, local_size;
  86     int leader_comm_rank = -1, leader_comm_size = 0;
  87     MPI_Comm shmem_comm, leader_comm;
  88     int leader_root, leader_of_root;
  89     void *in_buf = NULL, *out_buf = NULL, *tmp_buf = NULL;
  90     MPI_Aint true_lb, true_extent, extent;
  91     int is_commutative = 0, stride = 0;
  92     int intra_node_root=0;
  93
  94     //if not set (use of the algo directly, without mvapich2 selector)
  95     if(MV2_Reduce_function==NULL)
  96       MV2_Reduce_function=smpi_coll_tuned_reduce_mpich;
  97     if(MV2_Reduce_intra_function==NULL)
  98       MV2_Reduce_intra_function=smpi_coll_tuned_reduce_mpich;
  99
 100     if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
 101       smpi_comm_init_smp(comm);
 102     }
 103
 104     my_rank = smpi_comm_rank(comm);
 105     total_size = smpi_comm_size(comm);
 106     shmem_comm = smpi_comm_get_intra_comm(comm);
 107     local_rank = smpi_comm_rank(shmem_comm);
 108     local_size = smpi_comm_size(shmem_comm);
 109
 110     leader_comm = smpi_comm_get_leaders_comm(comm);
 111     int* leaders_map = smpi_comm_get_leaders_map(comm);
 112     leader_of_root = smpi_comm_group(comm)->rank(leaders_map[root]);
 113     leader_root = smpi_comm_group(leader_comm)->rank(leaders_map[root]);
 114
 115     is_commutative=smpi_op_is_commute(op);
 116
 117     smpi_datatype_extent(datatype, &true_lb,
 118                                        &true_extent);
 119     extent =smpi_datatype_get_extent(datatype);
 120     stride = count * MAX(extent, true_extent);
 121
 122     if (local_size == total_size) {
 123         /* First handle the case where there is only one node */
 124         if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG &&
 125             is_commutative == 1) {
 126             if (local_rank == 0 ) {
 127                 tmp_buf=(void *)smpi_get_tmp_sendbuffer( count *
 128                                     (MAX(extent, true_extent)));
 129                 tmp_buf = (void *) ((char *) tmp_buf - true_lb);
 130             }
 131
 132             if (sendbuf != MPI_IN_PLACE) {
 133                 in_buf = (void *)sendbuf;
 134             } else {
 135                 in_buf = recvbuf;
 136             }
 137
 138             if (local_rank == 0) {
 139                  if( my_rank != root) {
 140                      out_buf = tmp_buf;
 141                  } else {
 142                      out_buf = recvbuf;
 143                      if(in_buf == out_buf) {
 144                         in_buf = MPI_IN_PLACE;
 145                         out_buf = recvbuf;
 146                      }
 147                  }
 148             } else {
 149                 in_buf  = (void *)sendbuf;
 150                 out_buf = NULL;
 151             }
 152
 153             if (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) {
 154                 mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count,
 155                                                   datatype, op,
 156                                                   0, shmem_comm);
 157             }
 158             else {
 159                 mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
 160                                                                   datatype, op,
 161                                                                   0, shmem_comm);
 162             }
 163
 164             if (local_rank == 0 && root != my_rank) {
 165                 smpi_mpi_send(out_buf, count, datatype, root,
 166                                          COLL_TAG_REDUCE+1, comm);
 167             }
 168             if ((local_rank != 0) && (root == my_rank)) {
 169                 smpi_mpi_recv(recvbuf, count, datatype,
 170                                          leader_of_root, COLL_TAG_REDUCE+1, comm,
 171                                          MPI_STATUS_IGNORE);
 172             }
 173         } else {
 174             if(mv2_use_knomial_reduce == 1) {
 175                 reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2;
 176             } else {
 177                 reduce_fn = &MPIR_Reduce_binomial_MV2;
 178             }
 179             mpi_errno = reduce_fn(sendbuf, recvbuf, count,
 180                                   datatype, op,
 181                                   root, comm);
 182         }
 183         /* We are done */
 184         if(tmp_buf!=NULL)
 185           smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
 186         goto fn_exit;
 187     }
 188
 189
 190     if (local_rank == 0) {
 191         leader_comm = smpi_comm_get_leaders_comm(comm);
 192         if(leader_comm==MPI_COMM_NULL){
 193           leader_comm = MPI_COMM_WORLD;
 194         }
 195         leader_comm_size = smpi_comm_size(leader_comm);
 196         leader_comm_rank = smpi_comm_rank(leader_comm);
 197         tmp_buf=(void *)smpi_get_tmp_sendbuffer(count *
 198                             (MAX(extent, true_extent)));
 199         tmp_buf = (void *) ((char *) tmp_buf - true_lb);
 200     }
 201     if (sendbuf != MPI_IN_PLACE) {
 202         in_buf = (void *)sendbuf;
 203     } else {
 204         in_buf = recvbuf;
 205     }
 206     if (local_rank == 0) {
 207         out_buf = tmp_buf;
 208     } else {
 209         out_buf = NULL;
 210     }
 211
 212
 213     if(local_size > 1) {
 214         /* Lets do the intra-node reduce operations, if we have more than one
 215          * process in the node */
 216
 217         /*Fix the input and outbuf buffers for the intra-node reduce.
 218          *Node leaders will have the reduced data in tmp_buf after
 219          *this step*/
 220         if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2)
 221         {
 222             if (is_commutative == 1
 223                 && (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) {
 224                     mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
 225                                       datatype, op,
 226                                       intra_node_root, shmem_comm);
 227             } else {
 228                     mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
 229                                       datatype, op,
 230                                       intra_node_root, shmem_comm);
 231             }
 232         } else {
 233
 234             mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
 235                                       datatype, op,
 236                                       intra_node_root, shmem_comm);
 237         }
 238     } else {
 239         smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
 240         tmp_buf = in_buf;
 241     }
 242
 243     /* Now work on the inter-leader phase. Data is in tmp_buf */
 244     if (local_rank == 0 && leader_comm_size > 1) {
 245         /*The leader of root will have the global reduced data in tmp_buf
 246            or recv_buf
 247            at the end of the reduce */
 248         if (leader_comm_rank == leader_root) {
 249             if (my_rank == root) {
 250                 /* I am the root of the leader-comm, and the
 251                  * root of the reduce op. So, I will write the
 252                  * final result directly into my recvbuf */
 253                 if(tmp_buf != recvbuf) {
 254                     in_buf = tmp_buf;
 255                     out_buf = recvbuf;
 256                 } else {
 257
 258                      in_buf = (char *)smpi_get_tmp_sendbuffer(count*
 259                                        smpi_datatype_get_extent(datatype));
 260                      smpi_datatype_copy(tmp_buf, count, datatype,
 261                                         in_buf, count, datatype);
 262                     //in_buf = MPI_IN_PLACE;
 263                     out_buf = recvbuf;
 264                 }
 265             } else {
 266                 in_buf = (char *)smpi_get_tmp_sendbuffer(count*
 267                                        smpi_datatype_get_extent(datatype));
 268                 smpi_datatype_copy(tmp_buf, count, datatype,
 269                                         in_buf, count, datatype);
 270                 //in_buf = MPI_IN_PLACE;
 271                 out_buf = tmp_buf;
 272             }
 273         } else {
 274             in_buf = tmp_buf;
 275             out_buf = NULL;
 276         }
 277
 278         /* inter-leader communication  */
 279         mpi_errno = MV2_Reduce_function(in_buf, out_buf, count,
 280                               datatype, op,
 281                               leader_root, leader_comm);
 282
 283     }
 284
 285     if (local_size > 1) {
 286         /* Send the message to the root if the leader is not the
 287          * root of the reduce operation. The reduced data is in tmp_buf */
 288         if ((local_rank == 0) && (root != my_rank)
 289             && (leader_root == leader_comm_rank)) {
 290             smpi_mpi_send(tmp_buf, count, datatype, root,
 291                                      COLL_TAG_REDUCE+1, comm);
 292         }
 293         if ((local_rank != 0) && (root == my_rank)) {
 294             smpi_mpi_recv(recvbuf, count, datatype,
 295                                      leader_of_root,
 296                                      COLL_TAG_REDUCE+1, comm,
 297                                      MPI_STATUS_IGNORE);
 298         }
 299       smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
 300
 301       if (leader_comm_rank == leader_root) {
 302         if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) {
 303           smpi_free_tmp_buffer(in_buf);
 304         }
 305       }
 306     }
 307
 308
 309
 310   fn_exit:
 311     return mpi_errno;
 312 }