src/smpi/colls/reduce-mvapich-two-level.c

   1 /* Copyright (c) 2013-2014. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 /*
   8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   9  *                         University Research and Technology
  10  *                         Corporation.  All rights reserved.
  11  * Copyright (c) 2004-2009 The University of Tennessee and The University
  12  *                         of Tennessee Research Foundation.  All rights
  13  *                         reserved.
  14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  15  *                         University of Stuttgart.  All rights reserved.
  16  * Copyright (c) 2004-2005 The Regents of the University of California.
  17  *                         All rights reserved.
  18  *
  19  * Additional copyrights may follow
  20  */
  21  /* -*- Mode: C; c-basic-offset:4 ; -*- */
  22 /* Copyright (c) 2001-2014, The Ohio State University. All rights
  23  * reserved.
  24  *
  25  * This file is part of the MVAPICH2 software package developed by the
  26  * team members of The Ohio State University's Network-Based Computing
  27  * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
  28  *
  29  * For detailed copyright and licensing information, please refer to the
  30  * copyright file COPYRIGHT in the top level MVAPICH2 directory.
  31  */
  32 /*
  33  *
  34  *  (C) 2001 by Argonne National Laboratory.
  35  *      See COPYRIGHT in top-level directory.
  36  */
  37
  38 #include "colls_private.h"
  39 #define MV2_INTRA_SHMEM_REDUCE_MSG 2048
  40
  41 #define mv2_g_shmem_coll_max_msg_size (1 << 17)
  42 #define SHMEM_COLL_BLOCK_SIZE (local_size * mv2_g_shmem_coll_max_msg_size)
  43 #define mv2_use_knomial_reduce 1
  44
  45 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
  46 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
  47 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
  48 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
  49 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
  50
  51 extern int (*MV2_Reduce_function)( void *sendbuf,
  52     void *recvbuf,
  53     int count,
  54     MPI_Datatype datatype,
  55     MPI_Op op,
  56     int root,
  57     MPI_Comm  comm_ptr);
  58
  59 extern int (*MV2_Reduce_intra_function)( void *sendbuf,
  60     void *recvbuf,
  61     int count,
  62     MPI_Datatype datatype,
  63     MPI_Op op,
  64     int root,
  65     MPI_Comm  comm_ptr);
  66
  67
  68 /*Fn pointers for collectives */
  69 static int (*reduce_fn)(void *sendbuf,
  70                              void *recvbuf,
  71                              int count,
  72                              MPI_Datatype datatype,
  73                              MPI_Op op, int root, MPI_Comm  comm);
  74
  75 int smpi_coll_tuned_reduce_mvapich2_two_level( void *sendbuf,
  76                                      void *recvbuf,
  77                                      int count,
  78                                      MPI_Datatype datatype,
  79                                      MPI_Op op,
  80                                      int root,
  81                                      MPI_Comm comm)
  82 {
  83     int mpi_errno = MPI_SUCCESS;
  84     int my_rank, total_size, local_rank, local_size;
  85     int leader_comm_rank = -1, leader_comm_size = 0;
  86     MPI_Comm shmem_comm, leader_comm;
  87     int leader_root, leader_of_root;
  88     void *in_buf = NULL, *out_buf = NULL, *tmp_buf = NULL;
  89     MPI_Aint true_lb, true_extent, extent;
  90     int is_commutative = 0, stride = 0;
  91     int intra_node_root=0;
  92
  93     //if not set (use of the algo directly, without mvapich2 selector)
  94     if(MV2_Reduce_function==NULL)
  95       MV2_Reduce_function=smpi_coll_tuned_reduce_mpich;
  96     if(MV2_Reduce_intra_function==NULL)
  97       MV2_Reduce_intra_function=smpi_coll_tuned_reduce_mpich;
  98
  99     if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
 100       smpi_comm_init_smp(comm);
 101     }
 102
 103     my_rank = smpi_comm_rank(comm);
 104     total_size = smpi_comm_size(comm);
 105     shmem_comm = smpi_comm_get_intra_comm(comm);
 106     local_rank = smpi_comm_rank(shmem_comm);
 107     local_size = smpi_comm_size(shmem_comm);
 108
 109     leader_comm = smpi_comm_get_leaders_comm(comm);
 110     int* leaders_map = smpi_comm_get_leaders_map(comm);
 111     leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]);
 112     leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]);
 113
 114     is_commutative=smpi_op_is_commute(op);
 115
 116     smpi_datatype_extent(datatype, &true_lb,
 117                                        &true_extent);
 118     extent =smpi_datatype_get_extent(datatype);
 119     stride = count * MAX(extent, true_extent);
 120
 121     if (local_size == total_size) {
 122         /* First handle the case where there is only one node */
 123         if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG &&
 124             is_commutative == 1) {
 125             if (local_rank == 0 ) {
 126                 tmp_buf=(void *)smpi_get_tmp_sendbuffer( count *
 127                                     (MAX(extent, true_extent)));
 128                 tmp_buf = (void *) ((char *) tmp_buf - true_lb);
 129             }
 130
 131             if (sendbuf != MPI_IN_PLACE) {
 132                 in_buf = (void *)sendbuf;
 133             } else {
 134                 in_buf = recvbuf;
 135             }
 136
 137             if (local_rank == 0) {
 138                  if( my_rank != root) {
 139                      out_buf = tmp_buf;
 140                  } else {
 141                      out_buf = recvbuf;
 142                      if(in_buf == out_buf) {
 143                         in_buf = MPI_IN_PLACE;
 144                         out_buf = recvbuf;
 145                      }
 146                  }
 147             } else {
 148                 in_buf  = (void *)sendbuf;
 149                 out_buf = NULL;
 150             }
 151
 152             if (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) {
 153                 mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count,
 154                                                   datatype, op,
 155                                                   0, shmem_comm);
 156             }
 157             else {
 158                 mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
 159                                                                   datatype, op,
 160                                                                   0, shmem_comm);
 161             }
 162
 163             if (local_rank == 0 && root != my_rank) {
 164                 smpi_mpi_send(out_buf, count, datatype, root,
 165                                          COLL_TAG_REDUCE+1, comm);
 166             }
 167             if ((local_rank != 0) && (root == my_rank)) {
 168                 smpi_mpi_recv(recvbuf, count, datatype,
 169                                          leader_of_root, COLL_TAG_REDUCE+1, comm,
 170                                          MPI_STATUS_IGNORE);
 171             }
 172         } else {
 173             if(mv2_use_knomial_reduce == 1) {
 174                 reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2;
 175             } else {
 176                 reduce_fn = &MPIR_Reduce_binomial_MV2;
 177             }
 178             mpi_errno = reduce_fn(sendbuf, recvbuf, count,
 179                                   datatype, op,
 180                                   root, comm);
 181         }
 182         /* We are done */
 183         goto fn_exit;
 184     }
 185
 186
 187     if (local_rank == 0) {
 188         leader_comm = smpi_comm_get_leaders_comm(comm);
 189         if(leader_comm==MPI_COMM_NULL){
 190           leader_comm = MPI_COMM_WORLD;
 191         }
 192         leader_comm_size = smpi_comm_size(leader_comm);
 193         leader_comm_rank = smpi_comm_rank(leader_comm);
 194         tmp_buf=(void *)smpi_get_tmp_sendbuffer(count *
 195                             (MAX(extent, true_extent)));
 196         tmp_buf = (void *) ((char *) tmp_buf - true_lb);
 197     }
 198     if (sendbuf != MPI_IN_PLACE) {
 199         in_buf = (void *)sendbuf;
 200     } else {
 201         in_buf = recvbuf;
 202     }
 203     if (local_rank == 0) {
 204         out_buf = tmp_buf;
 205     } else {
 206         out_buf = NULL;
 207     }
 208
 209
 210     if(local_size > 1) {
 211         /* Lets do the intra-node reduce operations, if we have more than one
 212          * process in the node */
 213
 214         /*Fix the input and outbuf buffers for the intra-node reduce.
 215          *Node leaders will have the reduced data in tmp_buf after
 216          *this step*/
 217         if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2)
 218         {
 219             if (is_commutative == 1
 220                 && (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) {
 221                     mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
 222                                       datatype, op,
 223                                       intra_node_root, shmem_comm);
 224             } else {
 225                     mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
 226                                       datatype, op,
 227                                       intra_node_root, shmem_comm);
 228             }
 229         } else {
 230
 231             mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
 232                                       datatype, op,
 233                                       intra_node_root, shmem_comm);
 234         }
 235     } else {
 236         tmp_buf = in_buf;
 237     }
 238
 239     /* Now work on the inter-leader phase. Data is in tmp_buf */
 240     if (local_rank == 0 && leader_comm_size > 1) {
 241         /*The leader of root will have the global reduced data in tmp_buf
 242            or recv_buf
 243            at the end of the reduce */
 244         if (leader_comm_rank == leader_root) {
 245             if (my_rank == root) {
 246                 /* I am the root of the leader-comm, and the
 247                  * root of the reduce op. So, I will write the
 248                  * final result directly into my recvbuf */
 249                 if(tmp_buf != recvbuf) {
 250                     in_buf = tmp_buf;
 251                     out_buf = recvbuf;
 252                 } else {
 253
 254                      in_buf = (char *)smpi_get_tmp_sendbuffer(count*
 255                                        smpi_datatype_get_extent(datatype));
 256                      smpi_datatype_copy(tmp_buf, count, datatype,
 257                                         in_buf, count, datatype);
 258                     //in_buf = MPI_IN_PLACE;
 259                     out_buf = recvbuf;
 260                 }
 261             } else {
 262                 in_buf = (char *)smpi_get_tmp_sendbuffer(count*
 263                                        smpi_datatype_get_extent(datatype));
 264                 smpi_datatype_copy(tmp_buf, count, datatype,
 265                                         in_buf, count, datatype);
 266                 //in_buf = MPI_IN_PLACE;
 267                 out_buf = tmp_buf;
 268             }
 269         } else {
 270             in_buf = tmp_buf;
 271             out_buf = NULL;
 272         }
 273
 274         /* inter-leader communication  */
 275         mpi_errno = MV2_Reduce_function(in_buf, out_buf, count,
 276                               datatype, op,
 277                               leader_root, leader_comm);
 278
 279     }
 280
 281     if (local_size > 1) {
 282         /* Send the message to the root if the leader is not the
 283          * root of the reduce operation. The reduced data is in tmp_buf */
 284         if ((local_rank == 0) && (root != my_rank)
 285             && (leader_root == leader_comm_rank)) {
 286             smpi_mpi_send(tmp_buf, count, datatype, root,
 287                                      COLL_TAG_REDUCE+1, comm);
 288         }
 289
 290         if ((local_rank != 0) && (root == my_rank)) {
 291             smpi_mpi_recv(recvbuf, count, datatype,
 292                                      leader_of_root,
 293                                      COLL_TAG_REDUCE+1, comm,
 294                                      MPI_STATUS_IGNORE);
 295         }
 296     }
 297
 298   fn_exit:
 299     return mpi_errno;
 300 }