Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Move collective algorithms to separate folders
[simgrid.git] / src / smpi / colls / scatter / scatter-mvapich-two-level.cpp
diff --git a/src/smpi/colls/scatter/scatter-mvapich-two-level.cpp b/src/smpi/colls/scatter/scatter-mvapich-two-level.cpp
new file mode 100644 (file)
index 0000000..e76d756
--- /dev/null
@@ -0,0 +1,410 @@
+/* Copyright (c) 2013-2014. The SimGrid Team.
+ * All rights reserved.                                                     */
+
+/* This program is free software; you can redistribute it and/or modify it
+ * under the terms of the license (GNU LGPL) which comes with this package. */
+
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ *
+ * Additional copyrights may follow
+ */
+ /* -*- Mode: C; c-basic-offset:4 ; -*- */
+/* Copyright (c) 2001-2014, The Ohio State University. All rights
+ * reserved.
+ *
+ * This file is part of the MVAPICH2 software package developed by the
+ * team members of The Ohio State University's Network-Based Computing
+ * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
+ *
+ * For detailed copyright and licensing information, please refer to the
+ * copyright file COPYRIGHT in the top level MVAPICH2 directory.
+ */
+/*
+ *
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "../colls_private.h"
+
+#define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
+#define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
+
+extern int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
+    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+    int root, MPI_Comm comm);
+
+int smpi_coll_tuned_scatter_mvapich2_two_level_direct(void *sendbuf,
+                                      int sendcnt,
+                                      MPI_Datatype sendtype,
+                                      void *recvbuf,
+                                      int recvcnt,
+                                      MPI_Datatype recvtype,
+                                      int root, MPI_Comm  comm)
+{
+    int comm_size, rank;
+    int local_rank, local_size;
+    int leader_comm_rank = -1, leader_comm_size = -1;
+    int mpi_errno = MPI_SUCCESS;
+    int recvtype_size, sendtype_size, nbytes;
+    void *tmp_buf = NULL;
+    void *leader_scatter_buf = NULL;
+    MPI_Status status;
+    int leader_root, leader_of_root = -1;
+    MPI_Comm shmem_comm, leader_comm;
+    //if not set (use of the algo directly, without mvapich2 selector)
+    if(MV2_Scatter_intra_function==NULL)
+      MV2_Scatter_intra_function=smpi_coll_tuned_scatter_mpich;
+    
+    if(comm->get_leaders_comm()==MPI_COMM_NULL){
+      comm->init_smp();
+    }
+    comm_size = comm->size();
+    rank = comm->rank();
+
+    if (((rank == root) && (recvcnt == 0))
+        || ((rank != root) && (sendcnt == 0))) {
+        return MPI_SUCCESS;
+    }
+
+    /* extract the rank,size information for the intra-node
+     * communicator */
+    shmem_comm = comm->get_intra_comm();
+    local_rank = shmem_comm->rank();
+    local_size = shmem_comm->size();
+
+    if (local_rank == 0) {
+        /* Node leader. Extract the rank, size information for the leader
+         * communicator */
+        leader_comm = comm->get_leaders_comm();
+        leader_comm_size = leader_comm->size();
+        leader_comm_rank = leader_comm->rank();
+    }
+
+    if (local_size == comm_size) {
+        /* purely intra-node scatter. Just use the direct algorithm and we are done */
+        mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype,
+                                            recvbuf, recvcnt, recvtype,
+                                            root, comm);
+
+    } else {
+        recvtype_size=recvtype->size();
+        sendtype_size=sendtype->size();
+
+        if (rank == root) {
+            nbytes = sendcnt * sendtype_size;
+        } else {
+            nbytes = recvcnt * recvtype_size;
+        }
+
+        if (local_rank == 0) {
+            /* Node leader, allocate tmp_buffer */
+            tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size);
+        }
+
+        leader_comm = comm->get_leaders_comm();
+        int* leaders_map = comm->get_leaders_map();
+        leader_of_root = comm->group()->rank(leaders_map[root]);
+        leader_root = leader_comm->group()->rank(leaders_map[root]);
+        /* leader_root is the rank of the leader of the root in leader_comm.
+         * leader_root is to be used as the root of the inter-leader gather ops
+         */
+
+        if ((local_rank == 0) && (root != rank)
+            && (leader_of_root == rank)) {
+            /* The root of the scatter operation is not the node leader. Recv
+             * data from the node leader */
+            leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size);
+            Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE,
+                             root, COLL_TAG_SCATTER, comm, &status);
+
+        }
+
+        if (rank == root && local_rank != 0) {
+            /* The root of the scatter operation is not the node leader. Send
+             * data to the node leader */
+            Request::send(sendbuf, sendcnt * comm_size, sendtype,
+                                     leader_of_root, COLL_TAG_SCATTER, comm
+                                     );
+        }
+
+        if (leader_comm_size > 1 && local_rank == 0) {
+            if (!comm->is_uniform()) {
+                int *displs = NULL;
+                int *sendcnts = NULL;
+                int *node_sizes;
+                int i = 0;
+                node_sizes = comm->get_non_uniform_map();
+
+                if (root != leader_of_root) {
+                    if (leader_comm_rank == leader_root) {
+                        displs = static_cast<int*>(xbt_malloc(sizeof (int) * leader_comm_size));
+                        sendcnts = static_cast<int*>(xbt_malloc(sizeof (int) * leader_comm_size));
+                        sendcnts[0] = node_sizes[0] * nbytes;
+                        displs[0] = 0;
+
+                        for (i = 1; i < leader_comm_size; i++) {
+                            displs[i] =
+                                displs[i - 1] + node_sizes[i - 1] * nbytes;
+                            sendcnts[i] = node_sizes[i] * nbytes;
+                        }
+                    }
+                        smpi_mpi_scatterv(leader_scatter_buf, sendcnts, displs,
+                                      MPI_BYTE, tmp_buf, nbytes * local_size,
+                                      MPI_BYTE, leader_root, leader_comm);
+                } else {
+                    if (leader_comm_rank == leader_root) {
+                        displs = static_cast<int*>(xbt_malloc(sizeof (int) * leader_comm_size));
+                        sendcnts = static_cast<int*>(xbt_malloc(sizeof (int) * leader_comm_size));
+                        sendcnts[0] = node_sizes[0] * sendcnt;
+                        displs[0] = 0;
+
+                        for (i = 1; i < leader_comm_size; i++) {
+                            displs[i] =
+                                displs[i - 1] + node_sizes[i - 1] * sendcnt;
+                            sendcnts[i] = node_sizes[i] * sendcnt;
+                        }
+                    }
+                    smpi_mpi_scatterv(sendbuf, sendcnts, displs,
+                                              sendtype, tmp_buf,
+                                              nbytes * local_size, MPI_BYTE,
+                                              leader_root, leader_comm);
+                }
+                if (leader_comm_rank == leader_root) {
+                    xbt_free(displs);
+                    xbt_free(sendcnts);
+                }
+            } else {
+                if (leader_of_root != root) {
+                    mpi_errno =
+                        MPIR_Scatter_MV2_Direct(leader_scatter_buf,
+                                                nbytes * local_size, MPI_BYTE,
+                                                tmp_buf, nbytes * local_size,
+                                                MPI_BYTE, leader_root,
+                                                leader_comm);
+                } else {
+                    mpi_errno =
+                        MPIR_Scatter_MV2_Direct(sendbuf, sendcnt * local_size,
+                                                sendtype, tmp_buf,
+                                                nbytes * local_size, MPI_BYTE,
+                                                leader_root, leader_comm);
+
+                }
+            }
+        }
+        /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */
+
+        if (rank == root && recvbuf == MPI_IN_PLACE) {
+            mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
+                                                (void *)sendbuf, sendcnt, sendtype,
+                                                0, shmem_comm);
+        } else {
+            mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
+                                                recvbuf, recvcnt, recvtype,
+                                                0, shmem_comm);
+        }
+    }
+
+    /* check if multiple threads are calling this collective function */
+    if (comm_size != local_size && local_rank == 0) {
+        smpi_free_tmp_buffer(tmp_buf);
+        if (leader_of_root == rank && root != rank) {
+            smpi_free_tmp_buffer(leader_scatter_buf);
+        }
+    }
+    return (mpi_errno);
+}
+
+
+int smpi_coll_tuned_scatter_mvapich2_two_level_binomial(void *sendbuf,
+                                        int sendcnt,
+                                        MPI_Datatype sendtype,
+                                        void *recvbuf,
+                                        int recvcnt,
+                                        MPI_Datatype recvtype,
+                                        int root, MPI_Comm comm)
+{
+    int comm_size, rank;
+    int local_rank, local_size;
+    int leader_comm_rank = -1, leader_comm_size = -1;
+    int mpi_errno = MPI_SUCCESS;
+    int recvtype_size, sendtype_size, nbytes;
+    void *tmp_buf = NULL;
+    void *leader_scatter_buf = NULL;
+    MPI_Status status;
+    int leader_root = -1, leader_of_root = -1;
+    MPI_Comm shmem_comm, leader_comm;
+
+
+    //if not set (use of the algo directly, without mvapich2 selector)
+    if(MV2_Scatter_intra_function==NULL)
+      MV2_Scatter_intra_function=smpi_coll_tuned_scatter_mpich;
+    
+    if(comm->get_leaders_comm()==MPI_COMM_NULL){
+      comm->init_smp();
+    }
+    comm_size = comm->size();
+    rank = comm->rank();
+
+    if (((rank == root) && (recvcnt == 0))
+        || ((rank != root) && (sendcnt == 0))) {
+        return MPI_SUCCESS;
+    }
+
+    /* extract the rank,size information for the intra-node
+     * communicator */
+    shmem_comm = comm->get_intra_comm();
+    local_rank = shmem_comm->rank();
+    local_size = shmem_comm->size();
+
+    if (local_rank == 0) {
+        /* Node leader. Extract the rank, size information for the leader
+         * communicator */
+        leader_comm = comm->get_leaders_comm();
+        leader_comm_size = leader_comm->size();
+        leader_comm_rank = leader_comm->rank();
+    }
+
+    if (local_size == comm_size) {
+        /* purely intra-node scatter. Just use the direct algorithm and we are done */
+        mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype,
+                                            recvbuf, recvcnt, recvtype,
+                                            root, comm);
+
+    } else {
+        recvtype_size=recvtype->size();
+        sendtype_size=sendtype->size();
+
+        if (rank == root) {
+            nbytes = sendcnt * sendtype_size;
+        } else {
+            nbytes = recvcnt * recvtype_size;
+        }
+
+        if (local_rank == 0) {
+            /* Node leader, allocate tmp_buffer */
+            tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size);
+        }
+        leader_comm = comm->get_leaders_comm();
+        int* leaders_map = comm->get_leaders_map();
+        leader_of_root = comm->group()->rank(leaders_map[root]);
+        leader_root = leader_comm->group()->rank(leaders_map[root]);
+        /* leader_root is the rank of the leader of the root in leader_comm.
+         * leader_root is to be used as the root of the inter-leader gather ops
+         */
+
+        if ((local_rank == 0) && (root != rank)
+            && (leader_of_root == rank)) {
+            /* The root of the scatter operation is not the node leader. Recv
+             * data from the node leader */
+            leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size);
+            Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE,
+                             root, COLL_TAG_SCATTER, comm, &status);
+        }
+
+        if (rank == root && local_rank != 0) {
+            /* The root of the scatter operation is not the node leader. Send
+             * data to the node leader */
+            Request::send(sendbuf, sendcnt * comm_size, sendtype,
+                                     leader_of_root, COLL_TAG_SCATTER, comm);
+        }
+
+        if (leader_comm_size > 1 && local_rank == 0) {
+            if (!comm->is_uniform()) {
+                int *displs = NULL;
+                int *sendcnts = NULL;
+                int *node_sizes;
+                int i = 0;
+                node_sizes = comm->get_non_uniform_map();
+
+                if (root != leader_of_root) {
+                    if (leader_comm_rank == leader_root) {
+                        displs = static_cast<int*>(xbt_malloc(sizeof (int) * leader_comm_size));
+                        sendcnts = static_cast<int*>(xbt_malloc(sizeof (int) * leader_comm_size));
+                        sendcnts[0] = node_sizes[0] * nbytes;
+                        displs[0] = 0;
+
+                        for (i = 1; i < leader_comm_size; i++) {
+                            displs[i] =
+                                displs[i - 1] + node_sizes[i - 1] * nbytes;
+                            sendcnts[i] = node_sizes[i] * nbytes;
+                        }
+                    }
+                        smpi_mpi_scatterv(leader_scatter_buf, sendcnts, displs,
+                                      MPI_BYTE, tmp_buf, nbytes * local_size,
+                                      MPI_BYTE, leader_root, leader_comm);
+                } else {
+                    if (leader_comm_rank == leader_root) {
+                        displs = static_cast<int*>(xbt_malloc(sizeof (int) * leader_comm_size));
+                        sendcnts = static_cast<int*>(xbt_malloc(sizeof (int) * leader_comm_size));
+                        sendcnts[0] = node_sizes[0] * sendcnt;
+                        displs[0] = 0;
+
+                        for (i = 1; i < leader_comm_size; i++) {
+                            displs[i] =
+                                displs[i - 1] + node_sizes[i - 1] * sendcnt;
+                            sendcnts[i] = node_sizes[i] * sendcnt;
+                        }
+                    }
+                    smpi_mpi_scatterv(sendbuf, sendcnts, displs,
+                                              sendtype, tmp_buf,
+                                              nbytes * local_size, MPI_BYTE,
+                                              leader_root, leader_comm);
+                }
+                if (leader_comm_rank == leader_root) {
+                    xbt_free(displs);
+                    xbt_free(sendcnts);
+                }
+            } else {
+                if (leader_of_root != root) {
+                    mpi_errno =
+                        MPIR_Scatter_MV2_Binomial(leader_scatter_buf,
+                                                  nbytes * local_size, MPI_BYTE,
+                                                  tmp_buf, nbytes * local_size,
+                                                  MPI_BYTE, leader_root,
+                                                  leader_comm);
+                } else {
+                    mpi_errno =
+                        MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt * local_size,
+                                                  sendtype, tmp_buf,
+                                                  nbytes * local_size, MPI_BYTE,
+                                                  leader_root, leader_comm);
+
+                }
+            }
+        }
+        /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */
+
+        if (rank == root && recvbuf == MPI_IN_PLACE) {
+            mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
+                                                (void *)sendbuf, sendcnt, sendtype,
+                                                0, shmem_comm);
+        } else {
+            mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
+                                                recvbuf, recvcnt, recvtype,
+                                                0, shmem_comm);
+        }
+
+    }
+
+
+    /* check if multiple threads are calling this collective function */
+    if (comm_size != local_size && local_rank == 0) {
+        smpi_free_tmp_buffer(tmp_buf);
+        if (leader_of_root == rank && root != rank) {
+            smpi_free_tmp_buffer(leader_scatter_buf);
+        }
+    }
+
+    return (mpi_errno);
+}
+