X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/blobdiff_plain/8a9af89e44c0f7f2f648d402e89a26799910ee31..40616078da72e823931c1fb884949054699ec39d:/src/smpi/colls/scatter/scatter-mvapich-two-level.cpp diff --git a/src/smpi/colls/scatter/scatter-mvapich-two-level.cpp b/src/smpi/colls/scatter/scatter-mvapich-two-level.cpp new file mode 100644 index 0000000000..e76d75625d --- /dev/null +++ b/src/smpi/colls/scatter/scatter-mvapich-two-level.cpp @@ -0,0 +1,410 @@ +/* Copyright (c) 2013-2014. The SimGrid Team. + * All rights reserved. */ + +/* This program is free software; you can redistribute it and/or modify it + * under the terms of the license (GNU LGPL) which comes with this package. */ + +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2009 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * + * Additional copyrights may follow + */ + /* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* Copyright (c) 2001-2014, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH2 software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT in the top level MVAPICH2 directory. + */ +/* + * + * (C) 2001 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + */ +#include "../colls_private.h" + +#define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial +#define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear + +extern int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, + void *recvbuf, int recvcount, MPI_Datatype recvtype, + int root, MPI_Comm comm); + +int smpi_coll_tuned_scatter_mvapich2_two_level_direct(void *sendbuf, + int sendcnt, + MPI_Datatype sendtype, + void *recvbuf, + int recvcnt, + MPI_Datatype recvtype, + int root, MPI_Comm comm) +{ + int comm_size, rank; + int local_rank, local_size; + int leader_comm_rank = -1, leader_comm_size = -1; + int mpi_errno = MPI_SUCCESS; + int recvtype_size, sendtype_size, nbytes; + void *tmp_buf = NULL; + void *leader_scatter_buf = NULL; + MPI_Status status; + int leader_root, leader_of_root = -1; + MPI_Comm shmem_comm, leader_comm; + //if not set (use of the algo directly, without mvapich2 selector) + if(MV2_Scatter_intra_function==NULL) + MV2_Scatter_intra_function=smpi_coll_tuned_scatter_mpich; + + if(comm->get_leaders_comm()==MPI_COMM_NULL){ + comm->init_smp(); + } + comm_size = comm->size(); + rank = comm->rank(); + + if (((rank == root) && (recvcnt == 0)) + || ((rank != root) && (sendcnt == 0))) { + return MPI_SUCCESS; + } + + /* extract the rank,size information for the intra-node + * communicator */ + shmem_comm = comm->get_intra_comm(); + local_rank = shmem_comm->rank(); + local_size = shmem_comm->size(); + + if (local_rank == 0) { + /* Node leader. Extract the rank, size information for the leader + * communicator */ + leader_comm = comm->get_leaders_comm(); + leader_comm_size = leader_comm->size(); + leader_comm_rank = leader_comm->rank(); + } + + if (local_size == comm_size) { + /* purely intra-node scatter. Just use the direct algorithm and we are done */ + mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype, + recvbuf, recvcnt, recvtype, + root, comm); + + } else { + recvtype_size=recvtype->size(); + sendtype_size=sendtype->size(); + + if (rank == root) { + nbytes = sendcnt * sendtype_size; + } else { + nbytes = recvcnt * recvtype_size; + } + + if (local_rank == 0) { + /* Node leader, allocate tmp_buffer */ + tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size); + } + + leader_comm = comm->get_leaders_comm(); + int* leaders_map = comm->get_leaders_map(); + leader_of_root = comm->group()->rank(leaders_map[root]); + leader_root = leader_comm->group()->rank(leaders_map[root]); + /* leader_root is the rank of the leader of the root in leader_comm. + * leader_root is to be used as the root of the inter-leader gather ops + */ + + if ((local_rank == 0) && (root != rank) + && (leader_of_root == rank)) { + /* The root of the scatter operation is not the node leader. Recv + * data from the node leader */ + leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size); + Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE, + root, COLL_TAG_SCATTER, comm, &status); + + } + + if (rank == root && local_rank != 0) { + /* The root of the scatter operation is not the node leader. Send + * data to the node leader */ + Request::send(sendbuf, sendcnt * comm_size, sendtype, + leader_of_root, COLL_TAG_SCATTER, comm + ); + } + + if (leader_comm_size > 1 && local_rank == 0) { + if (!comm->is_uniform()) { + int *displs = NULL; + int *sendcnts = NULL; + int *node_sizes; + int i = 0; + node_sizes = comm->get_non_uniform_map(); + + if (root != leader_of_root) { + if (leader_comm_rank == leader_root) { + displs = static_cast(xbt_malloc(sizeof (int) * leader_comm_size)); + sendcnts = static_cast(xbt_malloc(sizeof (int) * leader_comm_size)); + sendcnts[0] = node_sizes[0] * nbytes; + displs[0] = 0; + + for (i = 1; i < leader_comm_size; i++) { + displs[i] = + displs[i - 1] + node_sizes[i - 1] * nbytes; + sendcnts[i] = node_sizes[i] * nbytes; + } + } + smpi_mpi_scatterv(leader_scatter_buf, sendcnts, displs, + MPI_BYTE, tmp_buf, nbytes * local_size, + MPI_BYTE, leader_root, leader_comm); + } else { + if (leader_comm_rank == leader_root) { + displs = static_cast(xbt_malloc(sizeof (int) * leader_comm_size)); + sendcnts = static_cast(xbt_malloc(sizeof (int) * leader_comm_size)); + sendcnts[0] = node_sizes[0] * sendcnt; + displs[0] = 0; + + for (i = 1; i < leader_comm_size; i++) { + displs[i] = + displs[i - 1] + node_sizes[i - 1] * sendcnt; + sendcnts[i] = node_sizes[i] * sendcnt; + } + } + smpi_mpi_scatterv(sendbuf, sendcnts, displs, + sendtype, tmp_buf, + nbytes * local_size, MPI_BYTE, + leader_root, leader_comm); + } + if (leader_comm_rank == leader_root) { + xbt_free(displs); + xbt_free(sendcnts); + } + } else { + if (leader_of_root != root) { + mpi_errno = + MPIR_Scatter_MV2_Direct(leader_scatter_buf, + nbytes * local_size, MPI_BYTE, + tmp_buf, nbytes * local_size, + MPI_BYTE, leader_root, + leader_comm); + } else { + mpi_errno = + MPIR_Scatter_MV2_Direct(sendbuf, sendcnt * local_size, + sendtype, tmp_buf, + nbytes * local_size, MPI_BYTE, + leader_root, leader_comm); + + } + } + } + /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */ + + if (rank == root && recvbuf == MPI_IN_PLACE) { + mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE, + (void *)sendbuf, sendcnt, sendtype, + 0, shmem_comm); + } else { + mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE, + recvbuf, recvcnt, recvtype, + 0, shmem_comm); + } + } + + /* check if multiple threads are calling this collective function */ + if (comm_size != local_size && local_rank == 0) { + smpi_free_tmp_buffer(tmp_buf); + if (leader_of_root == rank && root != rank) { + smpi_free_tmp_buffer(leader_scatter_buf); + } + } + return (mpi_errno); +} + + +int smpi_coll_tuned_scatter_mvapich2_two_level_binomial(void *sendbuf, + int sendcnt, + MPI_Datatype sendtype, + void *recvbuf, + int recvcnt, + MPI_Datatype recvtype, + int root, MPI_Comm comm) +{ + int comm_size, rank; + int local_rank, local_size; + int leader_comm_rank = -1, leader_comm_size = -1; + int mpi_errno = MPI_SUCCESS; + int recvtype_size, sendtype_size, nbytes; + void *tmp_buf = NULL; + void *leader_scatter_buf = NULL; + MPI_Status status; + int leader_root = -1, leader_of_root = -1; + MPI_Comm shmem_comm, leader_comm; + + + //if not set (use of the algo directly, without mvapich2 selector) + if(MV2_Scatter_intra_function==NULL) + MV2_Scatter_intra_function=smpi_coll_tuned_scatter_mpich; + + if(comm->get_leaders_comm()==MPI_COMM_NULL){ + comm->init_smp(); + } + comm_size = comm->size(); + rank = comm->rank(); + + if (((rank == root) && (recvcnt == 0)) + || ((rank != root) && (sendcnt == 0))) { + return MPI_SUCCESS; + } + + /* extract the rank,size information for the intra-node + * communicator */ + shmem_comm = comm->get_intra_comm(); + local_rank = shmem_comm->rank(); + local_size = shmem_comm->size(); + + if (local_rank == 0) { + /* Node leader. Extract the rank, size information for the leader + * communicator */ + leader_comm = comm->get_leaders_comm(); + leader_comm_size = leader_comm->size(); + leader_comm_rank = leader_comm->rank(); + } + + if (local_size == comm_size) { + /* purely intra-node scatter. Just use the direct algorithm and we are done */ + mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype, + recvbuf, recvcnt, recvtype, + root, comm); + + } else { + recvtype_size=recvtype->size(); + sendtype_size=sendtype->size(); + + if (rank == root) { + nbytes = sendcnt * sendtype_size; + } else { + nbytes = recvcnt * recvtype_size; + } + + if (local_rank == 0) { + /* Node leader, allocate tmp_buffer */ + tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size); + } + leader_comm = comm->get_leaders_comm(); + int* leaders_map = comm->get_leaders_map(); + leader_of_root = comm->group()->rank(leaders_map[root]); + leader_root = leader_comm->group()->rank(leaders_map[root]); + /* leader_root is the rank of the leader of the root in leader_comm. + * leader_root is to be used as the root of the inter-leader gather ops + */ + + if ((local_rank == 0) && (root != rank) + && (leader_of_root == rank)) { + /* The root of the scatter operation is not the node leader. Recv + * data from the node leader */ + leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size); + Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE, + root, COLL_TAG_SCATTER, comm, &status); + } + + if (rank == root && local_rank != 0) { + /* The root of the scatter operation is not the node leader. Send + * data to the node leader */ + Request::send(sendbuf, sendcnt * comm_size, sendtype, + leader_of_root, COLL_TAG_SCATTER, comm); + } + + if (leader_comm_size > 1 && local_rank == 0) { + if (!comm->is_uniform()) { + int *displs = NULL; + int *sendcnts = NULL; + int *node_sizes; + int i = 0; + node_sizes = comm->get_non_uniform_map(); + + if (root != leader_of_root) { + if (leader_comm_rank == leader_root) { + displs = static_cast(xbt_malloc(sizeof (int) * leader_comm_size)); + sendcnts = static_cast(xbt_malloc(sizeof (int) * leader_comm_size)); + sendcnts[0] = node_sizes[0] * nbytes; + displs[0] = 0; + + for (i = 1; i < leader_comm_size; i++) { + displs[i] = + displs[i - 1] + node_sizes[i - 1] * nbytes; + sendcnts[i] = node_sizes[i] * nbytes; + } + } + smpi_mpi_scatterv(leader_scatter_buf, sendcnts, displs, + MPI_BYTE, tmp_buf, nbytes * local_size, + MPI_BYTE, leader_root, leader_comm); + } else { + if (leader_comm_rank == leader_root) { + displs = static_cast(xbt_malloc(sizeof (int) * leader_comm_size)); + sendcnts = static_cast(xbt_malloc(sizeof (int) * leader_comm_size)); + sendcnts[0] = node_sizes[0] * sendcnt; + displs[0] = 0; + + for (i = 1; i < leader_comm_size; i++) { + displs[i] = + displs[i - 1] + node_sizes[i - 1] * sendcnt; + sendcnts[i] = node_sizes[i] * sendcnt; + } + } + smpi_mpi_scatterv(sendbuf, sendcnts, displs, + sendtype, tmp_buf, + nbytes * local_size, MPI_BYTE, + leader_root, leader_comm); + } + if (leader_comm_rank == leader_root) { + xbt_free(displs); + xbt_free(sendcnts); + } + } else { + if (leader_of_root != root) { + mpi_errno = + MPIR_Scatter_MV2_Binomial(leader_scatter_buf, + nbytes * local_size, MPI_BYTE, + tmp_buf, nbytes * local_size, + MPI_BYTE, leader_root, + leader_comm); + } else { + mpi_errno = + MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt * local_size, + sendtype, tmp_buf, + nbytes * local_size, MPI_BYTE, + leader_root, leader_comm); + + } + } + } + /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */ + + if (rank == root && recvbuf == MPI_IN_PLACE) { + mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE, + (void *)sendbuf, sendcnt, sendtype, + 0, shmem_comm); + } else { + mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE, + recvbuf, recvcnt, recvtype, + 0, shmem_comm); + } + + } + + + /* check if multiple threads are calling this collective function */ + if (comm_size != local_size && local_rank == 0) { + smpi_free_tmp_buffer(tmp_buf); + if (leader_of_root == rank && root != rank) { + smpi_free_tmp_buffer(leader_scatter_buf); + } + } + + return (mpi_errno); +} +