src/smpi/smpi_mpi_dt.c
src/smpi/smpi_pmpi.c
src/smpi/smpi_replay.c
+ #src/smpi/colls/allgather-2dmesh.c
+ #src/smpi/colls/allgather-3dmesh.c
+ #src/smpi/colls/allgather-bruck.c
+ src/smpi/colls/allgather-GB.c
+ src/smpi/colls/allgather-loosely-lr.c
+ src/smpi/colls/allgather-lr.c
+ src/smpi/colls/allgather-NTSLR.c
+ src/smpi/colls/allgather-NTSLR-NB.c
+ src/smpi/colls/allgather-pair.c
+ src/smpi/colls/allgather-rdb.c
+ src/smpi/colls/allgather-RDB.c
+ src/smpi/colls/allgather-rhv.c
+ src/smpi/colls/allgather-ring.c
+ src/smpi/colls/allgather-SMP-NTS.c
+ src/smpi/colls/allgather-smp-simple.c
+ src/smpi/colls/allgather-SMP-simple.c
+ src/smpi/colls/allgather-spreading-simple.c
+ src/smpi/colls/allreduce-lr.c
+ src/smpi/colls/allreduce-NTS.c
+ src/smpi/colls/allreduce-rab1.c
+ src/smpi/colls/allreduce-rab2.c
+ #src/smpi/colls/allreduce-rab-rdb.c
+ #src/smpi/colls/allreduce-rab-reduce-scatter.c
+ src/smpi/colls/allreduce-rab-rsag.c
+ src/smpi/colls/allreduce-rdb.c
+ src/smpi/colls/allreduce-redbcast.c
+ src/smpi/colls/allreduce-smp-binomial.c
+ #src/smpi/colls/allreduce-smp-binomial-pipeline.c
+ src/smpi/colls/allreduce-smp-rdb.c
+ src/smpi/colls/allreduce-smp-rsag.c
+ src/smpi/colls/allreduce-smp-rsag-lr.c
+ src/smpi/colls/allreduce-smp-rsag-rab.c
src/smpi/colls/alltoall-2dmesh.c
src/smpi/colls/alltoall-3dmesh.c
- # src/smpi/colls/alltoall-bruck.c
+ #src/smpi/colls/alltoall-bruck.c
src/smpi/colls/alltoall-pair.c
src/smpi/colls/alltoall-pair-light-barrier.c
src/smpi/colls/alltoall-pair-mpi-barrier.c
src/smpi/colls/alltoall-ring-mpi-barrier.c
src/smpi/colls/alltoall-ring-one-barrier.c
src/smpi/colls/alltoall-simple.c
+ src/smpi/colls/bcast-arrival-nb.c
+ src/smpi/colls/bcast-arrival-pattern-aware.c
+ src/smpi/colls/bcast-arrival-pattern-aware-wait.c
+ src/smpi/colls/bcast-arrival-scatter.c
+ src/smpi/colls/bcast-binomial-tree.c
+ src/smpi/colls/bcast-flattree.c
+ src/smpi/colls/bcast-flattree-pipeline.c
+ src/smpi/colls/bcast-NTSB.c
+ src/smpi/colls/bcast-NTSL.c
+ src/smpi/colls/bcast-NTSL-Isend.c
+ src/smpi/colls/bcast-scatter-LR-allgather.c
+ src/smpi/colls/bcast-scatter-rdb-allgather.c
+ src/smpi/colls/bcast-SMP-binary.c
+ src/smpi/colls/bcast-SMP-binomial.c
+ src/smpi/colls/bcast-SMP-linear.c
+ src/smpi/colls/bcast-TSB.c
+ src/smpi/colls/reduce-arrival-pattern-aware.c
+ src/smpi/colls/reduce-binomial.c
+ src/smpi/colls/reduce-flat-tree.c
+ src/smpi/colls/reduce-NTSL.c
+ src/smpi/colls/reduce-scatter-gather.c
+ src/smpi/colls/star-reduction.c
)
if(SMPI_F2C)
typedef struct mpi_coll_description {
const char *name;
const char *description;
- void* coll;
+ void *coll;
} s_mpi_coll_description_t, *mpi_coll_description_t;
+/** \ingroup MPI allgather
+ * \brief The list of all available allgather collectives
+ */
+XBT_PUBLIC_DATA(s_mpi_coll_description_t) mpi_coll_allgather_description[];
+XBT_PUBLIC_DATA(int (*mpi_coll_allgather_fun)
+ (void *, int, MPI_Datatype, void *, int, MPI_Datatype,
+ MPI_Comm));
+
+
+/** \ingroup MPI allreduce
+ * \brief The list of all available allgather collectives
+ */
+XBT_PUBLIC_DATA(s_mpi_coll_description_t) mpi_coll_allreduce_description[];
+XBT_PUBLIC_DATA(int (*mpi_coll_allreduce_fun)
+ (void *sbuf, void *rbuf, int rcount, MPI_Datatype dtype,
+ MPI_Op op, MPI_Comm comm));
+
+
/** \ingroup MPI alltoallcollectives
* \brief The list of all available alltoall collectives
*/
XBT_PUBLIC_DATA(s_mpi_coll_description_t) mpi_coll_alltoall_description[];
-XBT_PUBLIC_DATA(int (*mpi_coll_alltoall_fun)(void *, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm));
+XBT_PUBLIC_DATA(int (*mpi_coll_alltoall_fun)
+ (void *, int, MPI_Datatype, void *, int, MPI_Datatype,
+ MPI_Comm));
-/** \ingroup MPI allgather
- * \brief The list of all available allgather collectives
+
+/** \ingroup MPI bcast
+ * \brief The list of all available bcast collectives
*/
-XBT_PUBLIC_DATA(s_mpi_coll_description_t) mpi_coll_allgather_description[];
-XBT_PUBLIC_DATA(int (*mpi_coll_allgather_fun)(void *, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm));
+XBT_PUBLIC_DATA(s_mpi_coll_description_t) mpi_coll_bcast_description[];
+XBT_PUBLIC_DATA(int (*mpi_coll_bcast_fun)
+ (void *buf, int count, MPI_Datatype datatype, int root,
+ MPI_Comm com));
+
-XBT_PUBLIC(void) coll_help(const char *category, s_mpi_coll_description_t * table);
+/** \ingroup MPI reduce
+ * \brief The list of all available reduce collectives
+ */
+XBT_PUBLIC_DATA(s_mpi_coll_description_t) mpi_coll_reduce_description[];
+XBT_PUBLIC_DATA(int (*mpi_coll_reduce_fun)
+ (void *buf, void *rbuf, int count, MPI_Datatype datatype,
+ MPI_Op op, int root, MPI_Comm comm));
+
+
+XBT_PUBLIC(void) coll_help(const char *category,
+ s_mpi_coll_description_t * table);
XBT_PUBLIC(int) find_coll_description(s_mpi_coll_description_t * table,
- const char *name);
+ const char *name);
+
#endif /* _SMPI_INTERFAC_H */
/* New Module missing */
find_coll_description(table, val);
}
+static void _sg_cfg_cb__coll_allgather(const char *name, int pos){
+ _sg_cfg_cb__coll("allgather", mpi_coll_allgather_description, name, pos);
+}
+static void _sg_cfg_cb__coll_allreduce(const char *name, int pos)
+{
+ _sg_cfg_cb__coll("allreduce", mpi_coll_allreduce_description, name, pos);
+}
static void _sg_cfg_cb__coll_alltoall(const char *name, int pos)
{
_sg_cfg_cb__coll("alltoall", mpi_coll_alltoall_description, name, pos);
}
-static void _sg_cfg_cb__coll_allgather(const char *name, int pos){
- _sg_cfg_cb__coll("allgather", mpi_coll_allgather_description, name, pos);
+static void _sg_cfg_cb__coll_bcast(const char *name, int pos)
+{
+ _sg_cfg_cb__coll("bcast", mpi_coll_bcast_description, name, pos);
+}
+static void _sg_cfg_cb__coll_reduce(const char *name, int pos)
+{
+ _sg_cfg_cb__coll("reduce", mpi_coll_reduce_description, name, pos);
}
NULL);
xbt_cfg_setdefault_string(_sg_cfg_set, "smpi/or", "1:0:0:0:0");
- default_value = xbt_strdup("ompi");
+ default_value = xbt_strdup("default");
+ xbt_cfg_register(&_sg_cfg_set, "smpi/allgather",
+ "Which collective to use for allgather",
+ xbt_cfgelm_string, &default_value, 1, 1, &_sg_cfg_cb__coll_allgather,
+ NULL);
+
+ default_value = xbt_strdup("default");
+ xbt_cfg_register(&_sg_cfg_set, "smpi/allreduce",
+ "Which collective to use for allreduce",
+ xbt_cfgelm_string, &default_value, 1, 1, &_sg_cfg_cb__coll_allreduce,
+ NULL);
+
+ default_value = xbt_strdup("ompi");
xbt_cfg_register(&_sg_cfg_set, "smpi/alltoall",
"Which collective to use for alltoall",
xbt_cfgelm_string, &default_value, 1, 1, &_sg_cfg_cb__coll_alltoall,
NULL);
- default_value = xbt_strdup("default");
- xbt_cfg_register(&_sg_cfg_set, "smpi/allgather",
- "Which collective to use for allgather",
- xbt_cfgelm_string, &default_value, 1, 1, &_sg_cfg_cb__coll_allgather,
+
+ default_value = xbt_strdup("default");
+ xbt_cfg_register(&_sg_cfg_set, "smpi/bcast",
+ "Which collective to use for bcast",
+ xbt_cfgelm_string, &default_value, 1, 1, &_sg_cfg_cb__coll_bcast,
+ NULL);
+
+
+ default_value = xbt_strdup("default");
+ xbt_cfg_register(&_sg_cfg_set, "smpi/reduce",
+ "Which collective to use for reduce",
+ xbt_cfgelm_string, &default_value, 1, 1, &_sg_cfg_cb__coll_reduce,
NULL);
//END SMPI
-Copyright (c) 2006, Ahmad Faraj & Xin Yuan,
-All rights reserved.
+Copyright(c) 2006, Ahmad Faraj & Xin Yuan, All rights reserved.Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
+*Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
+ * Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- * Neither the name of the Florida State University nor the names of its
+ and / or other materials provided with the distribution.
+ * Neither the name of the Florida State University nor the names of its
contributors may be used to endorse or promote products derived from this
- software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ***************************************************************************
- * Any results obtained from executing this software require the *
- * acknowledgment and citation of the software and its owners. *
- * The full citation is given below: *
- * *
- * A. Faraj, X. Yuan, and D. Lowenthal. "STAR-MPI: Self Tuned Adaptive *
- * Routines for MPI Collective Operations." The 20th ACM International *
- * Conference on Supercomputing (ICS), Queensland, Australia *
- * June 28-July 1, 2006. *
- ***************************************************************************
-
+ software without specific prior written permission.THIS SOFTWARE IS PROVIDED
+ BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ***************************************************************************
+ *Any results obtained from executing this software require the *
+ *acknowledgment and citation of the software and its owners. *
+ *The full citation is given below:****A.Faraj, X.Yuan, and D.Lowenthal."STAR-MPI: Self Tuned Adaptive *
+ * Routines for MPI Collective Operations." The 20 th ACM International * *Conference on Supercomputing(ICS), Queensland, Australia * *June 28 - July 1, 2006. * ***************************************************************************
--- /dev/null
+#include "colls.h"
+
+/*****************************************************************************
+
+Copyright (c) 2006, Ahmad Faraj & Xin Yuan,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ * Neither the name of the Florida State University nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ *************************************************************************
+ * Any results obtained from executing this software require the *
+ * acknowledgment and citation of the software and its owners. *
+ * The full citation is given below: *
+ * *
+ * A. Faraj and X. Yuan. "Automatic Generation and Tuning of MPI *
+ * Collective Communication Routines." The 19th ACM International *
+ * Conference on Supercomputing (ICS), Cambridge, Massachusetts, *
+ * June 20-22, 2005. *
+ *************************************************************************
+
+*****************************************************************************/
+
+/*****************************************************************************
+
+ * Function: is_2dmesh
+
+ * Return: int
+
+ * Inputs:
+ num: the number of processors in a communicator
+ i: x dimension
+ j: y dimension
+
+ * Descp: takes a number and tries to find a factoring of x, y mesh out of it
+
+ * Auther: Ahmad Faraj
+ ****************************************************************************/
+#ifndef TWOD
+#define TWOD
+static int is_2dmesh(int num, int *i, int *j)
+{
+ int x, max = num / 2;
+ x = sqrt(num);
+
+ while (x <= max) {
+ if ((num % x) == 0) {
+ *i = x;
+ *j = num / x;
+
+ if (*i > *j) {
+ x = *i;
+ *i = *j;
+ *j = x;
+ }
+
+ return 1;
+ }
+ x++;
+ }
+ return 0;
+}
+#endif
+/*****************************************************************************
+ * Function: allgather_2dmesh_shoot
+ * return: int
+ * send_buff: send input buffer
+ * send_count: number of elements to send
+ * send_type: data type of elements being sent
+ * recv_buff: receive output buffer
+ * recv_count: number of elements to received
+ * recv_type: data type of elements being received
+ * comm: communication
+ * Descrp: Function realizes the allgather operation using the 2dmesh
+ * algorithm. Allgather ommunication occurs first in the x dimension then in
+ * the y dimension. The communication in each dimension follows
+ * "simple"
+ * Auther: Ahmad Faraj
+****************************************************************************/
+int
+smpi_coll_tuned_allgather_2dmesh(void *send_buff, int send_count, MPI_Datatype
+ send_type, void *recv_buff, int recv_count,
+ MPI_Datatype recv_type, MPI_Comm comm)
+{
+ MPI_Request *req, *req_ptr;
+ MPI_Aint extent;
+
+ int i, src, dst, rank, num_procs;
+ int X, Y, send_offset, recv_offset;
+ int my_row_base, my_col_base, src_row_base, block_size, num_reqs;
+ int success = 0;
+ int failure = 1;
+ int tag = 1;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &num_procs);
+
+ MPI_Type_extent(send_type, &extent);
+
+ block_size = extent * send_count;
+
+ is_2dmesh(num_procs, &X, &Y);
+ my_row_base = (rank / Y) * Y;
+ my_col_base = rank % Y;
+
+ num_reqs = X;
+ if (Y > X)
+ num_reqs = Y;
+
+ req = (MPI_Request *) malloc(num_reqs * sizeof(MPI_Request));
+ if (!req) {
+ printf("allgather-2dmesh-shoot.c:85: cannot allocate memory\n");
+ MPI_Finalize();
+ exit(failure);
+ }
+
+ req_ptr = req;
+
+ // do local allgather/local copy
+ recv_offset = rank * block_size;
+ MPIR_Localcopy(send_buff, send_count, send_type, (char *)recv_buff + recv_offset,
+ recv_count, recv_type);
+
+ // do row-wise comm
+ for (i = 0; i < Y; i++) {
+ src = i + my_row_base;
+ if (src == rank)
+ continue;
+ recv_offset = src * block_size;
+ MPIC_Irecv((char *)recv_buff + recv_offset, recv_count, recv_type, src, tag,
+ comm, req_ptr++);
+ }
+
+
+ for (i = 0; i < Y; i++) {
+ dst = i + my_row_base;
+ if (dst == rank)
+ continue;
+ MPIC_Send(send_buff, send_count, send_type, dst, tag, comm);
+ }
+
+ MPI_Waitall(Y - 1, req, MPI_STATUSES_IGNORE);
+
+ req_ptr = req;
+
+ // do colwise comm
+ for (i = 0; i < X; i++) {
+ src = (i * Y + my_col_base);
+ if (src == rank)
+ continue;
+ src_row_base = (src / Y) * Y;
+ recv_offset = src_row_base * block_size;
+ MPIC_Irecv((char *)recv_buff + recv_offset, recv_count * Y, recv_type, src, tag,
+ comm, req_ptr++);
+ }
+
+ for (i = 0; i < X; i++) {
+ dst = (i * Y + my_col_base);
+ if (dst == rank)
+ continue;
+ send_offset = my_row_base * block_size;
+ MPIC_Send((char *)recv_buff + send_offset, send_count * Y, send_type, dst, tag,
+ comm);
+ }
+
+ MPI_Waitall(X - 1, req, MPI_STATUSES_IGNORE);
+
+ free(req);
+
+ return success;
+}
--- /dev/null
+#include "colls.h"
+
+/*****************************************************************************
+
+Copyright (c) 2006, Ahmad Faraj & Xin Yuan,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ * Neither the name of the Florida State University nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ *************************************************************************
+ * Any results obtained from executing this software require the *
+ * acknowledgment and citation of the software and its owners. *
+ * The full citation is given below: *
+ * *
+ * A. Faraj and X. Yuan. "Automatic Generation and Tuning of MPI *
+ * Collective Communication Routines." The 19th ACM International *
+ * Conference on Supercomputing (ICS), Cambridge, Massachusetts, *
+ * June 20-22, 2005. *
+ *************************************************************************
+
+*****************************************************************************/
+
+/*****************************************************************************
+ * Function: is_2dmesh
+ * return: int
+ * num: the number of processors in a communicator
+ * i: x dimension
+ * j: y dimension
+ * k: z dimension
+ * descp: takes a number and tries to find a factoring of x*y*z mesh out of it
+ ****************************************************************************/
+#ifndef THREED
+#define THREED
+static int is_3dmesh(int num, int *i, int *j, int *k)
+{
+ int x, max = num / 3;
+ x = cbrt(num);
+ *i = *j = *k = 0;
+ while (x <= max) {
+ if ((num % (x * x)) == 0) {
+ *i = *j = x;
+ *k = num / (x * x);
+ return 1;
+ }
+ x++;
+ }
+ return 0;
+}
+#endif
+/*****************************************************************************
+ * Function: allgather_3dmesh_shoot
+ * return: int
+ * send_buff: send input buffer
+ * send_count: number of elements to send
+ * send_type: data type of elements being sent
+ * recv_buff: receive output buffer
+ * recv_count: number of elements to received
+ * recv_type: data type of elements being received
+ * comm: communication
+ * Descrp: Function realizes the allgather operation using the 2dmesh
+ * algorithm. Allgather ommunication occurs first in the x dimension, y
+ * dimension, and then in the z dimension. Communication in each dimension
+ * follows "simple"
+ * Auther: Ahmad Faraj
+****************************************************************************/
+int smpi_coll_tuned_allgather_3dmesh(void *send_buff, int send_count,
+ MPI_Datatype send_type, void *recv_buff,
+ int recv_count, MPI_Datatype recv_type,
+ MPI_Comm comm)
+{
+ MPI_Request *req, *req_ptr;
+ MPI_Aint extent;
+
+ int i, src, dst, rank, num_procs, block_size, my_z_base;
+ int my_z, X, Y, Z, send_offset, recv_offset;
+ int two_dsize, my_row_base, my_col_base, src_row_base, src_z_base, num_reqs;
+ int success = 0;
+ int failure = 1;
+ int tag = 1;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &num_procs);
+ MPI_Type_extent(send_type, &extent);
+
+ is_3dmesh(num_procs, &X, &Y, &Z);
+
+ num_reqs = X;
+
+ if (Y > X)
+ num_reqs = Y;
+ if (Z > Y)
+ num_reqs = Z;
+
+ two_dsize = X * Y;
+ my_z = rank / two_dsize;
+
+ my_row_base = (rank / X) * X;
+ my_col_base = (rank % Y) + (my_z * two_dsize);
+ my_z_base = my_z * two_dsize;
+
+ block_size = extent * send_count;
+
+ req = (MPI_Request *) malloc(num_reqs * sizeof(MPI_Request));
+ if (!req) {
+ printf("allgather-3dmesh-shoot.c:85: cannot allocate memory\n");
+ MPI_Finalize();
+ exit(failure);
+ }
+
+ req_ptr = req;
+
+ // do local allgather/local copy
+ recv_offset = rank * block_size;
+ MPIR_Localcopy(send_buff, send_count, send_type, (char *)recv_buff + recv_offset,
+ recv_count, recv_type);
+
+ // do rowwise comm
+ for (i = 0; i < Y; i++) {
+ src = i + my_row_base;
+ if (src == rank)
+ continue;
+ recv_offset = src * block_size;
+ MPIC_Irecv((char *)recv_buff + recv_offset, send_count, recv_type, src, tag,
+ comm, req_ptr++);
+ }
+
+ for (i = 0; i < Y; i++) {
+ dst = i + my_row_base;
+ if (dst == rank)
+ continue;
+ MPIC_Send(send_buff, send_count, send_type, dst, tag, comm);
+ }
+
+ MPI_Waitall(Y - 1, req, MPI_STATUSES_IGNORE);
+ req_ptr = req;
+
+ // do colwise comm, it does not matter here if i*X or i *Y since X == Y
+
+ for (i = 0; i < X; i++) {
+ src = (i * Y + my_col_base);
+ if (src == rank)
+ continue;
+
+ src_row_base = (src / X) * X;
+ recv_offset = src_row_base * block_size;
+ MPIC_Irecv((char *)recv_buff + recv_offset, recv_count * Y, recv_type, src, tag,
+ comm, req_ptr++);
+ }
+
+ send_offset = my_row_base * block_size;
+
+ for (i = 0; i < X; i++) {
+ dst = (i * Y + my_col_base);
+ if (dst == rank)
+ continue;
+ MPIC_Send((char *)recv_buff + send_offset, send_count * Y, send_type, dst, tag,
+ comm);
+ }
+
+ MPI_Waitall(X - 1, req, MPI_STATUSES_IGNORE);
+ req_ptr = req;
+
+ for (i = 1; i < Z; i++) {
+ src = (rank + i * two_dsize) % num_procs;
+ src_z_base = (src / two_dsize) * two_dsize;
+
+ recv_offset = (src_z_base * block_size);
+
+ MPIC_Irecv((char *)recv_buff + recv_offset, recv_count * two_dsize, recv_type,
+ src, tag, comm, req_ptr++);
+ }
+
+ for (i = 1; i < Z; i++) {
+ dst = (rank + i * two_dsize) % num_procs;
+ send_offset = my_z_base * block_size;
+ MPIC_Send((char *)recv_buff + send_offset, send_count * two_dsize, send_type,
+ dst, tag, comm);
+ }
+ MPI_Waitall(Z - 1, req, MPI_STATUSES_IGNORE);
+
+ free(req);
+
+ return success;
+}
--- /dev/null
+#include "colls.h"
+
+// Allgather - gather/bcast algorithm
+int smpi_coll_tuned_allgather_GB(void *send_buff, int send_count,
+ MPI_Datatype send_type, void *recv_buff,
+ int recv_count, MPI_Datatype recv_type,
+ MPI_Comm comm)
+{
+ int num_procs;
+ MPI_Comm_size(comm, &num_procs);
+ MPI_Gather(send_buff, send_count, send_type, recv_buff, recv_count, recv_type,
+ 0, comm);
+ MPI_Bcast(recv_buff, (recv_count * num_procs), recv_type, 0, comm);
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+// Allgather-Non-Topoloty-Scecific-Logical-Ring algorithm
+int
+smpi_coll_tuned_allgather_NTSLR_NB(void *sbuf, int scount, MPI_Datatype stype,
+ void *rbuf, int rcount, MPI_Datatype rtype,
+ MPI_Comm comm)
+{
+ MPI_Aint rextent, sextent;
+ MPI_Status status, status2;
+ int i, to, from, rank, size;
+ int send_offset, recv_offset;
+ int tag = 500;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &size);
+ MPI_Type_extent(rtype, &rextent);
+ MPI_Type_extent(stype, &sextent);
+ MPI_Request *rrequest_array;
+ MPI_Request *srequest_array;
+ rrequest_array = (MPI_Request *) malloc(size * sizeof(MPI_Request));
+ srequest_array = (MPI_Request *) malloc(size * sizeof(MPI_Request));
+
+ // irregular case use default MPI fucntions
+ if (scount * sextent != rcount * rextent)
+ MPI_Allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
+
+ // topo non-specific
+ to = (rank + 1) % size;
+ from = (rank + size - 1) % size;
+
+ //copy a single segment from sbuf to rbuf
+ send_offset = rank * scount * sextent;
+
+ MPI_Sendrecv(sbuf, scount, stype, rank, tag,
+ (char *)rbuf + send_offset, rcount, rtype, rank, tag, comm, &status);
+
+
+ //start sending logical ring message
+ int increment = scount * sextent;
+
+ //post all irecv first
+ for (i = 0; i < size - 1; i++) {
+ recv_offset = ((rank - i - 1 + size) % size) * increment;
+ MPI_Irecv((char *)rbuf + recv_offset, rcount, rtype, from, tag + i, comm,
+ &rrequest_array[i]);
+ }
+
+
+ for (i = 0; i < size - 1; i++) {
+ send_offset = ((rank - i + size) % size) * increment;
+ MPI_Isend((char *)rbuf + send_offset, scount, stype, to, tag + i, comm,
+ &srequest_array[i]);
+ MPI_Wait(&rrequest_array[i], &status);
+ MPI_Wait(&srequest_array[i], &status2);
+ }
+
+ free(rrequest_array);
+ free(srequest_array);
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+// Allgather-Non-Topoloty-Scecific-Logical-Ring algorithm
+int
+smpi_coll_tuned_allgather_NTSLR(void *sbuf, int scount, MPI_Datatype stype,
+ void *rbuf, int rcount, MPI_Datatype rtype,
+ MPI_Comm comm)
+{
+ MPI_Aint rextent, sextent;
+ MPI_Status status;
+ int i, to, from, rank, size;
+ int send_offset, recv_offset;
+ int tag = 500;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &size);
+ MPI_Type_extent(rtype, &rextent);
+ MPI_Type_extent(stype, &sextent);
+
+ // irregular case use default MPI fucntions
+ if (scount * sextent != rcount * rextent)
+ MPI_Allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
+
+ // topo non-specific
+ to = (rank + 1) % size;
+ from = (rank + size - 1) % size;
+
+ //copy a single segment from sbuf to rbuf
+ send_offset = rank * scount * sextent;
+
+ MPI_Sendrecv(sbuf, scount, stype, rank, tag,
+ (char *)rbuf + send_offset, rcount, rtype, rank, tag,
+ comm, &status);
+
+
+ //start sending logical ring message
+ int increment = scount * sextent;
+ for (i = 0; i < size - 1; i++) {
+ send_offset = ((rank - i + size) % size) * increment;
+ recv_offset = ((rank - i - 1 + size) % size) * increment;
+ MPI_Sendrecv((char *) rbuf + send_offset, scount, stype, to, tag + i,
+ (char *) rbuf + recv_offset, rcount, rtype, from, tag + i,
+ comm, &status);
+ }
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+int
+smpi_coll_tuned_allgather_RDB(void *sbuf, int send_count,
+ MPI_Datatype send_type,
+ void *rbuf, int recv_count,
+ MPI_Datatype recv_type,
+ MPI_Comm comm)
+{
+ // MPI variables
+ MPI_Status status;
+ MPI_Aint send_chunk, recv_chunk;
+
+ // local int variables
+ int i, j, k, dst, rank, num_procs, send_offset, recv_offset, tree_root;
+ int dst_tree_root, rank_tree_root, last_recv_count, num_procs_completed;
+ int offset, tmp_mask;
+ int tag = 1;
+ int mask = 1;
+ int success = 0;
+ int curr_count = recv_count;
+
+ // local string variables
+ char *send_ptr = (char *) sbuf;
+ char *recv_ptr = (char *) rbuf;
+
+ // get size of the communicator, followed by rank
+ MPI_Comm_size(comm, &num_procs);
+ MPI_Comm_rank(comm, &rank);
+
+ // get size of single element's type for send buffer and recv buffer
+ MPI_Type_extent(send_type, &send_chunk);
+ MPI_Type_extent(recv_type, &recv_chunk);
+
+ // multiply size of each element by number of elements to send or recv
+ send_chunk *= send_count;
+ recv_chunk *= recv_count;
+
+ // perform a local copy
+ MPI_Sendrecv(send_ptr, send_count, send_type, rank, tag,
+ recv_ptr + rank * recv_chunk, recv_count, recv_type, rank, tag,
+ comm, &status);
+
+ i = 0;
+ while (mask < num_procs) {
+ dst = rank ^ mask;
+ dst_tree_root = dst >> i;
+ dst_tree_root <<= i;
+ rank_tree_root = rank >> i;
+ rank_tree_root <<= i;
+ send_offset = rank_tree_root * send_chunk;
+ recv_offset = dst_tree_root * recv_chunk;
+
+ if (dst < num_procs) {
+ MPI_Sendrecv(recv_ptr + send_offset, curr_count, send_type, dst,
+ tag, recv_ptr + recv_offset, mask * recv_count,
+ recv_type, dst, tag, comm, &status);
+ MPI_Get_count(&status, recv_type, &last_recv_count);
+ curr_count += last_recv_count;
+ }
+
+ if (dst_tree_root + mask > num_procs) {
+ num_procs_completed = num_procs - rank_tree_root - mask;
+ /* num_procs_completed is the number of processes in this
+ subtree that have all the data. Send data to others
+ in a tree fashion. First find root of current tree
+ that is being divided into two. k is the number of
+ least-significant bits in this process's rank that
+ must be zeroed out to find the rank of the root */
+
+ j = mask;
+ k = 0;
+ while (j) {
+ j >>= 1;
+ k++;
+ }
+ k--;
+
+ offset = recv_chunk * (rank_tree_root + mask);
+ tmp_mask = mask >> 1;
+
+ while (tmp_mask) {
+ dst = rank ^ tmp_mask;
+
+ tree_root = rank >> k;
+ tree_root <<= k;
+
+ /* send only if this proc has data and destination
+ doesn't have data. at any step, multiple processes
+ can send if they have the data */
+ if ((dst > rank)
+ && (rank < tree_root + num_procs_completed)
+ && (dst >= tree_root + num_procs_completed)) {
+ MPI_Send(recv_ptr + offset, last_recv_count, recv_type, dst,
+ tag, comm);
+
+ /* last_recv_cnt was set in the previous
+ receive. that's the amount of data to be
+ sent now. */
+ }
+ /* recv only if this proc. doesn't have data and sender
+ has data */
+ else if ((dst < rank)
+ && (dst < tree_root + num_procs_completed)
+ && (rank >= tree_root + num_procs_completed)) {
+ MPI_Recv(recv_ptr + offset,
+ recv_count * num_procs_completed,
+ recv_type, dst, tag, comm, &status);
+ // num_procs_completed is also equal to the no. of processes
+ // whose data we don't have
+ MPI_Get_count(&status, recv_type, &last_recv_count);
+ curr_count += last_recv_count;
+ }
+ tmp_mask >>= 1;
+ k--;
+ }
+ }
+
+ mask <<= 1;
+ i++;
+ }
+
+ return success;
+}
--- /dev/null
+#include "colls.h"
+#ifndef NUM_CORE
+#define NUM_CORE 8
+#endif
+
+int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount,
+ MPI_Datatype stype, void *rbuf,
+ int rcount, MPI_Datatype rtype,
+ MPI_Comm comm)
+{
+ int src, dst, comm_size, rank;
+ MPI_Comm_size(comm, &comm_size);
+ MPI_Comm_rank(comm, &rank);
+ MPI_Aint rextent, sextent;
+ MPI_Type_extent(rtype, &rextent);
+ MPI_Type_extent(stype, &sextent);
+ int tag = 50;
+ MPI_Request request;
+ MPI_Request rrequest_array[128];
+ MPI_Request srequest_array[128];
+
+ MPI_Status status;
+ int i, send_offset, recv_offset;
+ int intra_rank, inter_rank;
+ intra_rank = rank % NUM_CORE;
+ inter_rank = rank / NUM_CORE;
+ int inter_comm_size = (comm_size + NUM_CORE - 1) / NUM_CORE;
+ int num_core_in_current_smp = NUM_CORE;
+
+ /* for too small number of processes, use default implementation */
+ if (comm_size <= NUM_CORE) {
+ return MPI_Allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
+ }
+ // the last SMP node may have fewer number of running processes than all others
+ if (inter_rank == (inter_comm_size - 1)) {
+ num_core_in_current_smp = comm_size - (inter_rank * NUM_CORE);
+ }
+ //copy corresponding message from sbuf to rbuf
+ recv_offset = rank * rextent * rcount;
+ MPI_Sendrecv(sbuf, scount, stype, rank, tag,
+ ((char *) rbuf + recv_offset), rcount, rtype, rank, tag, comm,
+ &status);
+
+ //gather to root of each SMP
+
+ for (i = 1; i < num_core_in_current_smp; i++) {
+
+ dst =
+ (inter_rank * NUM_CORE) + (intra_rank + i) % (num_core_in_current_smp);
+ src =
+ (inter_rank * NUM_CORE) + (intra_rank - i +
+ num_core_in_current_smp) %
+ (num_core_in_current_smp);
+ recv_offset = src * rextent * rcount;
+
+ MPI_Sendrecv(sbuf, scount, stype, dst, tag,
+ ((char *) rbuf + recv_offset), rcount, rtype, src, tag, comm,
+ &status);
+
+ }
+
+ // INTER-SMP-ALLGATHER
+ // Every root of each SMP node post INTER-Sendrecv, then do INTRA-Bcast for each receiving message
+ // Use logical ring algorithm
+
+ // root of each SMP
+ if (intra_rank == 0) {
+ src = ((inter_rank - 1 + inter_comm_size) % inter_comm_size) * NUM_CORE;
+ dst = ((inter_rank + 1) % inter_comm_size) * NUM_CORE;
+
+ // post all inter Irecv
+ for (i = 0; i < inter_comm_size - 1; i++) {
+ recv_offset =
+ ((inter_rank - i - 1 +
+ inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
+ MPI_Irecv((char *) rbuf + recv_offset, rcount * NUM_CORE, rtype, src,
+ tag + i, comm, &rrequest_array[i]);
+ }
+
+ // send first message
+ send_offset =
+ ((inter_rank +
+ inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
+ MPI_Isend((char *) rbuf + send_offset, scount * NUM_CORE, stype, dst, tag,
+ comm, &srequest_array[0]);
+
+ // loop : recv-inter , send-inter, send-intra (linear-bcast)
+ for (i = 0; i < inter_comm_size - 2; i++) {
+ recv_offset =
+ ((inter_rank - i - 1 +
+ inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
+ MPI_Wait(&rrequest_array[i], &status);
+ MPI_Isend((char *) rbuf + recv_offset, scount * NUM_CORE, stype, dst,
+ tag + i + 1, comm, &srequest_array[i + 1]);
+ if (num_core_in_current_smp > 1) {
+ MPI_Isend((char *) rbuf + recv_offset, scount * NUM_CORE, stype,
+ (rank + 1), tag + i + 1, comm, &request);
+ }
+ }
+
+ // recv last message and send_intra
+ recv_offset =
+ ((inter_rank - i - 1 +
+ inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
+ //recv_offset = ((inter_rank + 1) % inter_comm_size) * NUM_CORE * sextent * scount;
+ //i=inter_comm_size-2;
+ MPI_Wait(&rrequest_array[i], &status);
+ if (num_core_in_current_smp > 1) {
+ MPI_Isend((char *) rbuf + recv_offset, scount * NUM_CORE, stype,
+ (rank + 1), tag + i + 1, comm, &request);
+ }
+ }
+ // last rank of each SMP
+ else if (intra_rank == (num_core_in_current_smp - 1)) {
+ for (i = 0; i < inter_comm_size - 1; i++) {
+ recv_offset =
+ ((inter_rank - i - 1 +
+ inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
+ MPI_Irecv((char *) rbuf + recv_offset, (rcount * NUM_CORE), rtype,
+ rank - 1, tag + i + 1, comm, &request);
+ MPI_Wait(&request, &status);
+ }
+ }
+ // intermediate rank of each SMP
+ else {
+ for (i = 0; i < inter_comm_size - 1; i++) {
+ recv_offset =
+ ((inter_rank - i - 1 +
+ inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
+ MPI_Irecv((char *) rbuf + recv_offset, (rcount * NUM_CORE), rtype,
+ rank - 1, tag + i + 1, comm, &request);
+ MPI_Wait(&request, &status);
+ MPI_Isend((char *) rbuf + recv_offset, (scount * NUM_CORE), stype,
+ (rank + 1), tag + i + 1, comm, &request);
+ }
+ }
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+#ifndef NUM_CORE
+#define NUM_CORE 8
+#endif
+
+int smpi_coll_tuned_allgather_SMP_simple(void *send_buf, int scount,
+ MPI_Datatype stype, void *recv_buf,
+ int rcount, MPI_Datatype rtype,
+ MPI_Comm comm)
+{
+ int src, dst, comm_size, rank;
+ MPI_Comm_size(comm, &comm_size);
+ MPI_Comm_rank(comm, &rank);
+ MPI_Aint rextent, sextent;
+ MPI_Type_extent(rtype, &rextent);
+ MPI_Type_extent(stype, &sextent);
+ int tag = 50;
+ MPI_Status status;
+ int i, send_offset, recv_offset;
+ int intra_rank, inter_rank;
+ int num_core = NUM_CORE;
+ intra_rank = rank % num_core;
+ inter_rank = rank / num_core;
+ int inter_comm_size = (comm_size + num_core - 1) / num_core;
+ int num_core_in_current_smp = num_core;
+
+ // the last SMP node may have fewer number of running processes than all others
+ if (inter_rank == (inter_comm_size - 1)) {
+ num_core_in_current_smp = comm_size - (inter_rank * num_core);
+ }
+ //INTRA-SMP-ALLGATHER
+ recv_offset = rank * rextent * rcount;
+ MPI_Sendrecv(send_buf, scount, stype, rank, tag,
+ ((char *) recv_buf + recv_offset), rcount, rtype, rank, tag,
+ comm, &status);
+ for (i = 1; i < num_core_in_current_smp; i++) {
+
+ dst =
+ (inter_rank * num_core) + (intra_rank + i) % (num_core_in_current_smp);
+ src =
+ (inter_rank * num_core) + (intra_rank - i +
+ num_core_in_current_smp) %
+ (num_core_in_current_smp);
+ recv_offset = src * rextent * rcount;
+
+ MPI_Sendrecv(send_buf, scount, stype, dst, tag,
+ ((char *) recv_buf + recv_offset), rcount, rtype, src, tag,
+ comm, &status);
+
+ }
+
+ // INTER-SMP-ALLGATHER
+ // Every root of each SMP node post INTER-Sendrecv, then do INTRA-Bcast for each receiving message
+
+
+
+ if (intra_rank == 0) {
+ MPI_Request *reqs, *req_ptr;
+ int num_req = (inter_comm_size - 1) * 2;
+ reqs = (MPI_Request *) malloc(num_req * sizeof(MPI_Request));
+ req_ptr = reqs;
+ MPI_Status *stat;
+ stat = (MPI_Status *) malloc(num_req * sizeof(MPI_Status));
+
+ for (i = 1; i < inter_comm_size; i++) {
+
+ //dst = ((inter_rank+i)%inter_comm_size) * num_core;
+ src = ((inter_rank - i + inter_comm_size) % inter_comm_size) * num_core;
+ //send_offset = (rank * sextent * scount);
+ recv_offset = (src * sextent * scount);
+ // MPI_Sendrecv((recv_buf+send_offset), (scount * num_core), stype, dst, tag,
+ // (recv_buf+recv_offset), (rcount * num_core), rtype, src, tag, comm, &status);
+ //MPIC_Isend((recv_buf+send_offset), (scount * num_core), stype, dst, tag, comm, req_ptr++);
+ MPI_Irecv(((char *) recv_buf + recv_offset), (rcount * num_core), rtype,
+ src, tag, comm, req_ptr++);
+ }
+ for (i = 1; i < inter_comm_size; i++) {
+
+ dst = ((inter_rank + i) % inter_comm_size) * num_core;
+ //src = ((inter_rank-i+inter_comm_size)%inter_comm_size) * num_core;
+ send_offset = (rank * sextent * scount);
+ //recv_offset = (src * sextent * scount);
+ // MPI_Sendrecv((recv_buf+send_offset), (scount * num_core), stype, dst, tag,
+ // (recv_buf+recv_offset), (rcount * num_core), rtype, src, tag, comm, &status);
+ MPI_Isend(((char *) recv_buf + send_offset), (scount * num_core), stype,
+ dst, tag, comm, req_ptr++);
+ //MPIC_Irecv((recv_buf+recv_offset), (rcount * num_core), rtype, src, tag, comm, req_ptr++);
+ }
+ MPI_Waitall(num_req, reqs, stat);
+ free(reqs);
+ free(stat);
+
+ }
+ //INTRA-BCAST (use flat tree)
+
+ if (intra_rank == 0) {
+ for (i = 1; i < num_core_in_current_smp; i++) {
+ //printf("rank = %d, num = %d send to %d\n",rank, num_core_in_current_smp, (rank + i));
+ MPI_Send(recv_buf, (scount * comm_size), stype, (rank + i), tag, comm);
+ }
+ } else {
+ //printf("rank = %d recv from %d\n",rank, (inter_rank * num_core));
+ MPI_Recv(recv_buf, (rcount * comm_size), rtype, (inter_rank * num_core),
+ tag, comm, &status);
+ }
+
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+/*****************************************************************************
+
+Copyright (c) 2006, Ahmad Faraj & Xin Yuan,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ * Neither the name of the Florida State University nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ *************************************************************************
+ * Any results obtained from executing this software require the *
+ * acknowledgment and citation of the software and its owners. *
+ * The full citation is given below: *
+ * *
+ * A. Faraj and X. Yuan. "Automatic Generation and Tuning of MPI *
+ * Collective Communication Routines." The 19th ACM International *
+ * Conference on Supercomputing (ICS), Cambridge, Massachusetts, *
+ * June 20-22, 2005. *
+ *************************************************************************
+
+*****************************************************************************/
+
+
+/*****************************************************************************
+ * Function: allgather_bruck
+ * return: int
+ * inputs:
+ * send_buff: send input buffer
+ * send_count: number of elements to send
+ * send_type: data type of elements being sent
+ * recv_buff: receive output buffer
+ * recv_count: number of elements to received
+ * recv_type: data type of elements being received
+ * comm: communication
+ * Descrp: Function realizes the allgather operation using the bruck
+ * algorithm.
+ * Auther: MPICH
+ * Comment: Original bruck algorithm from MPICH is slightly modified by
+ * Ahmad Faraj.
+ ****************************************************************************/
+int smpi_coll_tuned_allgather_bruck(void *send_buff, int send_count,
+ MPI_Datatype send_type, void *recv_buff,
+ int recv_count, MPI_Datatype recv_type,
+ MPI_Comm comm)
+{
+ // MPI variables
+ MPI_Status status;
+ MPI_Aint recv_extent;
+
+ // local int variables
+ int i, src, dst, rank, num_procs, count, remainder;
+ int tag = 1;
+ int pof2 = 1;
+ int success = 0;
+
+ // local string variables
+ char *tmp_buff;
+ char *send_ptr = (char *) send_buff;
+ char *recv_ptr = (char *) recv_buff;
+
+ // get size of the communicator, followed by rank
+ MPI_Comm_size(comm, &num_procs);
+ MPI_Comm_rank(comm, &rank);
+
+ // get size of single element's type for recv buffer
+ MPI_Type_extent(recv_type, &recv_extent);
+
+ count = recv_count;
+
+ tmp_buff = (char *) malloc(num_procs * recv_count * recv_extent);
+ if (!tmp_buff) {
+ printf("allgather-bruck:54: cannot allocate memory\n");
+ MPI_Finalize();
+ exit(0);
+ }
+ // perform a local copy
+ MPIR_Localcopy(send_ptr, send_count, send_type, tmp_buff, recv_count,
+ recv_type);
+
+ while (pof2 <= (num_procs / 2)) {
+ src = (rank + pof2) % num_procs;
+ dst = (rank - pof2 + num_procs) % num_procs;
+
+ MPIC_Sendrecv(tmp_buff, count, recv_type, dst, tag,
+ tmp_buff + count * recv_extent, count, recv_type,
+ src, tag, comm, &status);
+ count *= 2;
+ pof2 *= 2;
+ }
+
+ remainder = num_procs - pof2;
+ if (remainder) {
+ src = (rank + pof2) % num_procs;
+ dst = (rank - pof2 + num_procs) % num_procs;
+
+ MPIC_Sendrecv(tmp_buff, remainder * recv_count, recv_type, dst, tag,
+ tmp_buff + count * recv_extent, remainder * recv_count,
+ recv_type, src, tag, comm, &status);
+ }
+
+ MPIC_Sendrecv(tmp_buff, (num_procs - rank) * recv_count, recv_type, rank,
+ tag, recv_ptr + rank * recv_count * recv_extent,
+ (num_procs - rank) * recv_count, recv_type, rank, tag, comm,
+ &status);
+
+ if (rank)
+ MPIC_Sendrecv(tmp_buff + (num_procs - rank) * recv_count * recv_extent,
+ rank * recv_count, recv_type, rank, tag, recv_ptr,
+ rank * recv_count, recv_type, rank, tag, comm, &status);
+ free(tmp_buff);
+ return success;
+}
+
+/*#include "ompi_bindings.h"
+
+int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
+ MPI_Datatype sdtype,
+ void* rbuf, int rcount,
+ MPI_Datatype rdtype,
+ MPI_Comm comm)
+{
+ int line = -1, err = 0;
+ int rank, size, step;
+ int sendto, recvfrom;
+ void * tmpsend, *tmprecv;
+ ptrdiff_t lb, sext, rext;
+
+ size = ompi_comm_size(comm);
+ rank = ompi_comm_rank(comm);
+
+ OPAL_OUTPUT((ompi_coll_tuned_stream,
+ "coll:tuned:alltoall_intra_pairwise rank %d", rank));
+
+ err = ompi_datatype_get_extent (sdtype, &lb, &sext);
+ if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
+ err = ompi_datatype_get_extent (rdtype, &lb, &rext);
+ if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
+
+
+ // Perform pairwise exchange - starting from 1 so the local copy is last
+ for (step = 1; step < size + 1; step++) {
+
+ // Determine sender and receiver for this step.
+ sendto = (rank + step) % size;
+ recvfrom = (rank + size - step) % size;
+
+ // Determine sending and receiving locations
+ tmpsend = (char*)sbuf + sendto * sext * scount;
+ tmprecv = (char*)rbuf + recvfrom * rext * rcount;
+
+ // send and receive
+ err = ompi_coll_tuned_sendrecv( tmpsend, scount, sdtype, sendto,
+ MCA_COLL_BASE_TAG_ALLTOALL,
+ tmprecv, rcount, rdtype, recvfrom,
+ MCA_COLL_BASE_TAG_ALLTOALL,
+ comm, MPI_STATUS_IGNORE, rank);
+ if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
+ }
+
+ return MPI_SUCCESS;
+
+ err_hndl:
+ OPAL_OUTPUT((ompi_coll_tuned_stream,
+ "%s:%4d\tError occurred %d, rank %2d", __FILE__, line,
+ err, rank));
+ return err;
+}
+*/
--- /dev/null
+#include "colls.h"
+
+#ifndef NUM_CORE
+#define NUM_CORE 4
+#endif
+
+int smpi_coll_tuned_allgather_loosely_lr(void *sbuf, int scount,
+ MPI_Datatype stype, void *rbuf,
+ int rcount, MPI_Datatype rtype,
+ MPI_Comm comm)
+{
+ int comm_size, rank;
+ int tag = 50;
+ int i, j, send_offset, recv_offset;
+ int intra_rank, inter_rank, inter_comm_size, intra_comm_size;
+ int inter_dst, inter_src;
+
+ MPI_Comm_size(comm, &comm_size);
+ MPI_Comm_rank(comm, &rank);
+ MPI_Aint rextent, sextent;
+ MPI_Type_extent(rtype, &rextent);
+ MPI_Type_extent(stype, &sextent);
+ MPI_Request inter_rrequest;
+ MPI_Request rrequest_array[128];
+ MPI_Request srequest_array[128];
+ MPI_Request inter_srequest_array[128];
+
+
+ int rrequest_count = 0;
+ int srequest_count = 0;
+ int inter_srequest_count = 0;
+
+ MPI_Status status;
+
+ intra_rank = rank % NUM_CORE;
+ inter_rank = rank / NUM_CORE;
+ inter_comm_size = (comm_size + NUM_CORE - 1) / NUM_CORE;
+ intra_comm_size = NUM_CORE;
+
+ int src_seg, dst_seg;
+
+ //copy corresponding message from sbuf to rbuf
+ recv_offset = rank * rextent * rcount;
+ MPI_Sendrecv(sbuf, scount, stype, rank, tag,
+ (char *)rbuf + recv_offset, rcount, rtype, rank, tag, comm, &status);
+
+ int dst, src;
+ int inter_send_offset, inter_recv_offset;
+
+ rrequest_count = 0;
+ srequest_count = 0;
+ inter_srequest_count = 0;
+
+ for (i = 0; i < inter_comm_size; i++) {
+
+ // inter_communication
+
+ inter_dst = (rank + intra_comm_size) % comm_size;
+ inter_src = (rank - intra_comm_size + comm_size) % comm_size;
+
+ src_seg =
+ ((inter_rank - 1 - i +
+ inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;
+ dst_seg =
+ ((inter_rank - i +
+ inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;
+
+ inter_send_offset = dst_seg * sextent * scount;
+ inter_recv_offset = src_seg * rextent * rcount;
+
+ for (j = 0; j < intra_comm_size; j++) {
+
+ // inter communication
+ if (intra_rank == j) {
+ if (i != inter_comm_size - 1) {
+
+ MPI_Irecv((char *)rbuf + inter_recv_offset, rcount, rtype, inter_src, tag,
+ comm, &inter_rrequest);
+ MPI_Isend((char *)rbuf + inter_send_offset, scount, stype, inter_dst, tag,
+ comm, &inter_srequest_array[inter_srequest_count++]);
+
+ }
+ }
+ //intra_communication
+ src = inter_rank * intra_comm_size + j;
+ dst = inter_rank * intra_comm_size + j;
+
+ src_seg =
+ ((inter_rank - i +
+ inter_comm_size) % inter_comm_size) * intra_comm_size + j;
+ dst_seg =
+ ((inter_rank - i +
+ inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;
+
+ send_offset = dst_seg * sextent * scount;
+ recv_offset = src_seg * rextent * rcount;
+
+
+ if (j != intra_rank) {
+
+ MPI_Irecv((char *)rbuf + recv_offset, rcount, rtype, src, tag, comm,
+ &rrequest_array[rrequest_count++]);
+ MPI_Isend((char *)rbuf + send_offset, scount, stype, dst, tag, comm,
+ &srequest_array[srequest_count++]);
+
+ }
+ } // intra loop
+
+
+ // wait for inter communication to finish for these rounds (# of round equals NUM_CORE)
+ if (i != inter_comm_size - 1) {
+ MPI_Wait(&inter_rrequest, &status);
+ }
+
+ } //inter loop
+
+ MPI_Waitall(rrequest_count, rrequest_array, MPI_STATUSES_IGNORE);
+ MPI_Waitall(srequest_count, srequest_array, MPI_STATUSES_IGNORE);
+ MPI_Waitall(inter_srequest_count, inter_srequest_array, MPI_STATUSES_IGNORE);
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+// Allgather-Non-Topoloty-Scecific-Logical-Ring algorithm
+int
+smpi_coll_tuned_allgather_lr(void *sbuf, int scount, MPI_Datatype stype,
+ void *rbuf, int rcount, MPI_Datatype rtype,
+ MPI_Comm comm)
+{
+ MPI_Aint rextent, sextent;
+ MPI_Status status;
+ int i, to, from, rank, size;
+ int send_offset, recv_offset;
+ int tag = 500;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &size);
+ MPI_Type_extent(rtype, &rextent);
+ MPI_Type_extent(stype, &sextent);
+
+ // irregular case use default MPI fucntions
+ if (scount * sextent != rcount * rextent)
+ MPI_Allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
+
+ // topo non-specific
+ to = (rank + 1) % size;
+ from = (rank + size - 1) % size;
+
+ //copy a single segment from sbuf to rbuf
+ send_offset = rank * scount * sextent;
+ MPI_Sendrecv(sbuf, scount, stype, rank, tag,
+ (char *) rbuf + send_offset, rcount, rtype, rank, tag,
+ comm, &status);
+
+ //start sending logical ring message
+ int increment = scount * sextent;
+ for (i = 0; i < size - 1; i++) {
+ send_offset = ((rank - i + size) % size) * increment;
+ recv_offset = ((rank - i - 1 + size) % size) * increment;
+ MPI_Sendrecv((char *) rbuf + send_offset, scount, stype, to, tag + i,
+ (char *) rbuf + recv_offset, rcount, rtype, from, tag + i,
+ comm, &status);
+ }
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+/*****************************************************************************
+
+Copyright (c) 2006, Ahmad Faraj & Xin Yuan,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ * Neither the name of the Florida State University nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ *************************************************************************
+ * Any results obtained from executing this software require the *
+ * acknowledgment and citation of the software and its owners. *
+ * The full citation is given below: *
+ * *
+ * A. Faraj and X. Yuan. "Automatic Generation and Tuning of MPI *
+ * Collective Communication Routines." The 19th ACM International *
+ * Conference on Supercomputing (ICS), Cambridge, Massachusetts, *
+ * June 20-22, 2005. *
+ *************************************************************************
+
+*****************************************************************************/
+
+/*****************************************************************************
+ * Function: allgather_pair
+ * return: int
+ * inputs:
+ * send_buff: send input buffer
+ * send_count: number of elements to send
+ * send_type: data type of elements being sent
+ * recv_buff: receive output buffer
+ * recv_count: number of elements to received
+ * recv_type: data type of elements being received
+ * comm: communication
+ * Descrp: Function works when P is power of two. In each phase of P - 1
+ * phases, nodes in pair communicate their data.
+ * Auther: Ahmad Faraj
+ ****************************************************************************/
+int
+smpi_coll_tuned_allgather_pair(void *send_buff, int send_count,
+ MPI_Datatype send_type, void *recv_buff,
+ int recv_count, MPI_Datatype recv_type,
+ MPI_Comm comm)
+{
+
+ MPI_Aint extent;
+ int i, src, dst, rank, num_procs;
+ int tag = 1;
+ MPI_Status status;
+
+ char *send_ptr = (char *) send_buff;
+ char *recv_ptr = (char *) recv_buff;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &num_procs);
+ MPI_Type_extent(send_type, &extent);
+
+ // local send/recv
+ MPI_Sendrecv(send_ptr, send_count, send_type, rank, tag,
+ recv_ptr + rank * recv_count * extent,
+ recv_count, recv_type, rank, tag, comm, &status);
+
+ for (i = 1; i < num_procs; i++) {
+ src = dst = rank ^ i;
+ MPI_Sendrecv(send_ptr, send_count, send_type, dst, tag,
+ recv_ptr + src * recv_count * extent, recv_count, recv_type,
+ src, tag, comm, &status);
+ }
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+int
+smpi_coll_tuned_allgather_rdb(void *sbuf, int send_count,
+ MPI_Datatype send_type, void *rbuf,
+ int recv_count, MPI_Datatype recv_type,
+ MPI_Comm comm)
+{
+ // MPI variables
+ MPI_Status status;
+ MPI_Aint send_chunk, recv_chunk;
+
+ // local int variables
+ int i, j, k, dst, rank, num_procs, send_offset, recv_offset, tree_root;
+ int dst_tree_root, rank_tree_root, last_recv_count, num_procs_completed;
+ int offset, tmp_mask;
+ int tag = 1;
+ int mask = 1;
+ int success = 0;
+ int curr_count = recv_count;
+
+ // local string variables
+ char *send_ptr = (char *) sbuf;
+ char *recv_ptr = (char *) rbuf;
+
+ // get size of the communicator, followed by rank
+ MPI_Comm_size(comm, &num_procs);
+ MPI_Comm_rank(comm, &rank);
+
+ // get size of single element's type for send buffer and recv buffer
+ MPI_Type_extent(send_type, &send_chunk);
+ MPI_Type_extent(recv_type, &recv_chunk);
+
+ // multiply size of each element by number of elements to send or recv
+ send_chunk *= send_count;
+ recv_chunk *= recv_count;
+
+ // perform a local copy
+ MPI_Sendrecv(send_ptr, send_count, send_type, rank, tag,
+ recv_ptr + rank * recv_chunk, recv_count, recv_type, rank, tag,
+ comm, &status);
+
+ i = 0;
+ while (mask < num_procs) {
+ dst = rank ^ mask;
+ dst_tree_root = dst >> i;
+ dst_tree_root <<= i;
+ rank_tree_root = rank >> i;
+ rank_tree_root <<= i;
+ send_offset = rank_tree_root * send_chunk;
+ recv_offset = dst_tree_root * recv_chunk;
+
+ if (dst < num_procs) {
+ MPI_Sendrecv(recv_ptr + send_offset, curr_count, send_type, dst,
+ tag, recv_ptr + recv_offset, mask * recv_count,
+ recv_type, dst, tag, comm, &status);
+ MPI_Get_count(&status, recv_type, &last_recv_count);
+ curr_count += last_recv_count;
+ }
+
+ if (dst_tree_root + mask > num_procs) {
+ num_procs_completed = num_procs - rank_tree_root - mask;
+ /* num_procs_completed is the number of processes in this
+ subtree that have all the data. Send data to others
+ in a tree fashion. First find root of current tree
+ that is being divided into two. k is the number of
+ least-significant bits in this process's rank that
+ must be zeroed out to find the rank of the root */
+
+ j = mask;
+ k = 0;
+ while (j) {
+ j >>= 1;
+ k++;
+ }
+ k--;
+
+ offset = recv_chunk * (rank_tree_root + mask);
+ tmp_mask = mask >> 1;
+
+ while (tmp_mask) {
+ dst = rank ^ tmp_mask;
+
+ tree_root = rank >> k;
+ tree_root <<= k;
+
+ /* send only if this proc has data and destination
+ doesn't have data. at any step, multiple processes
+ can send if they have the data */
+ if ((dst > rank)
+ && (rank < tree_root + num_procs_completed)
+ && (dst >= tree_root + num_procs_completed)) {
+ MPI_Send(recv_ptr + offset, last_recv_count, recv_type, dst,
+ tag, comm);
+
+ /* last_recv_cnt was set in the previous
+ receive. that's the amount of data to be
+ sent now. */
+ }
+ /* recv only if this proc. doesn't have data and sender
+ has data */
+ else if ((dst < rank)
+ && (dst < tree_root + num_procs_completed)
+ && (rank >= tree_root + num_procs_completed)) {
+ MPI_Recv(recv_ptr + offset,
+ recv_count * num_procs_completed,
+ recv_type, dst, tag, comm, &status);
+ // num_procs_completed is also equal to the no. of processes
+ // whose data we don't have
+ MPI_Get_count(&status, recv_type, &last_recv_count);
+ curr_count += last_recv_count;
+ }
+ tmp_mask >>= 1;
+ k--;
+ }
+ }
+
+ mask <<= 1;
+ i++;
+ }
+
+ return success;
+}
--- /dev/null
+#include "colls.h"
+
+// now only work with power of two processes
+
+int
+smpi_coll_tuned_allgather_rhv(void *sbuf, int send_count,
+ MPI_Datatype send_type, void *rbuf,
+ int recv_count, MPI_Datatype recv_type,
+ MPI_Comm comm)
+{
+ MPI_Status status;
+ MPI_Aint s_extent, r_extent;
+
+ // local int variables
+ int i, dst, send_base_offset, recv_base_offset, send_chunk, recv_chunk,
+ send_offset, recv_offset;
+ int rank, num_procs;
+ int tag = 50;
+ int mask;
+ int curr_count;
+
+ // get size of the communicator, followed by rank
+ MPI_Comm_size(comm, &num_procs);
+ MPI_Comm_rank(comm, &rank);
+
+ // get size of single element's type for send buffer and recv buffer
+ MPI_Type_extent(send_type, &s_extent);
+ MPI_Type_extent(recv_type, &r_extent);
+
+ // multiply size of each element by number of elements to send or recv
+ send_chunk = s_extent * send_count;
+ recv_chunk = r_extent * recv_count;
+
+ if (send_chunk != recv_chunk)
+ return MPI_Allgather(sbuf, send_count, send_type, rbuf, recv_count,
+ recv_type, comm);
+
+ // compute starting offset location to perform local copy
+ int size = num_procs / 2;
+ int base_offset = 0;
+ mask = 1;
+ while (mask < num_procs) {
+ if (rank & mask) {
+ base_offset += size;
+ }
+ mask <<= 1;
+ size /= 2;
+ }
+
+ // printf("node %d base_offset %d\n",rank,base_offset);
+
+ //perform a remote copy
+
+ dst = base_offset;
+ MPI_Sendrecv(sbuf, send_count, send_type, dst, tag,
+ (char *)rbuf + base_offset * recv_chunk, recv_count, recv_type, dst, tag,
+ comm, &status);
+
+
+ mask >>= 1;
+ i = 1;
+ int phase = 0;
+ curr_count = recv_count;
+ while (mask >= 1) {
+ // destination pair for both send and recv
+ dst = rank ^ mask;
+
+ // compute offsets
+ send_base_offset = base_offset;
+ if (rank & mask) {
+ recv_base_offset = base_offset - i;
+ base_offset -= i;
+ } else {
+ recv_base_offset = base_offset + i;
+ }
+ send_offset = send_base_offset * recv_chunk;
+ recv_offset = recv_base_offset * recv_chunk;
+
+ // printf("node %d send to %d in phase %d s_offset = %d r_offset = %d count = %d\n",rank,dst,phase, send_base_offset, recv_base_offset, curr_count);
+
+ MPI_Sendrecv((char *)rbuf + send_offset, curr_count, recv_type, dst, tag,
+ (char *)rbuf + recv_offset, curr_count, recv_type, dst, tag,
+ comm, &status);
+
+
+ curr_count *= 2;
+ i *= 2;
+ mask >>= 1;
+ phase++;
+ }
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+/*****************************************************************************
+
+Copyright (c) 2006, Ahmad Faraj & Xin Yuan,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ * Neither the name of the Florida State University nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ *************************************************************************
+ * Any results obtained from executing this software require the *
+ * acknowledgment and citation of the software and its owners. *
+ * The full citation is given below: *
+ * *
+ * A. Faraj and X. Yuan. "Automatic Generation and Tuning of MPI *
+ * Collective Communication Routines." The 19th ACM International *
+ * Conference on Supercomputing (ICS), Cambridge, Massachusetts, *
+ * June 20-22, 2005. *
+ *************************************************************************
+
+*****************************************************************************/
+
+/*****************************************************************************
+ * Function: allgather_ring
+ * return: int
+ * inputs:
+ * send_buff: send input buffer
+ * send_count: number of elements to send
+ * send_type: data type of elements being sent
+ * recv_buff: receive output buffer
+ * recv_count: number of elements to received
+ * recv_type: data type of elements being received
+ * comm: communication
+ * Descrp: Function works in P - 1 steps. In step i, node j - i -> j -> j+ i.
+ * Auther: Ahmad Faraj
+ ****************************************************************************/
+int
+smpi_coll_tuned_allgather_ring(void *send_buff, int send_count,
+ MPI_Datatype send_type, void *recv_buff,
+ int recv_count, MPI_Datatype recv_type,
+ MPI_Comm comm)
+{
+
+ MPI_Aint extent;
+ int i, src, dst, rank, num_procs;
+ int tag = 1;
+ MPI_Status status;
+
+ char *sendptr = (char *) send_buff;
+ char *recvptr = (char *) recv_buff;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &num_procs);
+ MPI_Type_extent(send_type, &extent);
+
+ // local send/recv
+ MPI_Sendrecv(sendptr, send_count, send_type, rank, tag,
+ recvptr + rank * recv_count * extent,
+ recv_count, recv_type, rank, tag, comm, &status);
+
+ for (i = 1; i < num_procs; i++) {
+ src = (rank - i + num_procs) % num_procs;
+ dst = (rank + i) % num_procs;
+ MPI_Sendrecv(sendptr, send_count, send_type, dst, tag,
+ recvptr + src * recv_count * extent, recv_count, recv_type,
+ src, tag, comm, &status);
+ }
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+#ifndef NUM_CORE
+#define NUM_CORE 8
+#endif
+
+int smpi_coll_tuned_allgather_smp_simple(void *send_buf, int scount,
+ MPI_Datatype stype, void *recv_buf,
+ int rcount, MPI_Datatype rtype,
+ MPI_Comm comm)
+{
+ int src, dst, comm_size, rank;
+ MPI_Comm_size(comm, &comm_size);
+ MPI_Comm_rank(comm, &rank);
+ MPI_Aint rextent, sextent;
+ MPI_Type_extent(rtype, &rextent);
+ MPI_Type_extent(stype, &sextent);
+ int tag = 50;
+ MPI_Status status;
+ int i, send_offset, recv_offset;
+ int intra_rank, inter_rank;
+ int num_core = NUM_CORE;
+ intra_rank = rank % num_core;
+ inter_rank = rank / num_core;
+ int inter_comm_size = (comm_size + num_core - 1) / num_core;
+ int num_core_in_current_smp = num_core;
+
+ // the last SMP node may have fewer number of running processes than all others
+ if (inter_rank == (inter_comm_size - 1)) {
+ num_core_in_current_smp = comm_size - (inter_rank * num_core);
+ }
+ //INTRA-SMP-ALLGATHER
+ recv_offset = rank * rextent * rcount;
+ MPI_Sendrecv(send_buf, scount, stype, rank, tag,
+ ((char *) recv_buf + recv_offset), rcount, rtype, rank, tag,
+ comm, &status);
+ for (i = 1; i < num_core_in_current_smp; i++) {
+
+ dst =
+ (inter_rank * num_core) + (intra_rank + i) % (num_core_in_current_smp);
+ src =
+ (inter_rank * num_core) + (intra_rank - i +
+ num_core_in_current_smp) %
+ (num_core_in_current_smp);
+ recv_offset = src * rextent * rcount;
+
+ MPI_Sendrecv(send_buf, scount, stype, dst, tag,
+ ((char *) recv_buf + recv_offset), rcount, rtype, src, tag,
+ comm, &status);
+
+ }
+
+ // INTER-SMP-ALLGATHER
+ // Every root of each SMP node post INTER-Sendrecv, then do INTRA-Bcast for each receiving message
+
+
+
+ if (intra_rank == 0) {
+ MPI_Request *reqs, *req_ptr;
+ int num_req = (inter_comm_size - 1) * 2;
+ reqs = (MPI_Request *) malloc(num_req * sizeof(MPI_Request));
+ req_ptr = reqs;
+ MPI_Status *stat;
+ stat = (MPI_Status *) malloc(num_req * sizeof(MPI_Status));
+
+ for (i = 1; i < inter_comm_size; i++) {
+
+ //dst = ((inter_rank+i)%inter_comm_size) * num_core;
+ src = ((inter_rank - i + inter_comm_size) % inter_comm_size) * num_core;
+ //send_offset = (rank * sextent * scount);
+ recv_offset = (src * sextent * scount);
+ // MPI_Sendrecv((recv_buf+send_offset), (scount * num_core), stype, dst, tag,
+ // (recv_buf+recv_offset), (rcount * num_core), rtype, src, tag, comm, &status);
+ //MPIC_Isend((recv_buf+send_offset), (scount * num_core), stype, dst, tag, comm, req_ptr++);
+ MPI_Irecv(((char *) recv_buf + recv_offset), (rcount * num_core), rtype,
+ src, tag, comm, req_ptr++);
+ }
+ for (i = 1; i < inter_comm_size; i++) {
+
+ dst = ((inter_rank + i) % inter_comm_size) * num_core;
+ //src = ((inter_rank-i+inter_comm_size)%inter_comm_size) * num_core;
+ send_offset = (rank * sextent * scount);
+ //recv_offset = (src * sextent * scount);
+ // MPI_Sendrecv((recv_buf+send_offset), (scount * num_core), stype, dst, tag,
+ // (recv_buf+recv_offset), (rcount * num_core), rtype, src, tag, comm, &status);
+ MPI_Isend(((char *) recv_buf + send_offset), (scount * num_core), stype,
+ dst, tag, comm, req_ptr++);
+ //MPIC_Irecv((recv_buf+recv_offset), (rcount * num_core), rtype, src, tag, comm, req_ptr++);
+ }
+ MPI_Waitall(num_req, reqs, stat);
+ free(reqs);
+ free(stat);
+
+ }
+ //INTRA-BCAST (use flat tree)
+
+ if (intra_rank == 0) {
+ for (i = 1; i < num_core_in_current_smp; i++) {
+ //printf("rank = %d, num = %d send to %d\n",rank, num_core_in_current_smp, (rank + i));
+ MPI_Send(recv_buf, (scount * comm_size), stype, (rank + i), tag, comm);
+ }
+ } else {
+ //printf("rank = %d recv from %d\n",rank, (inter_rank * num_core));
+ MPI_Recv(recv_buf, (rcount * comm_size), rtype, (inter_rank * num_core),
+ tag, comm, &status);
+ }
+
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+/*****************************************************************************
+
+Copyright (c) 2006, Ahmad Faraj & Xin Yuan,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ * Neither the name of the Florida State University nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ *************************************************************************
+ * Any results obtained from executing this software require the *
+ * acknowledgment and citation of the software and its owners. *
+ * The full citation is given below: *
+ * *
+ * A. Faraj and X. Yuan. "Automatic Generation and Tuning of MPI *
+ * Collective Communication Routines." The 19th ACM International *
+ * Conference on Supercomputing (ICS), Cambridge, Massachusetts, *
+ * June 20-22, 2005. *
+ *************************************************************************
+
+*****************************************************************************/
+
+/*****************************************************************************
+ * Function: allgather_spreading_simple
+ * return: int
+ * inputs:
+ * send_buff: send input buffer
+ * send_count: number of elements to send
+ * send_type: data type of elements being sent
+ * recv_buff: receive output buffer
+ * recv_count: number of elements to received
+ * recv_type: data type of elements being received
+ * comm: communication
+ * Descrp: Let i -> j denote the communication from node i to node j. The
+ * order of communications for node i is i -> i + 1, i -> i + 2, ...,
+ * i -> (i + p -1) % P.
+ *
+ * Auther: Ahmad Faraj
+ ****************************************************************************/
+int
+smpi_coll_tuned_allgather_spreading_simple(void *send_buff, int send_count,
+ MPI_Datatype send_type,
+ void *recv_buff, int recv_count,
+ MPI_Datatype recv_type,
+ MPI_Comm comm)
+{
+ MPI_Request *reqs, *req_ptr;
+ MPI_Aint extent;
+ int i, src, dst, rank, num_procs, num_reqs;
+ int tag = 1;
+ MPI_Status status;
+ char *recv_ptr = (char *) recv_buff;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &num_procs);
+ MPI_Type_extent(send_type, &extent);
+
+ num_reqs = (2 * num_procs) - 2;
+ reqs = (MPI_Request *) malloc(num_reqs * sizeof(MPI_Request));
+ if (!reqs) {
+ printf("allgather-spreading-simple.c:40: cannot allocate memory\n");
+ MPI_Finalize();
+ exit(0);
+ }
+
+ req_ptr = reqs;
+ MPI_Sendrecv(send_buff, send_count, send_type, rank, tag,
+ (char *) recv_buff + rank * recv_count * extent, recv_count,
+ recv_type, rank, tag, comm, &status);
+
+ for (i = 0; i < num_procs; i++) {
+ src = (rank + i) % num_procs;
+ if (src == rank)
+ continue;
+ MPI_Irecv(recv_ptr + src * recv_count * extent, recv_count, recv_type,
+ src, tag, comm, req_ptr++);
+ }
+
+ for (i = 0; i < num_procs; i++) {
+ dst = (rank + i) % num_procs;
+ if (dst == rank)
+ continue;
+ MPI_Isend(send_buff, send_count, send_type, dst, tag, comm, req_ptr++);
+ }
+
+ MPI_Waitall(num_reqs, reqs, MPI_STATUSES_IGNORE);
+ free(reqs);
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+/* IMPLEMENTED BY PITCH PATARASUK
+ Non-topoloty-specific all-reduce operation designed bandwidth optimally */
+
+/* ** NOTE **
+ Use -DMPICH2_REDUCTION if this code does not compile.
+ MPICH1 code also work on MPICH2 on our cluster and the performance are similar.
+ This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc).
+*/
+
+//#include <star-reduction.c>
+
+int
+smpi_coll_tuned_allreduce_NTS(void *sbuf, void *rbuf, int rcount,
+ MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
+{
+ int tag = 5000;
+ MPI_Status status;
+ int rank, i, size, count;
+ int send_offset, recv_offset;
+ int remainder, remainder_flag, remainder_offset;
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+ /* make it compatible with all data type */
+ MPI_Aint extent;
+ MPI_Type_extent(dtype, &extent);
+
+ /* when communication size is smaller than number of process (not support) */
+ if (rcount < size) {
+ return MPI_Allreduce(sbuf, rbuf, rcount, dtype, op, comm);
+ }
+
+ /* when communication size is not divisible by number of process:
+ call the native implementation for the remain chunk at the end of the operation */
+ else if (rcount % size != 0) {
+ remainder = rcount % size;
+ remainder_flag = 1;
+ remainder_offset = (rcount / size) * size * extent;
+ } else {
+ remainder_flag = remainder_offset = 0;
+ }
+
+ /* size of each point-to-point communication is equal to the size of the whole message
+ divided by number of processes
+ */
+ count = rcount / size;
+
+ /* our ALL-REDUCE implementation
+ 1. copy (partial of)send_buf to recv_buf
+ 2. use logical ring reduce-scatter
+ 3. use logical ring all-gather
+ */
+
+ // copy partial data
+ send_offset = ((rank - 1 + size) % size) * count * extent;
+ recv_offset = ((rank - 1 + size) % size) * count * extent;
+ MPI_Sendrecv((char *) sbuf + send_offset, count, dtype, rank, tag - 1,
+ (char *) rbuf + recv_offset, count, dtype, rank, tag - 1, comm,
+ &status);
+
+ // reduce-scatter
+ for (i = 0; i < (size - 1); i++) {
+ send_offset = ((rank - 1 - i + size) % size) * count * extent;
+ recv_offset = ((rank - 2 - i + size) % size) * count * extent;
+ MPI_Sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
+ tag + i, (char *) rbuf + recv_offset, count, dtype,
+ ((rank + size - 1) % size), tag + i, comm, &status);
+
+ // compute result to rbuf+recv_offset
+ star_reduction(op, (char *)sbuf + recv_offset, (char *)rbuf + recv_offset, &count, &dtype);
+ }
+
+ // all-gather
+ for (i = 0; i < (size - 1); i++) {
+ send_offset = ((rank - i + size) % size) * count * extent;
+ recv_offset = ((rank - 1 - i + size) % size) * count * extent;
+ MPI_Sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
+ tag + i, (char *) rbuf + recv_offset, count, dtype,
+ ((rank + size - 1) % size), tag + i, comm, &status);
+ }
+
+ /* when communication size is not divisible by number of process:
+ call the native implementation for the remain chunk at the end of the operation */
+ if (remainder_flag) {
+ return MPI_Allreduce((char *) sbuf + remainder_offset,
+ (char *) rbuf + remainder_offset, remainder, dtype, op,
+ comm);
+ }
+
+ return 0;
+}
--- /dev/null
+#include "colls.h"
+
+/* IMPLEMENTED BY PITCH PATARASUK
+ Non-topoloty-specific all-reduce operation designed bandwidth optimally
+ Bug fixing by Xin Yuan, 04/04/2008
+*/
+
+/* ** NOTE **
+ Use -DMPICH2_REDUCTION if this code does not compile.
+ MPICH1 code also work on MPICH2 on our cluster and the performance are similar.
+ This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc).
+*/
+
+//#include <star-reduction.c>
+
+int
+smpi_coll_tuned_allreduce_lr(void *sbuf, void *rbuf, int rcount,
+ MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
+{
+ int tag = 5000;
+ MPI_Status status;
+ int rank, i, size, count;
+ int send_offset, recv_offset;
+ int remainder, remainder_flag, remainder_offset;
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+ /* make it compatible with all data type */
+ MPI_Aint extent;
+ MPI_Type_extent(dtype, &extent);
+
+ /* when communication size is smaller than number of process (not support) */
+ if (rcount < size) {
+ return MPI_Allreduce(sbuf, rbuf, rcount, dtype, op, comm);
+ }
+
+ /* when communication size is not divisible by number of process:
+ call the native implementation for the remain chunk at the end of the operation */
+ else if (rcount % size != 0) {
+ remainder = rcount % size;
+ remainder_flag = 1;
+ remainder_offset = (rcount / size) * size * extent;
+ } else {
+ remainder_flag = remainder_offset = 0;
+ }
+
+ /* size of each point-to-point communication is equal to the size of the whole message
+ divided by number of processes
+ */
+ count = rcount / size;
+
+ /* our ALL-REDUCE implementation
+ 1. copy (partial of)send_buf to recv_buf
+ 2. use logical ring reduce-scatter
+ 3. use logical ring all-gather
+ */
+
+ // copy partial data
+ send_offset = ((rank - 1 + size) % size) * count * extent;
+ recv_offset = ((rank - 1 + size) % size) * count * extent;
+ MPI_Sendrecv((char *) sbuf + send_offset, count, dtype, rank, tag - 1,
+ (char *) rbuf + recv_offset, count, dtype, rank, tag - 1, comm,
+ &status);
+
+ // reduce-scatter
+ for (i = 0; i < (size - 1); i++) {
+ send_offset = ((rank - 1 - i + 2 * size) % size) * count * extent;
+ recv_offset = ((rank - 2 - i + 2 * size) % size) * count * extent;
+ // recv_offset = ((rank-i+2*size)%size)*count*extent;
+ MPI_Sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
+ tag + i, (char *) rbuf + recv_offset, count, dtype,
+ ((rank + size - 1) % size), tag + i, comm, &status);
+
+ // compute result to rbuf+recv_offset
+ star_reduction(op, (char *) sbuf + recv_offset, (char *) rbuf + recv_offset,
+ &count, &dtype);
+ }
+
+ // all-gather
+ for (i = 0; i < (size - 1); i++) {
+ send_offset = ((rank - i + 2 * size) % size) * count * extent;
+ recv_offset = ((rank - 1 - i + 2 * size) % size) * count * extent;
+ MPI_Sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
+ tag + i, (char *) rbuf + recv_offset, count, dtype,
+ ((rank + size - 1) % size), tag + i, comm, &status);
+ }
+
+ /* when communication size is not divisible by number of process:
+ call the native implementation for the remain chunk at the end of the operation */
+ if (remainder_flag) {
+ return MPI_Allreduce((char *) sbuf + remainder_offset,
+ (char *) rbuf + remainder_offset, remainder, dtype, op,
+ comm);
+ }
+
+ return 0;
+}
--- /dev/null
+#include "colls.h"
+
+int smpi_coll_tuned_allreduce_rab_rdb(void *sbuff, void *rbuff, int count,
+ MPI_Datatype dtype, MPI_Op op,
+ MPI_Comm comm)
+{
+ int nprocs, rank, type_size, tag = 543;
+ int mask, dst, pof2, newrank, rem, newdst, i,
+ send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps;
+ MPI_Aint lb, extent;
+ MPI_Status status;
+ void *tmp_buf = NULL;
+
+#ifdef MPICH2_REDUCTION
+ MPI_User_function *uop = MPIR_Op_table[op % 16 - 1];
+#else
+ MPI_User_function *uop;
+ struct MPIR_OP *op_ptr;
+ op_ptr = (MPI_User_function *) MPIR_ToPointer(op);
+ uop = op_ptr->op;
+#endif
+
+ MPI_Comm_size(comm, &nprocs);
+ MPI_Comm_rank(comm, &rank);
+
+ MPI_Type_extent(dtype, &extent);
+ tmp_buf = (void *) malloc(count * extent);
+ if (!tmp_buf) {
+ printf("Could not allocate memory for tmp_buf\n");
+ return 1;
+ }
+
+ MPIR_Localcopy(sbuff, count, dtype, rbuff, count, dtype);
+
+ MPI_Type_size(dtype, &type_size);
+
+ // find nearest power-of-two less than or equal to comm_size
+ pof2 = 1;
+ while (pof2 <= nprocs)
+ pof2 <<= 1;
+ pof2 >>= 1;
+
+ rem = nprocs - pof2;
+
+ // In the non-power-of-two case, all even-numbered
+ // processes of rank < 2*rem send their data to
+ // (rank+1). These even-numbered processes no longer
+ // participate in the algorithm until the very end. The
+ // remaining processes form a nice power-of-two.
+
+ if (rank < 2 * rem) {
+ // even
+ if (rank % 2 == 0) {
+
+ MPI_Send(rbuff, count, dtype, rank + 1, tag, comm);
+
+ // temporarily set the rank to -1 so that this
+ // process does not pariticipate in recursive
+ // doubling
+ newrank = -1;
+ } else // odd
+ {
+ MPI_Recv(tmp_buf, count, dtype, rank - 1, tag, comm, &status);
+ // do the reduction on received data. since the
+ // ordering is right, it doesn't matter whether
+ // the operation is commutative or not.
+ (*uop) (tmp_buf, rbuff, &count, &dtype);
+
+ // change the rank
+ newrank = rank / 2;
+ }
+ }
+
+ else // rank >= 2 * rem
+ newrank = rank - rem;
+
+ // If op is user-defined or count is less than pof2, use
+ // recursive doubling algorithm. Otherwise do a reduce-scatter
+ // followed by allgather. (If op is user-defined,
+ // derived datatypes are allowed and the user could pass basic
+ // datatypes on one process and derived on another as long as
+ // the type maps are the same. Breaking up derived
+ // datatypes to do the reduce-scatter is tricky, therefore
+ // using recursive doubling in that case.)
+
+ if (newrank != -1) {
+ // do a reduce-scatter followed by allgather. for the
+ // reduce-scatter, calculate the count that each process receives
+ // and the displacement within the buffer
+
+ cnts = (int *) malloc(pof2 * sizeof(int));
+ disps = (int *) malloc(pof2 * sizeof(int));
+
+ for (i = 0; i < (pof2 - 1); i++)
+ cnts[i] = count / pof2;
+ cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1);
+
+ disps[0] = 0;
+ for (i = 1; i < pof2; i++)
+ disps[i] = disps[i - 1] + cnts[i - 1];
+
+ mask = 0x1;
+ send_idx = recv_idx = 0;
+ last_idx = pof2;
+ while (mask < pof2) {
+ newdst = newrank ^ mask;
+ // find real rank of dest
+ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
+
+ send_cnt = recv_cnt = 0;
+ if (newrank < newdst) {
+ send_idx = recv_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < last_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < send_idx; i++)
+ recv_cnt += cnts[i];
+ } else {
+ recv_idx = send_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < recv_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < last_idx; i++)
+ recv_cnt += cnts[i];
+ }
+
+ // Send data from recvbuf. Recv into tmp_buf
+ MPI_Sendrecv((char *) rbuff + disps[send_idx] * extent, send_cnt,
+ dtype, dst, tag,
+ (char *) tmp_buf + disps[recv_idx] * extent, recv_cnt,
+ dtype, dst, tag, comm, &status);
+
+ // tmp_buf contains data received in this step.
+ // recvbuf contains data accumulated so far
+
+ // This algorithm is used only for predefined ops
+ // and predefined ops are always commutative.
+ (*uop) ((char *) tmp_buf + disps[recv_idx] * extent,
+ (char *) rbuff + disps[recv_idx] * extent, &recv_cnt, &dtype);
+
+ // update send_idx for next iteration
+ send_idx = recv_idx;
+ mask <<= 1;
+
+ // update last_idx, but not in last iteration because the value
+ // is needed in the allgather step below.
+ if (mask < pof2)
+ last_idx = recv_idx + pof2 / mask;
+ }
+
+ // now do the allgather
+
+ mask >>= 1;
+ while (mask > 0) {
+ newdst = newrank ^ mask;
+ // find real rank of dest
+ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
+
+ send_cnt = recv_cnt = 0;
+ if (newrank < newdst) {
+ // update last_idx except on first iteration
+ if (mask != pof2 / 2)
+ last_idx = last_idx + pof2 / (mask * 2);
+
+ recv_idx = send_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < recv_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < last_idx; i++)
+ recv_cnt += cnts[i];
+ } else {
+ recv_idx = send_idx - pof2 / (mask * 2);
+ for (i = send_idx; i < last_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < send_idx; i++)
+ recv_cnt += cnts[i];
+ }
+
+ MPI_Sendrecv((char *) rbuff + disps[send_idx] * extent, send_cnt,
+ dtype, dst, tag,
+ (char *) rbuff + disps[recv_idx] * extent, recv_cnt,
+ dtype, dst, tag, comm, &status);
+
+ if (newrank > newdst)
+ send_idx = recv_idx;
+
+ mask >>= 1;
+ }
+
+ free(cnts);
+ free(disps);
+
+ }
+ // In the non-power-of-two case, all odd-numbered processes of
+ // rank < 2 * rem send the result to (rank-1), the ranks who didn't
+ // participate above.
+
+ if (rank < 2 * rem) {
+ if (rank % 2) // odd
+ MPI_Send(rbuff, count, dtype, rank - 1, tag, comm);
+ else // even
+ MPI_Recv(rbuff, count, dtype, rank + 1, tag, comm, &status);
+ }
+
+ free(tmp_buf);
+ return 0;
+}
--- /dev/null
+#include "colls.h"
+#ifndef REDUCE_STUFF
+#define REDUCE_STUFF
+/*****************************************************************************
+
+Copyright (c) 2006, Ahmad Faraj & Xin Yuan,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ * Neither the name of the Florida State University nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ *************************************************************************
+ * Any results obtained from executing this software require the *
+ * acknowledgment and citation of the software and its owners. *
+ * The full citation is given below: *
+ * *
+ * A. Faraj and X. Yuan. "Automatic Generation and Tuning of MPI *
+ * Collective Communication Routines." The 19th ACM International *
+ * Conference on Supercomputing (ICS), Cambridge, Massachusetts, *
+ * June 20-22, 2005. *
+ *************************************************************************
+
+*****************************************************************************/
+
+extern MPI_User_function *MPIR_Op_table[];
+
+
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/* $Id: mpich-stuff.h,v 1.1 2005/08/22 19:50:21 faraj Exp $
+ *
+ * (C) 2001 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+#ifndef _MPICH_STUFF_H
+#define _MPICH_STUFF_H
+
+/*TOpaqOverview.tex
+ MPI Opaque Objects:
+
+ MPI Opaque objects such as 'MPI_Comm' or 'MPI_Datatype' are specified by
+ integers (in the MPICH2 implementation); the MPI standard calls these
+ handles.
+ Out of range values are invalid; the value 0 is reserved.
+ For most (with the possible exception of
+ 'MPI_Request' for performance reasons) MPI Opaque objects, the integer
+ encodes both the kind of object (allowing runtime tests to detect a datatype
+ passed where a communicator is expected) and important properties of the
+ object. Even the 'MPI_xxx_NULL' values should be encoded so that
+ different null handles can be distinguished. The details of the encoding
+ of the handles is covered in more detail in the MPICH2 Design Document.
+ For the most part, the ADI uses pointers to the underlying structures
+ rather than the handles themselves. However, each structure contains an
+ 'handle' field that is the corresponding integer handle for the MPI object.
+
+ MPID objects (objects used within the implementation of MPI) are not opaque.
+
+ T*/
+
+/* Known MPI object types. These are used for both the error handlers
+ and for the handles. This is a 4 bit value. 0 is reserved for so
+ that all-zero handles can be flagged as an error. */
+/*E
+ MPID_Object_kind - Object kind (communicator, window, or file)
+
+ Notes:
+ This enum is used by keyvals and errhandlers to indicate the type of
+ object for which MPI opaque types the data is valid. These are defined
+ as bits to allow future expansion to the case where an object is value for
+ multiple types (for example, we may want a universal error handler for
+ errors return). This is also used to indicate the type of MPI object a
+ MPI handle represents. It is an enum because only this applies only the
+ the MPI objects.
+
+ Module:
+ Attribute-DS
+ E*/
+typedef enum MPID_Object_kind {
+ MPID_COMM = 0x1,
+ MPID_GROUP = 0x2,
+ MPID_DATATYPE = 0x3,
+ MPID_FILE = 0x4,
+ MPID_ERRHANDLER = 0x5,
+ MPID_OP = 0x6,
+ MPID_INFO = 0x7,
+ MPID_WIN = 0x8,
+ MPID_KEYVAL = 0x9,
+ MPID_ATTR = 0xa,
+ MPID_REQUEST = 0xb
+} MPID_Object_kind;
+/* The above objects should correspond to MPI objects only. */
+#define HANDLE_MPI_KIND_SHIFT 26
+#define HANDLE_GET_MPI_KIND(a) ( ((a)&0x3c000000) >> HANDLE_MPI_KIND_SHIFT )
+
+/* Handle types. These are really 2 bits */
+#define HANDLE_KIND_INVALID 0x0
+#define HANDLE_KIND_BUILTIN 0x1
+#define HANDLE_KIND_DIRECT 0x2
+#define HANDLE_KIND_INDIRECT 0x3
+/* Mask assumes that ints are at least 4 bytes */
+#define HANDLE_KIND_MASK 0xc0000000
+#define HANDLE_KIND_SHIFT 30
+#define HANDLE_GET_KIND(a) (((a)&HANDLE_KIND_MASK)>>HANDLE_KIND_SHIFT)
+#define HANDLE_SET_KIND(a,kind) ((a)|((kind)<<HANDLE_KIND_SHIFT))
+
+/* For indirect, the remainder of the handle has a block and index */
+#define HANDLE_INDIRECT_SHIFT 16
+#define HANDLE_BLOCK(a) (((a)& 0x03FF0000) >> HANDLE_INDIRECT_SHIFT)
+#define HANDLE_BLOCK_INDEX(a) ((a) & 0x0000FFFF)
+
+/* Handle block is between 1 and 1024 *elements* */
+#define HANDLE_BLOCK_SIZE 256
+/* Index size is bewtween 1 and 65536 *elements* */
+#define HANDLE_BLOCK_INDEX_SIZE 1024
+
+/* For direct, the remainder of the handle is the index into a predefined
+ block */
+#define HANDLE_MASK 0x03FFFFFF
+#define HANDLE_INDEX(a) ((a)& HANDLE_MASK)
+
+/* ALL objects have the handle as the first value. */
+/* Inactive (unused and stored on the appropriate avail list) objects
+ have MPIU_Handle_common as the head */
+typedef struct MPIU_Handle_common {
+ int handle;
+ volatile int ref_count; /* This field is used to indicate that the
+ object is not in use (see, e.g.,
+ MPID_Comm_valid_ptr) */
+ void *next; /* Free handles use this field to point to the next
+ free object */
+} MPIU_Handle_common;
+
+/* All *active* (in use) objects have the handle as the first value; objects
+ with referene counts have the reference count as the second value.
+ See MPIU_Object_add_ref and MPIU_Object_release_ref. */
+typedef struct MPIU_Handle_head {
+ int handle;
+ volatile int ref_count;
+} MPIU_Handle_head;
+
+/* This type contains all of the data, except for the direct array,
+ used by the object allocators. */
+typedef struct MPIU_Object_alloc_t {
+ MPIU_Handle_common *avail; /* Next available object */
+ int initialized; /* */
+ void *(*indirect)[]; /* Pointer to indirect object blocks */
+ int indirect_size; /* Number of allocated indirect blocks */
+ MPID_Object_kind kind; /* Kind of object this is for */
+ int size; /* Size of an individual object */
+ void *direct; /* Pointer to direct block, used
+ for allocation */
+ int direct_size; /* Size of direct block */
+} MPIU_Object_alloc_t;
+extern void *MPIU_Handle_obj_alloc(MPIU_Object_alloc_t *);
+extern void MPIU_Handle_obj_alloc_start(MPIU_Object_alloc_t *);
+extern void MPIU_Handle_obj_alloc_complete(MPIU_Object_alloc_t *, int init);
+extern void MPIU_Handle_obj_free(MPIU_Object_alloc_t *, void *);
+void *MPIU_Handle_get_ptr_indirect(int, MPIU_Object_alloc_t *);
+extern void *MPIU_Handle_direct_init(void *direct, int direct_size,
+ int obj_size, int handle_type);
+#endif
+#define MPID_Getb_ptr(kind,a,bmsk,ptr) \
+{ \
+ switch (HANDLE_GET_KIND(a)) { \
+ case HANDLE_KIND_BUILTIN: \
+ ptr=MPID_##kind##_builtin+((a)&(bmsk)); \
+ break; \
+ case HANDLE_KIND_DIRECT: \
+ ptr=MPID_##kind##_direct+HANDLE_INDEX(a); \
+ break; \
+ case HANDLE_KIND_INDIRECT: \
+ ptr=((MPID_##kind*) \
+ MPIU_Handle_get_ptr_indirect(a,&MPID_##kind##_mem)); \
+ break; \
+ case HANDLE_KIND_INVALID: \
+ default: \
+ ptr=0; \
+ break; \
+ } \
+}
+
+
+
+#define MPID_Op_get_ptr(a,ptr) MPID_Getb_ptr(Op,a,0x000000ff,ptr)
+typedef enum MPID_Lang_t { MPID_LANG_C
+#ifdef HAVE_FORTRAN_BINDING
+ , MPID_LANG_FORTRAN, MPID_LANG_FORTRAN90
+#endif
+#ifdef HAVE_CXX_BINDING
+ , MPID_LANG_CXX
+#endif
+} MPID_Lang_t;
+/* Reduction and accumulate operations */
+/*E
+ MPID_Op_kind - Enumerates types of MPI_Op types
+
+ Notes:
+ These are needed for implementing 'MPI_Accumulate', since only predefined
+ operations are allowed for that operation.
+
+ A gap in the enum values was made allow additional predefined operations
+ to be inserted. This might include future additions to MPI or experimental
+ extensions (such as a Read-Modify-Write operation).
+
+ Module:
+ Collective-DS
+ E*/
+typedef enum MPID_Op_kind { MPID_OP_MAX = 1, MPID_OP_MIN = 2,
+ MPID_OP_SUM = 3, MPID_OP_PROD = 4,
+ MPID_OP_LAND = 5, MPID_OP_BAND = 6, MPID_OP_LOR = 7, MPID_OP_BOR = 8,
+ MPID_OP_LXOR = 9, MPID_OP_BXOR = 10, MPID_OP_MAXLOC = 11,
+ MPID_OP_MINLOC = 12, MPID_OP_REPLACE = 13,
+ MPID_OP_USER_NONCOMMUTE = 32, MPID_OP_USER = 33
+} MPID_Op_kind;
+
+/*S
+ MPID_User_function - Definition of a user function for MPI_Op types.
+
+ Notes:
+ This includes a 'const' to make clear which is the 'in' argument and
+ which the 'inout' argument, and to indicate that the 'count' and 'datatype'
+ arguments are unchanged (they are addresses in an attempt to allow
+ interoperation with Fortran). It includes 'restrict' to emphasize that
+ no overlapping operations are allowed.
+
+ We need to include a Fortran version, since those arguments will
+ have type 'MPI_Fint *' instead. We also need to add a test to the
+ test suite for this case; in fact, we need tests for each of the handle
+ types to ensure that the transfered handle works correctly.
+
+ This is part of the collective module because user-defined operations
+ are valid only for the collective computation routines and not for
+ RMA accumulate.
+
+ Yes, the 'restrict' is in the correct location. C compilers that
+ support 'restrict' should be able to generate code that is as good as a
+ Fortran compiler would for these functions.
+
+ We should note on the manual pages for user-defined operations that
+ 'restrict' should be used when available, and that a cast may be
+ required when passing such a function to 'MPI_Op_create'.
+
+ Question:
+ Should each of these function types have an associated typedef?
+
+ Should there be a C++ function here?
+
+ Module:
+ Collective-DS
+ S*/
+typedef union MPID_User_function {
+ void (*c_function) (const void *, void *, const int *, const MPI_Datatype *);
+ void (*f77_function) (const void *, void *,
+ const MPI_Fint *, const MPI_Fint *);
+} MPID_User_function;
+/* FIXME: Should there be "restrict" in the definitions above, e.g.,
+ (*c_function)( const void restrict * , void restrict *, ... )? */
+
+/*S
+ MPID_Op - MPI_Op structure
+
+ Notes:
+ All of the predefined functions are commutative. Only user functions may
+ be noncummutative, so there are two separate op types for commutative and
+ non-commutative user-defined operations.
+
+ Operations do not require reference counts because there are no nonblocking
+ operations that accept user-defined operations. Thus, there is no way that
+ a valid program can free an 'MPI_Op' while it is in use.
+
+ Module:
+ Collective-DS
+ S*/
+typedef struct MPID_Op {
+ int handle; /* value of MPI_Op for this structure */
+ volatile int ref_count;
+ MPID_Op_kind kind;
+ MPID_Lang_t language;
+ MPID_User_function function;
+} MPID_Op;
+#define MPID_OP_N_BUILTIN 14
+extern MPID_Op MPID_Op_builtin[MPID_OP_N_BUILTIN];
+extern MPID_Op MPID_Op_direct[];
+extern MPIU_Object_alloc_t MPID_Op_mem;
+
+/*****************************************************************************
+
+* Function: get_op_func
+
+* return: Pointer to MPI_User_function
+
+* inputs:
+ op: operator (max, min, etc)
+
+ * Descrp: Function returns the function associated with current operator
+ * op.
+
+ * Auther: AHMAD FARAJ
+
+****************************************************************************/
+MPI_User_function *get_op_func(MPI_Op op)
+{
+
+ if (HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN)
+ return MPIR_Op_table[op % 16 - 1];
+ return NULL;
+}
+
+#endif
+
+
+int smpi_coll_tuned_allreduce_rab_reduce_scatter(void *sbuff, void *rbuff,
+ int count, MPI_Datatype dtype,
+ MPI_Op op, MPI_Comm comm)
+{
+ int nprocs, rank, type_size, tag = 543;
+ int mask, dst, pof2, newrank, rem, newdst, i,
+ send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps;
+ MPI_Aint lb, extent;
+ MPI_Status status;
+ void *tmp_buf = NULL;
+ MPI_User_function *func = get_op_func(op);
+ MPI_Comm_size(comm, &nprocs);
+ MPI_Comm_rank(comm, &rank);
+
+ MPI_Type_extent(dtype, &extent);
+ tmp_buf = (void *) malloc(count * extent);
+ if (!tmp_buf) {
+ printf("Could not allocate memory for tmp_buf\n");
+ return 1;
+ }
+
+ MPIR_Localcopy(sbuff, count, dtype, rbuff, count, dtype);
+
+ MPI_Type_size(dtype, &type_size);
+
+ // find nearest power-of-two less than or equal to comm_size
+ pof2 = 1;
+ while (pof2 <= nprocs)
+ pof2 <<= 1;
+ pof2 >>= 1;
+
+ rem = nprocs - pof2;
+
+ // In the non-power-of-two case, all even-numbered
+ // processes of rank < 2*rem send their data to
+ // (rank+1). These even-numbered processes no longer
+ // participate in the algorithm until the very end. The
+ // remaining processes form a nice power-of-two.
+
+ if (rank < 2 * rem) {
+ // even
+ if (rank % 2 == 0) {
+
+ MPIC_Send(rbuff, count, dtype, rank + 1, tag, comm);
+
+ // temporarily set the rank to -1 so that this
+ // process does not pariticipate in recursive
+ // doubling
+ newrank = -1;
+ } else // odd
+ {
+ MPIC_Recv(tmp_buf, count, dtype, rank - 1, tag, comm, &status);
+ // do the reduction on received data. since the
+ // ordering is right, it doesn't matter whether
+ // the operation is commutative or not.
+ (*func) (tmp_buf, rbuff, &count, &dtype);
+
+ // change the rank
+ newrank = rank / 2;
+ }
+ }
+
+ else // rank >= 2 * rem
+ newrank = rank - rem;
+
+ // If op is user-defined or count is less than pof2, use
+ // recursive doubling algorithm. Otherwise do a reduce-scatter
+ // followed by allgather. (If op is user-defined,
+ // derived datatypes are allowed and the user could pass basic
+ // datatypes on one process and derived on another as long as
+ // the type maps are the same. Breaking up derived
+ // datatypes to do the reduce-scatter is tricky, therefore
+ // using recursive doubling in that case.)
+
+ if (newrank != -1) {
+ // do a reduce-scatter followed by allgather. for the
+ // reduce-scatter, calculate the count that each process receives
+ // and the displacement within the buffer
+
+ cnts = (int *) malloc(pof2 * sizeof(int));
+ disps = (int *) malloc(pof2 * sizeof(int));
+
+ for (i = 0; i < (pof2 - 1); i++)
+ cnts[i] = count / pof2;
+ cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1);
+
+ disps[0] = 0;
+ for (i = 1; i < pof2; i++)
+ disps[i] = disps[i - 1] + cnts[i - 1];
+
+ mask = 0x1;
+ send_idx = recv_idx = 0;
+ last_idx = pof2;
+ while (mask < pof2) {
+ newdst = newrank ^ mask;
+ // find real rank of dest
+ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
+
+ send_cnt = recv_cnt = 0;
+ if (newrank < newdst) {
+ send_idx = recv_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < last_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < send_idx; i++)
+ recv_cnt += cnts[i];
+ } else {
+ recv_idx = send_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < recv_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < last_idx; i++)
+ recv_cnt += cnts[i];
+ }
+
+ // Send data from recvbuf. Recv into tmp_buf
+ MPIC_Sendrecv((char *) rbuff + disps[send_idx] * extent, send_cnt,
+ dtype, dst, tag,
+ (char *) tmp_buf + disps[recv_idx] * extent, recv_cnt,
+ dtype, dst, tag, comm, &status);
+
+ // tmp_buf contains data received in this step.
+ // recvbuf contains data accumulated so far
+
+ // This algorithm is used only for predefined ops
+ // and predefined ops are always commutative.
+ (*func) ((char *) tmp_buf + disps[recv_idx] * extent,
+ (char *) rbuff + disps[recv_idx] * extent, &recv_cnt, &dtype);
+
+ // update send_idx for next iteration
+ send_idx = recv_idx;
+ mask <<= 1;
+
+ // update last_idx, but not in last iteration because the value
+ // is needed in the allgather step below.
+ if (mask < pof2)
+ last_idx = recv_idx + pof2 / mask;
+ }
+
+ // now do the allgather
+
+ mask >>= 1;
+ while (mask > 0) {
+ newdst = newrank ^ mask;
+ // find real rank of dest
+ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
+
+ send_cnt = recv_cnt = 0;
+ if (newrank < newdst) {
+ // update last_idx except on first iteration
+ if (mask != pof2 / 2)
+ last_idx = last_idx + pof2 / (mask * 2);
+
+ recv_idx = send_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < recv_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < last_idx; i++)
+ recv_cnt += cnts[i];
+ } else {
+ recv_idx = send_idx - pof2 / (mask * 2);
+ for (i = send_idx; i < last_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < send_idx; i++)
+ recv_cnt += cnts[i];
+ }
+
+ MPIC_Sendrecv((char *) rbuff + disps[send_idx] * extent, send_cnt,
+ dtype, dst, tag,
+ (char *) rbuff + disps[recv_idx] * extent, recv_cnt,
+ dtype, dst, tag, comm, &status);
+
+ if (newrank > newdst)
+ send_idx = recv_idx;
+
+ mask >>= 1;
+ }
+
+ free(cnts);
+ free(disps);
+
+ }
+ // In the non-power-of-two case, all odd-numbered processes of
+ // rank < 2 * rem send the result to (rank-1), the ranks who didn't
+ // participate above.
+
+ if (rank < 2 * rem) {
+ if (rank % 2) // odd
+ MPIC_Send(rbuff, count, dtype, rank - 1, tag, comm);
+ else // even
+ MPIC_Recv(rbuff, count, dtype, rank + 1, tag, comm, &status);
+ }
+
+ free(tmp_buf);
+ return 0;
+}
--- /dev/null
+#include "colls.h"
+//#include <star-reduction.c>
+
+int smpi_coll_tuned_allreduce_rab_rsag(void *sbuff, void *rbuff, int count,
+ MPI_Datatype dtype, MPI_Op op,
+ MPI_Comm comm)
+{
+ int nprocs, rank, type_size, tag = 543;
+ int mask, dst, pof2, newrank, rem, newdst, i,
+ send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps;
+ MPI_Aint extent;
+ MPI_Status status;
+ void *tmp_buf = NULL;
+ MPI_Comm_size(comm, &nprocs);
+ MPI_Comm_rank(comm, &rank);
+
+ MPI_Type_extent(dtype, &extent);
+ tmp_buf = (void *) malloc(count * extent);
+ if (!tmp_buf) {
+ printf("Could not allocate memory for tmp_buf\n");
+ return 1;
+ }
+
+ MPI_Sendrecv(sbuff, count, dtype, rank, tag, rbuff, count, dtype, rank, tag,
+ comm, &status);
+
+ MPI_Type_size(dtype, &type_size);
+
+ // find nearest power-of-two less than or equal to comm_size
+ pof2 = 1;
+ while (pof2 <= nprocs)
+ pof2 <<= 1;
+ pof2 >>= 1;
+
+ rem = nprocs - pof2;
+
+ // In the non-power-of-two case, all even-numbered
+ // processes of rank < 2*rem send their data to
+ // (rank+1). These even-numbered processes no longer
+ // participate in the algorithm until the very end. The
+ // remaining processes form a nice power-of-two.
+
+ if (rank < 2 * rem) {
+ // even
+ if (rank % 2 == 0) {
+
+ MPI_Send(rbuff, count, dtype, rank + 1, tag, comm);
+
+ // temporarily set the rank to -1 so that this
+ // process does not pariticipate in recursive
+ // doubling
+ newrank = -1;
+ } else // odd
+ {
+ MPI_Recv(tmp_buf, count, dtype, rank - 1, tag, comm, &status);
+ // do the reduction on received data. since the
+ // ordering is right, it doesn't matter whether
+ // the operation is commutative or not.
+ star_reduction(op, tmp_buf, rbuff, &count, &dtype);
+
+ // change the rank
+ newrank = rank / 2;
+ }
+ }
+
+ else // rank >= 2 * rem
+ newrank = rank - rem;
+
+ // If op is user-defined or count is less than pof2, use
+ // recursive doubling algorithm. Otherwise do a reduce-scatter
+ // followed by allgather. (If op is user-defined,
+ // derived datatypes are allowed and the user could pass basic
+ // datatypes on one process and derived on another as long as
+ // the type maps are the same. Breaking up derived
+ // datatypes to do the reduce-scatter is tricky, therefore
+ // using recursive doubling in that case.)
+
+ if (newrank != -1) {
+ // do a reduce-scatter followed by allgather. for the
+ // reduce-scatter, calculate the count that each process receives
+ // and the displacement within the buffer
+
+ cnts = (int *) malloc(pof2 * sizeof(int));
+ disps = (int *) malloc(pof2 * sizeof(int));
+
+ for (i = 0; i < (pof2 - 1); i++)
+ cnts[i] = count / pof2;
+ cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1);
+
+ disps[0] = 0;
+ for (i = 1; i < pof2; i++)
+ disps[i] = disps[i - 1] + cnts[i - 1];
+
+ mask = 0x1;
+ send_idx = recv_idx = 0;
+ last_idx = pof2;
+ while (mask < pof2) {
+ newdst = newrank ^ mask;
+ // find real rank of dest
+ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
+
+ send_cnt = recv_cnt = 0;
+ if (newrank < newdst) {
+ send_idx = recv_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < last_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < send_idx; i++)
+ recv_cnt += cnts[i];
+ } else {
+ recv_idx = send_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < recv_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < last_idx; i++)
+ recv_cnt += cnts[i];
+ }
+
+ // Send data from recvbuf. Recv into tmp_buf
+ MPI_Sendrecv((char *) rbuff + disps[send_idx] * extent, send_cnt,
+ dtype, dst, tag,
+ (char *) tmp_buf + disps[recv_idx] * extent, recv_cnt,
+ dtype, dst, tag, comm, &status);
+
+ // tmp_buf contains data received in this step.
+ // recvbuf contains data accumulated so far
+
+ // This algorithm is used only for predefined ops
+ // and predefined ops are always commutative.
+ star_reduction(op, (char *) tmp_buf + disps[recv_idx] * extent,
+ (char *) rbuff + disps[recv_idx] * extent,
+ &recv_cnt, &dtype);
+
+ // update send_idx for next iteration
+ send_idx = recv_idx;
+ mask <<= 1;
+
+ // update last_idx, but not in last iteration because the value
+ // is needed in the allgather step below.
+ if (mask < pof2)
+ last_idx = recv_idx + pof2 / mask;
+ }
+
+ // now do the allgather
+
+ mask >>= 1;
+ while (mask > 0) {
+ newdst = newrank ^ mask;
+ // find real rank of dest
+ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
+
+ send_cnt = recv_cnt = 0;
+ if (newrank < newdst) {
+ // update last_idx except on first iteration
+ if (mask != pof2 / 2)
+ last_idx = last_idx + pof2 / (mask * 2);
+
+ recv_idx = send_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < recv_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < last_idx; i++)
+ recv_cnt += cnts[i];
+ } else {
+ recv_idx = send_idx - pof2 / (mask * 2);
+ for (i = send_idx; i < last_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < send_idx; i++)
+ recv_cnt += cnts[i];
+ }
+
+ MPI_Sendrecv((char *) rbuff + disps[send_idx] * extent, send_cnt,
+ dtype, dst, tag,
+ (char *) rbuff + disps[recv_idx] * extent, recv_cnt,
+ dtype, dst, tag, comm, &status);
+
+ if (newrank > newdst)
+ send_idx = recv_idx;
+
+ mask >>= 1;
+ }
+
+ free(cnts);
+ free(disps);
+
+ }
+ // In the non-power-of-two case, all odd-numbered processes of
+ // rank < 2 * rem send the result to (rank-1), the ranks who didn't
+ // participate above.
+
+ if (rank < 2 * rem) {
+ if (rank % 2) // odd
+ MPI_Send(rbuff, count, dtype, rank - 1, tag, comm);
+ else // even
+ MPI_Recv(rbuff, count, dtype, rank + 1, tag, comm, &status);
+ }
+
+ free(tmp_buf);
+ return 0;
+}
--- /dev/null
+#include "colls.h"
+//#include <star-reduction.c>
+
+// NP pow of 2 for now
+int smpi_coll_tuned_allreduce_rab1(void *sbuff, void *rbuff,
+ int count, MPI_Datatype dtype,
+ MPI_Op op, MPI_Comm comm)
+{
+ MPI_Status status;
+ MPI_Aint extent;
+ int tag = 4321, rank, nprocs, send_size, newcnt, share;
+ int pof2 = 1, mask, send_idx, recv_idx, dst, send_cnt, recv_cnt;
+
+ void *recv, *tmp_buf;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &nprocs);
+
+ MPI_Type_extent(dtype, &extent);
+
+ pof2 = 1;
+ while (pof2 <= nprocs)
+ pof2 <<= 1;
+ pof2 >>= 1;
+
+ mask = 1;
+ send_idx = recv_idx = 0;
+
+ // uneven count
+ if ((count % nprocs)) {
+ send_size = (count + nprocs) / nprocs;
+ newcnt = send_size * nprocs;
+
+ recv = (void *) malloc(extent * newcnt);
+ tmp_buf = (void *) malloc(extent * newcnt);
+ memcpy(recv, sbuff, extent * count);
+
+
+ mask = pof2 / 2;
+ share = newcnt / pof2;
+ while (mask > 0) {
+ dst = rank ^ mask;
+ send_cnt = recv_cnt = newcnt / (pof2 / mask);
+
+ if (rank < dst)
+ send_idx = recv_idx + (mask * share);
+ else
+ recv_idx = send_idx + (mask * share);
+
+ MPI_Sendrecv((char *) recv + send_idx * extent, send_cnt, dtype, dst, tag,
+ tmp_buf, recv_cnt, dtype, dst, tag, comm, &status);
+
+ star_reduction(op, tmp_buf, (char *) recv + recv_idx * extent, &recv_cnt,
+ &dtype);
+
+ // update send_idx for next iteration
+ send_idx = recv_idx;
+ mask >>= 1;
+ }
+
+ memcpy(tmp_buf, (char *) recv + recv_idx * extent, recv_cnt * extent);
+ MPI_Allgather(tmp_buf, recv_cnt, dtype, recv, recv_cnt, dtype, comm);
+
+ memcpy(rbuff, recv, count * extent);
+ free(recv);
+ free(tmp_buf);
+
+ }
+
+ else {
+ tmp_buf = (void *) malloc(extent * count);
+ memcpy(rbuff, sbuff, count * extent);
+ mask = pof2 / 2;
+ share = count / pof2;
+ while (mask > 0) {
+ dst = rank ^ mask;
+ send_cnt = recv_cnt = count / (pof2 / mask);
+
+ if (rank < dst)
+ send_idx = recv_idx + (mask * share);
+ else
+ recv_idx = send_idx + (mask * share);
+
+ MPI_Sendrecv((char *) rbuff + send_idx * extent, send_cnt, dtype, dst,
+ tag, tmp_buf, recv_cnt, dtype, dst, tag, comm, &status);
+
+ star_reduction(op, tmp_buf, (char *) rbuff + recv_idx * extent, &recv_cnt,
+ &dtype);
+
+ // update send_idx for next iteration
+ send_idx = recv_idx;
+ mask >>= 1;
+ }
+
+ memcpy(tmp_buf, (char *) rbuff + recv_idx * extent, recv_cnt * extent);
+ MPI_Allgather(tmp_buf, recv_cnt, dtype, rbuff, recv_cnt, dtype, comm);
+ free(tmp_buf);
+ }
+
+ return 0;
+}
--- /dev/null
+#include "colls.h"
+//#include <star-reduction.c>
+
+// this requires that count >= NP
+int smpi_coll_tuned_allreduce_rab2(void *sbuff, void *rbuff,
+ int count, MPI_Datatype dtype,
+ MPI_Op op, MPI_Comm comm)
+{
+ MPI_Aint s_extent;
+ int i, rank, nprocs;
+ int nbytes, send_size, s_offset, r_offset;
+ void *recv, *send, *tmp;
+ /*
+ #ifdef MPICH2_REDUCTION
+ MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
+ #else
+ MPI_User_function *uop;
+ struct MPIR_OP *op_ptr;
+ op_ptr = MPIR_ToPointer(op);
+ uop = op_ptr->op;
+ #endif
+ */
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &nprocs);
+
+
+ MPI_Type_extent(dtype, &s_extent);
+
+ // uneven count
+ if (count % nprocs) {
+ if (count < nprocs)
+ send_size = nprocs;
+ else
+ send_size = (count + nprocs) / nprocs;
+ nbytes = send_size * s_extent;
+
+ send = (void *) malloc(s_extent * send_size * nprocs);
+ recv = (void *) malloc(s_extent * send_size * nprocs);
+ tmp = (void *) malloc(nbytes);
+
+ memcpy(send, sbuff, s_extent * count);
+
+ MPI_Alltoall(send, send_size, dtype, recv, send_size, dtype, comm);
+
+ memcpy(tmp, recv, nbytes);
+
+ for (i = 1, s_offset = nbytes; i < nprocs; i++, s_offset = i * nbytes)
+ star_reduction(op, (char *) recv + s_offset, tmp, &send_size, &dtype);
+
+ MPI_Allgather(tmp, send_size, dtype, recv, send_size, dtype, comm);
+ memcpy(rbuff, recv, count * s_extent);
+
+ free(recv);
+ free(tmp);
+ free(send);
+ } else {
+ send = sbuff;
+ send_size = count / nprocs;
+ nbytes = send_size * s_extent;
+ r_offset = rank * nbytes;
+
+ recv = (void *) malloc(s_extent * send_size * nprocs);
+
+ MPI_Alltoall(send, send_size, dtype, recv, send_size, dtype, comm);
+
+ memcpy((char *) rbuff + r_offset, recv, nbytes);
+
+ for (i = 1, s_offset = nbytes; i < nprocs; i++, s_offset = i * nbytes)
+ star_reduction(op, (char *) recv + s_offset, (char *) rbuff + r_offset,
+ &send_size, &dtype);
+
+ MPI_Allgather((char *) rbuff + r_offset, send_size, dtype, rbuff, send_size,
+ dtype, comm);
+ free(recv);
+ }
+
+ return 0;
+}
--- /dev/null
+#include "colls.h"
+//#include <star-reduction.c>
+
+int smpi_coll_tuned_allreduce_rdb(void *sbuff, void *rbuff, int count,
+ MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
+{
+ int nprocs, rank, type_size, tag = 543;
+ int mask, dst, pof2, newrank, rem, newdst;
+ MPI_Aint extent;
+ MPI_Status status;
+ void *tmp_buf = NULL;
+ /*
+ #ifdef MPICH2_REDUCTION
+ MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
+ #else
+ MPI_User_function *uop;
+ struct MPIR_OP *op_ptr;
+ op_ptr = MPIR_ToPointer(op);
+ uop = op_ptr->op;
+ #endif
+ */
+ MPI_Comm_size(comm, &nprocs);
+ MPI_Comm_rank(comm, &rank);
+
+ MPI_Type_extent(dtype, &extent);
+ tmp_buf = (void *) malloc(count * extent);
+ if (!tmp_buf) {
+ printf("Could not allocate memory for tmp_buf\n");
+ return 1;
+ }
+
+ MPI_Sendrecv(sbuff, count, dtype, rank, 500,
+ rbuff, count, dtype, rank, 500, comm, &status);
+
+ MPI_Type_size(dtype, &type_size);
+
+ // find nearest power-of-two less than or equal to comm_size
+ pof2 = 1;
+ while (pof2 <= nprocs)
+ pof2 <<= 1;
+ pof2 >>= 1;
+
+ rem = nprocs - pof2;
+
+ // In the non-power-of-two case, all even-numbered
+ // processes of rank < 2*rem send their data to
+ // (rank+1). These even-numbered processes no longer
+ // participate in the algorithm until the very end. The
+ // remaining processes form a nice power-of-two.
+
+ if (rank < 2 * rem) {
+ // even
+ if (rank % 2 == 0) {
+
+ MPI_Send(rbuff, count, dtype, rank + 1, tag, comm);
+
+ // temporarily set the rank to -1 so that this
+ // process does not pariticipate in recursive
+ // doubling
+ newrank = -1;
+ } else // odd
+ {
+ MPI_Recv(tmp_buf, count, dtype, rank - 1, tag, comm, &status);
+ // do the reduction on received data. since the
+ // ordering is right, it doesn't matter whether
+ // the operation is commutative or not.
+ star_reduction(op, tmp_buf, rbuff, &count, &dtype);
+
+ // change the rank
+ newrank = rank / 2;
+ }
+ }
+
+ else // rank >= 2 * rem
+ newrank = rank - rem;
+
+ // If op is user-defined or count is less than pof2, use
+ // recursive doubling algorithm. Otherwise do a reduce-scatter
+ // followed by allgather. (If op is user-defined,
+ // derived datatypes are allowed and the user could pass basic
+ // datatypes on one process and derived on another as long as
+ // the type maps are the same. Breaking up derived
+ // datatypes to do the reduce-scatter is tricky, therefore
+ // using recursive doubling in that case.)
+
+ if (newrank != -1) {
+ mask = 0x1;
+ while (mask < pof2) {
+ newdst = newrank ^ mask;
+ // find real rank of dest
+ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
+
+ // Send the most current data, which is in recvbuf. Recv
+ // into tmp_buf
+ MPI_Sendrecv(rbuff, count, dtype, dst, tag, tmp_buf, count, dtype,
+ dst, tag, comm, &status);
+
+ // tmp_buf contains data received in this step.
+ // recvbuf contains data accumulated so far
+
+ // op is commutative OR the order is already right
+ // we assume it is commuttive op
+ // if (op -> op_commute || (dst < rank))
+ if ((dst < rank)) {
+ star_reduction(op, tmp_buf, rbuff, &count, &dtype);
+ } else // op is noncommutative and the order is not right
+ {
+ star_reduction(op, rbuff, tmp_buf, &count, &dtype);
+
+ // copy result back into recvbuf
+ MPI_Sendrecv(tmp_buf, count, dtype, rank, tag, rbuff, count,
+ dtype, rank, tag, comm, &status);
+ }
+ mask <<= 1;
+ }
+ }
+ // In the non-power-of-two case, all odd-numbered processes of
+ // rank < 2 * rem send the result to (rank-1), the ranks who didn't
+ // participate above.
+
+ if (rank < 2 * rem) {
+ if (rank % 2) // odd
+ MPI_Send(rbuff, count, dtype, rank - 1, tag, comm);
+ else // even
+ MPI_Recv(rbuff, count, dtype, rank + 1, tag, comm, &status);
+ }
+
+ free(tmp_buf);
+ return 0;
+}
--- /dev/null
+#include "colls.h"
+
+int smpi_coll_tuned_allreduce_redbcast(void *buf, void *buf2, int count,
+ MPI_Datatype datatype, MPI_Op op,
+ MPI_Comm comm)
+{
+ MPI_Reduce(buf, buf2, count, datatype, op, 0, comm);
+ MPI_Bcast(buf2, count, datatype, 0, comm);
+ return 0;
+}
--- /dev/null
+#include "colls.h"
+/* IMPLEMENTED BY PITCH PATARASUK
+ Non-topoloty-specific (however, number of cores/node need to be changed)
+ all-reduce operation designed for smp clusters
+ It uses 2-layer communication: binomial for both intra-communication
+ inter-communication
+ The communication are done in a pipeline fashion */
+
+/* change number of core per smp-node
+ we assume that number of core per process will be the same for all implementations */
+#ifndef NUM_CORE
+#define NUM_CORE 8
+#endif
+
+/* this is a default segment size for pipelining,
+ but it is typically passed as a command line argument */
+int allreduce_smp_binomial_pipeline_segment_size = 4096;
+
+/* ** NOTE **
+ This code is modified from allreduce-smp-binomial.c by wrapping the code with pipeline effect as follow
+ for (loop over pipelength) {
+ smp-binomial main code;
+ }
+*/
+
+/* ** NOTE **
+ Use -DMPICH2 if this code does not compile.
+ MPICH1 code also work on MPICH2 on our cluster and the performance are similar.
+ This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc).
+*/
+
+#ifndef MPICH2
+extern void *MPIR_ToPointer();
+
+struct MPIR_OP {
+ MPI_User_function *op;
+ int commute, permanent;
+};
+
+#else
+extern MPI_User_function *MPIR_Op_table[];
+#endif
+
+/*
+This fucntion performs all-reduce operation as follow. ** in a pipeline fashion **
+1) binomial_tree reduce inside each SMP node
+2) binomial_tree reduce intra-communication between root of each SMP node
+3) binomial_tree bcast intra-communication between root of each SMP node
+4) binomial_tree bcast inside each SMP node
+*/
+int smpi_coll_tuned_allreduce_smp_binomial_pipeline(void *send_buf,
+ void *recv_buf, int count,
+ MPI_Datatype dtype,
+ MPI_Op op, MPI_Comm comm)
+{
+ int comm_size, rank;
+ void *tmp_buf;
+ int tag = 50;
+ int mask, src, dst;
+ MPI_Status status;
+ int num_core = NUM_CORE;
+
+ MPI_User_function *uop;
+#ifndef MPICH2
+ struct MPIR_OP *op_ptr = MPIR_ToPointer(op);
+ uop = (MPI_User_function *) op_ptr->op;
+#else
+ uop = MPIR_Op_table[op % 16 - 1];
+#endif
+
+ MPI_Comm_size(comm, &comm_size);
+ MPI_Comm_rank(comm, &rank);
+ MPI_Aint extent;
+ MPI_Type_extent(dtype, &extent);
+ tmp_buf = (void *) malloc(count * extent);
+
+ int intra_rank, inter_rank;
+ intra_rank = rank % num_core;
+ inter_rank = rank / num_core;
+
+ int phase;
+ int send_offset;
+ int recv_offset;
+ int pcount = allreduce_smp_binomial_pipeline_segment_size;
+ if (pcount > count) {
+ pcount = count;
+ }
+
+ /* size of processes participate in intra communications =>
+ should be equal to number of machines */
+ int inter_comm_size = (comm_size + num_core - 1) / num_core;
+
+ /* copy input buffer to output buffer */
+ MPI_Sendrecv(send_buf, count, dtype, rank, tag,
+ recv_buf, count, dtype, rank, tag, comm, &status);
+
+ /* compute pipe length */
+ int pipelength;
+ pipelength = count / pcount;
+
+ /* pipelining over pipelength (+3 is because we have 4 stages:
+ reduce-intra, reduce-inter, bcast-inter, bcast-intra */
+ for (phase = 0; phase < pipelength + 3; phase++) {
+
+ /* start binomial reduce intra communication inside each SMP node */
+ if (phase < pipelength) {
+ mask = 1;
+ while (mask < num_core) {
+ if ((mask & intra_rank) == 0) {
+ src = (inter_rank * num_core) + (intra_rank | mask);
+ if (src < comm_size) {
+ recv_offset = phase * pcount * extent;
+ MPI_Recv(tmp_buf, pcount, dtype, src, tag, comm, &status);
+ (*uop) (tmp_buf, (char *)recv_buf + recv_offset, &pcount, &dtype);
+ }
+ } else {
+ send_offset = phase * pcount * extent;
+ dst = (inter_rank * num_core) + (intra_rank & (~mask));
+ MPI_Send((char *)recv_buf + send_offset, pcount, dtype, dst, tag, comm);
+ break;
+ }
+ mask <<= 1;
+ }
+ }
+
+ /* start binomial reduce inter-communication between each SMP nodes:
+ each node only have one process that can communicate to other nodes */
+ if ((phase > 0) && (phase < (pipelength + 1))) {
+ if (intra_rank == 0) {
+
+ mask = 1;
+ while (mask < inter_comm_size) {
+ if ((mask & inter_rank) == 0) {
+ src = (inter_rank | mask) * num_core;
+ if (src < comm_size) {
+ recv_offset = (phase - 1) * pcount * extent;
+ MPI_Recv(tmp_buf, pcount, dtype, src, tag, comm, &status);
+ (*uop) (tmp_buf, (char *)recv_buf + recv_offset, &pcount, &dtype);
+ }
+ } else {
+ dst = (inter_rank & (~mask)) * num_core;
+ send_offset = (phase - 1) * pcount * extent;
+ MPI_Send((char *)recv_buf + send_offset, pcount, dtype, dst, tag, comm);
+ break;
+ }
+ mask <<= 1;
+ }
+ }
+ }
+
+ /* start binomial broadcast inter-communication between each SMP nodes:
+ each node only have one process that can communicate to other nodes */
+ if ((phase > 1) && (phase < (pipelength + 2))) {
+ if (intra_rank == 0) {
+ mask = 1;
+ while (mask < inter_comm_size) {
+ if (inter_rank & mask) {
+ src = (inter_rank - mask) * num_core;
+ recv_offset = (phase - 2) * pcount * extent;
+ MPI_Recv((char *)recv_buf + recv_offset, pcount, dtype, src, tag, comm,
+ &status);
+ break;
+ }
+ mask <<= 1;
+ }
+ mask >>= 1;
+
+ while (mask > 0) {
+ if (inter_rank < inter_comm_size) {
+ dst = (inter_rank + mask) * num_core;
+ if (dst < comm_size) {
+ //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
+ send_offset = (phase - 2) * pcount * extent;
+ MPI_Send((char *)recv_buf + send_offset, pcount, dtype, dst, tag, comm);
+ }
+ }
+ mask >>= 1;
+ }
+ }
+ }
+
+ /* start binomial broadcast intra-communication inside each SMP nodes */
+ if (phase > 2) {
+ int num_core_in_current_smp = num_core;
+ if (inter_rank == (inter_comm_size - 1)) {
+ num_core_in_current_smp = comm_size - (inter_rank * num_core);
+ }
+ mask = 1;
+ while (mask < num_core_in_current_smp) {
+ if (intra_rank & mask) {
+ src = (inter_rank * num_core) + (intra_rank - mask);
+ recv_offset = (phase - 3) * pcount * extent;
+ MPI_Recv((char *)recv_buf + recv_offset, pcount, dtype, src, tag, comm,
+ &status);
+ break;
+ }
+ mask <<= 1;
+ }
+ mask >>= 1;
+
+ while (mask > 0) {
+ dst = (inter_rank * num_core) + (intra_rank + mask);
+ if (dst < comm_size) {
+ send_offset = (phase - 3) * pcount * extent;
+ MPI_Send((char *)recv_buf + send_offset, pcount, dtype, dst, tag, comm);
+ }
+ mask >>= 1;
+ }
+ }
+ } // for phase
+
+ free(tmp_buf);
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+/* IMPLEMENTED BY PITCH PATARASUK
+ Non-topoloty-specific (however, number of cores/node need to be changed)
+ all-reduce operation designed for smp clusters
+ It uses 2-layer communication: binomial for both intra-communication
+ inter-communication*/
+
+/* change number of core per smp-node
+ we assume that number of core per process will be the same for all implementations */
+#ifndef NUM_CORE
+#define NUM_CORE 8
+#endif
+
+/* ** NOTE **
+ Use -DMPICH2 if this code does not compile.
+ MPICH1 code also work on MPICH2 on our cluster and the performance are similar.
+ This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc).
+*/
+
+//#include <star-reduction.c>
+
+/*
+This fucntion performs all-reduce operation as follow.
+1) binomial_tree reduce inside each SMP node
+2) binomial_tree reduce intra-communication between root of each SMP node
+3) binomial_tree bcast intra-communication between root of each SMP node
+4) binomial_tree bcast inside each SMP node
+*/
+int smpi_coll_tuned_allreduce_smp_binomial(void *send_buf, void *recv_buf,
+ int count, MPI_Datatype dtype,
+ MPI_Op op, MPI_Comm comm)
+{
+ int comm_size, rank;
+ void *tmp_buf;
+ int tag = 50;
+ int mask, src, dst;
+ int num_core = NUM_CORE;
+ MPI_Status status;
+ /*
+ #ifdef MPICH2_REDUCTION
+ MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
+ #else
+ MPI_User_function *uop;
+ struct MPIR_OP *op_ptr;
+ op_ptr = MPIR_ToPointer(op);
+ uop = op_ptr->op;
+ #endif
+ */
+
+ MPI_Comm_size(comm, &comm_size);
+ MPI_Comm_rank(comm, &rank);
+ MPI_Aint extent;
+ MPI_Type_extent(dtype, &extent);
+ tmp_buf = (void *) malloc(count * extent);
+
+ /* compute intra and inter ranking */
+ int intra_rank, inter_rank;
+ intra_rank = rank % num_core;
+ inter_rank = rank / num_core;
+
+ /* size of processes participate in intra communications =>
+ should be equal to number of machines */
+ int inter_comm_size = (comm_size + num_core - 1) / num_core;
+
+ /* copy input buffer to output buffer */
+ MPI_Sendrecv(send_buf, count, dtype, rank, tag,
+ recv_buf, count, dtype, rank, tag, comm, &status);
+
+ /* start binomial reduce intra communication inside each SMP node */
+ mask = 1;
+ while (mask < num_core) {
+ if ((mask & intra_rank) == 0) {
+ src = (inter_rank * num_core) + (intra_rank | mask);
+ if (src < comm_size) {
+ MPI_Recv(tmp_buf, count, dtype, src, tag, comm, &status);
+ star_reduction(op, tmp_buf, recv_buf, &count, &dtype);
+ }
+ } else {
+ dst = (inter_rank * num_core) + (intra_rank & (~mask));
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ break;
+ }
+ mask <<= 1;
+ }
+
+ /* start binomial reduce inter-communication between each SMP nodes:
+ each node only have one process that can communicate to other nodes */
+ if (intra_rank == 0) {
+ mask = 1;
+ while (mask < inter_comm_size) {
+ if ((mask & inter_rank) == 0) {
+ src = (inter_rank | mask) * num_core;
+ if (src < comm_size) {
+ MPI_Recv(tmp_buf, count, dtype, src, tag, comm, &status);
+ star_reduction(op, tmp_buf, recv_buf, &count, &dtype);
+ }
+ } else {
+ dst = (inter_rank & (~mask)) * num_core;
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ break;
+ }
+ mask <<= 1;
+ }
+ }
+
+ /* start binomial broadcast inter-communication between each SMP nodes:
+ each node only have one process that can communicate to other nodes */
+ if (intra_rank == 0) {
+ mask = 1;
+ while (mask < inter_comm_size) {
+ if (inter_rank & mask) {
+ src = (inter_rank - mask) * num_core;
+ MPI_Recv(recv_buf, count, dtype, src, tag, comm, &status);
+ break;
+ }
+ mask <<= 1;
+ }
+ mask >>= 1;
+
+ while (mask > 0) {
+ if (inter_rank < inter_comm_size) {
+ dst = (inter_rank + mask) * num_core;
+ if (dst < comm_size) {
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ }
+ }
+ mask >>= 1;
+ }
+ }
+
+ /* start binomial broadcast intra-communication inside each SMP nodes */
+ int num_core_in_current_smp = num_core;
+ if (inter_rank == (inter_comm_size - 1)) {
+ num_core_in_current_smp = comm_size - (inter_rank * num_core);
+ }
+ mask = 1;
+ while (mask < num_core_in_current_smp) {
+ if (intra_rank & mask) {
+ src = (inter_rank * num_core) + (intra_rank - mask);
+ MPI_Recv(recv_buf, count, dtype, src, tag, comm, &status);
+ break;
+ }
+ mask <<= 1;
+ }
+ mask >>= 1;
+
+ while (mask > 0) {
+ dst = (inter_rank * num_core) + (intra_rank + mask);
+ if (dst < comm_size) {
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ }
+ mask >>= 1;
+ }
+
+ free(tmp_buf);
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+/* IMPLEMENTED BY PITCH PATARASUK
+ Non-topoloty-specific (however, number of cores/node need to be changed)
+ all-reduce operation designed for smp clusters
+ It uses 2-layer communication: binomial for intra-communication
+ and rdb for inter-communication*/
+
+/* change number of core per smp-node
+ we assume that number of core per process will be the same for all implementations */
+#ifndef NUM_CORE
+#define NUM_CORE 8
+#endif
+
+/* ** NOTE **
+ Use -DMPICH2 if this code does not compile.
+ MPICH1 code also work on MPICH2 on our cluster and the performance are similar.
+ This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc).
+*/
+
+//#include <star-reduction.c>
+
+/*
+This fucntion performs all-reduce operation as follow.
+1) binomial_tree reduce inside each SMP node
+2) Recursive doubling intra-communication between root of each SMP node
+3) binomial_tree bcast inside each SMP node
+*/
+int smpi_coll_tuned_allreduce_smp_rdb(void *send_buf, void *recv_buf, int count,
+ MPI_Datatype dtype, MPI_Op op,
+ MPI_Comm comm)
+{
+ int comm_size, rank;
+ void *tmp_buf;
+ int tag = 50;
+ int mask, src, dst;
+ MPI_Status status;
+ int num_core = NUM_CORE;
+ /*
+ #ifdef MPICH2_REDUCTION
+ MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
+ #else
+ MPI_User_function *uop;
+ struct MPIR_OP *op_ptr;
+ op_ptr = MPIR_ToPointer(op);
+ uop = op_ptr->op;
+ #endif
+ */
+ MPI_Comm_size(comm, &comm_size);
+ MPI_Comm_rank(comm, &rank);
+ MPI_Aint extent;
+ MPI_Type_extent(dtype, &extent);
+ tmp_buf = (void *) malloc(count * extent);
+
+ /* compute intra and inter ranking */
+ int intra_rank, inter_rank;
+ intra_rank = rank % num_core;
+ inter_rank = rank / num_core;
+
+ /* size of processes participate in intra communications =>
+ should be equal to number of machines */
+ int inter_comm_size = (comm_size + num_core - 1) / num_core;
+
+ /* copy input buffer to output buffer */
+ MPI_Sendrecv(send_buf, count, dtype, rank, tag,
+ recv_buf, count, dtype, rank, tag, comm, &status);
+
+ /* start binomial reduce intra communication inside each SMP node */
+ mask = 1;
+ while (mask < num_core) {
+ if ((mask & intra_rank) == 0) {
+ src = (inter_rank * num_core) + (intra_rank | mask);
+ if (src < comm_size) {
+ MPI_Recv(tmp_buf, count, dtype, src, tag, comm, &status);
+ star_reduction(op, tmp_buf, recv_buf, &count, &dtype);
+ }
+ } else {
+ dst = (inter_rank * num_core) + (intra_rank & (~mask));
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ break;
+ }
+ mask <<= 1;
+ } /* end binomial reduce intra-communication */
+
+
+ /* start rdb (recursive doubling) all-reduce inter-communication
+ between each SMP nodes : each node only have one process that can communicate
+ to other nodes */
+ if (intra_rank == 0) {
+
+ /* find nearest power-of-two less than or equal to inter_comm_size */
+ int pof2, rem, newrank, newdst;
+ pof2 = 1;
+ while (pof2 <= inter_comm_size)
+ pof2 <<= 1;
+ pof2 >>= 1;
+ rem = inter_comm_size - pof2;
+
+ /* In the non-power-of-two case, all even-numbered
+ processes of rank < 2*rem send their data to
+ (rank+1). These even-numbered processes no longer
+ participate in the algorithm until the very end.
+ */
+ if (inter_rank < 2 * rem) {
+ if (inter_rank % 2 == 0) {
+ dst = rank + num_core;
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ newrank = -1;
+ } else {
+ src = rank - num_core;
+ MPI_Recv(tmp_buf, count, dtype, src, tag, comm, &status);
+ star_reduction(op, tmp_buf, recv_buf, &count, &dtype);
+ newrank = inter_rank / 2;
+ }
+ } else {
+ newrank = inter_rank - rem;
+ }
+
+ /* example inter-communication RDB rank change algorithm
+ 0,4,8,12..36 <= true rank (assume 4 core per SMP)
+ 0123 4567 89 <= inter_rank
+ 1 3 4567 89 (1,3 got data from 0,2 : 0,2 will be idle until the end)
+ 0 1 4567 89
+ 0 1 2345 67 => newrank
+ */
+
+ if (newrank != -1) {
+ mask = 1;
+ while (mask < pof2) {
+ newdst = newrank ^ mask;
+ /* find real rank of dest */
+ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
+ dst *= num_core;
+
+ /* exchange data in rdb manner */
+ MPI_Sendrecv(recv_buf, count, dtype, dst, tag, tmp_buf, count, dtype,
+ dst, tag, comm, &status);
+ star_reduction(op, tmp_buf, recv_buf, &count, &dtype);
+ mask <<= 1;
+ }
+ }
+
+ /* non pof2 case
+ left-over processes (all even ranks: < 2 * rem) get the result
+ */
+ if (inter_rank < 2 * rem) {
+ if (inter_rank % 2) {
+ MPI_Send(recv_buf, count, dtype, rank - num_core, tag, comm);
+ } else {
+ MPI_Recv(recv_buf, count, dtype, rank + num_core, tag, comm, &status);
+ }
+ }
+ }
+
+ /* start binomial broadcast intra-communication inside each SMP nodes */
+ int num_core_in_current_smp = num_core;
+ if (inter_rank == (inter_comm_size - 1)) {
+ num_core_in_current_smp = comm_size - (inter_rank * num_core);
+ }
+ mask = 1;
+ while (mask < num_core_in_current_smp) {
+ if (intra_rank & mask) {
+ src = (inter_rank * num_core) + (intra_rank - mask);
+ MPI_Recv(recv_buf, count, dtype, src, tag, comm, &status);
+ break;
+ }
+ mask <<= 1;
+ }
+ mask >>= 1;
+
+ while (mask > 0) {
+ dst = (inter_rank * num_core) + (intra_rank + mask);
+ if (dst < comm_size) {
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ }
+ mask >>= 1;
+ }
+
+ free(tmp_buf);
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+//#include <star-reduction.c>
+
+/* change number of core per smp-node
+ we assume that number of core per process will be the same for all implementations */
+#ifndef NUM_CORE
+#define NUM_CORE 8
+#endif
+
+/*
+This fucntion performs all-reduce operation as follow.
+1) binomial_tree reduce inside each SMP node
+2) reduce-scatter -inter between root of each SMP node
+3) allgather - inter between root of each SMP node
+4) binomial_tree bcast inside each SMP node
+*/
+int smpi_coll_tuned_allreduce_smp_rsag_lr(void *send_buf, void *recv_buf,
+ int count, MPI_Datatype dtype,
+ MPI_Op op, MPI_Comm comm)
+{
+ int comm_size, rank;
+ void *tmp_buf;
+ int tag = 50;
+ int mask, src, dst;
+ MPI_Status status;
+ int num_core = NUM_CORE;
+ /*
+ #ifdef MPICH2_REDUCTION
+ MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
+ #else
+ MPI_User_function *uop;
+ struct MPIR_OP *op_ptr;
+ op_ptr = MPIR_ToPointer(op);
+ uop = op_ptr->op;
+ #endif
+ */
+ MPI_Comm_size(comm, &comm_size);
+ MPI_Comm_rank(comm, &rank);
+ MPI_Aint extent;
+ MPI_Type_extent(dtype, &extent);
+ tmp_buf = (void *) malloc(count * extent);
+
+ int intra_rank, inter_rank;
+ intra_rank = rank % num_core;
+ inter_rank = rank / num_core;
+
+ //printf("node %d intra_rank = %d, inter_rank = %d\n", rank, intra_rank, inter_rank);
+
+ int inter_comm_size = (comm_size + num_core - 1) / num_core;
+
+ if (!rank) {
+ //printf("intra com size = %d\n",num_core);
+ //printf("inter com size = %d\n",inter_comm_size);
+ }
+
+
+ MPI_Sendrecv(send_buf, count, dtype, rank, tag,
+ recv_buf, count, dtype, rank, tag, comm, &status);
+
+
+ // SMP_binomial_reduce
+ mask = 1;
+ while (mask < num_core) {
+ if ((mask & intra_rank) == 0) {
+ src = (inter_rank * num_core) + (intra_rank | mask);
+ // if (src < ((inter_rank + 1) * num_core)) {
+ if (src < comm_size) {
+ MPI_Recv(tmp_buf, count, dtype, src, tag, comm, &status);
+ star_reduction(op, tmp_buf, recv_buf, &count, &dtype);
+ //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
+ }
+ } else {
+
+ dst = (inter_rank * num_core) + (intra_rank & (~mask));
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
+ break;
+ }
+ mask <<= 1;
+ }
+
+
+
+ // INTER: reduce-scatter
+ if (intra_rank == 0) {
+ int send_offset, recv_offset;
+ int send_count, recv_count;
+ int curr_size = count / inter_comm_size;
+ int curr_remainder = count % inter_comm_size;
+
+ int to = ((inter_rank + 1) % inter_comm_size) * num_core;
+ int from =
+ ((inter_rank + inter_comm_size - 1) % inter_comm_size) * num_core;
+ int i;
+
+ //printf("node %d to %d from %d\n",rank,to,from);
+
+ /* last segment may have a larger size since it also include the remainder */
+ int last_segment_ptr =
+ (inter_comm_size - 1) * (count / inter_comm_size) * extent;
+
+ for (i = 0; i < (inter_comm_size - 1); i++) {
+
+ send_offset =
+ ((inter_rank - 1 - i +
+ inter_comm_size) % inter_comm_size) * curr_size * extent;
+ recv_offset =
+ ((inter_rank - 2 - i +
+ inter_comm_size) % inter_comm_size) * curr_size * extent;
+
+ /* adjust size */
+ if (send_offset != last_segment_ptr)
+ send_count = curr_size;
+ else
+ send_count = curr_size + curr_remainder;
+
+ if (recv_offset != last_segment_ptr)
+ recv_count = curr_size;
+ else
+ recv_count = curr_size + curr_remainder;
+
+ MPI_Sendrecv((char *) recv_buf + send_offset, send_count, dtype, to,
+ tag + i, tmp_buf, recv_count, dtype, from, tag + i, comm,
+ &status);
+
+ // result is in rbuf
+ star_reduction(op, tmp_buf, (char *) recv_buf + recv_offset, &recv_count,
+ &dtype);
+ }
+
+ // INTER: allgather
+ for (i = 0; i < (inter_comm_size - 1); i++) {
+
+ send_offset =
+ ((inter_rank - i +
+ inter_comm_size) % inter_comm_size) * curr_size * extent;
+ recv_offset =
+ ((inter_rank - 1 - i +
+ inter_comm_size) % inter_comm_size) * curr_size * extent;
+
+ /* adjust size */
+ if (send_offset != last_segment_ptr)
+ send_count = curr_size;
+ else
+ send_count = curr_size + curr_remainder;
+
+ if (recv_offset != last_segment_ptr)
+ recv_count = curr_size;
+ else
+ recv_count = curr_size + curr_remainder;
+
+ MPI_Sendrecv((char *) recv_buf + send_offset, send_count, dtype, to,
+ tag + i, (char *) recv_buf + recv_offset, recv_count, dtype,
+ from, tag + i, comm, &status);
+
+ }
+ }
+
+
+
+ /*
+ // INTER_binomial_reduce
+
+ // only root node for each SMP
+ if (intra_rank == 0) {
+
+ mask = 1;
+ while (mask < inter_comm_size) {
+ if ((mask & inter_rank) == 0) {
+ src = (inter_rank | mask) * num_core;
+ if (src < comm_size) {
+ MPI_Recv(tmp_buf, count, dtype, src, tag, comm, &status);
+ (* uop) (tmp_buf, recv_buf, &count, &dtype);
+ //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
+ }
+ }
+ else {
+ dst = (inter_rank & (~mask)) * num_core;
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
+ break;
+ }
+ mask <<=1;
+ }
+ }
+ */
+
+ /*
+ // INTER_binomial_bcast
+
+
+ if (intra_rank == 0) {
+ mask = 1;
+ while (mask < inter_comm_size) {
+ if (inter_rank & mask) {
+ src = (inter_rank - mask) * num_core;
+ //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
+ MPI_Recv(recv_buf, count, dtype, src, tag, comm, &status);
+ break;
+ }
+ mask <<= 1;
+ }
+
+ mask >>= 1;
+ //printf("My rank = %d my mask = %d\n", rank,mask);
+
+ while (mask > 0) {
+ if (inter_rank < inter_comm_size) {
+ dst = (inter_rank + mask) * num_core;
+ if (dst < comm_size) {
+ //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ }
+ }
+ mask >>= 1;
+ }
+ }
+ */
+
+
+ // INTRA_binomial_bcast
+
+ int num_core_in_current_smp = num_core;
+ if (inter_rank == (inter_comm_size - 1)) {
+ num_core_in_current_smp = comm_size - (inter_rank * num_core);
+ }
+ // printf("Node %d num_core = %d\n",rank, num_core_in_current_smp);
+ mask = 1;
+ while (mask < num_core_in_current_smp) {
+ if (intra_rank & mask) {
+ src = (inter_rank * num_core) + (intra_rank - mask);
+ //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
+ MPI_Recv(recv_buf, count, dtype, src, tag, comm, &status);
+ break;
+ }
+ mask <<= 1;
+ }
+
+ mask >>= 1;
+ //printf("My rank = %d my mask = %d\n", rank,mask);
+
+ while (mask > 0) {
+ dst = (inter_rank * num_core) + (intra_rank + mask);
+ if (dst < comm_size) {
+ //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ }
+ mask >>= 1;
+ }
+
+
+ free(tmp_buf);
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+/*
+ * implemented by Pitch Patarasuk, 07/01/2007
+ */
+//#include <star-reduction.c>
+
+/* change number of core per smp-node
+ we assume that number of core per process will be the same for all implementations */
+#ifndef NUM_CORE
+#define NUM_CORE 8
+#endif
+
+/*
+This fucntion performs all-reduce operation as follow.
+1) binomial_tree reduce inside each SMP node
+2) reduce-scatter -inter between root of each SMP node
+3) allgather - inter between root of each SMP node
+4) binomial_tree bcast inside each SMP node
+*/
+int smpi_coll_tuned_allreduce_smp_rsag_rab(void *sbuf, void *rbuf, int count,
+ MPI_Datatype dtype, MPI_Op op,
+ MPI_Comm comm)
+{
+ int comm_size, rank;
+ void *tmp_buf;
+ int tag = 50;
+ int mask, src, dst;
+ MPI_Status status;
+ int num_core = NUM_CORE;
+
+ MPI_Comm_size(comm, &comm_size);
+ MPI_Comm_rank(comm, &rank);
+ MPI_Aint extent;
+ MPI_Type_extent(dtype, &extent);
+ tmp_buf = (void *) malloc(count * extent);
+
+ int intra_rank, inter_rank;
+ intra_rank = rank % num_core;
+ inter_rank = rank / num_core;
+
+ //printf("node %d intra_rank = %d, inter_rank = %d\n", rank, intra_rank, inter_rank);
+
+ int inter_comm_size = (comm_size + num_core - 1) / num_core;
+
+ MPI_Sendrecv(sbuf, count, dtype, rank, tag,
+ rbuf, count, dtype, rank, tag, comm, &status);
+
+ // SMP_binomial_reduce
+ mask = 1;
+ while (mask < num_core) {
+ if ((mask & intra_rank) == 0) {
+ src = (inter_rank * num_core) + (intra_rank | mask);
+ // if (src < ((inter_rank + 1) * num_core)) {
+ if (src < comm_size) {
+ MPI_Recv(tmp_buf, count, dtype, src, tag, comm, &status);
+ star_reduction(op, tmp_buf, rbuf, &count, &dtype);
+ //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
+ }
+ } else {
+
+ dst = (inter_rank * num_core) + (intra_rank & (~mask));
+ MPI_Send(rbuf, count, dtype, dst, tag, comm);
+ //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
+ break;
+ }
+ mask <<= 1;
+ }
+
+
+ // INTER: reduce-scatter
+ if (intra_rank == 0) {
+
+ int dst, base_offset, send_base_offset, recv_base_offset, recv_chunk;
+ int curr_count, i, recv_offset, send_offset;
+
+ // reduce-scatter
+
+ recv_chunk = extent * count / (comm_size / num_core);
+
+ mask = 1;
+ i = 0;
+ curr_count = count / 2;
+ int phase = 0;
+ base_offset = 0;
+ send_base_offset = 0;
+ recv_base_offset = 0;
+
+ while (mask < (comm_size / num_core)) {
+ dst = inter_rank ^ mask;
+
+ // compute offsets
+ send_base_offset = base_offset;
+
+ // right-handside
+ if (inter_rank & mask) {
+ recv_base_offset = base_offset + curr_count;
+ send_base_offset = base_offset;
+ base_offset = recv_base_offset;
+ }
+ // left-handside
+ else {
+ recv_base_offset = base_offset;
+ send_base_offset = base_offset + curr_count;
+ }
+ send_offset = send_base_offset * extent;
+ recv_offset = recv_base_offset * extent;
+
+ // if (rank==7)
+ // printf("node %d send to %d in phase %d s_offset = %d r_offset = %d count = %d\n",rank,dst,phase, send_offset, recv_offset, curr_count);
+
+ MPI_Sendrecv((char *)rbuf + send_offset, curr_count, dtype, (dst * num_core), tag,
+ tmp_buf, curr_count, dtype, (dst * num_core), tag,
+ comm, &status);
+
+ star_reduction(op, tmp_buf, (char *)rbuf + recv_offset, &curr_count, &dtype);
+
+ mask *= 2;
+ curr_count /= 2;
+ phase++;
+ }
+
+
+ // INTER: allgather
+
+ int size = (comm_size / num_core) / 2;
+ base_offset = 0;
+ mask = 1;
+ while (mask < (comm_size / num_core)) {
+ if (inter_rank & mask) {
+ base_offset += size;
+ }
+ mask <<= 1;
+ size /= 2;
+ }
+
+ curr_count *= 2;
+ mask >>= 1;
+ i = 1;
+ phase = 0;
+ while (mask >= 1) {
+ // destination pair for both send and recv
+ dst = inter_rank ^ mask;
+
+ // compute offsets
+ send_base_offset = base_offset;
+ if (inter_rank & mask) {
+ recv_base_offset = base_offset - i;
+ base_offset -= i;
+ } else {
+ recv_base_offset = base_offset + i;
+ }
+ send_offset = send_base_offset * recv_chunk;
+ recv_offset = recv_base_offset * recv_chunk;
+
+ // if (rank==7)
+ //printf("node %d send to %d in phase %d s_offset = %d r_offset = %d count = %d\n",rank,dst,phase, send_offset, recv_offset, curr_count);
+
+ MPI_Sendrecv((char *)rbuf + send_offset, curr_count, dtype, (dst * num_core), tag,
+ (char *)rbuf + recv_offset, curr_count, dtype, (dst * num_core), tag,
+ comm, &status);
+
+
+ curr_count *= 2;
+ i *= 2;
+ mask >>= 1;
+ phase++;
+ }
+
+
+ } // INTER
+
+ // intra SMP binomial bcast
+
+ int num_core_in_current_smp = num_core;
+ if (inter_rank == (inter_comm_size - 1)) {
+ num_core_in_current_smp = comm_size - (inter_rank * num_core);
+ }
+ // printf("Node %d num_core = %d\n",rank, num_core_in_current_smp);
+ mask = 1;
+ while (mask < num_core_in_current_smp) {
+ if (intra_rank & mask) {
+ src = (inter_rank * num_core) + (intra_rank - mask);
+ //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
+ MPI_Recv(rbuf, count, dtype, src, tag, comm, &status);
+ break;
+ }
+ mask <<= 1;
+ }
+
+ mask >>= 1;
+ //printf("My rank = %d my mask = %d\n", rank,mask);
+
+ while (mask > 0) {
+ dst = (inter_rank * num_core) + (intra_rank + mask);
+ if (dst < comm_size) {
+ //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
+ MPI_Send(rbuf, count, dtype, dst, tag, comm);
+ }
+ mask >>= 1;
+ }
+
+
+ free(tmp_buf);
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+/* change number of core per smp-node
+ we assume that number of core per process will be the same for all implementations */
+#ifndef NUM_CORE
+#define NUM_CORE 8
+#endif
+
+/*
+This fucntion performs all-reduce operation as follow.
+1) binomial_tree reduce inside each SMP node
+2) reduce-scatter -inter between root of each SMP node
+3) allgather - inter between root of each SMP node
+4) binomial_tree bcast inside each SMP node
+*/
+int smpi_coll_tuned_allreduce_smp_rsag(void *send_buf, void *recv_buf,
+ int count, MPI_Datatype dtype, MPI_Op op,
+ MPI_Comm comm)
+{
+ int comm_size, rank;
+ void *tmp_buf;
+ int tag = 50;
+ int mask, src, dst;
+ MPI_Status status;
+ int num_core = NUM_CORE;
+ /*
+ #ifdef MPICH2_REDUCTION
+ MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
+ #else
+ MPI_User_function *uop;
+ struct MPIR_OP *op_ptr;
+ op_ptr = MPIR_ToPointer(op);
+ uop = op_ptr->op;
+ #endif
+ */
+ MPI_Comm_size(comm, &comm_size);
+ MPI_Comm_rank(comm, &rank);
+ MPI_Aint extent;
+ MPI_Type_extent(dtype, &extent);
+ tmp_buf = (void *) malloc(count * extent);
+
+ int intra_rank, inter_rank;
+ intra_rank = rank % num_core;
+ inter_rank = rank / num_core;
+
+ //printf("node %d intra_rank = %d, inter_rank = %d\n", rank, intra_rank, inter_rank);
+
+ int inter_comm_size = (comm_size + num_core - 1) / num_core;
+
+ if (!rank) {
+ //printf("intra com size = %d\n",num_core);
+ //printf("inter com size = %d\n",inter_comm_size);
+ }
+
+
+ MPI_Sendrecv(send_buf, count, dtype, rank, tag,
+ recv_buf, count, dtype, rank, tag, comm, &status);
+
+
+ // SMP_binomial_reduce
+ mask = 1;
+ while (mask < num_core) {
+ if ((mask & intra_rank) == 0) {
+ src = (inter_rank * num_core) + (intra_rank | mask);
+ // if (src < ((inter_rank + 1) * num_core)) {
+ if (src < comm_size) {
+ MPI_Recv(tmp_buf, count, dtype, src, tag, comm, &status);
+ star_reduction(op, tmp_buf, recv_buf, &count, &dtype);
+ //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
+ }
+ } else {
+
+ dst = (inter_rank * num_core) + (intra_rank & (~mask));
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
+ break;
+ }
+ mask <<= 1;
+ }
+
+
+
+ // INTER: reduce-scatter
+ if (intra_rank == 0) {
+ int send_offset, recv_offset;
+ int seg_count = count / inter_comm_size;
+ int to = ((inter_rank + 1) % inter_comm_size) * num_core;
+ int from =
+ ((inter_rank + inter_comm_size - 1) % inter_comm_size) * num_core;
+ int i;
+
+ //printf("node %d to %d from %d\n",rank,to,from);
+
+ for (i = 0; i < (inter_comm_size - 1); i++) {
+
+ send_offset =
+ ((inter_rank - 1 - i +
+ inter_comm_size) % inter_comm_size) * seg_count * extent;
+ recv_offset =
+ ((inter_rank - 2 - i +
+ inter_comm_size) % inter_comm_size) * seg_count * extent;
+
+ MPI_Sendrecv((char *) recv_buf + send_offset, seg_count, dtype, to,
+ tag + i, tmp_buf, seg_count, dtype, from, tag + i, comm,
+ &status);
+
+ // result is in rbuf
+ star_reduction(op, tmp_buf, (char *) recv_buf + recv_offset, &seg_count,
+ &dtype);
+ }
+
+ // INTER: allgather
+ for (i = 0; i < (inter_comm_size - 1); i++) {
+
+ send_offset =
+ ((inter_rank - i +
+ inter_comm_size) % inter_comm_size) * seg_count * extent;
+ recv_offset =
+ ((inter_rank - 1 - i +
+ inter_comm_size) % inter_comm_size) * seg_count * extent;
+
+ MPI_Sendrecv((char *) recv_buf + send_offset, seg_count, dtype, to,
+ tag + i, (char *) recv_buf + recv_offset, seg_count, dtype,
+ from, tag + i, comm, &status);
+
+ }
+ }
+
+
+
+ /*
+ // INTER_binomial_reduce
+
+ // only root node for each SMP
+ if (intra_rank == 0) {
+
+ mask = 1;
+ while (mask < inter_comm_size) {
+ if ((mask & inter_rank) == 0) {
+ src = (inter_rank | mask) * num_core;
+ if (src < comm_size) {
+ MPI_Recv(tmp_buf, count, dtype, src, tag, comm, &status);
+ (* uop) (tmp_buf, recv_buf, &count, &dtype);
+ //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
+ }
+ }
+ else {
+ dst = (inter_rank & (~mask)) * num_core;
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
+ break;
+ }
+ mask <<=1;
+ }
+ }
+ */
+
+ /*
+ // INTER_binomial_bcast
+
+
+ if (intra_rank == 0) {
+ mask = 1;
+ while (mask < inter_comm_size) {
+ if (inter_rank & mask) {
+ src = (inter_rank - mask) * num_core;
+ //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
+ MPI_Recv(recv_buf, count, dtype, src, tag, comm, &status);
+ break;
+ }
+ mask <<= 1;
+ }
+
+ mask >>= 1;
+ //printf("My rank = %d my mask = %d\n", rank,mask);
+
+ while (mask > 0) {
+ if (inter_rank < inter_comm_size) {
+ dst = (inter_rank + mask) * num_core;
+ if (dst < comm_size) {
+ //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ }
+ }
+ mask >>= 1;
+ }
+ }
+ */
+
+
+ // INTRA_binomial_bcast
+
+ int num_core_in_current_smp = num_core;
+ if (inter_rank == (inter_comm_size - 1)) {
+ num_core_in_current_smp = comm_size - (inter_rank * num_core);
+ }
+ // printf("Node %d num_core = %d\n",rank, num_core_in_current_smp);
+ mask = 1;
+ while (mask < num_core_in_current_smp) {
+ if (intra_rank & mask) {
+ src = (inter_rank * num_core) + (intra_rank - mask);
+ //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
+ MPI_Recv(recv_buf, count, dtype, src, tag, comm, &status);
+ break;
+ }
+ mask <<= 1;
+ }
+
+ mask >>= 1;
+ //printf("My rank = %d my mask = %d\n", rank,mask);
+
+ while (mask > 0) {
+ dst = (inter_rank * num_core) + (intra_rank + mask);
+ if (dst < comm_size) {
+ //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
+ MPI_Send(recv_buf, count, dtype, dst, tag, comm);
+ }
+ mask >>= 1;
+ }
+
+
+ free(tmp_buf);
+ return MPI_SUCCESS;
+}
int smpi_coll_tuned_alltoall_2dmesh(void *send_buff, int send_count,
MPI_Datatype send_type,
void *recv_buff, int recv_count,
- MPI_Datatype recv_type,
- MPI_Comm comm)
+ MPI_Datatype recv_type, MPI_Comm comm)
{
MPI_Status *statuses, s;
MPI_Request *reqs, *req_ptr;;
recv_offset = (my_row_base * block_size) + (i * block_size);
if (i + my_row_base == rank)
- MPI_Sendrecv((char *)send_buff + recv_offset, send_count, send_type,
+ MPI_Sendrecv((char *) send_buff + recv_offset, send_count, send_type,
rank, tag,
- (char*)recv_buff + recv_offset, recv_count, recv_type,
+ (char *) recv_buff + recv_offset, recv_count, recv_type,
rank, tag, comm, &s);
else
MPI_Sendrecv(tmp_buff1 + send_offset, send_count, send_type,
rank, tag,
- (char *)recv_buff + recv_offset, recv_count, recv_type,
+ (char *) recv_buff + recv_offset, recv_count, recv_type,
rank, tag, comm, &s);
}
continue;
src_row_base = (src / Y) * Y;
- MPI_Irecv((char *)recv_buff + src_row_base * block_size, recv_count * Y,
+ MPI_Irecv((char *) recv_buff + src_row_base * block_size, recv_count * Y,
recv_type, src, tag, comm, req_ptr++);
}
send_offset = (dst + j * num_procs) * block_size;
if (j + my_row_base == rank)
- MPI_Sendrecv((char *)send_buff + dst * block_size, send_count, send_type,
- rank, tag,
- tmp_buff2 + recv_offset, recv_count, recv_type,
- rank, tag, comm, &s);
+ MPI_Sendrecv((char *) send_buff + dst * block_size, send_count,
+ send_type, rank, tag, tmp_buff2 + recv_offset, recv_count,
+ recv_type, rank, tag, comm, &s);
else
MPI_Sendrecv(tmp_buff1 + send_offset, send_count, send_type,
rank, tag,
int smpi_coll_tuned_alltoall_3dmesh(void *send_buff, int send_count,
MPI_Datatype send_type,
void *recv_buff, int recv_count,
- MPI_Datatype recv_type,
- MPI_Comm comm)
+ MPI_Datatype recv_type, MPI_Comm comm)
{
MPI_Request *reqs, *req_ptr;
MPI_Aint extent;
send_offset = (rank * block_size) + (i * block_size * num_procs);
recv_offset = (my_z_base * block_size) + (i * block_size);
MPI_Sendrecv(tmp_buff1 + send_offset, send_count, send_type, rank, tag,
- (char *)recv_buff + recv_offset, recv_count, recv_type,
+ (char *) recv_buff + recv_offset, recv_count, recv_type,
rank, tag, comm, &status);
}
recv_offset = (src_z_base * block_size);
- MPI_Irecv((char *)recv_buff + recv_offset, recv_count * two_dsize, recv_type,
- src, tag, comm, req_ptr++);
+ MPI_Irecv((char *) recv_buff + recv_offset, recv_count * two_dsize,
+ recv_type, src, tag, comm, req_ptr++);
}
for (i = 1; i < Z; i++) {
****************************************************************************/
int
-smpi_coll_tuned_alltoall_bruck(void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm)
+smpi_coll_tuned_alltoall_bruck(void *send_buff, int send_count,
+ MPI_Datatype send_type, void *recv_buff,
+ int recv_count, MPI_Datatype recv_type,
+ MPI_Comm comm)
{
MPI_Status status;
MPI_Aint extent;
MPI_Datatype new_type;
-
- int * blocks_length, * disps;
+
+ int *blocks_length, *disps;
int i, src, dst, rank, num_procs, count, remainder, block, position;
int pack_size, tag = 1, pof2 = 1, success = 1, failure = 0;
-
- char * tmp_buff;
- char * send_ptr = (char *) send_buff;
- char * recv_ptr = (char *) recv_buff;
+
+ char *tmp_buff;
+ char *send_ptr = (char *) send_buff;
+ char *recv_ptr = (char *) recv_buff;
MPI_Comm_size(comm, &num_procs);
MPI_Comm_rank(comm, &rank);
MPI_Type_extent(recv_type, &extent);
- tmp_buff = (char *) malloc (num_procs * recv_count * extent);
- if (!tmp_buff)
- {
- printf("alltoall-bruck:53: cannot allocate memory\n");
- MPI_Finalize();
- exit(failure);
- }
+ tmp_buff = (char *) malloc(num_procs * recv_count * extent);
+ if (!tmp_buff) {
+ printf("alltoall-bruck:53: cannot allocate memory\n");
+ MPI_Finalize();
+ exit(failure);
+ }
disps = (int *) malloc(sizeof(int) * num_procs);
- if (!disps)
- {
- printf("alltoall-bruck:61: cannot allocate memory\n");
- MPI_Finalize();
- exit(failure);
- }
-
- blocks_length = (int *) malloc(sizeof(int) * num_procs);
- if (!blocks_length)
- {
- printf("alltoall-bruck:69: cannot allocate memory\n");
- MPI_Finalize();
- exit(failure);
- }
-
-
+ if (!disps) {
+ printf("alltoall-bruck:61: cannot allocate memory\n");
+ MPI_Finalize();
+ exit(failure);
+ }
+
+ blocks_length = (int *) malloc(sizeof(int) * num_procs);
+ if (!blocks_length) {
+ printf("alltoall-bruck:69: cannot allocate memory\n");
+ MPI_Finalize();
+ exit(failure);
+ }
+
+
MPI_Sendrecv(send_ptr + rank * send_count * extent,
- (num_procs - rank) * send_count, send_type, rank, tag,
- recv_ptr, (num_procs - rank) * recv_count, recv_type, rank,
- tag, comm, &status);
+ (num_procs - rank) * send_count, send_type, rank, tag,
+ recv_ptr, (num_procs - rank) * recv_count, recv_type, rank,
+ tag, comm, &status);
MPI_Sendrecv(send_ptr, rank * send_count, send_type, rank, tag,
- recv_ptr + (num_procs - rank) * recv_count * extent,
- rank * recv_count, recv_type, rank, tag, comm, &status);
-
-
-
- MPI_Pack_size(send_count * num_procs, send_type, comm, &pack_size);
-
- while (pof2 < num_procs)
- {
- dst = (rank + pof2) % num_procs;
- src = (rank - pof2 + num_procs) % num_procs;
-
-
- count = 0;
- for (block = 1; block < num_procs; block++)
- if (block & pof2)
- {
- blocks_length[count] = send_count;
- disps[count] = block * send_count;
- count++;
- }
-
- MPI_Type_indexed(count, blocks_length, disps, recv_type, &new_type);
- MPI_Type_commit(&new_type);
-
- position = 0;
- MPI_Pack(recv_buff, 1, new_type, tmp_buff, pack_size, &position, comm);
-
- MPI_Sendrecv(tmp_buff, position, MPI_PACKED, dst, tag, recv_buff, 1,
- new_type, src, tag, comm, &status);
- MPI_Type_free(&new_type);
-
- pof2 *= 2;
- }
+ recv_ptr + (num_procs - rank) * recv_count * extent,
+ rank * recv_count, recv_type, rank, tag, comm, &status);
+
+
+
+ MPI_Pack_size(send_count * num_procs, send_type, comm, &pack_size);
+
+ while (pof2 < num_procs) {
+ dst = (rank + pof2) % num_procs;
+ src = (rank - pof2 + num_procs) % num_procs;
+
+
+ count = 0;
+ for (block = 1; block < num_procs; block++)
+ if (block & pof2) {
+ blocks_length[count] = send_count;
+ disps[count] = block * send_count;
+ count++;
+ }
+
+ MPI_Type_indexed(count, blocks_length, disps, recv_type, &new_type);
+ MPI_Type_commit(&new_type);
+
+ position = 0;
+ MPI_Pack(recv_buff, 1, new_type, tmp_buff, pack_size, &position, comm);
+
+ MPI_Sendrecv(tmp_buff, position, MPI_PACKED, dst, tag, recv_buff, 1,
+ new_type, src, tag, comm, &status);
+ MPI_Type_free(&new_type);
+
+ pof2 *= 2;
+ }
free(disps);
free(blocks_length);
-
+
MPI_Sendrecv(recv_ptr + (rank + 1) * recv_count * extent,
- (num_procs - rank - 1) * recv_count, send_type,
- rank, tag, tmp_buff, (num_procs - rank - 1) * recv_count,
- recv_type, rank, tag, comm, &status);
-
+ (num_procs - rank - 1) * recv_count, send_type,
+ rank, tag, tmp_buff, (num_procs - rank - 1) * recv_count,
+ recv_type, rank, tag, comm, &status);
+
MPI_Sendrecv(recv_ptr, (rank + 1) * recv_count, send_type, rank, tag,
- tmp_buff + (num_procs - rank - 1) * recv_count * extent,
- (rank + 1) * recv_count, recv_type, rank, tag, comm, &status);
+ tmp_buff + (num_procs - rank - 1) * recv_count * extent,
+ (rank + 1) * recv_count, recv_type, rank, tag, comm, &status);
+
-
- for (i = 0; i < num_procs; i++)
+ for (i = 0; i < num_procs; i++)
MPI_Sendrecv(tmp_buff + i * recv_count * extent, recv_count, send_type,
- rank, tag,
- recv_ptr + (num_procs - i - 1) * recv_count * extent,
- recv_count, recv_type, rank, tag, comm, &status);
+ rank, tag,
+ recv_ptr + (num_procs - i - 1) * recv_count * extent,
+ recv_count, recv_type, rank, tag, comm, &status);
free(tmp_buff);
return success;
+++ /dev/null
-int
-alltoall_native(void * send_buff, int send_count,
- MPI_Datatype send_type, void * recv_buff,
- int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm)
-{
- return MPI_Alltoall(send_buff, send_count, send_type, recv_buff, recv_count,
- recv_type, comm);
-}
****************************************************************************/
int
-smpi_coll_tuned_alltoall_pair_light_barrier(void * send_buff, int send_count,
- MPI_Datatype send_type, void * recv_buff,
- int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm)
+smpi_coll_tuned_alltoall_pair_light_barrier(void *send_buff, int send_count,
+ MPI_Datatype send_type,
+ void *recv_buff, int recv_count,
+ MPI_Datatype recv_type,
+ MPI_Comm comm)
{
MPI_Aint send_chunk, recv_chunk;
MPI_Status s;
int i, src, dst, rank, num_procs, next_partner;
- int tag = 1, success = 1; /*, failure = 0;*/
+ int tag = 1, success = 1; /*, failure = 0; */
char send_sync = 'a', recv_sync = 'b';
- char * send_ptr = (char *) send_buff;
- char * recv_ptr = (char *) recv_buff;
-
+ char *send_ptr = (char *) send_buff;
+ char *recv_ptr = (char *) recv_buff;
+
MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &num_procs);
MPI_Type_extent(send_type, &send_chunk);
recv_chunk *= recv_count;
MPI_Sendrecv(send_ptr + rank * send_chunk, send_count, send_type, rank, tag,
- recv_ptr + rank * recv_chunk, recv_count, recv_type, rank, tag,
- comm, &s);
+ recv_ptr + rank * recv_chunk, recv_count, recv_type, rank, tag,
+ comm, &s);
+
+ for (i = 1; i < num_procs; i++) {
+ src = dst = rank ^ i;
- for (i = 1; i < num_procs; i++)
- {
- src = dst = rank ^ i;
-
- MPI_Sendrecv(send_ptr + dst * send_chunk, send_count, send_type,
- dst, tag, recv_ptr + src * recv_chunk, recv_count,
- recv_type, src, tag, comm, &s);
+ MPI_Sendrecv(send_ptr + dst * send_chunk, send_count, send_type,
+ dst, tag, recv_ptr + src * recv_chunk, recv_count,
+ recv_type, src, tag, comm, &s);
- if ((i + 1) < num_procs)
- {
- next_partner = rank ^ (i + 1);
- MPI_Sendrecv(&send_sync, 1, MPI_CHAR, next_partner, tag,
- &recv_sync, 1, MPI_CHAR, next_partner, tag,
- comm, &s);
- }
+ if ((i + 1) < num_procs) {
+ next_partner = rank ^ (i + 1);
+ MPI_Sendrecv(&send_sync, 1, MPI_CHAR, next_partner, tag,
+ &recv_sync, 1, MPI_CHAR, next_partner, tag, comm, &s);
}
+ }
return success;
}
****************************************************************************/
int
-smpi_coll_tuned_alltoall_pair_mpi_barrier(void * send_buff, int send_count,
- MPI_Datatype send_type, void * recv_buff,
- int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm)
+smpi_coll_tuned_alltoall_pair_mpi_barrier(void *send_buff, int send_count,
+ MPI_Datatype send_type,
+ void *recv_buff, int recv_count,
+ MPI_Datatype recv_type, MPI_Comm comm)
{
MPI_Status s;
MPI_Aint send_chunk, recv_chunk;
int i, src, dst, rank, num_procs;
- int tag = 101, success = 1; /*, failure = 0, pof2 = 1;*/
- char * send_ptr = (char *) send_buff;
- char * recv_ptr = (char *) recv_buff;
-
+ int tag = 101, success = 1; /*, failure = 0, pof2 = 1; */
+ char *send_ptr = (char *) send_buff;
+ char *recv_ptr = (char *) recv_buff;
+
MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &num_procs);
MPI_Type_extent(send_type, &send_chunk);
MPI_Type_extent(recv_type, &recv_chunk);
send_chunk *= send_count;
- recv_chunk *= recv_count;
-
- for (i = 0; i < num_procs; i++)
- {
- src = dst = rank ^ i;
- MPI_Barrier(comm);
- MPI_Sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst,
- tag, recv_ptr + src * recv_chunk, recv_count, recv_type,
- src, tag, comm, &s);
- }
+ recv_chunk *= recv_count;
+
+ for (i = 0; i < num_procs; i++) {
+ src = dst = rank ^ i;
+ MPI_Barrier(comm);
+ MPI_Sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst,
+ tag, recv_ptr + src * recv_chunk, recv_count, recv_type,
+ src, tag, comm, &s);
+ }
return success;
}
****************************************************************************/
int
-smpi_coll_tuned_alltoall_pair_one_barrier(void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm)
+smpi_coll_tuned_alltoall_pair_one_barrier(void *send_buff, int send_count,
+ MPI_Datatype send_type,
+ void *recv_buff, int recv_count,
+ MPI_Datatype recv_type, MPI_Comm comm)
{
MPI_Aint send_chunk, recv_chunk;
MPI_Status s;
int i, src, dst, rank, num_procs;
- int tag = 1, success = 1; /*, failure = 0, pof2 = 1; */
+ int tag = 1, success = 1; /*, failure = 0, pof2 = 1; */
+
+ char *send_ptr = (char *) send_buff;
+ char *recv_ptr = (char *) recv_buff;
- char * send_ptr = (char *) send_buff;
- char * recv_ptr = (char *) recv_buff;
-
MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &num_procs);
MPI_Type_extent(send_type, &send_chunk);
MPI_Type_extent(recv_type, &recv_chunk);
send_chunk *= send_count;
- recv_chunk *= recv_count;
+ recv_chunk *= recv_count;
MPI_Barrier(comm);
- for (i = 0; i < num_procs; i++)
- {
- src = dst = rank ^ i;
- MPI_Sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst,
- tag, recv_ptr + src * recv_chunk, recv_count, recv_type,
- src, tag, comm, &s);
- }
+ for (i = 0; i < num_procs; i++) {
+ src = dst = rank ^ i;
+ MPI_Sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst,
+ tag, recv_ptr + src * recv_chunk, recv_count, recv_type,
+ src, tag, comm, &s);
+ }
return success;
}
int smpi_coll_tuned_alltoall_pair(void *send_buff, int send_count,
MPI_Datatype send_type,
void *recv_buff, int recv_count,
- MPI_Datatype recv_type,
- MPI_Comm comm)
+ MPI_Datatype recv_type, MPI_Comm comm)
{
MPI_Aint send_chunk, recv_chunk;
int smpi_coll_tuned_alltoall_rdb(void *send_buff, int send_count,
MPI_Datatype send_type,
void *recv_buff, int recv_count,
- MPI_Datatype recv_type,
- MPI_Comm comm)
+ MPI_Datatype recv_type, MPI_Comm comm)
{
/* MPI variables */
MPI_Status status;
****************************************************************************/
int
-smpi_coll_tuned_alltoall_ring_light_barrier(void * send_buff, int send_count,
- MPI_Datatype send_type, void * recv_buff,
- int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm)
+smpi_coll_tuned_alltoall_ring_light_barrier(void *send_buff, int send_count,
+ MPI_Datatype send_type,
+ void *recv_buff, int recv_count,
+ MPI_Datatype recv_type,
+ MPI_Comm comm)
{
MPI_Aint send_chunk, recv_chunk;
MPI_Status s;
int i, src, dst, rank, num_procs, next_dst, next_src;
- int tag = 1, success = 1; /*, failure = 0;*/
+ int tag = 1, success = 1; /*, failure = 0; */
char send_sync = 'a', recv_sync = 'b';
- char * send_ptr = (char *) send_buff;
- char * recv_ptr = (char *) recv_buff;
-
+ char *send_ptr = (char *) send_buff;
+ char *recv_ptr = (char *) recv_buff;
+
MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &num_procs);
MPI_Type_extent(send_type, &send_chunk);
recv_chunk *= recv_count;
MPI_Sendrecv(send_ptr + rank * send_chunk, send_count, send_type, rank, tag,
- recv_ptr + rank * recv_chunk, recv_count, recv_type, rank, tag,
- comm, &s);
+ recv_ptr + rank * recv_chunk, recv_count, recv_type, rank, tag,
+ comm, &s);
+
+ for (i = 1; i < num_procs; i++) {
+ src = (rank - i + num_procs) % num_procs;
+ dst = (rank + i) % num_procs;
+
+ MPI_Sendrecv(send_ptr + dst * send_chunk, send_count, send_type,
+ dst, tag, recv_ptr + src * recv_chunk, recv_count,
+ recv_type, src, tag, comm, &s);
- for (i = 1; i < num_procs; i++)
- {
- src = (rank - i + num_procs) % num_procs;
- dst = (rank + i) % num_procs;
-
- MPI_Sendrecv(send_ptr + dst * send_chunk, send_count, send_type,
- dst, tag, recv_ptr + src * recv_chunk, recv_count,
- recv_type, src, tag, comm, &s);
+ if ((i + 1) < num_procs) {
+ next_src = (rank - (i + 1) + num_procs) % num_procs;
+ next_dst = (rank + (i + 1) + num_procs) % num_procs;
+ MPI_Sendrecv(&send_sync, 1, MPI_CHAR, next_src, tag,
+ &recv_sync, 1, MPI_CHAR, next_dst, tag, comm, &s);
- if ((i + 1) < num_procs)
- {
- next_src = (rank - (i + 1) + num_procs) % num_procs;
- next_dst = (rank + (i + 1) + num_procs) % num_procs;
- MPI_Sendrecv(&send_sync, 1, MPI_CHAR, next_src, tag,
- &recv_sync, 1, MPI_CHAR, next_dst, tag,
- comm, &s);
-
- }
}
+ }
return success;
}
****************************************************************************/
int
-smpi_coll_tuned_alltoall_ring_mpi_barrier(void * send_buff, int send_count,
- MPI_Datatype send_type, void * recv_buff,
- int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm)
+smpi_coll_tuned_alltoall_ring_mpi_barrier(void *send_buff, int send_count,
+ MPI_Datatype send_type,
+ void *recv_buff, int recv_count,
+ MPI_Datatype recv_type, MPI_Comm comm)
{
MPI_Status s;
MPI_Aint send_chunk, recv_chunk;
int i, src, dst, rank, num_procs;
- int tag = 1, success = 1; /*, failure = 0, pof2 = 1;*/
+ int tag = 1, success = 1; /*, failure = 0, pof2 = 1; */
+
+ char *send_ptr = (char *) send_buff;
+ char *recv_ptr = (char *) recv_buff;
- char * send_ptr = (char *) send_buff;
- char * recv_ptr = (char *) recv_buff;
-
MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &num_procs);
MPI_Type_extent(send_type, &send_chunk);
MPI_Type_extent(recv_type, &recv_chunk);
send_chunk *= send_count;
- recv_chunk *= recv_count;
-
- for (i = 0; i < num_procs; i++)
- {
- src = (rank - i + num_procs) % num_procs;
- dst = (rank + i) % num_procs;
-
- MPI_Barrier(comm);
- MPI_Sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst,
- tag, recv_ptr + src * recv_chunk, recv_count, recv_type,
- src, tag, comm, &s);
- }
+ recv_chunk *= recv_count;
+
+ for (i = 0; i < num_procs; i++) {
+ src = (rank - i + num_procs) % num_procs;
+ dst = (rank + i) % num_procs;
+
+ MPI_Barrier(comm);
+ MPI_Sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst,
+ tag, recv_ptr + src * recv_chunk, recv_count, recv_type,
+ src, tag, comm, &s);
+ }
return success;
}
****************************************************************************/
int
-smpi_coll_tuned_alltoall_ring_one_barrier(void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm)
+smpi_coll_tuned_alltoall_ring_one_barrier(void *send_buff, int send_count,
+ MPI_Datatype send_type,
+ void *recv_buff, int recv_count,
+ MPI_Datatype recv_type, MPI_Comm comm)
{
MPI_Status s;
MPI_Aint send_chunk, recv_chunk;
int i, src, dst, rank, num_procs;
- int tag = 1, success = 1; /*, failure = 0, pof2 = 1; */
+ int tag = 1, success = 1; /*, failure = 0, pof2 = 1; */
+
+ char *send_ptr = (char *) send_buff;
+ char *recv_ptr = (char *) recv_buff;
- char * send_ptr = (char *) send_buff;
- char * recv_ptr = (char *) recv_buff;
-
MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &num_procs);
MPI_Type_extent(send_type, &send_chunk);
MPI_Type_extent(recv_type, &recv_chunk);
send_chunk *= send_count;
- recv_chunk *= recv_count;
+ recv_chunk *= recv_count;
MPI_Barrier(comm);
- for (i = 0; i < num_procs; i++)
- {
- src = (rank - i + num_procs) % num_procs;
- dst = (rank + i) % num_procs;
-
- MPI_Sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst,
- tag, recv_ptr + src * recv_chunk, recv_count, recv_type,
- src, tag, comm, &s);
- }
+ for (i = 0; i < num_procs; i++) {
+ src = (rank - i + num_procs) % num_procs;
+ dst = (rank + i) % num_procs;
+
+ MPI_Sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst,
+ tag, recv_ptr + src * recv_chunk, recv_count, recv_type,
+ src, tag, comm, &s);
+ }
return success;
}
****************************************************************************/
int
-smpi_coll_tuned_alltoall_ring(void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm)
+smpi_coll_tuned_alltoall_ring(void *send_buff, int send_count,
+ MPI_Datatype send_type, void *recv_buff,
+ int recv_count, MPI_Datatype recv_type,
+ MPI_Comm comm)
{
MPI_Status s;
MPI_Aint send_chunk, recv_chunk;
int i, src, dst, rank, num_procs;
- int tag = 1, success = 1; /*, failure = 0, pof2 = 1*/;
+ int tag = 1, success = 1; /*, failure = 0, pof2 = 1 */ ;
+
+ char *send_ptr = (char *) send_buff;
+ char *recv_ptr = (char *) recv_buff;
- char * send_ptr = (char *) send_buff;
- char * recv_ptr = (char *) recv_buff;
-
MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &num_procs);
MPI_Type_extent(send_type, &send_chunk);
MPI_Type_extent(recv_type, &recv_chunk);
send_chunk *= send_count;
- recv_chunk *= recv_count;
-
- for (i = 0; i < num_procs; i++)
- {
- src = (rank - i + num_procs) % num_procs;
- dst = (rank + i) % num_procs;
-
- MPI_Sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst,
- tag, recv_ptr + src * recv_chunk, recv_count, recv_type,
- src, tag, comm, &s);
- }
+ recv_chunk *= recv_count;
+
+ for (i = 0; i < num_procs; i++) {
+ src = (rank - i + num_procs) % num_procs;
+ dst = (rank + i) % num_procs;
+
+ MPI_Sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst,
+ tag, recv_ptr + src * recv_chunk, recv_count, recv_type,
+ src, tag, comm, &s);
+ }
return success;
}
int smpi_coll_tuned_alltoall_simple(void *send_buff, int send_count,
MPI_Datatype send_type,
void *recv_buff, int recv_count,
- MPI_Datatype recv_type,
- MPI_Comm comm)
+ MPI_Datatype recv_type, MPI_Comm comm)
{
int i, rank, size, nreqs, err, src, dst, tag = 101;
char *psnd;
--- /dev/null
+#include "colls.h"
+
+int bcast_NTSB_segment_size_in_byte = 8192;
+
+int smpi_coll_tuned_bcast_NTSB(void *buf, int count, MPI_Datatype datatype,
+ int root, MPI_Comm comm)
+{
+ int tag = 5000;
+ MPI_Status status;
+ int rank, size;
+ int i;
+
+ MPI_Request *send_request_array;
+ MPI_Request *recv_request_array;
+ MPI_Status *send_status_array;
+ MPI_Status *recv_status_array;
+
+ MPI_Aint extent;
+ MPI_Type_extent(datatype, &extent);
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+ /* source node and destination nodes (same through out the functions) */
+ int from = (rank - 1) / 2;
+ int to_left = rank * 2 + 1;
+ int to_right = rank * 2 + 2;
+ if (to_left >= size)
+ to_left = -1;
+ if (to_right >= size)
+ to_right = -1;
+
+ /* segment is segment size in number of elements (not bytes) */
+ int segment = bcast_NTSB_segment_size_in_byte / extent;
+
+ /* pipeline length */
+ int pipe_length = count / segment;
+
+ /* use for buffer offset for sending and receiving data = segment size in byte */
+ int increment = segment * extent;
+
+ /* if the input size is not divisible by segment size =>
+ the small remainder will be done with native implementation */
+ int remainder = count % segment;
+
+ /* if root is not zero send to rank zero first */
+ if (root != 0) {
+ if (rank == root) {
+ MPI_Send(buf, count, datatype, 0, tag, comm);
+ } else if (rank == 0) {
+ MPI_Recv(buf, count, datatype, root, tag, comm, &status);
+ }
+ }
+
+ /* when a message is smaller than a block size => no pipeline */
+ if (count <= segment) {
+
+ /* case: root */
+ if (rank == 0) {
+ /* case root has only a left child */
+ if (to_right == -1) {
+ MPI_Send(buf, count, datatype, to_left, tag, comm);
+ }
+ /* case root has both left and right children */
+ else {
+ MPI_Send(buf, count, datatype, to_left, tag, comm);
+ MPI_Send(buf, count, datatype, to_right, tag, comm);
+ }
+ }
+
+ /* case: leaf ==> receive only */
+ else if (to_left == -1) {
+ MPI_Recv(buf, count, datatype, from, tag, comm, &status);
+ }
+
+ /* case: intermidiate node with only left child ==> relay message */
+ else if (to_right == -1) {
+ MPI_Recv(buf, count, datatype, from, tag, comm, &status);
+ MPI_Send(buf, count, datatype, to_left, tag, comm);
+ }
+
+ /* case: intermidiate node with both left and right children ==> relay message */
+ else {
+ MPI_Recv(buf, count, datatype, from, tag, comm, &status);
+ MPI_Send(buf, count, datatype, to_left, tag, comm);
+ MPI_Send(buf, count, datatype, to_right, tag, comm);
+ }
+ return MPI_SUCCESS;
+ }
+ // pipelining
+ else {
+
+ send_request_array =
+ (MPI_Request *) malloc(2 * (size + pipe_length) * sizeof(MPI_Request));
+ recv_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ send_status_array =
+ (MPI_Status *) malloc(2 * (size + pipe_length) * sizeof(MPI_Status));
+ recv_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+
+
+
+ /* case: root */
+ if (rank == 0) {
+ /* case root has only a left child */
+ if (to_right == -1) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Isend((char *) buf + (i * increment), segment, datatype, to_left,
+ tag + i, comm, &send_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+ /* case root has both left and right children */
+ else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Isend((char *) buf + (i * increment), segment, datatype, to_left,
+ tag + i, comm, &send_request_array[i]);
+ MPI_Isend((char *) buf + (i * increment), segment, datatype, to_right,
+ tag + i, comm, &send_request_array[i + pipe_length]);
+ }
+ MPI_Waitall((2 * pipe_length), send_request_array, send_status_array);
+ }
+ }
+
+ /* case: leaf ==> receive only */
+ else if (to_left == -1) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype, from,
+ tag + i, comm, &recv_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), recv_request_array, recv_status_array);
+ }
+
+ /* case: intermidiate node with only left child ==> relay message */
+ else if (to_right == -1) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype, from,
+ tag + i, comm, &recv_request_array[i]);
+ }
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&recv_request_array[i], &status);
+ MPI_Isend((char *) buf + (i * increment), segment, datatype, to_left,
+ tag + i, comm, &send_request_array[i]);
+ }
+ MPI_Waitall(pipe_length, send_request_array, send_status_array);
+
+ }
+ /* case: intermidiate node with both left and right children ==> relay message */
+ else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype, from,
+ tag + i, comm, &recv_request_array[i]);
+ }
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&recv_request_array[i], &status);
+ MPI_Isend((char *) buf + (i * increment), segment, datatype, to_left,
+ tag + i, comm, &send_request_array[i]);
+ MPI_Isend((char *) buf + (i * increment), segment, datatype, to_right,
+ tag + i, comm, &send_request_array[i + pipe_length]);
+ }
+ MPI_Waitall((2 * pipe_length), send_request_array, send_status_array);
+ }
+
+ free(send_request_array);
+ free(recv_request_array);
+ free(send_status_array);
+ free(recv_status_array);
+ } /* end pipeline */
+
+ /* when count is not divisible by block size, use default BCAST for the remainder */
+ if ((remainder != 0) && (count > segment)) {
+ MPI_Bcast((char *) buf + (pipe_length * increment), remainder, datatype,
+ root, comm);
+ }
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+static int bcast_NTSL_segment_size_in_byte = 8192;
+
+/* Non-topology-specific pipelined linear-bcast function
+ 0->1, 1->2 ,2->3, ....., ->last node : in a pipeline fashion
+*/
+int smpi_coll_tuned_bcast_NTSL_Isend(void *buf, int count, MPI_Datatype datatype,
+ int root, MPI_Comm comm)
+{
+ int tag = 50;
+ MPI_Status status;
+ MPI_Request request;
+ MPI_Request *send_request_array;
+ MPI_Request *recv_request_array;
+ MPI_Status *send_status_array;
+ MPI_Status *recv_status_array;
+ int rank, size;
+ int i;
+ MPI_Aint extent;
+ MPI_Type_extent(datatype, &extent);
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+ /* source node and destination nodes (same through out the functions) */
+ int to = (rank + 1) % size;
+ int from = (rank + size - 1) % size;
+
+ /* segment is segment size in number of elements (not bytes) */
+ int segment = bcast_NTSL_segment_size_in_byte / extent;
+
+ /* pipeline length */
+ int pipe_length = count / segment;
+
+ /* use for buffer offset for sending and receiving data = segment size in byte */
+ int increment = segment * extent;
+
+ /* if the input size is not divisible by segment size =>
+ the small remainder will be done with native implementation */
+ int remainder = count % segment;
+
+ /* if root is not zero send to rank zero first
+ this can be modified to make it faster by using logical src, dst.
+ */
+ if (root != 0) {
+ if (rank == root) {
+ MPI_Send(buf, count, datatype, 0, tag, comm);
+ } else if (rank == 0) {
+ MPI_Recv(buf, count, datatype, root, tag, comm, &status);
+ }
+ }
+
+ /* when a message is smaller than a block size => no pipeline */
+ if (count <= segment) {
+ if (rank == 0) {
+ MPI_Send(buf, count, datatype, to, tag, comm);
+ } else if (rank == (size - 1)) {
+ MPI_Irecv(buf, count, datatype, from, tag, comm, &request);
+ MPI_Wait(&request, &status);
+ } else {
+ MPI_Irecv(buf, count, datatype, from, tag, comm, &request);
+ MPI_Wait(&request, &status);
+ MPI_Send(buf, count, datatype, to, tag, comm);
+ }
+ return MPI_SUCCESS;
+ }
+
+ /* pipeline bcast */
+ else {
+ send_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ recv_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ send_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+ recv_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+
+ /* root send data */
+ if (rank == 0) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Isend((char *) buf + (i * increment), segment, datatype, to,
+ (tag + i), comm, &send_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+
+ /* last node only receive data */
+ else if (rank == (size - 1)) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype, from,
+ (tag + i), comm, &recv_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), recv_request_array, recv_status_array);
+ }
+
+ /* intermediate nodes relay (receive, then send) data */
+ else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype, from,
+ (tag + i), comm, &recv_request_array[i]);
+ }
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&recv_request_array[i], &status);
+ MPI_Isend((char *) buf + (i * increment), segment, datatype, to,
+ (tag + i), comm, &send_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+
+ free(send_request_array);
+ free(recv_request_array);
+ free(send_status_array);
+ free(recv_status_array);
+ } /* end pipeline */
+
+ /* when count is not divisible by block size, use default BCAST for the remainder */
+ if ((remainder != 0) && (count > segment)) {
+ MPI_Bcast((char *) buf + (pipe_length * increment), remainder, datatype,
+ root, comm);
+ }
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+static int bcast_NTSL_segment_size_in_byte = 8192;
+
+/* Non-topology-specific pipelined linear-bcast function
+ 0->1, 1->2 ,2->3, ....., ->last node : in a pipeline fashion
+*/
+int smpi_coll_tuned_bcast_NTSL(void *buf, int count, MPI_Datatype datatype,
+ int root, MPI_Comm comm)
+{
+ int tag = 50;
+ MPI_Status status;
+ MPI_Request request;
+ MPI_Request *send_request_array;
+ MPI_Request *recv_request_array;
+ MPI_Status *send_status_array;
+ MPI_Status *recv_status_array;
+ int rank, size;
+ int i;
+ MPI_Aint extent;
+ MPI_Type_extent(datatype, &extent);
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+ /* source node and destination nodes (same through out the functions) */
+ int to = (rank + 1) % size;
+ int from = (rank + size - 1) % size;
+
+ /* segment is segment size in number of elements (not bytes) */
+ int segment = bcast_NTSL_segment_size_in_byte / extent;
+
+ /* pipeline length */
+ int pipe_length = count / segment;
+
+ /* use for buffer offset for sending and receiving data = segment size in byte */
+ int increment = segment * extent;
+
+ /* if the input size is not divisible by segment size =>
+ the small remainder will be done with native implementation */
+ int remainder = count % segment;
+
+ /* if root is not zero send to rank zero first
+ this can be modified to make it faster by using logical src, dst.
+ */
+ if (root != 0) {
+ if (rank == root) {
+ MPI_Send(buf, count, datatype, 0, tag, comm);
+ } else if (rank == 0) {
+ MPI_Recv(buf, count, datatype, root, tag, comm, &status);
+ }
+ }
+
+ /* when a message is smaller than a block size => no pipeline */
+ if (count <= segment) {
+ if (rank == 0) {
+ MPI_Send(buf, count, datatype, to, tag, comm);
+ } else if (rank == (size - 1)) {
+ MPI_Irecv(buf, count, datatype, from, tag, comm, &request);
+ MPI_Wait(&request, &status);
+ } else {
+ MPI_Irecv(buf, count, datatype, from, tag, comm, &request);
+ MPI_Wait(&request, &status);
+ MPI_Send(buf, count, datatype, to, tag, comm);
+ }
+ return MPI_SUCCESS;
+ }
+
+ /* pipeline bcast */
+ else {
+ send_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ recv_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ send_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+ recv_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+
+ /* root send data */
+ if (rank == 0) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Isend((char *) buf + (i * increment), segment, datatype, to,
+ (tag + i), comm, &send_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+
+ /* last node only receive data */
+ else if (rank == (size - 1)) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype, from,
+ (tag + i), comm, &recv_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), recv_request_array, recv_status_array);
+ }
+
+ /* intermediate nodes relay (receive, then send) data */
+ else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype, from,
+ (tag + i), comm, &recv_request_array[i]);
+ }
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&recv_request_array[i], &status);
+ MPI_Isend((char *) buf + (i * increment), segment, datatype, to,
+ (tag + i), comm, &send_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+
+ free(send_request_array);
+ free(recv_request_array);
+ free(send_status_array);
+ free(recv_status_array);
+ } /* end pipeline */
+
+ /* when count is not divisible by block size, use default BCAST for the remainder */
+ if ((remainder != 0) && (count > segment)) {
+ MPI_Bcast((char *) buf + (pipe_length * increment), remainder, datatype,
+ root, comm);
+ }
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+#ifndef NUM_CORE
+#define NUM_CORE 8
+#endif
+
+int bcast_SMP_binary_segment_byte = 8192;
+
+int smpi_coll_tuned_bcast_SMP_binary(void *buf, int count,
+ MPI_Datatype datatype, int root,
+ MPI_Comm comm)
+{
+ int tag = 5000;
+ MPI_Status status;
+ MPI_Request request;
+ MPI_Request *request_array;
+ MPI_Status *status_array;
+ int rank, size;
+ int i;
+ MPI_Aint extent;
+ MPI_Type_extent(datatype, &extent);
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &size);
+
+ int segment = bcast_SMP_binary_segment_byte / extent;
+ int pipe_length = count / segment;
+ int remainder = count % segment;
+
+ int to_intra_left = (rank / NUM_CORE) * NUM_CORE + (rank % NUM_CORE) * 2 + 1;
+ int to_intra_right = (rank / NUM_CORE) * NUM_CORE + (rank % NUM_CORE) * 2 + 2;
+ int to_inter_left = ((rank / NUM_CORE) * 2 + 1) * NUM_CORE;
+ int to_inter_right = ((rank / NUM_CORE) * 2 + 2) * NUM_CORE;
+ int from_inter = (((rank / NUM_CORE) - 1) / 2) * NUM_CORE;
+ int from_intra = (rank / NUM_CORE) * NUM_CORE + ((rank % NUM_CORE) - 1) / 2;
+ int increment = segment * extent;
+
+ int base = (rank / NUM_CORE) * NUM_CORE;
+ int num_core = NUM_CORE;
+ if (((rank / NUM_CORE) * NUM_CORE) == ((size / NUM_CORE) * NUM_CORE))
+ num_core = size - (rank / NUM_CORE) * NUM_CORE;
+
+ // if root is not zero send to rank zero first
+ if (root != 0) {
+ if (rank == root)
+ MPI_Send(buf, count, datatype, 0, tag, comm);
+ else if (rank == 0)
+ MPI_Recv(buf, count, datatype, root, tag, comm, &status);
+ }
+ // when a message is smaller than a block size => no pipeline
+ if (count <= segment) {
+ // case ROOT-of-each-SMP
+ if (rank % NUM_CORE == 0) {
+ // case ROOT
+ if (rank == 0) {
+ //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
+ if (to_inter_left < size)
+ MPI_Send(buf, count, datatype, to_inter_left, tag, comm);
+ if (to_inter_right < size)
+ MPI_Send(buf, count, datatype, to_inter_right, tag, comm);
+ if ((to_intra_left - base) < num_core)
+ MPI_Send(buf, count, datatype, to_intra_left, tag, comm);
+ if ((to_intra_right - base) < num_core)
+ MPI_Send(buf, count, datatype, to_intra_right, tag, comm);
+ }
+ // case LEAVES ROOT-of-eash-SMP
+ else if (to_inter_left >= size) {
+ //printf("node %d from %d\n",rank,from_inter);
+ MPI_Irecv(buf, count, datatype, from_inter, tag, comm, &request);
+ MPI_Wait(&request, &status);
+ if ((to_intra_left - base) < num_core)
+ MPI_Send(buf, count, datatype, to_intra_left, tag, comm);
+ if ((to_intra_right - base) < num_core)
+ MPI_Send(buf, count, datatype, to_intra_right, tag, comm);
+ }
+ // case INTERMEDIAT ROOT-of-each-SMP
+ else {
+ //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter);
+ MPI_Irecv(buf, count, datatype, from_inter, tag, comm, &request);
+ MPI_Wait(&request, &status);
+ MPI_Send(buf, count, datatype, to_inter_left, tag, comm);
+ if (to_inter_right < size)
+ MPI_Send(buf, count, datatype, to_inter_right, tag, comm);
+ if ((to_intra_left - base) < num_core)
+ MPI_Send(buf, count, datatype, to_intra_left, tag, comm);
+ if ((to_intra_right - base) < num_core)
+ MPI_Send(buf, count, datatype, to_intra_right, tag, comm);
+ }
+ }
+ // case non ROOT-of-each-SMP
+ else {
+ // case leaves
+ if ((to_intra_left - base) >= num_core) {
+ MPI_Irecv(buf, count, datatype, from_intra, tag, comm, &request);
+ MPI_Wait(&request, &status);
+ }
+ // case intermediate
+ else {
+ MPI_Irecv(buf, count, datatype, from_intra, tag, comm, &request);
+ MPI_Wait(&request, &status);
+ MPI_Send(buf, count, datatype, to_intra_left, tag, comm);
+ if ((to_intra_right - base) < num_core)
+ MPI_Send(buf, count, datatype, to_intra_right, tag, comm);
+ }
+ }
+
+ return MPI_SUCCESS;
+ }
+
+ // pipeline bcast
+ else {
+ request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+
+ // case ROOT-of-each-SMP
+ if (rank % NUM_CORE == 0) {
+ // case ROOT
+ if (rank == 0) {
+ for (i = 0; i < pipe_length; i++) {
+ //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
+ if (to_inter_left < size)
+ MPI_Send((char *) buf + (i * increment), segment, datatype,
+ to_inter_left, (tag + i), comm);
+ if (to_inter_right < size)
+ MPI_Send((char *) buf + (i * increment), segment, datatype,
+ to_inter_right, (tag + i), comm);
+ if ((to_intra_left - base) < num_core)
+ MPI_Send((char *) buf + (i * increment), segment, datatype,
+ to_intra_left, (tag + i), comm);
+ if ((to_intra_right - base) < num_core)
+ MPI_Send((char *) buf + (i * increment), segment, datatype,
+ to_intra_right, (tag + i), comm);
+ }
+ }
+ // case LEAVES ROOT-of-eash-SMP
+ else if (to_inter_left >= size) {
+ //printf("node %d from %d\n",rank,from_inter);
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype,
+ from_inter, (tag + i), comm, &request_array[i]);
+ }
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&request_array[i], &status);
+ if ((to_intra_left - base) < num_core)
+ MPI_Send((char *) buf + (i * increment), segment, datatype,
+ to_intra_left, (tag + i), comm);
+ if ((to_intra_right - base) < num_core)
+ MPI_Send((char *) buf + (i * increment), segment, datatype,
+ to_intra_right, (tag + i), comm);
+ }
+ }
+ // case INTERMEDIAT ROOT-of-each-SMP
+ else {
+ //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter);
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype,
+ from_inter, (tag + i), comm, &request_array[i]);
+ }
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&request_array[i], &status);
+ MPI_Send((char *) buf + (i * increment), segment, datatype,
+ to_inter_left, (tag + i), comm);
+ if (to_inter_right < size)
+ MPI_Send((char *) buf + (i * increment), segment, datatype,
+ to_inter_right, (tag + i), comm);
+ if ((to_intra_left - base) < num_core)
+ MPI_Send((char *) buf + (i * increment), segment, datatype,
+ to_intra_left, (tag + i), comm);
+ if ((to_intra_right - base) < num_core)
+ MPI_Send((char *) buf + (i * increment), segment, datatype,
+ to_intra_right, (tag + i), comm);
+ }
+ }
+ }
+ // case non-ROOT-of-each-SMP
+ else {
+ // case leaves
+ if ((to_intra_left - base) >= num_core) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype,
+ from_intra, (tag + i), comm, &request_array[i]);
+ }
+ MPI_Waitall((pipe_length), request_array, status_array);
+ }
+ // case intermediate
+ else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype,
+ from_intra, (tag + i), comm, &request_array[i]);
+ }
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&request_array[i], &status);
+ MPI_Send((char *) buf + (i * increment), segment, datatype,
+ to_intra_left, (tag + i), comm);
+ if ((to_intra_right - base) < num_core)
+ MPI_Send((char *) buf + (i * increment), segment, datatype,
+ to_intra_right, (tag + i), comm);
+ }
+ }
+ }
+
+ free(request_array);
+ free(status_array);
+ }
+
+ // when count is not divisible by block size, use default BCAST for the remainder
+ if ((remainder != 0) && (count > segment)) {
+ MPI_Bcast((char *) buf + (pipe_length * increment), remainder, datatype,
+ root, comm);
+ }
+
+ return 1;
+}
--- /dev/null
+#include "colls.h"
+#ifndef NUM_CORE
+#define NUM_CORE 8
+#endif
+
+int smpi_coll_tuned_bcast_SMP_binomial(void *buf, int count,
+ MPI_Datatype datatype, int root,
+ MPI_Comm comm)
+{
+ int mask = 1;
+ int size;
+ int rank;
+ MPI_Status status;
+ int tag = 50;
+
+ MPI_Comm_size(comm, &size);
+ MPI_Comm_rank(comm, &rank);
+
+ int to_intra, to_inter;
+ int from_intra, from_inter;
+ int inter_rank = rank / NUM_CORE;
+ int inter_size = (size - 1) / NUM_CORE + 1;
+ int intra_rank = rank % NUM_CORE;
+ int intra_size = NUM_CORE;
+ if (((rank / NUM_CORE) * NUM_CORE) == ((size / NUM_CORE) * NUM_CORE))
+ intra_size = size - (rank / NUM_CORE) * NUM_CORE;
+
+ // if root is not zero send to rank zero first
+ if (root != 0) {
+ if (rank == root)
+ MPI_Send(buf, count, datatype, 0, tag, comm);
+ else if (rank == 0)
+ MPI_Recv(buf, count, datatype, root, tag, comm, &status);
+ }
+ //FIRST STEP node 0 send to every root-of-each-SMP with binomial tree
+
+ //printf("node %d inter_rank = %d, inter_size = %d\n",rank,inter_rank, inter_size);
+
+ if (intra_rank == 0) {
+ mask = 1;
+ while (mask < inter_size) {
+ if (inter_rank & mask) {
+ from_inter = (inter_rank - mask) * NUM_CORE;
+ //printf("Node %d recv from node %d when mask is %d\n", rank, from_inter, mask);
+ MPI_Recv(buf, count, datatype, from_inter, tag, comm, &status);
+ break;
+ }
+ mask <<= 1;
+ }
+
+ mask >>= 1;
+ //printf("My rank = %d my mask = %d\n", rank,mask);
+
+ while (mask > 0) {
+ if (inter_rank < inter_size) {
+ to_inter = (inter_rank + mask) * NUM_CORE;
+ if (to_inter < size) {
+ //printf("Node %d send to node %d when mask is %d\n", rank, to_inter, mask);
+ MPI_Send(buf, count, datatype, to_inter, tag, comm);
+ }
+ }
+ mask >>= 1;
+ }
+ }
+ // SECOND STEP every root-of-each-SMP send to all children with binomial tree
+ // base is a rank of root-of-each-SMP
+ int base = (rank / NUM_CORE) * NUM_CORE;
+ mask = 1;
+ while (mask < intra_size) {
+ if (intra_rank & mask) {
+ from_intra = base + (intra_rank - mask);
+ //printf("Node %d recv from node %d when mask is %d\n", rank, from_inter, mask);
+ MPI_Recv(buf, count, datatype, from_intra, tag, comm, &status);
+ break;
+ }
+ mask <<= 1;
+ }
+
+ mask >>= 1;
+
+ //printf("My rank = %d my mask = %d\n", rank,mask);
+
+ while (mask > 0) {
+ if (intra_rank < intra_size) {
+ to_intra = base + (intra_rank + mask);
+ if (to_intra < size) {
+ //printf("Node %d send to node %d when mask is %d\n", rank, to_inter, mask);
+ MPI_Send(buf, count, datatype, to_intra, tag, comm);
+ }
+ }
+ mask >>= 1;
+ }
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+#ifndef NUM_CORE
+#define NUM_CORE 8
+#endif
+
+int bcast_SMP_linear_segment_byte = 8192;
+
+int smpi_coll_tuned_bcast_SMP_linear(void *buf, int count,
+ MPI_Datatype datatype, int root,
+ MPI_Comm comm)
+{
+ int tag = 5000;
+ MPI_Status status;
+ MPI_Request request;
+ MPI_Request *request_array;
+ MPI_Status *status_array;
+ int rank, size;
+ int i;
+ MPI_Aint extent;
+ MPI_Type_extent(datatype, &extent);
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &size);
+
+ int segment = bcast_SMP_linear_segment_byte / extent;
+ int pipe_length = count / segment;
+ int remainder = count % segment;
+ int increment = segment * extent;
+
+
+ /* leader of each SMP do inter-communication
+ and act as a root for intra-communication */
+ int to_inter = (rank + NUM_CORE) % size;
+ int to_intra = (rank + 1) % size;
+ int from_inter = (rank - NUM_CORE + size) % size;
+ int from_intra = (rank + size - 1) % size;
+
+ // call native when MPI communication size is too small
+ if (size <= NUM_CORE) {
+ return MPI_Bcast(buf, count, datatype, root, comm);
+ }
+ // if root is not zero send to rank zero first
+ if (root != 0) {
+ if (rank == root)
+ MPI_Send(buf, count, datatype, 0, tag, comm);
+ else if (rank == 0)
+ MPI_Recv(buf, count, datatype, root, tag, comm, &status);
+ }
+ // when a message is smaller than a block size => no pipeline
+ if (count <= segment) {
+ // case ROOT
+ if (rank == 0) {
+ MPI_Send(buf, count, datatype, to_inter, tag, comm);
+ MPI_Send(buf, count, datatype, to_intra, tag, comm);
+ }
+ // case last ROOT of each SMP
+ else if (rank == (((size - 1) / NUM_CORE) * NUM_CORE)) {
+ MPI_Irecv(buf, count, datatype, from_inter, tag, comm, &request);
+ MPI_Wait(&request, &status);
+ MPI_Send(buf, count, datatype, to_intra, tag, comm);
+ }
+ // case intermediate ROOT of each SMP
+ else if (rank % NUM_CORE == 0) {
+ MPI_Irecv(buf, count, datatype, from_inter, tag, comm, &request);
+ MPI_Wait(&request, &status);
+ MPI_Send(buf, count, datatype, to_inter, tag, comm);
+ MPI_Send(buf, count, datatype, to_intra, tag, comm);
+ }
+ // case last non-ROOT of each SMP
+ else if (((rank + 1) % NUM_CORE == 0) || (rank == (size - 1))) {
+ MPI_Irecv(buf, count, datatype, from_intra, tag, comm, &request);
+ MPI_Wait(&request, &status);
+ }
+ // case intermediate non-ROOT of each SMP
+ else {
+ MPI_Irecv(buf, count, datatype, from_intra, tag, comm, &request);
+ MPI_Wait(&request, &status);
+ MPI_Send(buf, count, datatype, to_intra, tag, comm);
+ }
+ return MPI_SUCCESS;
+ }
+ // pipeline bcast
+ else {
+ request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+
+ // case ROOT of each SMP
+ if (rank % NUM_CORE == 0) {
+ // case real root
+ if (rank == 0) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Send((char *) buf + (i * increment), segment, datatype, to_inter,
+ (tag + i), comm);
+ MPI_Send((char *) buf + (i * increment), segment, datatype, to_intra,
+ (tag + i), comm);
+ }
+ }
+ // case last ROOT of each SMP
+ else if (rank == (((size - 1) / NUM_CORE) * NUM_CORE)) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype,
+ from_inter, (tag + i), comm, &request_array[i]);
+ }
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&request_array[i], &status);
+ MPI_Send((char *) buf + (i * increment), segment, datatype, to_intra,
+ (tag + i), comm);
+ }
+ }
+ // case intermediate ROOT of each SMP
+ else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype,
+ from_inter, (tag + i), comm, &request_array[i]);
+ }
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&request_array[i], &status);
+ MPI_Send((char *) buf + (i * increment), segment, datatype, to_inter,
+ (tag + i), comm);
+ MPI_Send((char *) buf + (i * increment), segment, datatype, to_intra,
+ (tag + i), comm);
+ }
+ }
+ } else { // case last non-ROOT of each SMP
+ if (((rank + 1) % NUM_CORE == 0) || (rank == (size - 1))) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype,
+ from_intra, (tag + i), comm, &request_array[i]);
+ }
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&request_array[i], &status);
+ }
+ }
+ // case intermediate non-ROOT of each SMP
+ else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) buf + (i * increment), segment, datatype,
+ from_intra, (tag + i), comm, &request_array[i]);
+ }
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&request_array[i], &status);
+ MPI_Send((char *) buf + (i * increment), segment, datatype, to_intra,
+ (tag + i), comm);
+ }
+ }
+ }
+ free(request_array);
+ free(status_array);
+ }
+
+ // when count is not divisible by block size, use default BCAST for the remainder
+ if ((remainder != 0) && (count > segment)) {
+ MPI_Bcast((char *) buf + (pipe_length * increment), remainder, datatype,
+ root, comm);
+ }
+
+ return 1;
+}
--- /dev/null
+#include "colls.h"
+int binary_pipeline_bcast_tree_height = 10;
+
+int binary_pipeline_bcast_send_to[2][128] = {
+ {1, 2, 3, -1, -1, 6, -1, -1, 9, 10, 11, -1, -1, 14, -1, 16, 17, 18, 19, -1,
+ -1, 22, -1, 24, 25, 26, 27, -1, -1, 30, -1, -1, 33, 34, 35, -1, -1, 38, -1,
+ -1, 41, 42, 43, -1, -1, 46, -1, -1, 49, 50, 51, -1, -1, 54, -1, -1, 57, 58,
+ 59, -1, -1, 62, -1, -1, 65, 66, 67, -1, -1, 70, -1, -1, 73, 74, 75, -1, -1,
+ 78, -1, 80, 81, 82, 83, -1, -1, 86, -1, -1, 89, 90, 91, -1, -1, 94, -1, -1,
+ 97, 98, 99, -1, -1, 102, -1, -1, 105, 106, 107, -1, -1, 110, -1, -1, 113,
+ 114, 115, -1, -1, 118, -1, -1, 121, 122, 123, -1, -1, 126, -1, -1},
+ {8, 5, 4, -1, -1, 7, -1, -1, 15, 13, 12, -1, -1, -1, -1, 72, 23, 21, 20, -1,
+ -1, -1, -1, 48, 32, 29, 28, -1, -1, 31, -1, -1, 40, 37, 36, -1, -1, 39, -1,
+ -1, -1, 45, 44, -1, -1, 47, -1, -1, 56, 53, 52, -1, -1, 55, -1, -1, 64, 61,
+ 60, -1, -1, 63, -1, -1, -1, 69, 68, -1, -1, 71, -1, -1, 79, 77, 76, -1, -1,
+ -1, -1, 104, 88, 85, 84, -1, -1, 87, -1, -1, 96, 93, 92, -1, -1, 95, -1, -1,
+ -1, 101, 100, -1, -1, 103, -1, -1, 112, 109, 108, -1, -1, 111, -1, -1, 120,
+ 117, 116, -1, -1, 119, -1, -1, -1, 125, 124, -1, -1, 127, -1, -1}
+};
+
+int binary_pipeline_bcast_recv_from[128] =
+ { -1, 0, 1, 2, 2, 1, 5, 5, 0, 8, 9, 10, 10, 9, 13, 8, 15, 16, 17, 18, 18,
+17, 21, 16, 23, 24, 25, 26, 26, 25, 29, 29, 24, 32, 33, 34, 34, 33, 37, 37, 32, 40, 41, 42, 42, 41, 45, 45, 23,
+48, 49, 50, 50, 49, 53, 53, 48, 56, 57, 58, 58, 57, 61, 61, 56, 64, 65, 66, 66, 65, 69, 69, 15, 72, 73, 74, 74,
+73, 77, 72, 79, 80, 81, 82, 82, 81, 85, 85, 80, 88, 89, 90, 90, 89, 93, 93, 88, 96, 97, 98, 98, 97, 101, 101, 79,
+104, 105, 106, 106, 105, 109, 109, 104, 112, 113, 114, 114, 113, 117, 117, 112, 120, 121, 122, 122, 121, 125,
+125 };
+
+int binary_pipeline_bcast_sequence[128] =
+ { 0, 1, 2, 3, 3, 2, 3, 3, 1, 2, 3, 4, 4, 3, 4, 2, 3, 4, 5, 6, 6, 5, 6, 4, 5,
+6, 7, 8, 8, 7, 8, 8, 6, 7, 8, 9, 9, 8, 9, 9, 7, 8, 9, 10, 10, 9, 10, 10, 5, 6, 7, 8, 8, 7, 8, 8, 6, 7, 8, 9, 9,
+8, 9, 9, 7, 8, 9, 10, 10, 9, 10, 10, 3, 4, 5, 6, 6, 5, 6, 4, 5, 6, 7, 8, 8, 7, 8, 8, 6, 7, 8, 9, 9, 8, 9, 9, 7,
+8, 9, 10, 10, 9, 10, 10, 5, 6, 7, 8, 8, 7, 8, 8, 6, 7, 8, 9, 9, 8, 9, 9, 7, 8, 9, 10, 10, 9, 10, 10 };
+
+
+int bcast_TSB_segment_size_in_byte = 8192;
+
+int smpi_coll_tuned_bcast_TSB(void *buf, int count, MPI_Datatype datatype,
+ int root, MPI_Comm comm)
+{
+ int tag = 5000;
+ MPI_Status status;
+ int rank, size;
+ int i;
+
+ MPI_Aint extent;
+ MPI_Type_extent(datatype, &extent);
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+ /* source node and destination nodes (same through out the functions) */
+ int to_left = binary_pipeline_bcast_send_to[0][rank];
+ int to_right = binary_pipeline_bcast_send_to[1][rank];
+ int from = binary_pipeline_bcast_recv_from[rank];
+
+ /* segment is segment size in number of elements (not bytes) */
+ int segment = bcast_TSB_segment_size_in_byte / extent;
+
+ /* pipeline length */
+ int pipe_length = count / segment;
+
+ /* use for buffer offset for sending and receiving data = segment size in byte */
+ int increment = segment * extent;
+
+ /* if the input size is not divisible by segment size =>
+ the small remainder will be done with native implementation */
+ int remainder = count % segment;
+
+ /* if root is not zero send to rank zero first */
+ if (root != 0) {
+ if (rank == root) {
+ MPI_Send(buf, count, datatype, 0, tag, comm);
+ } else if (rank == 0) {
+ MPI_Recv(buf, count, datatype, root, tag, comm, &status);
+ }
+ }
+
+ /* when a message is smaller than a block size => no pipeline */
+ if (count <= segment) {
+
+ /* case: root */
+ if (rank == 0) {
+ /* case root has only a left child */
+ if (to_right == -1) {
+ MPI_Send(buf, count, datatype, to_left, tag, comm);
+ }
+ /* case root has both left and right children */
+ else {
+ MPI_Send(buf, count, datatype, to_left, tag, comm);
+ MPI_Send(buf, count, datatype, to_right, tag, comm);
+ }
+ }
+
+ /* case: leaf ==> receive only */
+ else if (to_left == -1) {
+ MPI_Recv(buf, count, datatype, from, tag, comm, &status);
+ }
+
+ /* case: intermidiate node with only left child ==> relay message */
+ else if (to_right == -1) {
+ MPI_Recv(buf, count, datatype, from, tag, comm, &status);
+ MPI_Send(buf, count, datatype, to_left, tag, comm);
+ }
+
+ /* case: intermidiate node with both left and right children ==> relay message */
+ else {
+ MPI_Recv(buf, count, datatype, from, tag, comm, &status);
+ MPI_Send(buf, count, datatype, to_left, tag, comm);
+ MPI_Send(buf, count, datatype, to_right, tag, comm);
+ }
+ return MPI_SUCCESS;
+ }
+ // pipelining
+ else {
+
+ /* case: root */
+ if (rank == 0) {
+ /* case root has only a left child */
+ if (to_right == -1) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Send((char *) buf + (i * increment), segment, datatype, to_left,
+ tag + i, comm);
+ }
+ }
+ /* case root has both left and right children */
+ else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Send((char *) buf + (i * increment), segment, datatype, to_left,
+ tag + i, comm);
+ MPI_Send((char *) buf + (i * increment), segment, datatype, to_right,
+ tag + i, comm);
+ }
+ }
+ }
+
+ /* case: leaf ==> receive only */
+ else if (to_left == -1) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Recv((char *) buf + (i * increment), segment, datatype, from,
+ tag + i, comm, &status);
+ }
+ }
+
+ /* case: intermidiate node with only left child ==> relay message */
+ else if (to_right == -1) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Recv((char *) buf + (i * increment), segment, datatype, from,
+ tag + i, comm, &status);
+ MPI_Send((char *) buf + (i * increment), segment, datatype, to_left,
+ tag + i, comm);
+ }
+ }
+ /* case: intermidiate node with both left and right children ==> relay message */
+ else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Recv((char *) buf + (i * increment), segment, datatype, from,
+ tag + i, comm, &status);
+ MPI_Send((char *) buf + (i * increment), segment, datatype, to_left,
+ tag + i, comm);
+ MPI_Send((char *) buf + (i * increment), segment, datatype, to_right,
+ tag + i, comm);
+ }
+ }
+ }
+
+ /* when count is not divisible by block size, use default BCAST for the remainder */
+ if ((remainder != 0) && (count > segment)) {
+ MPI_Bcast((char *) buf + (pipe_length * increment), remainder, datatype,
+ root, comm);
+ }
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+static int bcast_NTSL_segment_size_in_byte = 8192;
+
+#define HEADER_SIZE 1024
+#define MAX_NODE 1024
+
+/* Non-topology-specific pipelined linear-bcast function */
+int smpi_coll_tuned_bcast_arrival_nb(void *buf, int count,
+ MPI_Datatype datatype, int root,
+ MPI_Comm comm)
+{
+ int tag = 50;
+ MPI_Status status;
+ MPI_Request request;
+ MPI_Request *send_request_array;
+ MPI_Request *recv_request_array;
+ MPI_Status *send_status_array;
+ MPI_Status *recv_status_array;
+
+ MPI_Status temp_status_array[MAX_NODE];
+
+ int rank, size;
+ int i, j;
+
+ int sent_count;
+ int header_index;
+ int flag_array[MAX_NODE];
+ int already_sent[MAX_NODE];
+
+ int header_buf[HEADER_SIZE];
+ char temp_buf[MAX_NODE];
+
+ MPI_Aint extent;
+ MPI_Type_extent(datatype, &extent);
+
+ /* destination */
+ int to;
+
+
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+
+ /* segment is segment size in number of elements (not bytes) */
+ int segment = bcast_NTSL_segment_size_in_byte / extent;
+
+ /* pipeline length */
+ int pipe_length = count / segment;
+
+ /* use for buffer offset for sending and receiving data = segment size in byte */
+ int increment = segment * extent;
+
+ /* if the input size is not divisible by segment size =>
+ the small remainder will be done with native implementation */
+ int remainder = count % segment;
+
+ /* if root is not zero send to rank zero first
+ this can be modified to make it faster by using logical src, dst.
+ */
+ if (root != 0) {
+ if (rank == root) {
+ MPI_Send(buf, count, datatype, 0, tag, comm);
+ } else if (rank == 0) {
+ MPI_Recv(buf, count, datatype, root, tag, comm, &status);
+ }
+ }
+
+ /* value == 0 means root has not send data (or header) to the node yet */
+ for (i = 0; i < MAX_NODE; i++) {
+ already_sent[i] = 0;
+ }
+ // printf("YYY\n");
+
+ /* when a message is smaller than a block size => no pipeline */
+ if (count <= segment) {
+ if (rank == 0) {
+ sent_count = 0;
+
+ while (sent_count < (size - 1)) {
+
+ // for (j=0;j<1000;j++) {
+ for (i = 1; i < size; i++) {
+ if (already_sent[i] == 0)
+ MPI_Iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
+ MPI_STATUSES_IGNORE);
+ }
+ //}
+
+ header_index = 0;
+ /* recv 1-byte message */
+ for (i = 1; i < size; i++) {
+
+ /* message arrive */
+ if ((flag_array[i] == 1) && (already_sent[i] == 0)) {
+ MPI_Recv(temp_buf, 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
+ header_buf[header_index] = i;
+ header_index++;
+ sent_count++;
+
+ /* will send in the next step */
+ already_sent[i] = 1;
+ }
+ }
+
+ /* send header followed by data */
+ if (header_index != 0) {
+ header_buf[header_index] = -1;
+ to = header_buf[0];
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);
+ MPI_Send(buf, count, datatype, to, tag, comm);
+ }
+
+ /* randomly MPI_Send to one */
+ else {
+ /* search for the first node that never received data before */
+ for (i = 1; i < size; i++) {
+ if (already_sent[i] == 0) {
+ header_buf[0] = i;
+ header_buf[1] = -1;
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, i, tag, comm);
+ MPI_Send(buf, count, datatype, i, tag, comm);
+ already_sent[i] = 1;
+ sent_count++;
+ break;
+ }
+ }
+ }
+
+
+ } /* while loop */
+ }
+
+ /* non-root */
+ else {
+
+ /* send 1-byte message to root */
+ MPI_Send(temp_buf, 1, MPI_CHAR, 0, tag, comm);
+
+ /* wait for header and data, forward when required */
+ MPI_Recv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm,
+ &status);
+ MPI_Recv(buf, count, datatype, MPI_ANY_SOURCE, tag, comm, &status);
+
+ /* search for where it is */
+ int myordering = 0;
+ while (rank != header_buf[myordering]) {
+ myordering++;
+ }
+
+ /* send header followed by data */
+ if (header_buf[myordering + 1] != -1) {
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1],
+ tag, comm);
+ MPI_Send(buf, count, datatype, header_buf[myordering + 1], tag, comm);
+ }
+ }
+ }
+ /* pipeline bcast */
+ else {
+ send_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ recv_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ send_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+ recv_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+
+ if (rank == 0) {
+ sent_count = 0;
+ int iteration = 0;
+
+ int will_send[1000];
+ for (i = 0; i < 1000; i++)
+ will_send[i] = 0;
+ while (sent_count < (size - 1)) {
+ iteration++;
+ //start = MPI_Wtime();
+
+ int k;
+ for (k = 0; k < 3; k++) {
+ for (i = 1; i < size; i++) {
+ if ((already_sent[i] == 0) && (will_send[i] == 0)) {
+ MPI_Iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
+ &temp_status_array[i]);
+ if (flag_array[i] == 1) {
+ will_send[i] = 1;
+ MPI_Recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD,
+ &status);
+ i = 1;
+ }
+ }
+ }
+ }
+
+ //total = MPI_Wtime() - start;
+ //total *= 1000;
+ //printf("Iprobe time = %.2f\n",total);
+ header_index = 0;
+
+ //start = MPI_Wtime();
+ /* recv 1-byte message */
+ for (i = 1; i < size; i++) {
+ /* message arrive */
+ if ((will_send[i] == 1) && (already_sent[i] == 0)) {
+ header_buf[header_index] = i;
+ header_index++;
+ sent_count++;
+
+ /* will send in the next step */
+ already_sent[i] = 1;
+ }
+ }
+ //printf("sent_count = %d\n",sent_count);
+
+
+ //total = MPI_Wtime() - start;
+ //total *= 1000;
+ //printf("Recv 1-byte time = %.2f\n",total);
+
+ /*
+ if (header_index != 0) {
+ printf("header index = %d node = ",header_index);
+ for (i=0;i<header_index;i++) {
+ printf("%d ",header_buf[i]);
+ }
+ printf("\n");
+ }
+ */
+
+ /* send header followed by data */
+ if (header_index != 0) {
+ header_buf[header_index] = -1;
+ to = header_buf[0];
+
+ //start = MPI_Wtime();
+
+ /* send header */
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);
+
+ //total = MPI_Wtime() - start;
+ //total *= 1000;
+ //printf("\tSend header to %d time = %.2f\n",to,total);
+
+ //start = MPI_Wtime();
+
+ /* send data - non-pipeline case */
+
+ if (0 == 1) {
+ //if (header_index == 1) {
+ MPI_Send(buf, count, datatype, to, tag, comm);
+ }
+
+
+ /* send data - pipeline */
+ else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Send((char *)buf + (i * increment), segment, datatype, to, tag, comm);
+ }
+ //MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+ //total = MPI_Wtime() - start;
+ //total *= 1000;
+ //printf("\tSend data to %d time = %.2f\n",to,total);
+
+ }
+
+
+
+ /* randomly MPI_Send to one node */
+ else {
+ /* search for the first node that never received data before */
+ for (i = 1; i < size; i++) {
+ if (already_sent[i] == 0) {
+ header_buf[0] = i;
+ header_buf[1] = -1;
+ to = i;
+
+ //start = MPI_Wtime();
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);
+
+ /* still need to chop data so that we can use the same non-root code */
+ for (j = 0; j < pipe_length; j++) {
+ MPI_Send((char *)buf + (j * increment), segment, datatype, to, tag,
+ comm);
+ }
+
+ //MPI_Send(buf,count,datatype,to,tag,comm);
+ //MPI_Wait(&request,MPI_STATUS_IGNORE);
+
+ //total = MPI_Wtime() - start;
+ //total *= 1000;
+ //printf("SEND TO SINGLE node %d time = %.2f\n",i,total);
+
+
+ already_sent[i] = 1;
+ sent_count++;
+ break;
+ }
+ }
+ }
+
+ } /* while loop */
+
+ //total = MPI_Wtime() - start2;
+ //total *= 1000;
+ //printf("Node zero iter = %d time = %.2f\n",iteration,total);
+
+ /* probe before exit in case there are messages to recv */
+ for (i = 1; i < size; i++) {
+ MPI_Iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
+ &temp_status_array[i]);
+ if (flag_array[i] == 1)
+ MPI_Recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
+ }
+ }
+
+ /* rank 0 */
+ /* none root */
+ else {
+
+ /* if root already send a message to this node, don't send one-byte message */
+ MPI_Iprobe(0, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[0], &status);
+
+ /* send 1-byte message to root */
+ if (flag_array[0] == 0)
+ MPI_Send(temp_buf, 1, MPI_CHAR, 0, tag, comm);
+
+ /* wait for header forward when required */
+ MPI_Irecv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm,
+ &request);
+ MPI_Wait(&request, MPI_STATUS_IGNORE);
+
+ /* search for where it is */
+ int myordering = 0;
+ while (rank != header_buf[myordering]) {
+ myordering++;
+ }
+
+ /* send header when required */
+ if (header_buf[myordering + 1] != -1) {
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1],
+ tag, comm);
+ }
+
+ /* receive data */
+
+ if (0 == -1) {
+ //if (header_buf[1] == -1) {
+ MPI_Irecv(buf, count, datatype, 0, tag, comm, &request);
+ MPI_Wait(&request, MPI_STATUS_IGNORE);
+ //printf("\t\tnode %d ordering = %d receive data from root\n",rank,myordering);
+ } else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *)buf + (i * increment), segment, datatype, MPI_ANY_SOURCE,
+ tag, comm, &recv_request_array[i]);
+ }
+ }
+
+ /* send data */
+ if (header_buf[myordering + 1] != -1) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&recv_request_array[i], MPI_STATUS_IGNORE);
+ MPI_Isend((char *)buf + (i * increment), segment, datatype,
+ header_buf[myordering + 1], tag, comm,
+ &send_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+
+ }
+
+ free(send_request_array);
+ free(recv_request_array);
+ free(send_status_array);
+ free(recv_status_array);
+ } /* end pipeline */
+
+ /* when count is not divisible by block size, use default BCAST for the remainder */
+ if ((remainder != 0) && (count > segment)) {
+ MPI_Bcast((char *)buf + (pipe_length * increment), remainder, datatype, root, comm);
+ }
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+int bcast_arrival_pattern_aware_wait_segment_size_in_byte = 8192;
+
+#ifndef BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE
+#define BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE 1024
+#endif
+
+#ifndef BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE
+#define BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE 128
+#endif
+
+/* Non-topology-specific pipelined linear-bcast function */
+int smpi_coll_tuned_bcast_arrival_pattern_aware_wait(void *buf, int count,
+ MPI_Datatype datatype,
+ int root, MPI_Comm comm)
+{
+ MPI_Status status;
+ MPI_Request request;
+ MPI_Request *send_request_array;
+ MPI_Request *recv_request_array;
+ MPI_Status *send_status_array;
+ MPI_Status *recv_status_array;
+
+
+ MPI_Status temp_status_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
+
+ int rank, size;
+ int i, j, k;
+ int tag = 50;
+ int will_send[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
+
+ int sent_count;
+ int header_index;
+ int flag_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
+ int already_sent[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
+
+ int header_buf[BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE];
+ char temp_buf[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
+
+ int max_node = BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE;
+ int header_size = BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE;
+
+ MPI_Aint extent;
+ MPI_Type_extent(datatype, &extent);
+
+ /* source and destination */
+ int to, from;
+
+
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+
+ /* segment is segment size in number of elements (not bytes) */
+ int segment = bcast_arrival_pattern_aware_wait_segment_size_in_byte / extent;
+
+ /* pipeline length */
+ int pipe_length = count / segment;
+
+ /* use for buffer offset for sending and receiving data = segment size in byte */
+ int increment = segment * extent;
+
+ /* if the input size is not divisible by segment size =>
+ the small remainder will be done with native implementation */
+ int remainder = count % segment;
+
+ /* if root is not zero send to rank zero first
+ this can be modified to make it faster by using logical src, dst.
+ */
+ if (root != 0) {
+ if (rank == root) {
+ MPI_Send(buf, count, datatype, 0, tag, comm);
+ } else if (rank == 0) {
+ MPI_Recv(buf, count, datatype, root, tag, comm, &status);
+ }
+ }
+
+
+ /* value == 0 means root has not send data (or header) to the node yet */
+ for (i = 0; i < max_node; i++) {
+ already_sent[i] = 0;
+ }
+
+ /* when a message is smaller than a block size => no pipeline */
+ if (count <= segment) {
+ segment = count;
+ pipe_length = 1;
+ }
+
+ /* start pipeline bcast */
+
+ send_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ recv_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ send_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+ recv_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+
+ /* root */
+ if (rank == 0) {
+ sent_count = 0;
+ int iteration = 0;
+
+ for (i = 0; i < BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE; i++)
+ will_send[i] = 0;
+ while (sent_count < (size - 1)) {
+ iteration++;
+
+ /* loop k times to let more processes arrive before start sending data */
+ for (k = 0; k < 3; k++) {
+ for (i = 1; i < size; i++) {
+ if ((already_sent[i] == 0) && (will_send[i] == 0)) {
+ MPI_Iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
+ &temp_status_array[i]);
+ if (flag_array[i] == 1) {
+ will_send[i] = 1;
+ MPI_Recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD,
+ &status);
+ i = 0;
+ }
+ }
+ }
+ }
+
+ header_index = 0;
+
+ /* recv 1-byte message */
+ for (i = 1; i < size; i++) {
+ /* message arrive */
+ if ((will_send[i] == 1) && (already_sent[i] == 0)) {
+ header_buf[header_index] = i;
+ header_index++;
+ sent_count++;
+
+ /* will send in the next step */
+ already_sent[i] = 1;
+ }
+ }
+
+ /* send header followed by data */
+ if (header_index != 0) {
+ header_buf[header_index] = -1;
+ to = header_buf[0];
+
+ /* send header */
+ MPI_Send(header_buf, header_size, MPI_INT, to, tag, comm);
+
+ /* send data - pipeline */
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Isend((char *)buf + (i * increment), segment, datatype, to, tag, comm,
+ &send_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+
+
+ /* end - send header followed by data */
+ /* randomly MPI_Send to one node */
+ /* this part has been commented out - performance-wise */
+ else if (2 == 3) {
+ /* search for the first node that never received data before */
+ for (i = 0; i < size; i++) {
+ if (i == root)
+ continue;
+ if (already_sent[i] == 0) {
+ header_buf[0] = i;
+ header_buf[1] = -1;
+ to = i;
+
+ MPI_Send(header_buf, header_size, MPI_INT, to, tag, comm);
+
+ /* still need to chop data so that we can use the same non-root code */
+ for (j = 0; j < pipe_length; j++) {
+ MPI_Send((char *)buf + (j * increment), segment, datatype, to, tag, comm);
+ }
+ }
+ }
+ }
+ } /* end - while (send_count < size-1) loop */
+ }
+
+ /* end - root */
+ /* none root */
+ else {
+
+ /* send 1-byte message to root */
+ MPI_Send(temp_buf, 1, MPI_CHAR, 0, tag, comm);
+
+ /* wait for header forward when required */
+ MPI_Irecv(header_buf, header_size, MPI_INT, MPI_ANY_SOURCE, tag, comm,
+ &request);
+ MPI_Wait(&request, MPI_STATUS_IGNORE);
+
+ /* search for where it is */
+ int myordering = 0;
+ while (rank != header_buf[myordering]) {
+ myordering++;
+ }
+
+ to = header_buf[myordering + 1];
+ if (myordering == 0) {
+ from = 0;
+ } else {
+ from = header_buf[myordering - 1];
+ }
+
+ /* send header when required */
+ if (to != -1) {
+ MPI_Send(header_buf, header_size, MPI_INT, to, tag, comm);
+ }
+
+ /* receive data */
+
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *)buf + (i * increment), segment, datatype, from, tag, comm,
+ &recv_request_array[i]);
+ }
+
+ /* forward data */
+ if (to != -1) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&recv_request_array[i], MPI_STATUS_IGNORE);
+ MPI_Isend((char *)buf + (i * increment), segment, datatype, to, tag, comm,
+ &send_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+
+ /* recv only */
+ else {
+ MPI_Waitall((pipe_length), recv_request_array, recv_status_array);
+ }
+ }
+
+ free(send_request_array);
+ free(recv_request_array);
+ free(send_status_array);
+ free(recv_status_array);
+ /* end pipeline */
+
+ /* when count is not divisible by block size, use default BCAST for the remainder */
+ if ((remainder != 0) && (count > segment)) {
+ MPI_Bcast((char *)buf + (pipe_length * increment), remainder, datatype, root, comm);
+ }
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+static int bcast_NTSL_segment_size_in_byte = 8192;
+
+#define HEADER_SIZE 1024
+#define MAX_NODE 1024
+
+/* Non-topology-specific pipelined linear-bcast function */
+int smpi_coll_tuned_bcast_arrival_pattern_aware(void *buf, int count,
+ MPI_Datatype datatype, int root,
+ MPI_Comm comm)
+{
+ int tag = 50;
+ MPI_Status status;
+ MPI_Request request;
+ MPI_Request *send_request_array;
+ MPI_Request *recv_request_array;
+ MPI_Status *send_status_array;
+ MPI_Status *recv_status_array;
+
+ MPI_Status temp_status_array[MAX_NODE];
+
+ int rank, size;
+ int i, j;
+
+ int sent_count;
+ int header_index;
+ int flag_array[MAX_NODE];
+ int already_sent[MAX_NODE];
+
+ int header_buf[HEADER_SIZE];
+ char temp_buf[MAX_NODE];
+
+ MPI_Aint extent;
+ MPI_Type_extent(datatype, &extent);
+
+ /* destination */
+ int to;
+
+
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+
+ /* segment is segment size in number of elements (not bytes) */
+ int segment = bcast_NTSL_segment_size_in_byte / extent;
+
+ /* pipeline length */
+ int pipe_length = count / segment;
+
+ /* use for buffer offset for sending and receiving data = segment size in byte */
+ int increment = segment * extent;
+
+ /* if the input size is not divisible by segment size =>
+ the small remainder will be done with native implementation */
+ int remainder = count % segment;
+
+ /* if root is not zero send to rank zero first
+ this can be modified to make it faster by using logical src, dst.
+ */
+ if (root != 0) {
+ if (rank == root) {
+ MPI_Send(buf, count, datatype, 0, tag, comm);
+ } else if (rank == 0) {
+ MPI_Recv(buf, count, datatype, root, tag, comm, &status);
+ }
+ }
+
+ /* value == 0 means root has not send data (or header) to the node yet */
+ for (i = 0; i < MAX_NODE; i++) {
+ already_sent[i] = 0;
+ }
+
+ /* when a message is smaller than a block size => no pipeline */
+ if (count <= segment) {
+ if (rank == 0) {
+ sent_count = 0;
+
+ while (sent_count < (size - 1)) {
+ for (i = 1; i < size; i++) {
+ MPI_Iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
+ MPI_STATUSES_IGNORE);
+ }
+
+ header_index = 0;
+ /* recv 1-byte message */
+ for (i = 1; i < size; i++) {
+
+ /* message arrive */
+ if ((flag_array[i] == 1) && (already_sent[i] == 0)) {
+ MPI_Recv(temp_buf, 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
+ header_buf[header_index] = i;
+ header_index++;
+ sent_count++;
+
+ /* will send in the next step */
+ already_sent[i] = 1;
+ }
+ }
+
+ /* send header followed by data */
+ if (header_index != 0) {
+ header_buf[header_index] = -1;
+ to = header_buf[0];
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);
+ MPI_Send(buf, count, datatype, to, tag, comm);
+ }
+
+ /* randomly MPI_Send to one */
+ else {
+ /* search for the first node that never received data before */
+ for (i = 1; i < size; i++) {
+ if (already_sent[i] == 0) {
+ header_buf[0] = i;
+ header_buf[1] = -1;
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, i, tag, comm);
+ MPI_Send(buf, count, datatype, i, tag, comm);
+ already_sent[i] = 1;
+ sent_count++;
+ break;
+ }
+ }
+ }
+
+
+ } /* while loop */
+ }
+
+ /* non-root */
+ else {
+
+ /* send 1-byte message to root */
+ MPI_Send(temp_buf, 1, MPI_CHAR, 0, tag, comm);
+
+ /* wait for header and data, forward when required */
+ MPI_Recv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm,
+ &status);
+ MPI_Recv(buf, count, datatype, MPI_ANY_SOURCE, tag, comm, &status);
+
+ /* search for where it is */
+ int myordering = 0;
+ while (rank != header_buf[myordering]) {
+ myordering++;
+ }
+
+ /* send header followed by data */
+ if (header_buf[myordering + 1] != -1) {
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1],
+ tag, comm);
+ MPI_Send(buf, count, datatype, header_buf[myordering + 1], tag, comm);
+ }
+ }
+ }
+ /* pipeline bcast */
+ else {
+ send_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ recv_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ send_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+ recv_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+
+ if (rank == 0) {
+ //double start2 = MPI_Wtime();
+ sent_count = 0;
+ //int iteration = 0;
+ while (sent_count < (size - 1)) {
+ //iteration++;
+ //start = MPI_Wtime();
+ for (i = 1; i < size; i++) {
+ MPI_Iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
+ &temp_status_array[i]);
+ }
+ //total = MPI_Wtime() - start;
+ //total *= 1000;
+ //printf("Iprobe time = %.2f\n",total);
+ header_index = 0;
+
+ MPI_Wtime();
+ /* recv 1-byte message */
+ for (i = 1; i < size; i++) {
+ /* message arrive */
+ if ((flag_array[i] == 1) && (already_sent[i] == 0)) {
+ MPI_Recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD,
+ &status);
+ header_buf[header_index] = i;
+ header_index++;
+ sent_count++;
+
+ /* will send in the next step */
+ already_sent[i] = 1;
+ }
+ }
+ //total = MPI_Wtime() - start;
+ //total *= 1000;
+ //printf("Recv 1-byte time = %.2f\n",total);
+
+ /*
+ if (header_index != 0) {
+ printf("header index = %d node = ",header_index);
+ for (i=0;i<header_index;i++) {
+ printf("%d ",header_buf[i]);
+ }
+ printf("\n");
+ }
+ */
+
+ /* send header followed by data */
+ if (header_index != 0) {
+ header_buf[header_index] = -1;
+ to = header_buf[0];
+
+ //start = MPI_Wtime();
+
+ /* send header */
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);
+
+ //total = MPI_Wtime() - start;
+ //total *= 1000;
+ //printf("\tSend header to %d time = %.2f\n",to,total);
+
+ //start = MPI_Wtime();
+
+ /* send data - non-pipeline case */
+
+ if (0 == 1) {
+ //if (header_index == 1) {
+ MPI_Send(buf, count, datatype, to, tag, comm);
+ }
+
+
+ /* send data - pipeline */
+ else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Send((char *)buf + (i * increment), segment, datatype, to, tag, comm);
+ }
+ //MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+ //total = MPI_Wtime() - start;
+ //total *= 1000;
+ //printf("\tSend data to %d time = %.2f\n",to,total);
+
+ }
+
+
+
+ /* randomly MPI_Send to one node */
+ else {
+ /* search for the first node that never received data before */
+ for (i = 1; i < size; i++) {
+ if (already_sent[i] == 0) {
+ header_buf[0] = i;
+ header_buf[1] = -1;
+ to = i;
+
+ //start = MPI_Wtime();
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);
+
+ /* still need to chop data so that we can use the same non-root code */
+ for (j = 0; j < pipe_length; j++) {
+ MPI_Send((char *)buf + (j * increment), segment, datatype, to, tag,
+ comm);
+ }
+
+ //MPI_Send(buf,count,datatype,to,tag,comm);
+ //MPI_Wait(&request,MPI_STATUS_IGNORE);
+
+ //total = MPI_Wtime() - start;
+ //total *= 1000;
+ //printf("SEND TO SINGLE node %d time = %.2f\n",i,total);
+
+
+ already_sent[i] = 1;
+ sent_count++;
+ break;
+ }
+ }
+ }
+
+ } /* while loop */
+
+ //total = MPI_Wtime() - start2;
+ //total *= 1000;
+ //printf("Node zero iter = %d time = %.2f\n",iteration,total);
+ }
+
+ /* rank 0 */
+ /* none root */
+ else {
+ /* send 1-byte message to root */
+ MPI_Send(temp_buf, 1, MPI_CHAR, 0, tag, comm);
+
+ /* wait for header forward when required */
+ MPI_Irecv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm,
+ &request);
+ MPI_Wait(&request, MPI_STATUS_IGNORE);
+
+ /* search for where it is */
+ int myordering = 0;
+ while (rank != header_buf[myordering]) {
+ myordering++;
+ }
+
+ /* send header when required */
+ if (header_buf[myordering + 1] != -1) {
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1],
+ tag, comm);
+ }
+
+ /* receive data */
+
+ if (0 == -1) {
+ //if (header_buf[1] == -1) {
+ MPI_Irecv(buf, count, datatype, 0, tag, comm, &request);
+ MPI_Wait(&request, MPI_STATUS_IGNORE);
+ //printf("\t\tnode %d ordering = %d receive data from root\n",rank,myordering);
+ } else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *)buf + (i * increment), segment, datatype, MPI_ANY_SOURCE,
+ tag, comm, &recv_request_array[i]);
+ }
+ }
+
+ /* send data */
+ if (header_buf[myordering + 1] != -1) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&recv_request_array[i], MPI_STATUS_IGNORE);
+ MPI_Isend((char *)buf + (i * increment), segment, datatype,
+ header_buf[myordering + 1], tag, comm,
+ &send_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+
+ }
+
+ free(send_request_array);
+ free(recv_request_array);
+ free(send_status_array);
+ free(recv_status_array);
+ } /* end pipeline */
+
+ /* when count is not divisible by block size, use default BCAST for the remainder */
+ if ((remainder != 0) && (count > segment)) {
+ MPI_Bcast((char *)buf + (pipe_length * increment), remainder, datatype, root, comm);
+ }
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+#ifndef BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE
+#define BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE 128
+#endif
+
+#ifndef BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE
+#define BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE 128
+#endif
+
+/* Non-topology-specific pipelined linear-bcast function */
+int smpi_coll_tuned_bcast_arrival_scatter(void *buf, int count,
+ MPI_Datatype datatype, int root,
+ MPI_Comm comm)
+{
+ int tag = 50;
+ int header_tag = 10;
+ MPI_Status status;
+
+ int curr_remainder;
+ int curr_size;
+ int curr_increment;
+ int send_offset;
+ int recv_offset;
+ int send_count;
+ int recv_count;
+
+ MPI_Status temp_status_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
+
+ int rank, size;
+ int i, k;
+
+ int sent_count;
+ int header_index;
+ int flag_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
+ int already_sent[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
+ int header_buf[BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE];
+ char temp_buf[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
+ int will_send[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
+ int max_node = BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE;
+ int header_size = BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE;
+
+ MPI_Aint extent;
+ MPI_Type_extent(datatype, &extent);
+
+
+ /* source and destination */
+ int to, from;
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+ /* message too small */
+ if (count < size) {
+ return MPI_Bcast(buf, count, datatype, root, comm);
+ }
+
+
+
+ /* if root is not zero send to rank zero first
+ this can be modified to make it faster by using logical src, dst.
+ */
+ if (root != 0) {
+ if (rank == root) {
+ MPI_Send(buf, count, datatype, 0, tag - 1, comm);
+ } else if (rank == 0) {
+ MPI_Recv(buf, count, datatype, root, tag - 1, comm, &status);
+ }
+ }
+
+
+ /* value == 0 means root has not send data (or header) to the node yet */
+ for (i = 0; i < max_node; i++) {
+ already_sent[i] = 0;
+ }
+
+ /* start bcast */
+
+ /* root */
+ if (rank == 0) {
+
+ for (i = 0; i < max_node; i++)
+ will_send[i] = 0;
+
+ sent_count = 0;
+ while (sent_count < (size - 1)) {
+
+ for (k = 0; k < 3; k++) {
+ for (i = 1; i < size; i++) {
+ if ((already_sent[i] == 0) && (will_send[i] == 0)) {
+ MPI_Iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
+ &temp_status_array[i]);
+ if (flag_array[i] == 1) {
+ will_send[i] = 1;
+ MPI_Recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD,
+ &status);
+ i = 0;
+ }
+ }
+ }
+ }
+ header_index = 0;
+
+ /* recv 1-byte message in this round */
+ for (i = 1; i < size; i++) {
+ /* message arrive */
+ if ((will_send[i] == 1) && (already_sent[i] == 0)) {
+ header_buf[header_index] = i;
+ header_index++;
+ sent_count++;
+
+ /* will send in the next step */
+ already_sent[i] = 1;
+ }
+ }
+
+ /*
+ if (header_index != 0) {
+ printf("header index = %d node = ",header_index);
+ for (i=0;i<header_index;i++) {
+ printf("%d ",header_buf[i]);
+ }
+ printf("\n");
+ }
+ */
+
+ /* send header followed by data */
+ if (header_index != 0) {
+ header_buf[header_index] = -1;
+
+ /* send header */
+ for (i = 0; i < header_index; i++) {
+ to = header_buf[i];
+ MPI_Send(header_buf, header_size, MPI_INT, to, header_tag, comm);
+ }
+
+ curr_remainder = count % header_index;
+ curr_size = (count / header_index);
+ curr_increment = curr_size * extent;
+
+ /* send data */
+
+ for (i = 0; i < header_index; i++) {
+ to = header_buf[i];
+ if ((i == (header_index - 1)) || (curr_size == 0))
+ curr_size += curr_remainder;
+ //printf("Root send to %d index %d\n",to,(i*curr_increment));
+ MPI_Send((char *) buf + (i * curr_increment), curr_size, datatype, to,
+ tag, comm);
+ }
+ }
+ } /* while (sent_count < size-1) */
+ }
+
+ /* rank 0 */
+ /* none root */
+ else {
+ /* send 1-byte message to root */
+ MPI_Send(temp_buf, 1, MPI_CHAR, 0, tag, comm);
+
+ /* wait for header forward when required */
+ MPI_Recv(header_buf, header_size, MPI_INT, 0, header_tag, comm, &status);
+
+ /* search for where it is */
+ int myordering = 0;
+ while (rank != header_buf[myordering]) {
+ myordering++;
+ }
+
+ int total_nodes = 0;
+ while (header_buf[total_nodes] != -1) {
+ total_nodes++;
+ }
+
+ curr_remainder = count % total_nodes;
+ curr_size = (count / total_nodes);
+ curr_increment = curr_size * extent;
+ int recv_size = curr_size;
+
+ /* receive data */
+ if (myordering == (total_nodes - 1))
+ recv_size += curr_remainder;
+ MPI_Recv((char *) buf + (myordering * curr_increment), recv_size, datatype,
+ 0, tag, comm, &status);
+
+ /* at this point all nodes in this set perform all-gather operation */
+ to = header_buf[myordering + 1];
+ from = header_buf[myordering - 1];
+ if (myordering == 0)
+ from = header_buf[total_nodes - 1];
+ if (myordering == (total_nodes - 1))
+ to = header_buf[0];
+
+
+ /* last segment may have a larger size since it also include the remainder */
+ int last_segment_ptr = (total_nodes - 1) * (count / total_nodes) * extent;
+
+
+ /* allgather */
+ for (i = 0; i < total_nodes - 1; i++) {
+ send_offset =
+ ((myordering - i + total_nodes) % total_nodes) * curr_increment;
+ recv_offset =
+ ((myordering - i - 1 + total_nodes) % total_nodes) * curr_increment;
+
+ /* adjust size */
+ if (send_offset != last_segment_ptr)
+ send_count = curr_size;
+ else
+ send_count = curr_size + curr_remainder;
+
+ if (recv_offset != last_segment_ptr)
+ recv_count = curr_size;
+ else
+ recv_count = curr_size + curr_remainder;
+
+ //printf("\t\tnode %d sent_to %d recv_from %d send_size %d recv_size %d\n",rank,to,from,send_count,recv_count);
+ //printf("\tnode %d sent_offset %d send_count %d\n",rank,send_offset,send_count);
+
+
+ MPI_Sendrecv((char *) buf + send_offset, send_count, datatype, to,
+ tag + i, (char *) buf + recv_offset, recv_count, datatype,
+ from, tag + i, comm, &status);
+ }
+ } /* non-root */
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+/*****************************************************************************
+
+Copyright (c) 2006, Ahmad Faraj & Xin Yuan,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ * Neither the name of the Florida State University nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ *************************************************************************
+ * Any results obtained from executing this software require the *
+ * acknowledgment and citation of the software and its owners. *
+ * The full citation is given below: *
+ * *
+ * A. Faraj and X. Yuan. "Automatic Generation and Tuning of MPI *
+ * Collective Communication Routines." The 19th ACM International *
+ * Conference on Supercomputing (ICS), Cambridge, Massachusetts, *
+ * June 20-22, 2005. *
+ *************************************************************************
+
+*****************************************************************************/
+
+/*****************************************************************************
+
+ * Function: bcast_binomial_tree
+
+ * Return: int
+
+ * Inputs:
+ buff: send input buffer
+ count: number of elements to send
+ data_type: data type of elements being sent
+ root: source of data
+ comm: communicator
+
+ * Descrp: broadcasts using a bionomial tree.
+
+ * Auther: MPIH / modified by Ahmad Faraj
+
+ ****************************************************************************/
+
+int
+smpi_coll_tuned_bcast_binomial_tree(void *buff, int count,
+ MPI_Datatype data_type, int root,
+ MPI_Comm comm)
+{
+ int src, dst, rank, num_procs, mask, relative_rank;
+ int tag = 1, success = 0;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &num_procs);
+
+ relative_rank = (rank >= root) ? rank - root : rank - root + num_procs;
+
+ mask = 0x1;
+ while (mask < num_procs) {
+ if (relative_rank & mask) {
+ src = rank - mask;
+ if (src < 0)
+ src += num_procs;
+ MPI_Recv(buff, count, data_type, src, tag, comm, MPI_STATUS_IGNORE);
+ break;
+ }
+ mask <<= 1;
+ }
+
+ mask >>= 1;
+ while (mask > 0) {
+ if (relative_rank + mask < num_procs) {
+ dst = rank + mask;
+ if (dst >= num_procs)
+ dst -= num_procs;
+ MPI_Send(buff, count, data_type, dst, tag, comm);
+ }
+ mask >>= 1;
+ }
+
+ return success;
+}
--- /dev/null
+#include "colls.h"
+
+int flattree_segment_in_byte = 8192;
+
+int
+smpi_coll_tuned_bcast_flattree_pipeline(void *buff, int count,
+ MPI_Datatype data_type, int root,
+ MPI_Comm comm)
+{
+ int i, j, rank, num_procs;
+ int tag = 1;
+
+ MPI_Aint extent;
+ MPI_Type_extent(data_type, &extent);
+
+ int segment = flattree_segment_in_byte / extent;
+ int pipe_length = count / segment;
+ int increment = segment * extent;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &num_procs);
+
+ MPI_Request *request_array;
+ MPI_Status *status_array;
+
+ request_array = (MPI_Request *) malloc(pipe_length * sizeof(MPI_Request));
+ status_array = (MPI_Status *) malloc(pipe_length * sizeof(MPI_Status));
+
+ if (rank != root) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *)buff + (i * increment), segment, data_type, root, tag, comm,
+ &request_array[i]);
+ }
+ MPI_Waitall(pipe_length, request_array, status_array);
+ }
+
+ else {
+ // Root sends data to all others
+ for (j = 0; j < num_procs; j++) {
+ if (j == rank)
+ continue;
+ else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Send((char *)buff + (i * increment), segment, data_type, j, tag, comm);
+ }
+ }
+ }
+
+ }
+
+ free(request_array);
+ free(status_array);
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+int
+smpi_coll_tuned_bcast_flattree(void *buff, int count, MPI_Datatype data_type,
+ int root, MPI_Comm comm)
+{
+ MPI_Request *req_ptr;
+ MPI_Request *reqs;
+
+ int i, rank, num_procs;
+ int tag = 1;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &num_procs);
+
+ if (rank != root) {
+ MPI_Recv(buff, count, data_type, root, tag, comm, MPI_STATUS_IGNORE);
+ }
+
+ else {
+ reqs = (MPI_Request *) malloc((num_procs - 1) * sizeof(MPI_Request));
+ req_ptr = reqs;
+
+ // Root sends data to all others
+ for (i = 0; i < num_procs; i++) {
+ if (i == rank)
+ continue;
+ MPI_Isend(buff, count, data_type, i, tag, comm, req_ptr++);
+ }
+
+ // wait on all requests
+ MPI_Waitall(num_procs - 1, reqs, MPI_STATUSES_IGNORE);
+
+ free(reqs);
+ }
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+/*****************************************************************************
+
+Copyright (c) 2006, Ahmad Faraj & Xin Yuan,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ * Neither the name of the Florida State University nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ *************************************************************************
+ * Any results obtained from executing this software require the *
+ * acknowledgment and citation of the software and its owners. *
+ * The full citation is given below: *
+ * *
+ * A. Faraj and X. Yuan. "Automatic Generation and Tuning of MPI *
+ * Collective Communication Routines." The 19th ACM International *
+ * Conference on Supercomputing (ICS), Cambridge, Massachusetts, *
+ * June 20-22, 2005. *
+ *************************************************************************
+
+*****************************************************************************/
+
+/*****************************************************************************
+
+ * Function: bcast_scatter_LR_allgather
+
+ * Return: int
+
+ * Inputs:
+ buff: send input buffer
+ count: number of elements to send
+ data_type: data type of elements being sent
+ root: source of data
+ comm: communicator
+
+ * Descrp: broadcasts using a scatter followed by LR allgather.
+
+ * Auther: MPIH / modified by Ahmad Faraj
+
+ ****************************************************************************/
+int
+smpi_coll_tuned_bcast_scatter_LR_allgather(void *buff, int count,
+ MPI_Datatype data_type, int root,
+ MPI_Comm comm)
+{
+ MPI_Aint extent;
+ MPI_Status status;
+ int i, src, dst, rank, num_procs;
+ int mask, relative_rank, curr_size, recv_size, send_size, nbytes;
+ int scatter_size, left, right, next_src, *recv_counts, *disps;
+ int tag = 1, success = 0, failure = 1;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &num_procs);
+ MPI_Type_extent(data_type, &extent);
+
+
+ nbytes = extent * count;
+ scatter_size = (nbytes + num_procs - 1) / num_procs; // ceiling division
+ curr_size = (rank == root) ? nbytes : 0; // root starts with all the data
+ relative_rank = (rank >= root) ? rank - root : rank - root + num_procs;
+
+ mask = 0x1;
+ while (mask < num_procs) {
+ if (relative_rank & mask) {
+ src = rank - mask;
+ if (src < 0)
+ src += num_procs;
+ recv_size = nbytes - relative_rank * scatter_size;
+ // recv_size is larger than what might actually be sent by the
+ // sender. We don't need compute the exact value because MPI
+ // allows you to post a larger recv.
+ if (recv_size <= 0)
+ curr_size = 0; // this process doesn't receive any data
+ // because of uneven division
+ else {
+ MPI_Recv((char *) buff + relative_rank * scatter_size, recv_size,
+ MPI_BYTE, src, tag, comm, &status);
+ MPI_Get_count(&status, MPI_BYTE, &curr_size);
+ }
+ break;
+ }
+ mask <<= 1;
+ }
+
+ // This process is responsible for all processes that have bits
+ // set from the LSB upto (but not including) mask. Because of
+ // the "not including", we start by shifting mask back down
+ // one.
+
+ mask >>= 1;
+ while (mask > 0) {
+ if (relative_rank + mask < num_procs) {
+ send_size = curr_size - scatter_size * mask;
+ // mask is also the size of this process's subtree
+
+ if (send_size > 0) {
+ dst = rank + mask;
+ if (dst >= num_procs)
+ dst -= num_procs;
+ MPI_Send((char *) buff + scatter_size * (relative_rank + mask),
+ send_size, MPI_BYTE, dst, tag, comm);
+
+ curr_size -= send_size;
+ }
+ }
+ mask >>= 1;
+ }
+
+ // done scatter now do allgather
+ recv_counts = (int *) malloc(sizeof(int) * num_procs);
+ if (!recv_counts) {
+ printf("bcast-scatter-LR-allgather:95: cannot allocate memory\n");
+ MPI_Finalize();
+ exit(failure);
+ }
+
+ disps = (int *) malloc(sizeof(int) * num_procs);
+ if (!disps) {
+ printf("bcast-scatter-LR-allgather:103: cannot allocate memory\n");
+ MPI_Finalize();
+ exit(failure);
+ }
+
+ for (i = 0; i < num_procs; i++) {
+ recv_counts[i] = nbytes - i * scatter_size;
+ if (recv_counts[i] > scatter_size)
+ recv_counts[i] = scatter_size;
+ if (recv_counts[i] < 0)
+ recv_counts[i] = 0;
+ }
+
+ disps[0] = 0;
+ for (i = 1; i < num_procs; i++)
+ disps[i] = disps[i - 1] + recv_counts[i - 1];
+
+ left = (num_procs + rank - 1) % num_procs;
+ right = (rank + 1) % num_procs;
+
+ src = rank;
+ next_src = left;
+
+ for (i = 1; i < num_procs; i++) {
+ MPI_Sendrecv((char *) buff + disps[(src - root + num_procs) % num_procs],
+ recv_counts[(src - root + num_procs) % num_procs],
+ MPI_BYTE, right, tag,
+ (char *) buff +
+ disps[(next_src - root + num_procs) % num_procs],
+ recv_counts[(next_src - root + num_procs) % num_procs],
+ MPI_BYTE, left, tag, comm, &status);
+ src = next_src;
+ next_src = (num_procs + next_src - 1) % num_procs;
+ }
+
+
+ free(recv_counts);
+ free(disps);
+
+ return success;
+}
--- /dev/null
+#include "colls.h"
+
+/*****************************************************************************
+
+Copyright (c) 2006, Ahmad Faraj & Xin Yuan,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ * Neither the name of the Florida State University nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ *************************************************************************
+ * Any results obtained from executing this software require the *
+ * acknowledgment and citation of the software and its owners. *
+ * The full citation is given below: *
+ * *
+ * A. Faraj and X. Yuan. "Automatic Generation and Tuning of MPI *
+ * Collective Communication Routines." The 19th ACM International *
+ * Conference on Supercomputing (ICS), Cambridge, Massachusetts, *
+ * June 20-22, 2005. *
+ *************************************************************************
+
+*****************************************************************************/
+
+/*****************************************************************************
+
+ * Function: bcast_scatter_rdb_allgather
+
+ * Return: int
+
+ * Inputs:
+ buff: send input buffer
+ count: number of elements to send
+ data_type: data type of elements being sent
+ root: source of data
+ comm: communicator
+
+ * Descrp: broadcasts using a scatter followed by rdb allgather.
+
+ * Auther: MPICH / modified by Ahmad Faraj
+
+ ****************************************************************************/
+
+int
+smpi_coll_tuned_bcast_scatter_rdb_allgather(void *buff, int count, MPI_Datatype
+ data_type, int root, MPI_Comm comm)
+{
+ MPI_Aint extent;
+ MPI_Status status;
+
+ int i, j, k, src, dst, rank, num_procs, send_offset, recv_offset;
+ int mask, relative_rank, curr_size, recv_size, send_size, nbytes;
+ int scatter_size, tree_root, relative_dst, dst_tree_root;
+ int my_tree_root, offset, tmp_mask, num_procs_completed;
+ int tag = 1, success = 0;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &num_procs);
+ MPI_Type_extent(data_type, &extent);
+
+ nbytes = extent * count;
+ scatter_size = (nbytes + num_procs - 1) / num_procs; // ceiling division
+ curr_size = (rank == root) ? nbytes : 0; // root starts with all the data
+ relative_rank = (rank >= root) ? rank - root : rank - root + num_procs;
+
+ mask = 0x1;
+ while (mask < num_procs) {
+ if (relative_rank & mask) {
+ src = rank - mask;
+ if (src < 0)
+ src += num_procs;
+ recv_size = nbytes - relative_rank * scatter_size;
+ // recv_size is larger than what might actually be sent by the
+ // sender. We don't need compute the exact value because MPI
+ // allows you to post a larger recv.
+ if (recv_size <= 0)
+ curr_size = 0; // this process doesn't receive any data
+ // because of uneven division
+ else {
+ MPI_Recv((char *)buff + relative_rank * scatter_size, recv_size,
+ MPI_BYTE, src, tag, comm, &status);
+ MPI_Get_count(&status, MPI_BYTE, &curr_size);
+ }
+ break;
+ }
+ mask <<= 1;
+ }
+
+ // This process is responsible for all processes that have bits
+ // set from the LSB upto (but not including) mask. Because of
+ // the "not including", we start by shifting mask back down
+ // one.
+
+ mask >>= 1;
+ while (mask > 0) {
+ if (relative_rank + mask < num_procs) {
+ send_size = curr_size - scatter_size * mask;
+ // mask is also the size of this process's subtree
+
+ if (send_size > 0) {
+ dst = rank + mask;
+ if (dst >= num_procs)
+ dst -= num_procs;
+ MPI_Send((char *)buff + scatter_size * (relative_rank + mask),
+ send_size, MPI_BYTE, dst, tag, comm);
+
+ curr_size -= send_size;
+ }
+ }
+ mask >>= 1;
+ }
+
+ // done scatter now do allgather
+
+
+ mask = 0x1;
+ i = 0;
+ while (mask < num_procs) {
+ relative_dst = relative_rank ^ mask;
+
+ dst = (relative_dst + root) % num_procs;
+
+ /* find offset into send and recv buffers.
+ zero out the least significant "i" bits of relative_rank and
+ relative_dst to find root of src and dst
+ subtrees. Use ranks of roots as index to send from
+ and recv into buffer */
+
+ dst_tree_root = relative_dst >> i;
+ dst_tree_root <<= i;
+
+ my_tree_root = relative_rank >> i;
+ my_tree_root <<= i;
+
+ send_offset = my_tree_root * scatter_size;
+ recv_offset = dst_tree_root * scatter_size;
+
+ if (relative_dst < num_procs) {
+ MPI_Sendrecv((char *)buff + send_offset, curr_size, MPI_BYTE, dst, tag,
+ (char *)buff + recv_offset, scatter_size * mask, MPI_BYTE, dst,
+ tag, comm, &status);
+ MPI_Get_count(&status, MPI_BYTE, &recv_size);
+ curr_size += recv_size;
+ }
+
+ /* if some processes in this process's subtree in this step
+ did not have any destination process to communicate with
+ because of non-power-of-two, we need to send them the
+ data that they would normally have received from those
+ processes. That is, the haves in this subtree must send to
+ the havenots. We use a logarithmic recursive-halfing algorithm
+ for this. */
+
+ if (dst_tree_root + mask > num_procs) {
+ num_procs_completed = num_procs - my_tree_root - mask;
+ /* num_procs_completed is the number of processes in this
+ subtree that have all the data. Send data to others
+ in a tree fashion. First find root of current tree
+ that is being divided into two. k is the number of
+ least-significant bits in this process's rank that
+ must be zeroed out to find the rank of the root */
+ j = mask;
+ k = 0;
+ while (j) {
+ j >>= 1;
+ k++;
+ }
+ k--;
+
+ offset = scatter_size * (my_tree_root + mask);
+ tmp_mask = mask >> 1;
+
+ while (tmp_mask) {
+ relative_dst = relative_rank ^ tmp_mask;
+ dst = (relative_dst + root) % num_procs;
+
+ tree_root = relative_rank >> k;
+ tree_root <<= k;
+
+ /* send only if this proc has data and destination
+ doesn't have data. */
+
+ if ((relative_dst > relative_rank)
+ && (relative_rank < tree_root + num_procs_completed)
+ && (relative_dst >= tree_root + num_procs_completed)) {
+ MPI_Send((char *)buff + offset, recv_size, MPI_BYTE, dst, tag, comm);
+
+ /* recv_size was set in the previous
+ receive. that's the amount of data to be
+ sent now. */
+ }
+ /* recv only if this proc. doesn't have data and sender
+ has data */
+ else if ((relative_dst < relative_rank)
+ && (relative_dst < tree_root + num_procs_completed)
+ && (relative_rank >= tree_root + num_procs_completed)) {
+
+ MPI_Recv((char *)buff + offset, scatter_size * num_procs_completed,
+ MPI_BYTE, dst, tag, comm, &status);
+
+ /* num_procs_completed is also equal to the no. of processes
+ whose data we don't have */
+ MPI_Get_count(&status, MPI_BYTE, &recv_size);
+ curr_size += recv_size;
+ }
+ tmp_mask >>= 1;
+ k--;
+ }
+ }
+ mask <<= 1;
+ i++;
+ }
+
+ return success;
+}
#ifndef SMPI_COLLS_H
#define SMPI_COLLS_H
+#include <math.h>
#include "smpi/mpi.h"
#include "xbt.h"
-int smpi_coll_tuned_alltoall_2dmesh(
- void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm);
-int smpi_coll_tuned_alltoall_3dmesh(
- void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm);
-/*int smpi_coll_tuned_alltoall_bruck(
- void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm);*/
-int smpi_coll_tuned_alltoall_pair(
- void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm);
-int smpi_coll_tuned_alltoall_pair_light_barrier(
- void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm);
-int smpi_coll_tuned_alltoall_pair_mpi_barrier(
- void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm);
-int smpi_coll_tuned_alltoall_pair_one_barrier(
- void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm);
-int smpi_coll_tuned_alltoall_rdb(
- void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm);
-int smpi_coll_tuned_alltoall_ring(
- void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm);
-int smpi_coll_tuned_alltoall_ring_light_barrier(
- void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm);
-int smpi_coll_tuned_alltoall_ring_mpi_barrier(
- void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm);
-int smpi_coll_tuned_alltoall_ring_one_barrier(
- void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm);
-int smpi_coll_tuned_alltoall_simple(
- void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm);
-
-
-int smpi_coll_tuned_allgather_2dmesh(
- void * send_buff, int send_count, MPI_Datatype send_type,
- void * recv_buff, int recv_count, MPI_Datatype recv_type,
- MPI_Comm comm);
+void star_reduction(MPI_Op op, void *src, void *target, int *count, MPI_Datatype *dtype);
+
+#define COLL_DESCRIPTION(cat, ret, args, name) \
+ {# name,\
+ # cat " " # name " collective",\
+ smpi_coll_tuned_ ## cat ## _ ## name}
+
+#define COLL_PROTO(cat, ret, args, name) \
+ ret smpi_coll_tuned_ ## cat ## _ ## name(COLL_UNPAREN args);
+#define COLL_UNPAREN(...) __VA_ARGS__
+
+#define COLL_APPLY(action, sig, name) action(sig, name)
+#define COLL_COMMA ,
+#define COLL_NOsep
+#define COLL_NOTHING(...)
+
+
+/*************
+ * ALLGATHER *
+ *************/
+#define COLL_ALLGATHER_SIG allgather, int, \
+ (void *send_buff, int send_count, MPI_Datatype send_type, \
+ void *recv_buff, int recv_count, MPI_Datatype recv_type, \
+ MPI_Comm comm)
+
+#define COLL_ALLGATHERS(action, COLL_sep) \
+COLL_NOTHING(COLL_APPLY(action, COLL_ALLGATHER_SIG, 2dmesh) COLL_sep) \
+COLL_NOTHING(COLL_APPLY(action, COLL_ALLGATHER_SIG, 3dmesh) COLL_sep) \
+COLL_NOTHING(COLL_APPLY(action, COLL_ALLGATHER_SIG, bruck) COLL_sep) \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, GB) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, loosely_lr) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, lr) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, NTSLR) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, NTSLR_NB) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, pair) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, rdb) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, RDB) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, rhv) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, ring) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, SMP_NTS) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, smp_simple) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, SMP_simple) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, spreading_simple)
+
+COLL_ALLGATHERS(COLL_PROTO, COLL_NOsep)
+
+
+/*************
+ * ALLREDUCE *
+ *************/
+#define COLL_ALLREDUCE_SIG allreduce, int, \
+ (void *sbuf, void *rbuf, int rcount, \
+ MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
+
+#define COLL_ALLREDUCES(action, COLL_sep) \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, lr) COLL_sep \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, NTS) COLL_sep \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, rab1) COLL_sep \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, rab2) COLL_sep \
+COLL_NOTHING(COLL_APPLY(action, COLL_ALLREDUCE_SIG, rab_rdb) COLL_sep) \
+COLL_NOTHING(COLL_APPLY(action, COLL_ALLREDUCE_SIG, rab_reduce_scatter) COLL_sep) \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, rab_rsag) COLL_sep \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, rdb) COLL_sep \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_binomial) COLL_sep \
+COLL_NOTHING(COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_binomial_pipeline) COLL_sep) \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rdb) COLL_sep \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag) COLL_sep \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag_lr) COLL_sep \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag_rab) COLL_sep \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, redbcast)
+
+COLL_ALLREDUCES(COLL_PROTO, COLL_NOsep)
+
+
+/************
+ * ALLTOALL *
+ ************/
+#define COLL_ALLTOALL_SIG alltoall, int, \
+ (void *send_buff, int send_count, MPI_Datatype send_type, \
+ void *recv_buff, int recv_count, MPI_Datatype recv_type, \
+ MPI_Comm com)
+
+#define COLL_ALLTOALLS(action, COLL_sep) \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, 2dmesh) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, 3dmesh) COLL_sep \
+COLL_NOTHING(COLL_APPLY(action, COLL_ALLTOALL_SIG, bruck) COLL_sep) \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, pair) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, pair_light_barrier) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, pair_mpi_barrier) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, pair_one_barrier) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, rdb) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, ring) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_light_barrier) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_mpi_barrier) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_one_barrier) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, simple)
+
+COLL_ALLTOALLS(COLL_PROTO, COLL_NOsep)
+
+
+/*********
+ * BCAST *
+ *********/
+#define COLL_BCAST_SIG bcast, int, \
+ (void *buf, int count, MPI_Datatype datatype, \
+ int root, MPI_Comm comm)
+
+#define COLL_BCASTS(action, COLL_sep) \
+COLL_APPLY(action, COLL_BCAST_SIG, arrival_nb) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, arrival_pattern_aware) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, arrival_pattern_aware_wait) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, arrival_scatter) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, binomial_tree) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, flattree) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, flattree_pipeline) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, NTSB) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, NTSL) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, NTSL_Isend) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, scatter_LR_allgather) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, scatter_rdb_allgather) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, SMP_binary) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, SMP_binomial) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, SMP_linear) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, TSB)
+
+COLL_BCASTS(COLL_PROTO, COLL_NOsep)
+
+
+/**********
+ * REDUCE *
+ **********/
+#define COLL_REDUCE_SIG reduce, int, \
+ (void *buf, void *rbuf, int count, MPI_Datatype datatype, \
+ MPI_Op op, int root, MPI_Comm comm)
+
+#define COLL_REDUCES(action, COLL_sep) \
+COLL_APPLY(action, COLL_REDUCE_SIG, arrival_pattern_aware) COLL_sep \
+COLL_APPLY(action, COLL_REDUCE_SIG, binomial) COLL_sep \
+COLL_APPLY(action, COLL_REDUCE_SIG, flat_tree) COLL_sep \
+COLL_APPLY(action, COLL_REDUCE_SIG, NTSL) COLL_sep \
+COLL_APPLY(action, COLL_REDUCE_SIG, scatter_gather)
+
+COLL_REDUCES(COLL_PROTO, COLL_NOsep)
#endif
--- /dev/null
+#include "colls.h"
+//#include <star-reduction.c>
+
+int reduce_NTSL_segment_size_in_byte = 8192;
+
+/* Non-topology-specific pipelined linear-bcast function
+ 0->1, 1->2 ,2->3, ....., ->last node : in a pipeline fashion
+*/
+int smpi_coll_tuned_reduce_NTSL(void *buf, void *rbuf, int count,
+ MPI_Datatype datatype, MPI_Op op, int root,
+ MPI_Comm comm)
+{
+ int tag = 50;
+ MPI_Status status;
+ MPI_Request *send_request_array;
+ MPI_Request *recv_request_array;
+ MPI_Status *send_status_array;
+ MPI_Status *recv_status_array;
+ int rank, size;
+ int i;
+ MPI_Aint extent;
+ MPI_Type_extent(datatype, &extent);
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+ /* source node and destination nodes (same through out the functions) */
+ int to = (rank - 1 + size) % size;
+ int from = (rank + 1) % size;
+
+ /* segment is segment size in number of elements (not bytes) */
+ int segment = reduce_NTSL_segment_size_in_byte / extent;
+
+ /* pipeline length */
+ int pipe_length = count / segment;
+
+ /* use for buffer offset for sending and receiving data = segment size in byte */
+ int increment = segment * extent;
+
+ /* if the input size is not divisible by segment size =>
+ the small remainder will be done with native implementation */
+ int remainder = count % segment;
+
+ /* if root is not zero send to rank zero first
+ this can be modified to make it faster by using logical src, dst.
+ */
+
+ /*
+ if (root != 0) {
+ if (rank == root){
+ MPI_Send(buf,count,datatype,0,tag,comm);
+ }
+ else if (rank == 0) {
+ MPI_Recv(buf,count,datatype,root,tag,comm,&status);
+ }
+ }
+ */
+
+ char *tmp_buf;
+ tmp_buf = (char *) malloc(count * extent);
+
+ MPI_Sendrecv(buf, count, datatype, rank, tag, rbuf, count, datatype, rank,
+ tag, comm, &status);
+
+ /* when a message is smaller than a block size => no pipeline */
+ if (count <= segment) {
+ if (rank == root) {
+ MPI_Recv(tmp_buf, count, datatype, from, tag, comm, &status);
+ star_reduction(op, tmp_buf, rbuf, &count, &datatype);
+ } else if (rank == ((root - 1 + size) % size)) {
+ MPI_Send(rbuf, count, datatype, to, tag, comm);
+ } else {
+ MPI_Recv(tmp_buf, count, datatype, from, tag, comm, &status);
+ star_reduction(op, tmp_buf, rbuf, &count, &datatype);
+ MPI_Send(rbuf, count, datatype, to, tag, comm);
+ }
+ free(tmp_buf);
+ return MPI_SUCCESS;
+ }
+
+ /* pipeline */
+ else {
+ send_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ recv_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ send_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+ recv_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+
+ /* root recv data */
+ if (rank == root) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) tmp_buf + (i * increment), segment, datatype, from,
+ (tag + i), comm, &recv_request_array[i]);
+ }
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&recv_request_array[i], &status);
+ star_reduction(op, tmp_buf + (i * increment), (char *)rbuf + (i * increment),
+ &segment, &datatype);
+ }
+ }
+
+ /* last node only sends data */
+ else if (rank == ((root - 1 + size) % size)) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Isend((char *)rbuf + (i * increment), segment, datatype, to, (tag + i),
+ comm, &send_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+
+ /* intermediate nodes relay (receive, reduce, then send) data */
+ else {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv((char *) tmp_buf + (i * increment), segment, datatype, from,
+ (tag + i), comm, &recv_request_array[i]);
+ }
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&recv_request_array[i], &status);
+ star_reduction(op, tmp_buf + (i * increment), (char *)rbuf + (i * increment),
+ &segment, &datatype);
+ MPI_Isend((char *) rbuf + (i * increment), segment, datatype, to,
+ (tag + i), comm, &send_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+
+ free(send_request_array);
+ free(recv_request_array);
+ free(send_status_array);
+ free(recv_status_array);
+ } /* end pipeline */
+
+ /* when count is not divisible by block size, use default BCAST for the remainder */
+ if ((remainder != 0) && (count > segment)) {
+ MPI_Reduce((char *)buf + (pipe_length * increment),
+ (char *)rbuf + (pipe_length * increment), remainder, datatype, op, root,
+ comm);
+ }
+
+ free(tmp_buf);
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+//#include <star-reduction.c>
+
+int reduce_arrival_pattern_aware_segment_size_in_byte = 8192;
+
+#ifndef HEADER_SIZE
+#define HEADER_SIZE 1024
+#endif
+
+#ifndef MAX_NODE
+#define MAX_NODE 1024
+#endif
+
+/* Non-topology-specific pipelined linear-reduce function */
+int smpi_coll_tuned_reduce_arrival_pattern_aware(void *buf, void *rbuf,
+ int count,
+ MPI_Datatype datatype,
+ MPI_Op op, int root,
+ MPI_Comm comm)
+{
+ int rank;
+ MPI_Comm_rank(comm, &rank);
+
+ int tag = 50;
+ MPI_Status status;
+ MPI_Request request;
+ MPI_Request *send_request_array;
+ MPI_Request *recv_request_array;
+ MPI_Status *send_status_array;
+ MPI_Status *recv_status_array;
+
+ MPI_Status temp_status_array[MAX_NODE];
+
+ int size;
+ int i;
+
+ int sent_count;
+ int header_index;
+ int flag_array[MAX_NODE];
+ int already_received[MAX_NODE];
+
+ int header_buf[HEADER_SIZE];
+ char temp_buf[MAX_NODE];
+
+ MPI_Aint extent;
+ MPI_Type_extent(datatype, &extent);
+
+ /* source and destination */
+ int to, from;
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+
+ /* segment is segment size in number of elements (not bytes) */
+ int segment = reduce_arrival_pattern_aware_segment_size_in_byte / extent;
+
+ /* pipeline length */
+ int pipe_length = count / segment;
+
+ /* use for buffer offset for sending and receiving data = segment size in byte */
+ int increment = segment * extent;
+
+ /* if the input size is not divisible by segment size =>
+ the small remainder will be done with native implementation */
+ int remainder = count % segment;
+
+
+ /* value == 0 means root has not send data (or header) to the node yet */
+ for (i = 0; i < MAX_NODE; i++) {
+ already_received[i] = 0;
+ }
+
+ char *tmp_buf;
+ tmp_buf = (char *) malloc(count * extent);
+
+ MPI_Sendrecv(buf, count, datatype, rank, tag, rbuf, count, datatype, rank,
+ tag, comm, &status);
+
+
+
+ /* when a message is smaller than a block size => no pipeline */
+ if (count <= segment) {
+
+ if (rank == 0) {
+ sent_count = 0;
+
+ while (sent_count < (size - 1)) {
+
+ for (i = 1; i < size; i++) {
+ if (already_received[i] == 0)
+ MPI_Iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
+ MPI_STATUSES_IGNORE);
+ }
+
+ header_index = 0;
+ /* recv 1-byte message */
+ for (i = 0; i < size; i++) {
+ if (i == rank)
+ continue;
+
+ /* 1-byte message arrive */
+ if ((flag_array[i] == 1) && (already_received[i] == 0)) {
+ MPI_Recv(temp_buf, 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
+ header_buf[header_index] = i;
+ header_index++;
+ sent_count++;
+
+
+ //printf("root send to %d recv from %d : data = ",to,from);
+ /*
+ for (i=0;i<=header_index;i++) {
+ printf("%d ",header_buf[i]);
+ }
+ printf("\n");
+ */
+ /* will receive in the next step */
+ already_received[i] = 1;
+ }
+ }
+
+ /* send header followed by receive and reduce data */
+ if (header_index != 0) {
+ header_buf[header_index] = -1;
+ to = header_buf[0];
+ from = header_buf[header_index - 1];
+
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);
+ MPI_Recv(tmp_buf, count, datatype, from, tag, comm, &status);
+ star_reduction(op, tmp_buf, rbuf, &count, &datatype);
+ }
+ } /* while loop */
+ }
+
+ /* root */
+ /* non-root */
+ else {
+
+ /* send 1-byte message to root */
+ MPI_Send(temp_buf, 1, MPI_CHAR, 0, tag, comm);
+
+ /* wait for header and data, forward when required */
+ MPI_Recv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm,
+ &status);
+ // MPI_Recv(buf,count,datatype,MPI_ANY_SOURCE,tag,comm,&status);
+
+ /* search for where it is */
+ int myordering = 0;
+ while (rank != header_buf[myordering]) {
+ myordering++;
+ }
+
+ /* forward header */
+ if (header_buf[myordering + 1] != -1) {
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1],
+ tag, comm);
+ }
+ //printf("node %d ordering %d\n",rank,myordering);
+
+ /* receive, reduce, and forward data */
+
+ /* send only */
+ if (myordering == 0) {
+ if (header_buf[myordering + 1] == -1) {
+ to = 0;
+ } else {
+ to = header_buf[myordering + 1];
+ }
+ MPI_Send(rbuf, count, datatype, to, tag, comm);
+ }
+
+ /* recv, reduce, send */
+ else {
+ if (header_buf[myordering + 1] == -1) {
+ to = 0;
+ } else {
+ to = header_buf[myordering + 1];
+ }
+ from = header_buf[myordering - 1];
+ MPI_Recv(tmp_buf, count, datatype, header_buf[myordering - 1], tag,
+ comm, &status);
+ star_reduction(op, tmp_buf, rbuf, &count, &datatype);
+ MPI_Send(rbuf, count, datatype, to, tag, comm);
+ }
+ } /* non-root */
+ }
+ /* pipeline bcast */
+ else {
+ // printf("node %d start\n",rank);
+
+ send_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ recv_request_array =
+ (MPI_Request *) malloc((size + pipe_length) * sizeof(MPI_Request));
+ send_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+ recv_status_array =
+ (MPI_Status *) malloc((size + pipe_length) * sizeof(MPI_Status));
+
+ if (rank == 0) {
+ sent_count = 0;
+
+ int will_send[MAX_NODE];
+ for (i = 0; i < MAX_NODE; i++)
+ will_send[i] = 0;
+
+ /* loop until all data are received (sent) */
+ while (sent_count < (size - 1)) {
+ int k;
+ for (k = 0; k < 1; k++) {
+ for (i = 1; i < size; i++) {
+ //if (i == rank)
+ //continue;
+ if ((already_received[i] == 0) && (will_send[i] == 0)) {
+ MPI_Iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
+ &temp_status_array[i]);
+ if (flag_array[i] == 1) {
+ will_send[i] = 1;
+ MPI_Recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD,
+ &status);
+ //printf("recv from %d\n",i);
+ i = 1;
+ }
+ }
+ }
+ } /* end of probing */
+
+ header_index = 0;
+
+ /* recv 1-byte message */
+ for (i = 1; i < size; i++) {
+ //if (i==rank)
+ //continue;
+ /* message arrived in this round (put in the header) */
+ if ((will_send[i] == 1) && (already_received[i] == 0)) {
+ header_buf[header_index] = i;
+ header_index++;
+ sent_count++;
+
+ /* will send in the next step */
+ already_received[i] = 1;
+ }
+ }
+
+ /* send header followed by data */
+ if (header_index != 0) {
+ header_buf[header_index] = -1;
+ to = header_buf[0];
+
+ /* send header */
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);
+
+ /* recv data - pipeline */
+ from = header_buf[header_index - 1];
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Recv(tmp_buf + (i * increment), segment, datatype, from, tag,
+ comm, &status);
+ star_reduction(op, tmp_buf + (i * increment),
+ (char *)rbuf + (i * increment), &segment, &datatype);
+ }
+ }
+ } /* while loop (sent_count < size-1 ) */
+ }
+
+ /* root */
+ /* none root */
+ else {
+ /* send 1-byte message to root */
+ MPI_Send(temp_buf, 1, MPI_CHAR, 0, tag, comm);
+
+
+ /* wait for header forward when required */
+ MPI_Irecv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm,
+ &request);
+ MPI_Wait(&request, MPI_STATUS_IGNORE);
+
+ /* search for where it is */
+ int myordering = 0;
+
+ while (rank != header_buf[myordering]) {
+ myordering++;
+ }
+
+ /* send header when required */
+ if (header_buf[myordering + 1] != -1) {
+ MPI_Send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1],
+ tag, comm);
+ }
+
+ /* (receive, reduce), and send data */
+ if (header_buf[myordering + 1] == -1) {
+ to = 0;
+ } else {
+ to = header_buf[myordering + 1];
+ }
+
+ /* send only */
+ if (myordering == 0) {
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Isend((char *)rbuf + (i * increment), segment, datatype, to, tag, comm,
+ &send_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+
+ /* receive, reduce, and send */
+ else {
+ from = header_buf[myordering - 1];
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Irecv(tmp_buf + (i * increment), segment, datatype, from, tag,
+ comm, &recv_request_array[i]);
+ }
+ for (i = 0; i < pipe_length; i++) {
+ MPI_Wait(&recv_request_array[i], MPI_STATUS_IGNORE);
+ star_reduction(op, tmp_buf + (i * increment), (char *)rbuf + (i * increment),
+ &segment, &datatype);
+ MPI_Isend((char *)rbuf + (i * increment), segment, datatype, to, tag, comm,
+ &send_request_array[i]);
+ }
+ MPI_Waitall((pipe_length), send_request_array, send_status_array);
+ }
+ } /* non-root */
+
+
+
+
+ free(send_request_array);
+ free(recv_request_array);
+ free(send_status_array);
+ free(recv_status_array);
+
+ //printf("node %d done\n",rank);
+ } /* end pipeline */
+
+
+ /* if root is not zero send root after finished
+ this can be modified to make it faster by using logical src, dst.
+ */
+ if (root != 0) {
+ if (rank == 0) {
+ MPI_Send(rbuf, count, datatype, root, tag, comm);
+ } else if (rank == root) {
+ MPI_Recv(rbuf, count, datatype, 0, tag, comm, &status);
+ }
+ }
+
+
+ /* when count is not divisible by block size, use default BCAST for the remainder */
+ if ((remainder != 0) && (count > segment)) {
+ MPI_Reduce((char *)buf + (pipe_length * increment),
+ (char *)rbuf + (pipe_length * increment), remainder, datatype, op, root,
+ comm);
+ }
+
+ free(tmp_buf);
+
+ return MPI_SUCCESS;
+}
--- /dev/null
+#include "colls.h"
+
+//#include <star-reduction.c>
+
+int smpi_coll_tuned_reduce_binomial(void *sendbuf, void *recvbuf, int count,
+ MPI_Datatype datatype, MPI_Op op, int root,
+ MPI_Comm comm)
+{
+ MPI_Status status;
+ int comm_size, rank;
+ int mask, relrank, source;
+ int dst;
+ int tag = 4321;
+ MPI_Aint extent;
+ void *tmp_buf;
+
+ if (count == 0)
+ return 0;
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &comm_size);
+
+ MPI_Type_extent(datatype, &extent);
+
+ tmp_buf = (void *) malloc(count * extent);
+
+ MPI_Sendrecv(sendbuf, count, datatype, rank, tag,
+ recvbuf, count, datatype, rank, tag, comm, &status);
+ mask = 1;
+ relrank = (rank - root + comm_size) % comm_size;
+
+ while (mask < comm_size) {
+ /* Receive */
+ if ((mask & relrank) == 0) {
+ source = (relrank | mask);
+ if (source < comm_size) {
+ source = (source + root) % comm_size;
+ MPI_Recv(tmp_buf, count, datatype, source, tag, comm, &status);
+ star_reduction(op, tmp_buf, recvbuf, &count, &datatype);
+ }
+ } else {
+ dst = ((relrank & (~mask)) + root) % comm_size;
+ MPI_Send(recvbuf, count, datatype, dst, tag, comm);
+ break;
+ }
+ mask <<= 1;
+ }
+
+ free(tmp_buf);
+
+ return 0;
+}
--- /dev/null
+#include "colls.h"
+//#include <star-reduction.c>
+
+int
+smpi_coll_tuned_reduce_flat_tree(void *sbuf, void *rbuf, int count,
+ MPI_Datatype dtype, MPI_Op op,
+ int root, MPI_Comm comm)
+{
+ int i, tag = 4321;
+ int size;
+ int rank;
+ MPI_Aint extent;
+ char *origin = 0;
+ char *inbuf;
+ MPI_Status status;
+
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &size);
+
+ /* If not root, send data to the root. */
+ MPI_Type_extent(dtype, &extent);
+
+ if (rank != root) {
+ MPI_Send(sbuf, count, dtype, root, tag, comm);
+ return 0;
+ }
+
+ /* Root receives and reduces messages. Allocate buffer to receive
+ messages. */
+
+ if (size > 1)
+ origin = (char *) malloc(count * extent);
+
+
+ /* Initialize the receive buffer. */
+ if (rank == (size - 1))
+ MPI_Sendrecv(sbuf, count, dtype, rank, tag,
+ rbuf, count, dtype, rank, tag, comm, &status);
+ else
+ MPI_Recv(rbuf, count, dtype, size - 1, tag, comm, &status);
+
+ /* Loop receiving and calling reduction function (C or Fortran). */
+
+ for (i = size - 2; i >= 0; --i) {
+ if (rank == i)
+ inbuf = sbuf;
+ else {
+ MPI_Recv(origin, count, dtype, i, tag, comm, &status);
+ inbuf = origin;
+ }
+
+ /* Call reduction function. */
+ star_reduction(op, inbuf, rbuf, &count, &dtype);
+
+ }
+
+ if (origin)
+ free(origin);
+
+ /* All done */
+ return 0;
+}
--- /dev/null
+#include "colls.h"
+
+/*
+ reduce
+ Author: MPICH
+ */
+
+int smpi_coll_tuned_reduce_scatter_gather(void *sendbuf, void *recvbuf,
+ int count, MPI_Datatype datatype,
+ MPI_Op op, int root, MPI_Comm comm)
+{
+ MPI_Status status;
+ int comm_size, rank, type_size, pof2, rem, newrank;
+ int mask, *cnts, *disps, i, j, send_idx = 0;
+ int recv_idx, last_idx = 0, newdst;
+ int dst, send_cnt, recv_cnt, newroot, newdst_tree_root;
+ int newroot_tree_root, new_count;
+ int tag = 4321;
+ void *send_ptr, *recv_ptr, *tmp_buf;
+
+ cnts = NULL;
+ disps = NULL;
+
+ MPI_Aint extent;
+
+ if (count == 0)
+ return 0;
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &comm_size);
+
+ MPI_Type_extent(datatype, &extent);
+ MPI_Type_size(datatype, &type_size);
+
+ /* find nearest power-of-two less than or equal to comm_size */
+ pof2 = 1;
+ while (pof2 <= comm_size)
+ pof2 <<= 1;
+ pof2 >>= 1;
+
+ if (count < comm_size) {
+ new_count = comm_size;
+ send_ptr = (void *) malloc(new_count * extent);
+ recv_ptr = (void *) malloc(new_count * extent);
+ tmp_buf = (void *) malloc(new_count * extent);
+ memcpy(send_ptr, sendbuf, extent * new_count);
+
+ //if ((rank != root))
+ MPI_Sendrecv(send_ptr, new_count, datatype, rank, tag,
+ recv_ptr, new_count, datatype, rank, tag, comm, &status);
+
+ rem = comm_size - pof2;
+ if (rank < 2 * rem) {
+ if (rank % 2 != 0) {
+ /* odd */
+ MPI_Send(recv_ptr, new_count, datatype, rank - 1, tag, comm);
+ newrank = -1;
+ } else {
+ MPI_Recv(tmp_buf, count, datatype, rank + 1, tag, comm, &status);
+ star_reduction(op, tmp_buf, recv_ptr, &new_count, &datatype);
+ newrank = rank / 2;
+ }
+ } else /* rank >= 2*rem */
+ newrank = rank - rem;
+
+ cnts = (int *) malloc(pof2 * sizeof(int));
+ disps = (int *) malloc(pof2 * sizeof(int));
+
+ if (newrank != -1) {
+ for (i = 0; i < (pof2 - 1); i++)
+ cnts[i] = new_count / pof2;
+ cnts[pof2 - 1] = new_count - (new_count / pof2) * (pof2 - 1);
+
+ disps[0] = 0;
+ for (i = 1; i < pof2; i++)
+ disps[i] = disps[i - 1] + cnts[i - 1];
+
+ mask = 0x1;
+ send_idx = recv_idx = 0;
+ last_idx = pof2;
+ while (mask < pof2) {
+ newdst = newrank ^ mask;
+ /* find real rank of dest */
+ dst = (newdst < rem) ? newdst * 2 : newdst + rem;
+
+ send_cnt = recv_cnt = 0;
+ if (newrank < newdst) {
+ send_idx = recv_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < last_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < send_idx; i++)
+ recv_cnt += cnts[i];
+ } else {
+ recv_idx = send_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < recv_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < last_idx; i++)
+ recv_cnt += cnts[i];
+ }
+
+ /* Send data from recvbuf. Recv into tmp_buf */
+ MPI_Sendrecv((char *) recv_ptr +
+ disps[send_idx] * extent,
+ send_cnt, datatype,
+ dst, tag,
+ (char *) tmp_buf +
+ disps[recv_idx] * extent,
+ recv_cnt, datatype, dst, tag, comm, &status);
+
+ /* tmp_buf contains data received in this step.
+ recvbuf contains data accumulated so far */
+
+ star_reduction(op, (char *) tmp_buf + disps[recv_idx] * extent,
+ (char *) recv_ptr + disps[recv_idx] * extent,
+ &recv_cnt, &datatype);
+
+ /* update send_idx for next iteration */
+ send_idx = recv_idx;
+ mask <<= 1;
+
+ if (mask < pof2)
+ last_idx = recv_idx + pof2 / mask;
+ }
+ }
+
+ /* now do the gather to root */
+
+ if (root < 2 * rem) {
+ if (root % 2 != 0) {
+ if (rank == root) {
+ /* recv */
+ for (i = 0; i < (pof2 - 1); i++)
+ cnts[i] = new_count / pof2;
+ cnts[pof2 - 1] = new_count - (new_count / pof2) * (pof2 - 1);
+
+ disps[0] = 0;
+ for (i = 1; i < pof2; i++)
+ disps[i] = disps[i - 1] + cnts[i - 1];
+
+ MPI_Recv(recv_ptr, cnts[0], datatype, 0, tag, comm, &status);
+
+ newrank = 0;
+ send_idx = 0;
+ last_idx = 2;
+ } else if (newrank == 0) {
+ MPI_Send(recv_ptr, cnts[0], datatype, root, tag, comm);
+ newrank = -1;
+ }
+ newroot = 0;
+ } else
+ newroot = root / 2;
+ } else
+ newroot = root - rem;
+
+ if (newrank != -1) {
+ j = 0;
+ mask = 0x1;
+ while (mask < pof2) {
+ mask <<= 1;
+ j++;
+ }
+ mask >>= 1;
+ j--;
+ while (mask > 0) {
+ newdst = newrank ^ mask;
+
+ /* find real rank of dest */
+ dst = (newdst < rem) ? newdst * 2 : newdst + rem;
+
+ if ((newdst == 0) && (root < 2 * rem) && (root % 2 != 0))
+ dst = root;
+ newdst_tree_root = newdst >> j;
+ newdst_tree_root <<= j;
+
+ newroot_tree_root = newroot >> j;
+ newroot_tree_root <<= j;
+
+ send_cnt = recv_cnt = 0;
+ if (newrank < newdst) {
+ /* update last_idx except on first iteration */
+ if (mask != pof2 / 2)
+ last_idx = last_idx + pof2 / (mask * 2);
+
+ recv_idx = send_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < recv_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < last_idx; i++)
+ recv_cnt += cnts[i];
+ } else {
+ recv_idx = send_idx - pof2 / (mask * 2);
+ for (i = send_idx; i < last_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < send_idx; i++)
+ recv_cnt += cnts[i];
+ }
+
+ if (newdst_tree_root == newroot_tree_root) {
+ MPI_Send((char *) recv_ptr +
+ disps[send_idx] * extent,
+ send_cnt, datatype, dst, tag, comm);
+ break;
+ } else {
+ MPI_Recv((char *) recv_ptr +
+ disps[recv_idx] * extent,
+ recv_cnt, datatype, dst, tag, comm, &status);
+ }
+
+ if (newrank > newdst)
+ send_idx = recv_idx;
+
+ mask >>= 1;
+ j--;
+ }
+ }
+ memcpy(recvbuf, recv_ptr, extent * count);
+ free(send_ptr);
+ free(recv_ptr);
+ }
+
+
+ else if (count >= comm_size) {
+ tmp_buf = (void *) malloc(count * extent);
+
+ //if ((rank != root))
+ MPI_Sendrecv(sendbuf, count, datatype, rank, tag,
+ recvbuf, count, datatype, rank, tag, comm, &status);
+
+ rem = comm_size - pof2;
+ if (rank < 2 * rem) {
+ if (rank % 2 != 0) { /* odd */
+ MPI_Send(recvbuf, count, datatype, rank - 1, tag, comm);
+ newrank = -1;
+ }
+
+ else {
+ MPI_Recv(tmp_buf, count, datatype, rank + 1, tag, comm, &status);
+ star_reduction(op, tmp_buf, recvbuf, &count, &datatype);
+ newrank = rank / 2;
+ }
+ } else /* rank >= 2*rem */
+ newrank = rank - rem;
+
+ cnts = (int *) malloc(pof2 * sizeof(int));
+ disps = (int *) malloc(pof2 * sizeof(int));
+
+ if (newrank != -1) {
+ for (i = 0; i < (pof2 - 1); i++)
+ cnts[i] = count / pof2;
+ cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1);
+
+ disps[0] = 0;
+ for (i = 1; i < pof2; i++)
+ disps[i] = disps[i - 1] + cnts[i - 1];
+
+ mask = 0x1;
+ send_idx = recv_idx = 0;
+ last_idx = pof2;
+ while (mask < pof2) {
+ newdst = newrank ^ mask;
+ /* find real rank of dest */
+ dst = (newdst < rem) ? newdst * 2 : newdst + rem;
+
+ send_cnt = recv_cnt = 0;
+ if (newrank < newdst) {
+ send_idx = recv_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < last_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < send_idx; i++)
+ recv_cnt += cnts[i];
+ } else {
+ recv_idx = send_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < recv_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < last_idx; i++)
+ recv_cnt += cnts[i];
+ }
+
+ /* Send data from recvbuf. Recv into tmp_buf */
+ MPI_Sendrecv((char *) recvbuf +
+ disps[send_idx] * extent,
+ send_cnt, datatype,
+ dst, tag,
+ (char *) tmp_buf +
+ disps[recv_idx] * extent,
+ recv_cnt, datatype, dst, tag, comm, &status);
+
+ /* tmp_buf contains data received in this step.
+ recvbuf contains data accumulated so far */
+
+ star_reduction(op, (char *) tmp_buf + disps[recv_idx] * extent,
+ (char *) recvbuf + disps[recv_idx] * extent,
+ &recv_cnt, &datatype);
+
+ /* update send_idx for next iteration */
+ send_idx = recv_idx;
+ mask <<= 1;
+
+ if (mask < pof2)
+ last_idx = recv_idx + pof2 / mask;
+ }
+ }
+
+ /* now do the gather to root */
+
+ if (root < 2 * rem) {
+ if (root % 2 != 0) {
+ if (rank == root) { /* recv */
+ for (i = 0; i < (pof2 - 1); i++)
+ cnts[i] = count / pof2;
+ cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1);
+
+ disps[0] = 0;
+ for (i = 1; i < pof2; i++)
+ disps[i] = disps[i - 1] + cnts[i - 1];
+
+ MPI_Recv(recvbuf, cnts[0], datatype, 0, tag, comm, &status);
+
+ newrank = 0;
+ send_idx = 0;
+ last_idx = 2;
+ } else if (newrank == 0) {
+ MPI_Send(recvbuf, cnts[0], datatype, root, tag, comm);
+ newrank = -1;
+ }
+ newroot = 0;
+ } else
+ newroot = root / 2;
+ } else
+ newroot = root - rem;
+
+ if (newrank != -1) {
+ j = 0;
+ mask = 0x1;
+ while (mask < pof2) {
+ mask <<= 1;
+ j++;
+ }
+ mask >>= 1;
+ j--;
+ while (mask > 0) {
+ newdst = newrank ^ mask;
+
+ /* find real rank of dest */
+ dst = (newdst < rem) ? newdst * 2 : newdst + rem;
+
+ if ((newdst == 0) && (root < 2 * rem) && (root % 2 != 0))
+ dst = root;
+ newdst_tree_root = newdst >> j;
+ newdst_tree_root <<= j;
+
+ newroot_tree_root = newroot >> j;
+ newroot_tree_root <<= j;
+
+ send_cnt = recv_cnt = 0;
+ if (newrank < newdst) {
+ /* update last_idx except on first iteration */
+ if (mask != pof2 / 2)
+ last_idx = last_idx + pof2 / (mask * 2);
+
+ recv_idx = send_idx + pof2 / (mask * 2);
+ for (i = send_idx; i < recv_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < last_idx; i++)
+ recv_cnt += cnts[i];
+ } else {
+ recv_idx = send_idx - pof2 / (mask * 2);
+ for (i = send_idx; i < last_idx; i++)
+ send_cnt += cnts[i];
+ for (i = recv_idx; i < send_idx; i++)
+ recv_cnt += cnts[i];
+ }
+
+ if (newdst_tree_root == newroot_tree_root) {
+ MPI_Send((char *) recvbuf +
+ disps[send_idx] * extent,
+ send_cnt, datatype, dst, tag, comm);
+ break;
+ } else {
+ MPI_Recv((char *) recvbuf +
+ disps[recv_idx] * extent,
+ recv_cnt, datatype, dst, tag, comm, &status);
+ }
+
+ if (newrank > newdst)
+ send_idx = recv_idx;
+
+ mask >>= 1;
+ j--;
+ }
+ }
+ }
+ if (cnts)
+ free(cnts);
+ if (disps)
+ free(disps);
+
+ return 0;
+}
--- /dev/null
+#include "colls.h"
+
+/*
+ * created by Pitch Patarasuk
+ * Modified by Xin Yuan
+ *
+ * realize a subset of MPI predefine operators:
+ * MPI_LAND, MPI_BAND: C integer, Fortran integer, Byte
+ * MPI_LOR, MPI_BOR: C integer, Fortran integer, Byte
+ * MPI_LXOR, MPI_BXOR: C integer, Fortran integer, Byte
+ * MPI_SUM, MPI_PROD: C integer, Fortran integer, Floating point
+ * MPI_MIN, MPI_MAX: C integer, Fortran integer, Floating point, Byte
+ *
+ * Types not implemented: MPI_LONG_DOUBLE, MPI_LOGICAL, MPI_COMPLEX
+ */
+
+#ifndef STAR_REDUCTION
+#define STAR_REDUCTION
+
+
+#ifdef MPICH2_REDUCTION
+extern MPI_User_function * MPIR_Op_table[];
+#elif defined MVAPICH_REDUCETION
+extern void *MPIR_ToPointer();
+struct MPIR_OP
+{
+ MPI_User_function *op;
+ int commute;
+ int permanent;
+};
+#endif
+
+static void star_generic_reduction(MPI_Op op, void *src, void *target, int *count, MPI_Datatype *dtype){
+ int i;
+ if ((op == MPI_BOR) || (op == MPI_LOR)) {
+ if ((*dtype == MPI_BYTE) || (*dtype == MPI_CHAR)) {
+ for (i=0;i<*count;i++) {
+ ((char *)target)[i] |= ((char *)src)[i];
+ }
+ }
+ else if ((*dtype == MPI_INT)
+ || (*dtype == MPI_LONG)
+ || (*dtype == MPI_INT)
+ || (*dtype == MPI_UNSIGNED)
+ || (*dtype == MPI_UNSIGNED_LONG)) {
+ for (i=0;i<*count;i++) {
+ ((int *)target)[i] |= ((int *)src)[i];
+ }
+ }
+ else if ((*dtype == MPI_SHORT)
+ || (*dtype == MPI_UNSIGNED_SHORT)) {
+ for (i=0;i<*count;i++) {
+ ((short *)target)[i] |= ((short *)src)[i];
+ }
+ }
+ else {
+ printf("reduction operation not supported\n");
+ }
+ }
+
+ else if ((op == MPI_BAND) || (op == MPI_LAND)) {
+ if ((*dtype == MPI_BYTE) || (*dtype == MPI_CHAR)) {
+ for (i=0;i<*count;i++) {
+ ((char *)target)[i] &= ((char *)src)[i];
+ }
+ }
+ else if ((*dtype == MPI_INT)
+ || (*dtype == MPI_LONG)
+ || (*dtype == MPI_UNSIGNED)
+ || (*dtype == MPI_UNSIGNED_LONG)) {
+ for (i=0;i<*count;i++) {
+ ((int *)target)[i] &= ((int *)src)[i];
+ }
+ }
+ else if ((*dtype == MPI_SHORT)
+ || (*dtype == MPI_UNSIGNED_SHORT)) {
+ for (i=0;i<*count;i++) {
+ ((short *)target)[i] &= ((short *)src)[i];
+ }
+ }
+ else {
+ printf("reduction operation not supported\n");
+ }
+ }
+
+
+ else if ((op == MPI_BXOR) || (op == MPI_LXOR)) {
+ if ((*dtype == MPI_BYTE) || (*dtype == MPI_CHAR)) {
+ for (i=0;i<*count;i++) {
+ ((char *)target)[i] ^= ((char *)src)[i];
+ }
+ }
+ else if ((*dtype == MPI_INT)
+ || (*dtype == MPI_LONG)
+ || (*dtype == MPI_UNSIGNED)
+ || (*dtype == MPI_UNSIGNED_LONG)) {
+ for (i=0;i<*count;i++) {
+ ((int *)target)[i] ^= ((int *)src)[i];
+ }
+ }
+ else if ((*dtype == MPI_SHORT)
+ || (*dtype == MPI_UNSIGNED_SHORT)) {
+ for (i=0;i<*count;i++) {
+ ((short *)target)[i] ^= ((short *)src)[i];
+ }
+ }
+ else {
+ printf("reduction operation not supported\n");
+ }
+ }
+
+ else if (op == MPI_MAX) {
+ if ((*dtype == MPI_INT)
+ || (*dtype == MPI_LONG)) {
+ for (i=0;i<*count;i++) {
+ if (((int *)src)[i] > ((int *)target)[i]) {
+ ((int *)target)[i] = ((int *)src)[i];
+ }
+ }
+ }
+ else if ((*dtype == MPI_UNSIGNED)
+ || (*dtype == MPI_UNSIGNED_LONG)) {
+ for (i=0;i<*count;i++) {
+ if (((unsigned int *)src)[i] > ((unsigned int *)target)[i]) {
+ ((unsigned int *)target)[i] = ((unsigned int *)src)[i];
+ }
+ }
+ }
+ else if ((*dtype == MPI_SHORT) ) {
+ for (i=0;i<*count;i++) {
+ if (((short *)src)[i] > ((short *)target)[i]) {
+ ((short *)target)[i] = ((short *)src)[i];
+ }
+ }
+ }
+ else if (*dtype == MPI_UNSIGNED_SHORT) {
+ for (i=0;i<*count;i++) {
+ if (((unsigned short *)src)[i] > ((unsigned short *)target)[i]) {
+ ((unsigned short *)target)[i] = ((unsigned short *)src)[i];
+ }
+ }
+ }
+
+ else if ((*dtype == MPI_DOUBLE)) {
+ for (i=0;i<*count;i++) {
+ if (((double *)src)[i] > ((double *)target)[i]) {
+ ((double *)target)[i] = ((double *)src)[i];
+ }
+ }
+ }
+ else if ((*dtype == MPI_FLOAT)) {
+ for (i=0;i<*count;i++) {
+ if (((float *)src)[i] > ((float *)target)[i]) {
+ ((float *)target)[i] = ((float *)src)[i];
+ }
+ }
+ }
+ else if ((*dtype == MPI_CHAR) || (*dtype == MPI_BYTE)) {
+ for (i=0;i<*count;i++) {
+ if (((char *)src)[i] > ((char *)target)[i]) {
+ ((char *)target)[i] = ((char *)src)[i];
+ }
+ }
+ }
+ else {
+ printf("reduction operation not supported\n");
+ }
+ }
+
+
+
+ else if (op == MPI_MIN) {
+ if ((*dtype == MPI_INT)
+ || (*dtype == MPI_LONG)) {
+ for (i=0;i<*count;i++) {
+ if (((int *)src)[i] < ((int *)target)[i]) {
+ ((int *)target)[i] = ((int *)src)[i];
+ }
+ }
+ }
+ else if ((*dtype == MPI_UNSIGNED)
+ || (*dtype == MPI_UNSIGNED_LONG)) {
+ for (i=0;i<*count;i++) {
+ if (((unsigned int *)src)[i] < ((unsigned int *)target)[i]) {
+ ((unsigned int *)target)[i] = ((unsigned int *)src)[i];
+ }
+ }
+ }
+ else if ((*dtype == MPI_SHORT) ) {
+ for (i=0;i<*count;i++) {
+ if (((short *)src)[i] < ((short *)target)[i]) {
+ ((short *)target)[i] = ((short *)src)[i];
+ }
+ }
+ }
+ else if (*dtype == MPI_UNSIGNED_SHORT) {
+ for (i=0;i<*count;i++) {
+ if (((unsigned short *)src)[i] < ((unsigned short *)target)[i]) {
+ ((unsigned short *)target)[i] = ((unsigned short *)src)[i];
+ }
+ }
+ }
+
+ else if ((*dtype == MPI_DOUBLE)) {
+ for (i=0;i<*count;i++) {
+ if (((double *)src)[i] < ((double *)target)[i]) {
+ ((double *)target)[i] = ((double *)src)[i];
+ }
+ }
+ }
+ else if ((*dtype == MPI_FLOAT)) {
+ for (i=0;i<*count;i++) {
+ if (((float *)src)[i] < ((float *)target)[i]) {
+ ((float *)target)[i] = ((float *)src)[i];
+ }
+ }
+ }
+ else if ((*dtype == MPI_CHAR) || (*dtype == MPI_BYTE)) {
+ for (i=0;i<*count;i++) {
+ if (((char *)src)[i] < ((char *)target)[i]) {
+ ((char *)target)[i] = ((char *)src)[i];
+ }
+ }
+ }
+ else {
+ printf("reduction operation not supported\n");
+ }
+ }
+
+
+ else if (op == MPI_SUM) {
+ if ((*dtype == MPI_INT)
+ || (*dtype == MPI_LONG)) {
+ for (i=0;i<*count;i++) {
+ ((int *)target)[i] += ((int *)src)[i];
+ }
+ }
+ else if ((*dtype == MPI_UNSIGNED)
+ || (*dtype == MPI_UNSIGNED_LONG)) {
+ for (i=0;i<*count;i++) {
+ ((unsigned int *)target)[i] += ((unsigned int *)src)[i];
+ }
+ }
+ else if ((*dtype == MPI_SHORT) ) {
+ for (i=0;i<*count;i++) {
+ ((short *)target)[i] += ((short *)src)[i];
+ }
+ }
+ else if (*dtype == MPI_UNSIGNED_SHORT) {
+ for (i=0;i<*count;i++) {
+ ((unsigned short *)target)[i] += ((unsigned short *)src)[i];
+ }
+ }
+
+ else if ((*dtype == MPI_DOUBLE)) {
+ for (i=0;i<*count;i++) {
+ ((double *)target)[i] += ((double *)src)[i];
+ }
+ }
+ else if ((*dtype == MPI_FLOAT)) {
+ for (i=0;i<*count;i++) {
+ ((float *)target)[i] += ((float *)src)[i];
+ }
+ }
+ else {
+ printf("reduction operation not supported\n");
+ }
+ }
+
+ else if (op == MPI_PROD) {
+ if ((*dtype == MPI_INT)
+ || (*dtype == MPI_LONG)) {
+ for (i=0;i<*count;i++) {
+ ((int *)target)[i] *= ((int *)src)[i];
+ }
+ }
+ else if ((*dtype == MPI_UNSIGNED)
+ || (*dtype == MPI_UNSIGNED_LONG)) {
+ for (i=0;i<*count;i++) {
+ ((unsigned int *)target)[i] *= ((unsigned int *)src)[i];
+ }
+ }
+ else if ((*dtype == MPI_SHORT) ) {
+ for (i=0;i<*count;i++) {
+ ((short *)target)[i] *= ((short *)src)[i];
+ }
+ }
+ else if (*dtype == MPI_UNSIGNED_SHORT) {
+ for (i=0;i<*count;i++) {
+ ((unsigned short *)target)[i] *= ((unsigned short *)src)[i];
+ }
+ }
+
+ else if ((*dtype == MPI_DOUBLE)) {
+ for (i=0;i<*count;i++) {
+ ((double *)target)[i] *= ((double *)src)[i];
+ }
+ }
+ else if ((*dtype == MPI_FLOAT)) {
+ for (i=0;i<*count;i++) {
+ ((float *)target)[i] *= ((float *)src)[i];
+ }
+ }
+ else {
+ printf("reduction operation not supported\n");
+ }
+ }
+
+ else {
+ printf("reduction operation not supported\n");
+ }
+}
+
+void star_reduction(MPI_Op op, void *src, void *target, int *count, MPI_Datatype *dtype){
+
+#ifdef MPICH2_REDUCTION
+MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
+ return (*uop) (src,target,count,dtype);
+#elif defined MVAPICH_REDUCTION
+MPI_User_function *uop;
+struct MPIR_OP *op_ptr;
+op_ptr = MPIR_ToPointer(op);
+uop = op_ptr->op;
+ return (*uop) (src,target,count,dtype);
+#else
+ return star_generic_reduction(op,src,target,count,dtype);
+#endif
+
+
+
+}
+
+
+#endif
#include "private.h"
#include "colls/colls.h"
+s_mpi_coll_description_t mpi_coll_allgather_description[] = {
+ {"default",
+ "allgather default collective",
+ smpi_mpi_allgather},
+COLL_ALLGATHERS(COLL_DESCRIPTION, COLL_COMMA),
+ {NULL, NULL, NULL} /* this array must be NULL terminated */
+};
+
+s_mpi_coll_description_t mpi_coll_allreduce_description[] = {
+ {"default",
+ "allreduce default collective",
+ smpi_mpi_allreduce},
+COLL_ALLREDUCES(COLL_DESCRIPTION, COLL_COMMA),
+ {NULL, NULL, NULL} /* this array must be NULL terminated */
+};
+
s_mpi_coll_description_t mpi_coll_alltoall_description[] = {
{"ompi",
"Ompi alltoall default collective",
smpi_coll_tuned_alltoall_ompi},
-
- {"2dmesh",
- "Alltoall 2dmesh collective",
- smpi_coll_tuned_alltoall_2dmesh},
- {"3dmesh",
- "Alltoall 3dmesh collective",
- smpi_coll_tuned_alltoall_3dmesh},
- /*{"bruck",
- "Alltoall Bruck collective",
- smpi_coll_tuned_alltoall_bruck},*/
- {"pair",
- "Alltoall pair collective",
- smpi_coll_tuned_alltoall_pair},
- {"pair_light_barrier",
- "Alltoall pair_light_barrier collective",
- smpi_coll_tuned_alltoall_pair_light_barrier},
- {"pair_mpi_barrier",
- "Alltoall pair_mpi_barrier collective",
- smpi_coll_tuned_alltoall_pair_mpi_barrier},
- {"rdb",
- "Alltoall rdb collective",
- smpi_coll_tuned_alltoall_rdb},
- {"ring",
- "Alltoall ring collective",
- smpi_coll_tuned_alltoall_ring},
- {"ring_light_barrier",
- "Alltoall ring_light_barrier collective",
- smpi_coll_tuned_alltoall_ring_light_barrier},
- {"ring_light_barrier",
- "Alltoall ring_light_barrier collective",
- smpi_coll_tuned_alltoall_ring_light_barrier},
- {"ring_mpi_barrier",
- "Alltoall ring_mpi_barrier collective",
- smpi_coll_tuned_alltoall_ring_mpi_barrier},
- {"ring_one_barrier",
- "Alltoall ring_one_barrier collective",
- smpi_coll_tuned_alltoall_ring_one_barrier},
- {"simple",
- "Alltoall simple collective",
- smpi_coll_tuned_alltoall_simple},
-
+COLL_ALLTOALLS(COLL_DESCRIPTION, COLL_COMMA),
{"bruck",
"Alltoall Bruck (SG) collective",
smpi_coll_tuned_alltoall_bruck},
{"pairwise",
"Alltoall pairwise (SG) collective",
smpi_coll_tuned_alltoall_pairwise},
-
{NULL, NULL, NULL} /* this array must be NULL terminated */
};
-s_mpi_coll_description_t mpi_coll_allgather_description[] = {
+s_mpi_coll_description_t mpi_coll_bcast_description[] = {
{"default",
"allgather default collective",
- smpi_mpi_gather},
+ smpi_mpi_bcast},
+COLL_BCASTS(COLL_DESCRIPTION, COLL_COMMA),
+ {NULL, NULL, NULL} /* this array must be NULL terminated */
+};
+s_mpi_coll_description_t mpi_coll_reduce_description[] = {
+ {"default",
+ "allgather default collective",
+ smpi_mpi_reduce},
+COLL_REDUCES(COLL_DESCRIPTION, COLL_COMMA),
{NULL, NULL, NULL} /* this array must be NULL terminated */
};
+
/** Displays the long description of all registered models, and quit */
void coll_help(const char *category, s_mpi_coll_description_t * table)
{
XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_coll, smpi,
"Logging specific to SMPI (coll)");
-int (*mpi_coll_alltoall_fun)(void *, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm);
int (*mpi_coll_allgather_fun)(void *, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm);
+int (*mpi_coll_allreduce_fun)(void *sbuf, void *rbuf, int rcount, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm);
+int (*mpi_coll_alltoall_fun)(void *, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm);
+int (*mpi_coll_bcast_fun)(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm com);
+int (*mpi_coll_reduce_fun)(void *buf, void *rbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm);
struct s_proc_tree {
int PROCTREE_A;
int MPI_Init(int *argc, char ***argv)
{
+ int allgather_id = find_coll_description(mpi_coll_allgather_description,
+ sg_cfg_get_string("smpi/allgather"));
+ mpi_coll_allgather_fun = (int (*)(void *, int, MPI_Datatype,
+ void*, int, MPI_Datatype, MPI_Comm))
+ mpi_coll_allgather_description[allgather_id].coll;
+
+ int allreduce_id = find_coll_description(mpi_coll_allreduce_description,
+ sg_cfg_get_string("smpi/allreduce"));
+ mpi_coll_allreduce_fun = (int (*)(void *sbuf, void *rbuf, int rcount, \
+ MPI_Datatype dtype, MPI_Op op, MPI_Comm comm))
+ mpi_coll_allreduce_description[allreduce_id].coll;
+
int alltoall_id = find_coll_description(mpi_coll_alltoall_description,
sg_cfg_get_string("smpi/alltoall"));
mpi_coll_alltoall_fun = (int (*)(void *, int, MPI_Datatype,
void*, int, MPI_Datatype, MPI_Comm))
mpi_coll_alltoall_description[alltoall_id].coll;
- int allgather_id = find_coll_description(mpi_coll_allgather_description,
- sg_cfg_get_string("smpi/allgather"));
- mpi_coll_allgather_fun = (int (*)(void *, int, MPI_Datatype,
- void*, int, MPI_Datatype, MPI_Comm))
- mpi_coll_allgather_description[allgather_id].coll;
+ int bcast_id = find_coll_description(mpi_coll_bcast_description,
+ sg_cfg_get_string("smpi/bcast"));
+ mpi_coll_bcast_fun = (int (*)(void *buf, int count, MPI_Datatype datatype, \
+ int root, MPI_Comm com))
+ mpi_coll_bcast_description[bcast_id].coll;
+
+ int reduce_id = find_coll_description(mpi_coll_reduce_description,
+ sg_cfg_get_string("smpi/reduce"));
+ mpi_coll_reduce_fun = (int (*)(void *buf, void *rbuf, int count, MPI_Datatype datatype, \
+ MPI_Op op, int root, MPI_Comm comm))
+ mpi_coll_reduce_description[reduce_id].coll;
return PMPI_Init(argc, argv);
}