From 8ec703af1cf974ed8097c2e96cc051f1f0d38ac7 Mon Sep 17 00:00:00 2001 From: Augustin Degomme Date: Thu, 24 Jul 2014 15:20:25 +0200 Subject: [PATCH] cleanup a bit the code, ensure tests do pass --- src/smpi/colls/smpi_mvapich2_selector.c | 1519 +--------------- .../colls/smpi_mvapich2_selector_stampede.h | 1600 ++++++++++++++++- 2 files changed, 1553 insertions(+), 1566 deletions(-) diff --git a/src/smpi/colls/smpi_mvapich2_selector.c b/src/smpi/colls/smpi_mvapich2_selector.c index 49906afab8..32408edf52 100644 --- a/src/smpi/colls/smpi_mvapich2_selector.c +++ b/src/smpi/colls/smpi_mvapich2_selector.c @@ -3,7 +3,7 @@ /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team. * All rights reserved. */ -/* This program is free software; you can redistribute it and/or modify it +/* This program is xbt_free software; you can redistribute it and/or modify it * under the terms of the license (GNU LGPL) which comes with this package. */ #include "colls_private.h" @@ -11,249 +11,6 @@ #include "smpi_mvapich2_selector_stampede.h" -static void init_mv2_alltoall_tables_stampede(){ -int i; - int agg_table_sum = 0; -mv2_alltoall_tuning_table **table_ptrs = NULL; - mv2_alltoall_num_ppn_conf = 3; - mv2_alltoall_thresholds_table - = malloc(sizeof(mv2_alltoall_tuning_table *) - * mv2_alltoall_num_ppn_conf); - table_ptrs = malloc(sizeof(mv2_alltoall_tuning_table *) - * mv2_alltoall_num_ppn_conf); - mv2_size_alltoall_tuning_table = malloc(sizeof(int) * - mv2_alltoall_num_ppn_conf); - mv2_alltoall_table_ppn_conf =malloc(mv2_alltoall_num_ppn_conf * sizeof(int)); - mv2_alltoall_table_ppn_conf[0] = 1; - mv2_size_alltoall_tuning_table[0] = 6; - mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = { - {2, - 1, - {{0, -1, &MPIR_Alltoall_pairwise_MV2}, - }, - - {{0, -1, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {4, - 2, - {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2}, - {262144, -1, &MPIR_Alltoall_pairwise_MV2}, - }, - - {{0, -1, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {8, - 2, - {{0, 8, &MPIR_Alltoall_RD_MV2}, - {8, -1, &MPIR_Alltoall_Scatter_dest_MV2}, - }, - - {{0, -1, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {16, - 3, - {{0, 64, &MPIR_Alltoall_RD_MV2}, - {64, 512, &MPIR_Alltoall_bruck_MV2}, - {512, -1, &MPIR_Alltoall_Scatter_dest_MV2}, - }, - - {{0,-1, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {32, - 3, - {{0, 32, &MPIR_Alltoall_RD_MV2}, - {32, 2048, &MPIR_Alltoall_bruck_MV2}, - {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2}, - }, - - {{0, -1, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {64, - 3, - {{0, 8, &MPIR_Alltoall_RD_MV2}, - {8, 1024, &MPIR_Alltoall_bruck_MV2}, - {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2}, - }, - - {{0, -1, &MPIR_Alltoall_inplace_MV2}, - }, - }, - }; - table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn; - mv2_alltoall_table_ppn_conf[1] = 2; - mv2_size_alltoall_tuning_table[1] = 6; - mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = { - {4, - 2, - {{0, 32, &MPIR_Alltoall_RD_MV2}, - {32, -1, &MPIR_Alltoall_Scatter_dest_MV2}, - }, - - {{0, -1, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {8, - 2, - {{0, 64, &MPIR_Alltoall_RD_MV2}, - {64, -1, &MPIR_Alltoall_Scatter_dest_MV2}, - }, - - {{0, -1, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {16, - 3, - {{0, 64, &MPIR_Alltoall_RD_MV2}, - {64, 2048, &MPIR_Alltoall_bruck_MV2}, - {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2}, - }, - - {{0,-1, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {32, - 3, - {{0, 16, &MPIR_Alltoall_RD_MV2}, - {16, 2048, &MPIR_Alltoall_bruck_MV2}, - {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2}, - }, - - {{0, -1, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {64, - 3, - {{0, 8, &MPIR_Alltoall_RD_MV2}, - {8, 1024, &MPIR_Alltoall_bruck_MV2}, - {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2}, - }, - - {{0, -1, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {128, - 3, - {{0, 4, &MPIR_Alltoall_RD_MV2}, - {4, 2048, &MPIR_Alltoall_bruck_MV2}, - {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2}, - }, - - {{0, -1, &MPIR_Alltoall_inplace_MV2}, - }, - }, - }; - table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn; - mv2_alltoall_table_ppn_conf[2] = 16; - mv2_size_alltoall_tuning_table[2] = 7; - mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = { - {16, - 2, - {{0, 2048, &MPIR_Alltoall_bruck_MV2}, - {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2}, - }, - - {{32768, -1, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {32, - 2, - {{0, 2048, &MPIR_Alltoall_bruck_MV2}, - {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2}, - }, - - {{16384, -1, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {64, - 3, - {{0, 2048, &MPIR_Alltoall_bruck_MV2}, - {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2}, - {16384, -1, &MPIR_Alltoall_pairwise_MV2}, - }, - - {{32768, 131072, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {128, - 2, - {{0, 2048, &MPIR_Alltoall_bruck_MV2}, - {2048, -1, &MPIR_Alltoall_pairwise_MV2}, - }, - - {{16384,65536, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {256, - 2, - {{0, 1024, &MPIR_Alltoall_bruck_MV2}, - {1024, -1, &MPIR_Alltoall_pairwise_MV2}, - }, - - {{16384, 65536, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - {512, - 2, - {{0, 1024, &MPIR_Alltoall_bruck_MV2}, - {1024, -1, &MPIR_Alltoall_pairwise_MV2}, - }, - - {{16384, 65536, &MPIR_Alltoall_inplace_MV2}, - }, - }, - {1024, - 2, - {{0, 1024, &MPIR_Alltoall_bruck_MV2}, - {1024, -1, &MPIR_Alltoall_pairwise_MV2}, - }, - - {{16384, 65536, &MPIR_Alltoall_inplace_MV2}, - }, - }, - - }; - table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn; - agg_table_sum = 0; - for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) { - agg_table_sum += mv2_size_alltoall_tuning_table[i]; - } - mv2_alltoall_thresholds_table[0] = - malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table)); - memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0], - (sizeof(mv2_alltoall_tuning_table) - * mv2_size_alltoall_tuning_table[0])); - for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) { - mv2_alltoall_thresholds_table[i] = - mv2_alltoall_thresholds_table[i - 1] - + mv2_size_alltoall_tuning_table[i - 1]; - memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i], - (sizeof(mv2_alltoall_tuning_table) - * mv2_size_alltoall_tuning_table[i])); - } - free(table_ptrs); - - -} int smpi_coll_tuned_alltoall_mvapich2( void *sendbuf, int sendcount, MPI_Datatype sendtype, @@ -304,7 +61,7 @@ int smpi_coll_tuned_alltoall_mvapich2( void *sendbuf, int sendcount, mv2_alltoall_thresholds_table[conf_index][range].in_place_algo_table[range_threshold].min ||nbytes > mv2_alltoall_thresholds_table[conf_index][range].in_place_algo_table[range_threshold].max ) { - tmp_buf = (char *)malloc( comm_size * recvcount * recvtype_size ); + tmp_buf = (char *)xbt_malloc( comm_size * recvcount * recvtype_size ); mpi_errno = smpi_datatype_copy((char *)recvbuf, comm_size*recvcount, recvtype, (char *)tmp_buf, @@ -313,7 +70,7 @@ int smpi_coll_tuned_alltoall_mvapich2( void *sendbuf, int sendcount, mpi_errno = MV2_Alltoall_function(tmp_buf, recvcount, recvtype, recvbuf, recvcount, recvtype, comm ); - free(tmp_buf); + xbt_free(tmp_buf); } else { mpi_errno = MPIR_Alltoall_inplace_MV2(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, @@ -326,221 +83,6 @@ int smpi_coll_tuned_alltoall_mvapich2( void *sendbuf, int sendcount, } -static void init_mv2_allgather_tables_stampede(){ -int i; - int agg_table_sum = 0; -mv2_allgather_tuning_table **table_ptrs = NULL; - mv2_allgather_num_ppn_conf = 3; - mv2_allgather_thresholds_table - = malloc(sizeof(mv2_allgather_tuning_table *) - * mv2_allgather_num_ppn_conf); - table_ptrs = malloc(sizeof(mv2_allgather_tuning_table *) - * mv2_allgather_num_ppn_conf); - mv2_size_allgather_tuning_table = malloc(sizeof(int) * - mv2_allgather_num_ppn_conf); - mv2_allgather_table_ppn_conf - = malloc(mv2_allgather_num_ppn_conf * sizeof(int)); - mv2_allgather_table_ppn_conf[0] = 1; - mv2_size_allgather_tuning_table[0] = 6; - mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = { - { - 2, - {0}, - 1, - { - {0, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 4, - {0,0}, - 2, - { - {0, 262144, &MPIR_Allgather_RD_MV2}, - {262144, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 8, - {0,0}, - 2, - { - {0, 131072, &MPIR_Allgather_RD_MV2}, - {131072, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 16, - {0,0}, - 2, - { - {0, 131072, &MPIR_Allgather_RD_MV2}, - {131072, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 32, - {0,0}, - 2, - { - {0, 65536, &MPIR_Allgather_RD_MV2}, - {65536, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 64, - {0,0}, - 2, - { - {0, 32768, &MPIR_Allgather_RD_MV2}, - {32768, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - }; - table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn; - mv2_allgather_table_ppn_conf[1] = 2; - mv2_size_allgather_tuning_table[1] = 6; - mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = { - { - 4, - {0,0}, - 2, - { - {0, 524288, &MPIR_Allgather_RD_MV2}, - {524288, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 8, - {0,1,0}, - 2, - { - {0, 32768, &MPIR_Allgather_RD_MV2}, - {32768, 524288, &MPIR_Allgather_Ring_MV2}, - {524288, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 16, - {0,1,0}, - 2, - { - {0, 16384, &MPIR_Allgather_RD_MV2}, - {16384, 524288, &MPIR_Allgather_Ring_MV2}, - {524288, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 32, - {1,1,0}, - 2, - { - {0, 65536, &MPIR_Allgather_RD_MV2}, - {65536, 524288, &MPIR_Allgather_Ring_MV2}, - {524288, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 64, - {1,1,0}, - 2, - { - {0, 32768, &MPIR_Allgather_RD_MV2}, - {32768, 524288, &MPIR_Allgather_Ring_MV2}, - {524288, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 128, - {1,1,0}, - 2, - { - {0, 65536, &MPIR_Allgather_RD_MV2}, - {65536, 524288, &MPIR_Allgather_Ring_MV2}, - {524288, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - }; - table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn; - mv2_allgather_table_ppn_conf[2] = 16; - mv2_size_allgather_tuning_table[2] = 6; - mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = { - { - 16, - {0,0}, - 2, - { - {0, 1024, &MPIR_Allgather_RD_MV2}, - {1024, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 32, - {0,0}, - 2, - { - {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2}, - {1024, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 64, - {0,0}, - 2, - { - {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2}, - {1024, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 128, - {0,0}, - 2, - { - {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2}, - {1024, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 256, - {0,0}, - 2, - { - {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2}, - {1024, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - { - 512, - {0,0}, - 2, - { - {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2}, - {1024, -1, &MPIR_Allgather_Ring_MV2}, - }, - }, - - }; - table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn; - agg_table_sum = 0; - for (i = 0; i < mv2_allgather_num_ppn_conf; i++) { - agg_table_sum += mv2_size_allgather_tuning_table[i]; - } - mv2_allgather_thresholds_table[0] = - malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table)); - memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0], - (sizeof(mv2_allgather_tuning_table) - * mv2_size_allgather_tuning_table[0])); - for (i = 1; i < mv2_allgather_num_ppn_conf; i++) { - mv2_allgather_thresholds_table[i] = - mv2_allgather_thresholds_table[i - 1] - + mv2_size_allgather_tuning_table[i - 1]; - memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i], - (sizeof(mv2_allgather_tuning_table) - * mv2_size_allgather_tuning_table[i])); - } - free(table_ptrs); -} int smpi_coll_tuned_allgather_mvapich2(void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, @@ -647,53 +189,6 @@ int smpi_coll_tuned_allgather_mvapich2(void *sendbuf, int sendcount, MPI_Datatyp return mpi_errno; } -static void init_mv2_gather_tables_stampede(){ - - mv2_size_gather_tuning_table=7; - mv2_gather_thresholds_table = malloc(mv2_size_gather_tuning_table* - sizeof (mv2_gather_tuning_table)); - mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={ - {16, - 2,{{0, 524288, &MPIR_Gather_MV2_Direct}, - {524288, -1, &MPIR_Gather_intra}}, - 1,{{0, -1, &MPIR_Gather_MV2_Direct}}}, - {32, - 3,{{0, 16384, &MPIR_Gather_MV2_Direct}, - {16384, 131072, &MPIR_Gather_intra}, - {131072, -1, &MPIR_Gather_MV2_two_level_Direct}}, - 1,{{0, -1, &MPIR_Gather_intra}}}, - {64, - 3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct}, - {256, 16384, &MPIR_Gather_MV2_Direct}, - {256, -1, &MPIR_Gather_MV2_two_level_Direct}}, - 1,{{0, -1, &MPIR_Gather_intra}}}, - {128, - 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, - {512, 16384, &MPIR_Gather_MV2_Direct}, - {16384, -1, &MPIR_Gather_MV2_two_level_Direct}}, - 1,{{0, -1, &MPIR_Gather_intra}}}, - {256, - 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, - {512, 16384, &MPIR_Gather_MV2_Direct}, - {16384, -1, &MPIR_Gather_MV2_two_level_Direct}}, - 1,{{0, -1, &MPIR_Gather_intra}}}, - {512, - 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, - {512, 16384, &MPIR_Gather_MV2_Direct}, - {8196, -1, &MPIR_Gather_MV2_two_level_Direct}}, - 1,{{0, -1, &MPIR_Gather_intra}}}, - {1024, - 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, - {512, 16384, &MPIR_Gather_MV2_Direct}, - {8196, -1, &MPIR_Gather_MV2_two_level_Direct}}, - 1,{{0, -1, &MPIR_Gather_intra}}}, - }; - - memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table, - mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table)); - -} - int smpi_coll_tuned_gather_mvapich2(void *sendbuf, int sendcnt, @@ -774,72 +269,6 @@ int smpi_coll_tuned_gather_mvapich2(void *sendbuf, } - -static void init_mv2_allgatherv_tables_stampede(){ - mv2_size_allgatherv_tuning_table = 6; - mv2_allgatherv_thresholds_table = malloc(mv2_size_allgatherv_tuning_table * - sizeof (mv2_allgatherv_tuning_table)); - mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = { - { - 16, - 2, - { - {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2}, - {512, -1, &MPIR_Allgatherv_Ring_MV2}, - }, - }, - { - 32, - 2, - { - {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2}, - {512, -1, &MPIR_Allgatherv_Ring_MV2}, - }, - }, - { - 64, - 2, - { - {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2}, - {256, -1, &MPIR_Allgatherv_Ring_MV2}, - }, - }, - { - 128, - 2, - { - {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2}, - {256, -1, &MPIR_Allgatherv_Ring_MV2}, - }, - }, - { - 256, - 2, - { - {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2}, - {256, -1, &MPIR_Allgatherv_Ring_MV2}, - }, - }, - { - 512, - 2, - { - {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2}, - {256, -1, &MPIR_Allgatherv_Ring_MV2}, - }, - }, - - }; - memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table, - mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table)); -} - - - - - - - int smpi_coll_tuned_allgatherv_mvapich2(void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int *recvcounts, int *displs, MPI_Datatype recvtype, MPI_Comm comm ) @@ -905,147 +334,6 @@ int smpi_coll_tuned_allgatherv_mvapich2(void *sendbuf, int sendcount, MPI_Dataty } -static void init_mv2_allreduce_tables_stampede(){ -mv2_size_allreduce_tuning_table = 8; - mv2_allreduce_thresholds_table = malloc(mv2_size_allreduce_tuning_table * - sizeof (mv2_allreduce_tuning_table)); - mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = { - { - 16, - 0, - {1, 0}, - 2, - { - {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2}, - {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, - }, - 2, - { - {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2}, - {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2}, - }, - }, - { - 32, - 0, - {1, 1, 0}, - 3, - { - {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2}, - {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2}, - {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, - }, - 2, - { - {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2}, - {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2}, - }, - }, - { - 64, - 0, - {1, 1, 0}, - 3, - { - {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2}, - {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2}, - {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, - }, - 2, - { - {0, 512, &MPIR_Allreduce_reduce_shmem_MV2}, - {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2}, - }, - }, - { - 128, - 0, - {1, 1, 0}, - 3, - { - {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2}, - {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2}, - {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, - }, - 2, - { - {0, 512, &MPIR_Allreduce_reduce_shmem_MV2}, - {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2}, - }, - }, - { - 256, - 0, - {1, 1, 0}, - 3, - { - {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2}, - {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2}, - {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, - }, - 2, - { - {0, 512, &MPIR_Allreduce_reduce_shmem_MV2}, - {512, -1, &MPIR_Allreduce_reduce_p2p_MV2}, - }, - }, - { - 512, - 0, - {1, 1, 0}, - 3, - { - {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2}, - {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2}, - {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, - }, - 2, - { - {0, 512, &MPIR_Allreduce_reduce_shmem_MV2}, - {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2}, - }, - }, - { - 1024, - 0, - {1, 1, 1, 0}, - 4, - { - {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2}, - {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2}, - {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2}, - {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, - }, - 2, - { - {0, 512, &MPIR_Allreduce_reduce_shmem_MV2}, - {512, -1, &MPIR_Allreduce_reduce_p2p_MV2}, - }, - }, - { - 2048, - 0, - {1, 1, 1, 0}, - 4, - { - {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2}, - {64, 512, &MPIR_Allreduce_reduce_p2p_MV2}, - {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2}, - {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2}, - {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, - }, - 2, - { - {0, 512, &MPIR_Allreduce_reduce_shmem_MV2}, - {512, -1, &MPIR_Allreduce_reduce_p2p_MV2}, - }, - }, - - }; - memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table, - mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table)); -} - int smpi_coll_tuned_allreduce_mvapich2(void *sendbuf, void *recvbuf, @@ -1179,7 +467,7 @@ if (sbuf == MPI_IN_PLACE) { rbuf, rcounts, rdisps,rdtype, comm); } else /* For starters, just keep the original algorithm. */ - return smpi_coll_tuned_alltoallv_pair(sbuf, scounts, sdisps, sdtype, + return smpi_coll_tuned_alltoallv_ring(sbuf, scounts, sdisps, sdtype, rbuf, rcounts, rdisps,rdtype, comm); } @@ -1191,215 +479,6 @@ int smpi_coll_tuned_barrier_mvapich2(MPI_Comm comm) } -/* -static void init_mv2_bcast_tables_stampede(){ - //Stampede, - mv2_size_bcast_tuning_table=8; - mv2_bcast_thresholds_table = malloc(mv2_size_bcast_tuning_table * - sizeof (mv2_bcast_tuning_table)); - - mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={ - { - 16, - 8192, 4, 4, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, - 11, - { - {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, - {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, - {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {8192, 16384, &MPIR_Bcast_binomial_MV2, -1}, - {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, - {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}, - {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}, - {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1}, - {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1} - }, - 11, - { - {0, 8, &MPIR_Shmem_Bcast_MV2, 2}, - {8, 16, &MPIR_Shmem_Bcast_MV2, 4}, - {16, 1024, &MPIR_Shmem_Bcast_MV2, 2}, - {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4}, - {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1}, - {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4}, - {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2}, - {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1}, - {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1}, - {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1}, - {524288, -1, &MPIR_Shmem_Bcast_MV2, -1} - } - }, - { - 32, - 8192, 4, 4, - {1, 1, 1, 1, 1, 1, 1, 1}, - 8, - { - {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, - {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, - {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, - {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, - {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, - {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8} - }, - 8, - { - {0, 128, &MPIR_Shmem_Bcast_MV2, 2}, - {128, 256, &MPIR_Shmem_Bcast_MV2, 4}, - {256, 32768, &MPIR_Shmem_Bcast_MV2, 2}, - {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4}, - {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2}, - {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8}, - {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2}, - {524288, -1, &MPIR_Shmem_Bcast_MV2, 8} - } - }, - { - 64, - 8192, 4, 4, - {1, 1, 1, 1, 1, 1, 1, 1, 1}, - 9, - { - {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, - {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, - {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, - {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, - {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2} - }, - 9, - { - {0, 2, &MPIR_Shmem_Bcast_MV2, 4}, - {2, 4, &MPIR_Shmem_Bcast_MV2, 8}, - {4, 16, &MPIR_Shmem_Bcast_MV2, 4}, - {16, 32, &MPIR_Shmem_Bcast_MV2, 8}, - {32, 128, &MPIR_Shmem_Bcast_MV2, 4}, - {128, 256, &MPIR_Shmem_Bcast_MV2, 8}, - {256, 4096, &MPIR_Shmem_Bcast_MV2, 4}, - {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8}, - {32768, -1, &MPIR_Shmem_Bcast_MV2, 2} - } - }, - { - 128, - 8192, 4, 4, - {1, 1, 1, 0}, - 4, - { - {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, - {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, - {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1} - }, - 4, - { - {0, 8192, &MPIR_Shmem_Bcast_MV2, 8}, - {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4}, - {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2}, - {524288, -1, NULL, -1} - } - }, - { - 256, - 8192, 4, 4, - {1, 1, 1, 1, 1}, - 5, - { - {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, - {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}, - {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, - {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1} - }, - 5, - { - {0, 16384, &MPIR_Shmem_Bcast_MV2, 4}, - {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2}, - {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1}, - {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2}, - {524288, -1, &MPIR_Shmem_Bcast_MV2, -1} - } - }, - { - 512, - 8192, 4, 4, - {1, 1, 1, 1, 1}, - 5, - { - {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, - {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, - {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1}, - {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1} - }, - 5, - { - {0, 4096, &MPIR_Shmem_Bcast_MV2, 8}, - {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4}, - {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2}, - {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1}, - {262144, -1, &MPIR_Shmem_Bcast_MV2, -1} - } - }, - { - 1024, - 8192, 4, 4, - {1, 1, 1, 1, 1}, - 5, - { - {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, - {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, - {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1}, - {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1} - }, - 5, - { - {0, 8192, &MPIR_Shmem_Bcast_MV2, 8}, - {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4}, - {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2}, - {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1}, - {524288, -1, &MPIR_Shmem_Bcast_MV2, -1} - } - }, - { - 2048, - 8192, 4, 4, - {1, 1, 1, 1, 1, 1, 1}, - 7, - { - {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, - {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, - {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, - {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, - {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1}, - {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1} - }, - 7, - { - {0, 16, &MPIR_Shmem_Bcast_MV2, 8}, - {16, 32, &MPIR_Shmem_Bcast_MV2, 4}, - {32, 4096, &MPIR_Shmem_Bcast_MV2, 8}, - {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4}, - {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2}, - {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1}, - {524288, -1, &MPIR_Shmem_Bcast_MV2, -1} - } - } - }; - - memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table, - mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table)); -}*/ int smpi_coll_tuned_bcast_mvapich2(void *buffer, @@ -1413,199 +492,6 @@ int smpi_coll_tuned_bcast_mvapich2(void *buffer, } -static void init_mv2_reduce_tables_stampede(){ - /*Stampede*/ - mv2_size_reduce_tuning_table = 8; - mv2_reduce_thresholds_table = malloc(mv2_size_reduce_tuning_table * - sizeof (mv2_reduce_tuning_table)); - mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = { - { - 16, - 4, - 4, - {1, 0, 0}, - 3, - { - {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {262144, 1048576, &MPIR_Reduce_binomial_MV2}, - {1048576, -1, &MPIR_Reduce_redscat_gather_MV2}, - }, - 2, - { - {0, 65536, &MPIR_Reduce_shmem_MV2}, - {65536,-1, &MPIR_Reduce_binomial_MV2}, - }, - }, - { - 32, - 4, - 4, - {1, 1, 1, 1, 0, 0, 0}, - 7, - { - {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {32768, 65536, &MPIR_Reduce_binomial_MV2}, - {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {262144, 1048576, &MPIR_Reduce_binomial_MV2}, - {1048576, -1, &MPIR_Reduce_redscat_gather_MV2}, - }, - 6, - { - {0, 8192, &MPIR_Reduce_shmem_MV2}, - {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2}, - {16384, 32768, &MPIR_Reduce_shmem_MV2}, - {32768, 65536, &MPIR_Reduce_shmem_MV2}, - {65536, 262144, &MPIR_Reduce_shmem_MV2}, - {262144,-1, &MPIR_Reduce_binomial_MV2}, - }, - }, - { - 64, - 4, - 4, - {1, 1, 1, 1, 0}, - 5, - { - {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {16384, 65536, &MPIR_Reduce_binomial_MV2}, - {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {262144, -1, &MPIR_Reduce_redscat_gather_MV2}, - }, - 5, - { - {0, 8192, &MPIR_Reduce_shmem_MV2}, - {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2}, - {16384, 65536, &MPIR_Reduce_shmem_MV2}, - {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2}, - {262144, -1, &MPIR_Reduce_binomial_MV2}, - }, - }, - { - 128, - 4, - 4, - {1, 0, 1, 0, 1, 0}, - 6, - { - {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {16384, 65536, &MPIR_Reduce_binomial_MV2}, - {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {262144, 1048576, &MPIR_Reduce_binomial_MV2}, - {1048576, -1, &MPIR_Reduce_redscat_gather_MV2}, - }, - 5, - { - {0, 8192, &MPIR_Reduce_shmem_MV2}, - {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2}, - {16384, 65536, &MPIR_Reduce_shmem_MV2}, - {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2}, - {262144, -1, &MPIR_Reduce_binomial_MV2}, - }, - }, - { - 256, - 4, - 4, - {1, 1, 1, 0, 1, 1, 0}, - 7, - { - {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {16384, 32768, &MPIR_Reduce_binomial_MV2}, - {32768, 65536, &MPIR_Reduce_binomial_MV2}, - {65536, 262144, &MPIR_Reduce_binomial_MV2}, - {262144, 1048576, &MPIR_Reduce_binomial_MV2}, - {1048576, -1, &MPIR_Reduce_redscat_gather_MV2}, - }, - 6, - { - {0, 8192, &MPIR_Reduce_shmem_MV2}, - {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2}, - {16384, 32768, &MPIR_Reduce_shmem_MV2}, - {32768, 65536, &MPIR_Reduce_shmem_MV2}, - {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2}, - {262144, -1, &MPIR_Reduce_binomial_MV2}, - }, - }, - { - 512, - 4, - 4, - {1, 0, 1, 1, 1, 0}, - 6, - { - {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {16384, 65536, &MPIR_Reduce_binomial_MV2}, - {65536, 262144, &MPIR_Reduce_binomial_MV2}, - {262144, 1048576, &MPIR_Reduce_binomial_MV2}, - {1048576, -1, &MPIR_Reduce_redscat_gather_MV2}, - }, - 5, - { - {0, 8192, &MPIR_Reduce_shmem_MV2}, - {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2}, - {16384, 65536, &MPIR_Reduce_shmem_MV2}, - {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2}, - {262144, -1, &MPIR_Reduce_binomial_MV2}, - }, - }, - { - 1024, - 4, - 4, - {1, 0, 1, 1, 1}, - 5, - { - {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {16384, 65536, &MPIR_Reduce_binomial_MV2}, - {65536, 262144, &MPIR_Reduce_binomial_MV2}, - {262144, -1, &MPIR_Reduce_binomial_MV2}, - }, - 5, - { - {0, 8192, &MPIR_Reduce_shmem_MV2}, - {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2}, - {16384, 65536, &MPIR_Reduce_shmem_MV2}, - {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2}, - {262144, -1, &MPIR_Reduce_binomial_MV2}, - }, - }, - { - 2048, - 4, - 4, - {1, 0, 1, 1, 1,1}, - 6, - { - {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2}, - {4096, 16384, &MPIR_Reduce_binomial_MV2}, - {16384, 65536, &MPIR_Reduce_binomial_MV2}, - {65536, 131072, &MPIR_Reduce_binomial_MV2}, - {131072, -1, &MPIR_Reduce_binomial_MV2}, - }, - 6, - { - {0, 2048, &MPIR_Reduce_shmem_MV2}, - {2048, 4096, &MPIR_Reduce_shmem_MV2}, - {4096, 16384, &MPIR_Reduce_shmem_MV2}, - {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2}, - {65536, 131072, &MPIR_Reduce_binomial_MV2}, - {131072, -1, &MPIR_Reduce_shmem_MV2}, - }, - }, - - }; - memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table, - mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table)); -} - int smpi_coll_tuned_reduce_mvapich2( void *sendbuf, @@ -1724,69 +610,6 @@ int smpi_coll_tuned_reduce_mvapich2( void *sendbuf, } - -static void init_mv2_reduce_scatter_tables_stampede(){ - mv2_size_red_scat_tuning_table = 6; - mv2_red_scat_thresholds_table = malloc(mv2_size_red_scat_tuning_table * - sizeof (mv2_red_scat_tuning_table)); - mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = { - { - 16, - 3, - { - {0, 64, &MPIR_Reduce_Scatter_Basic_MV2}, - {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2}, - {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2}, - }, - }, - { - 32, - 3, - { - {0, 64, &MPIR_Reduce_Scatter_Basic_MV2}, - {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2}, - {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2}, - }, - }, - { - 64, - 3, - { - {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2}, - {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2}, - {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2}, - }, - }, - { - 128, - 2, - { - {0, 128, &MPIR_Reduce_Scatter_Basic_MV2}, - {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2}, - }, - }, - { - 256, - 2, - { - {0, 128, &MPIR_Reduce_Scatter_Basic_MV2}, - {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2}, - }, - }, - { - 512, - 2, - { - {0, 256, &MPIR_Reduce_Scatter_Basic_MV2}, - {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2}, - }, - }, - - }; - memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table, - mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table)); -} - int smpi_coll_tuned_reduce_scatter_mvapich2(void *sendbuf, void *recvbuf, int *recvcnts, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) @@ -1797,7 +620,7 @@ int smpi_coll_tuned_reduce_scatter_mvapich2(void *sendbuf, void *recvbuf, int *r int range = 0; int range_threshold = 0; int is_commutative = 0; - int *disps = malloc(comm_size * sizeof (int)); + int *disps = xbt_malloc(comm_size * sizeof (int)); if(mv2_red_scat_thresholds_table==NULL) init_mv2_reduce_scatter_tables_stampede(); @@ -1836,9 +659,24 @@ int smpi_coll_tuned_reduce_scatter_mvapich2(void *sendbuf, void *recvbuf, int *r recvcnts, datatype, op, comm); } else { - mpi_errno = MPIR_Reduce_scatter_non_comm_MV2(sendbuf, recvbuf, - recvcnts, datatype, - op, comm); + int is_block_regular = 1; + for (i = 0; i < (comm_size - 1); ++i) { + if (recvcnts[i] != recvcnts[i+1]) { + is_block_regular = 0; + break; + } + } + int pof2 = 1; + while (pof2 < comm_size) pof2 <<= 1; + if (pof2 == comm_size && is_block_regular) { + /* noncommutative, pof2 size, and block regular */ + mpi_errno = MPIR_Reduce_scatter_non_comm_MV2(sendbuf, recvbuf, + recvcnts, datatype, + op, comm); + } + mpi_errno = smpi_coll_tuned_reduce_scatter_mpich_rdb(sendbuf, recvbuf, + recvcnts, datatype, + op, comm); } return mpi_errno; @@ -1847,317 +685,6 @@ int smpi_coll_tuned_reduce_scatter_mvapich2(void *sendbuf, void *recvbuf, int *r -static void init_mv2_scatter_tables_stampede(){ -{ - int agg_table_sum = 0; - int i; - mv2_scatter_tuning_table **table_ptrs = NULL; - mv2_scatter_num_ppn_conf = 3; - mv2_scatter_thresholds_table - = malloc(sizeof(mv2_scatter_tuning_table *) - * mv2_scatter_num_ppn_conf); - table_ptrs = malloc(sizeof(mv2_scatter_tuning_table *) - * mv2_scatter_num_ppn_conf); - mv2_size_scatter_tuning_table = malloc(sizeof(int) * - mv2_scatter_num_ppn_conf); - mv2_scatter_table_ppn_conf - = malloc(mv2_scatter_num_ppn_conf * sizeof(int)); - mv2_scatter_table_ppn_conf[0] = 1; - mv2_size_scatter_tuning_table[0] = 6; - mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = { - {2, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Binomial}, - }, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Binomial}, - }, - }, - - {4, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Direct}, - }, - }, - - {8, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Direct}, - }, - }, - - {16, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Direct}, - }, - }, - - {32, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Direct}, - }, - }, - - {64, - 2, - { - {0, 32, &MPIR_Scatter_MV2_Binomial}, - {32, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Binomial}, - }, - }, - }; - table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn; - mv2_scatter_table_ppn_conf[1] = 2; - mv2_size_scatter_tuning_table[1] = 6; - mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = { - {4, - 2, - { - {0, 4096, &MPIR_Scatter_MV2_Binomial}, - {4096, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Direct}, - }, - }, - - {8, - 2, - { - {0, 512, &MPIR_Scatter_MV2_two_level_Direct}, - {512, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Binomial}, - }, - }, - - {16, - 2, - { - {0, 2048, &MPIR_Scatter_MV2_two_level_Direct}, - {2048, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Binomial}, - }, - }, - - {32, - 2, - { - {0, 2048, &MPIR_Scatter_MV2_two_level_Direct}, - {2048, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Binomial}, - }, - }, - - {64, - 2, - { - {0, 8192, &MPIR_Scatter_MV2_two_level_Direct}, - {8192, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - {0, -1, &MPIR_Scatter_MV2_Binomial}, - }, - }, - - {128, - 4, - { - {0, 16, &MPIR_Scatter_MV2_Binomial}, - {16, 128, &MPIR_Scatter_MV2_two_level_Binomial}, - {128, 16384, &MPIR_Scatter_MV2_two_level_Direct}, - {16384, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - {0, 128, &MPIR_Scatter_MV2_Direct}, - {128, -1, &MPIR_Scatter_MV2_Binomial}, - }, - }, - }; - table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn; - mv2_scatter_table_ppn_conf[2] = 16; - mv2_size_scatter_tuning_table[2] = 8; - mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = { - { - 16, - 2, - { - {0, 256, &MPIR_Scatter_MV2_Binomial}, - {256, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - { 0, -1, &MPIR_Scatter_MV2_Direct}, - }, - }, - - { - 32, - 2, - { - {0, 512, &MPIR_Scatter_MV2_Binomial}, - {512, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - { 0, -1, &MPIR_Scatter_MV2_Direct}, - }, - }, - - { - 64, - 2, - { - {0, 1024, &MPIR_Scatter_MV2_two_level_Direct}, - {1024, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - { 0, -1, &MPIR_Scatter_MV2_Direct}, - }, - }, - - { - 128, - 4, - { - {0, 16, &MPIR_Scatter_mcst_wrap_MV2}, - {0, 16, &MPIR_Scatter_MV2_two_level_Direct}, - {16, 2048, &MPIR_Scatter_MV2_two_level_Direct}, - {2048, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - { 0, -1, &MPIR_Scatter_MV2_Direct}, - }, - }, - - { - 256, - 4, - { - {0, 16, &MPIR_Scatter_mcst_wrap_MV2}, - {0, 16, &MPIR_Scatter_MV2_two_level_Direct}, - {16, 2048, &MPIR_Scatter_MV2_two_level_Direct}, - {2048, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - { 0, -1, &MPIR_Scatter_MV2_Direct}, - }, - }, - - { - 512, - 4, - { - {0, 16, &MPIR_Scatter_mcst_wrap_MV2}, - {16, 16, &MPIR_Scatter_MV2_two_level_Direct}, - {16, 4096, &MPIR_Scatter_MV2_two_level_Direct}, - {4096, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - { 0, -1, &MPIR_Scatter_MV2_Binomial}, - }, - }, - { - 1024, - 5, - { - {0, 16, &MPIR_Scatter_mcst_wrap_MV2}, - {0, 16, &MPIR_Scatter_MV2_Binomial}, - {16, 32, &MPIR_Scatter_MV2_Binomial}, - {32, 4096, &MPIR_Scatter_MV2_two_level_Direct}, - {4096, -1, &MPIR_Scatter_MV2_Direct}, - }, - 1, - { - { 0, -1, &MPIR_Scatter_MV2_Binomial}, - }, - }, - { - 2048, - 7, - { - {0, 16, &MPIR_Scatter_mcst_wrap_MV2}, - {0, 16, &MPIR_Scatter_MV2_two_level_Binomial}, - {16, 128, &MPIR_Scatter_MV2_two_level_Binomial}, - {128, 1024, &MPIR_Scatter_MV2_two_level_Direct}, - {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct}, - {16384, 65536, &MPIR_Scatter_MV2_Direct}, - {65536, -1, &MPIR_Scatter_MV2_two_level_Direct}, - }, - 6, - { - {0, 16, &MPIR_Scatter_MV2_Binomial}, - {16, 128, &MPIR_Scatter_MV2_Binomial}, - {128, 1024, &MPIR_Scatter_MV2_Binomial}, - {1024, 16384, &MPIR_Scatter_MV2_Direct}, - {16384, 65536, &MPIR_Scatter_MV2_Direct}, - {65536, -1, &MPIR_Scatter_MV2_Direct}, - }, - }, - }; - table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn; - agg_table_sum = 0; - for (i = 0; i < mv2_scatter_num_ppn_conf; i++) { - agg_table_sum += mv2_size_scatter_tuning_table[i]; - } - mv2_scatter_thresholds_table[0] = - malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table)); - memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0], - (sizeof(mv2_scatter_tuning_table) - * mv2_size_scatter_tuning_table[0])); - for (i = 1; i < mv2_scatter_num_ppn_conf; i++) { - mv2_scatter_thresholds_table[i] = - mv2_scatter_thresholds_table[i - 1] - + mv2_size_scatter_tuning_table[i - 1]; - memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i], - (sizeof(mv2_scatter_tuning_table) - * mv2_size_scatter_tuning_table[i])); - } - free(table_ptrs); - } -} - int smpi_coll_tuned_scatter_mvapich2(void *sendbuf, int sendcnt, MPI_Datatype sendtype, diff --git a/src/smpi/colls/smpi_mvapich2_selector_stampede.h b/src/smpi/colls/smpi_mvapich2_selector_stampede.h index abfc786c11..58104ec8a9 100644 --- a/src/smpi/colls/smpi_mvapich2_selector_stampede.h +++ b/src/smpi/colls/smpi_mvapich2_selector_stampede.h @@ -1,14 +1,14 @@ -/* selector for collective algorithms based on mvapich decision logic */ +/* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/ /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team. * All rights reserved. */ /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */ -/* Indicates number of processes per node */ -extern int *mv2_alltoall_table_ppn_conf; -/* Indicates total number of configurations */ -extern int mv2_alltoall_num_ppn_conf; + + +/************ Alltoall variables and initializers */ + #define MV2_MAX_NB_THRESHOLDS 32 typedef struct { int min; @@ -25,24 +25,16 @@ typedef struct { mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS]; } mv2_alltoall_tuning_table; -extern int *mv2_size_alltoall_tuning_table; -extern mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table; -extern int mv2_use_old_alltoall; +int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL; - -int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, - void *recvbuf, int recvcount, MPI_Datatype recvtype, - MPI_Comm comm_ptr)=NULL; - - +/* Indicates number of processes per node */ int *mv2_alltoall_table_ppn_conf = NULL; +/* Indicates total number of configurations */ int mv2_alltoall_num_ppn_conf = 1; int *mv2_size_alltoall_tuning_table = NULL; mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL; - - #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_ring @@ -50,11 +42,251 @@ mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL; #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring +static void init_mv2_alltoall_tables_stampede(){ +int i; + int agg_table_sum = 0; + mv2_alltoall_tuning_table **table_ptrs = NULL; + mv2_alltoall_num_ppn_conf = 3; + mv2_alltoall_thresholds_table = xbt_malloc(sizeof(mv2_alltoall_tuning_table *) + * mv2_alltoall_num_ppn_conf); + table_ptrs = xbt_malloc(sizeof(mv2_alltoall_tuning_table *) + * mv2_alltoall_num_ppn_conf); + mv2_size_alltoall_tuning_table = xbt_malloc(sizeof(int) * + mv2_alltoall_num_ppn_conf); + mv2_alltoall_table_ppn_conf = xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int)); + mv2_alltoall_table_ppn_conf[0] = 1; + mv2_size_alltoall_tuning_table[0] = 6; + mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = { + {2, + 1, + {{0, -1, &MPIR_Alltoall_pairwise_MV2}, + }, + + {{0, -1, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {4, + 2, + {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2}, + {262144, -1, &MPIR_Alltoall_pairwise_MV2}, + }, + + {{0, -1, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {8, + 2, + {{0, 8, &MPIR_Alltoall_RD_MV2}, + {8, -1, &MPIR_Alltoall_Scatter_dest_MV2}, + }, + + {{0, -1, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {16, + 3, + {{0, 64, &MPIR_Alltoall_RD_MV2}, + {64, 512, &MPIR_Alltoall_bruck_MV2}, + {512, -1, &MPIR_Alltoall_Scatter_dest_MV2}, + }, + + {{0,-1, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {32, + 3, + {{0, 32, &MPIR_Alltoall_RD_MV2}, + {32, 2048, &MPIR_Alltoall_bruck_MV2}, + {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2}, + }, + + {{0, -1, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {64, + 3, + {{0, 8, &MPIR_Alltoall_RD_MV2}, + {8, 1024, &MPIR_Alltoall_bruck_MV2}, + {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2}, + }, + + {{0, -1, &MPIR_Alltoall_inplace_MV2}, + }, + }, + }; + table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn; + mv2_alltoall_table_ppn_conf[1] = 2; + mv2_size_alltoall_tuning_table[1] = 6; + mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = { + {4, + 2, + {{0, 32, &MPIR_Alltoall_RD_MV2}, + {32, -1, &MPIR_Alltoall_Scatter_dest_MV2}, + }, + + {{0, -1, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {8, + 2, + {{0, 64, &MPIR_Alltoall_RD_MV2}, + {64, -1, &MPIR_Alltoall_Scatter_dest_MV2}, + }, + + {{0, -1, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {16, + 3, + {{0, 64, &MPIR_Alltoall_RD_MV2}, + {64, 2048, &MPIR_Alltoall_bruck_MV2}, + {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2}, + }, + + {{0,-1, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {32, + 3, + {{0, 16, &MPIR_Alltoall_RD_MV2}, + {16, 2048, &MPIR_Alltoall_bruck_MV2}, + {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2}, + }, + + {{0, -1, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {64, + 3, + {{0, 8, &MPIR_Alltoall_RD_MV2}, + {8, 1024, &MPIR_Alltoall_bruck_MV2}, + {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2}, + }, + + {{0, -1, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {128, + 3, + {{0, 4, &MPIR_Alltoall_RD_MV2}, + {4, 2048, &MPIR_Alltoall_bruck_MV2}, + {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2}, + }, + + {{0, -1, &MPIR_Alltoall_inplace_MV2}, + }, + }, + }; + table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn; + mv2_alltoall_table_ppn_conf[2] = 16; + mv2_size_alltoall_tuning_table[2] = 7; + mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = { + {16, + 2, + {{0, 2048, &MPIR_Alltoall_bruck_MV2}, + {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2}, + }, + + {{32768, -1, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {32, + 2, + {{0, 2048, &MPIR_Alltoall_bruck_MV2}, + {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2}, + }, + + {{16384, -1, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {64, + 3, + {{0, 2048, &MPIR_Alltoall_bruck_MV2}, + {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2}, + {16384, -1, &MPIR_Alltoall_pairwise_MV2}, + }, + + {{32768, 131072, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {128, + 2, + {{0, 2048, &MPIR_Alltoall_bruck_MV2}, + {2048, -1, &MPIR_Alltoall_pairwise_MV2}, + }, + + {{16384,65536, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {256, + 2, + {{0, 1024, &MPIR_Alltoall_bruck_MV2}, + {1024, -1, &MPIR_Alltoall_pairwise_MV2}, + }, + + {{16384, 65536, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + {512, + 2, + {{0, 1024, &MPIR_Alltoall_bruck_MV2}, + {1024, -1, &MPIR_Alltoall_pairwise_MV2}, + }, + + {{16384, 65536, &MPIR_Alltoall_inplace_MV2}, + }, + }, + {1024, + 2, + {{0, 1024, &MPIR_Alltoall_bruck_MV2}, + {1024, -1, &MPIR_Alltoall_pairwise_MV2}, + }, + + {{16384, 65536, &MPIR_Alltoall_inplace_MV2}, + }, + }, + + }; + table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn; + agg_table_sum = 0; + for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) { + agg_table_sum += mv2_size_alltoall_tuning_table[i]; + } + mv2_alltoall_thresholds_table[0] = + xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table)); + memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0], + (sizeof(mv2_alltoall_tuning_table) + * mv2_size_alltoall_tuning_table[0])); + for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) { + mv2_alltoall_thresholds_table[i] = + mv2_alltoall_thresholds_table[i - 1] + + mv2_size_alltoall_tuning_table[i - 1]; + memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i], + (sizeof(mv2_alltoall_tuning_table) + * mv2_size_alltoall_tuning_table[i])); + } + xbt_free(table_ptrs); + + +} + -/* Indicates number of processes per node */ -extern int *mv2_allgather_table_ppn_conf; -/* Indicates total number of configurations */ -extern int mv2_allgather_num_ppn_conf; +/************ Allgather variables and initializers */ typedef struct { int min; @@ -74,10 +306,6 @@ typedef struct { mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS]; } mv2_allgather_tuning_table; -extern int *mv2_size_allgather_tuning_table; -extern mv2_allgather_tuning_table **mv2_allgather_thresholds_table; -extern int mv2_use_old_allgather; - int (*MV2_Allgather_function)(void *sendbuf, int sendcount, MPI_Datatype sendtype, @@ -96,6 +324,225 @@ mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL; #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring +static void init_mv2_allgather_tables_stampede(){ +int i; + int agg_table_sum = 0; +mv2_allgather_tuning_table **table_ptrs = NULL; + mv2_allgather_num_ppn_conf = 3; + mv2_allgather_thresholds_table + = xbt_malloc(sizeof(mv2_allgather_tuning_table *) + * mv2_allgather_num_ppn_conf); + table_ptrs = xbt_malloc(sizeof(mv2_allgather_tuning_table *) + * mv2_allgather_num_ppn_conf); + mv2_size_allgather_tuning_table = xbt_malloc(sizeof(int) * + mv2_allgather_num_ppn_conf); + mv2_allgather_table_ppn_conf + = xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int)); + mv2_allgather_table_ppn_conf[0] = 1; + mv2_size_allgather_tuning_table[0] = 6; + mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = { + { + 2, + {0}, + 1, + { + {0, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 4, + {0,0}, + 2, + { + {0, 262144, &MPIR_Allgather_RD_MV2}, + {262144, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 8, + {0,0}, + 2, + { + {0, 131072, &MPIR_Allgather_RD_MV2}, + {131072, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 16, + {0,0}, + 2, + { + {0, 131072, &MPIR_Allgather_RD_MV2}, + {131072, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 32, + {0,0}, + 2, + { + {0, 65536, &MPIR_Allgather_RD_MV2}, + {65536, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 64, + {0,0}, + 2, + { + {0, 32768, &MPIR_Allgather_RD_MV2}, + {32768, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + }; + table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn; + mv2_allgather_table_ppn_conf[1] = 2; + mv2_size_allgather_tuning_table[1] = 6; + mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = { + { + 4, + {0,0}, + 2, + { + {0, 524288, &MPIR_Allgather_RD_MV2}, + {524288, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 8, + {0,1,0}, + 2, + { + {0, 32768, &MPIR_Allgather_RD_MV2}, + {32768, 524288, &MPIR_Allgather_Ring_MV2}, + {524288, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 16, + {0,1,0}, + 2, + { + {0, 16384, &MPIR_Allgather_RD_MV2}, + {16384, 524288, &MPIR_Allgather_Ring_MV2}, + {524288, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 32, + {1,1,0}, + 2, + { + {0, 65536, &MPIR_Allgather_RD_MV2}, + {65536, 524288, &MPIR_Allgather_Ring_MV2}, + {524288, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 64, + {1,1,0}, + 2, + { + {0, 32768, &MPIR_Allgather_RD_MV2}, + {32768, 524288, &MPIR_Allgather_Ring_MV2}, + {524288, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 128, + {1,1,0}, + 2, + { + {0, 65536, &MPIR_Allgather_RD_MV2}, + {65536, 524288, &MPIR_Allgather_Ring_MV2}, + {524288, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + }; + table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn; + mv2_allgather_table_ppn_conf[2] = 16; + mv2_size_allgather_tuning_table[2] = 6; + mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = { + { + 16, + {0,0}, + 2, + { + {0, 1024, &MPIR_Allgather_RD_MV2}, + {1024, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 32, + {0,0}, + 2, + { + {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2}, + {1024, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 64, + {0,0}, + 2, + { + {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2}, + {1024, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 128, + {0,0}, + 2, + { + {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2}, + {1024, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 256, + {0,0}, + 2, + { + {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2}, + {1024, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + { + 512, + {0,0}, + 2, + { + {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2}, + {1024, -1, &MPIR_Allgather_Ring_MV2}, + }, + }, + + }; + table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn; + agg_table_sum = 0; + for (i = 0; i < mv2_allgather_num_ppn_conf; i++) { + agg_table_sum += mv2_size_allgather_tuning_table[i]; + } + mv2_allgather_thresholds_table[0] = + xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table)); + memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0], + (sizeof(mv2_allgather_tuning_table) + * mv2_size_allgather_tuning_table[0])); + for (i = 1; i < mv2_allgather_num_ppn_conf; i++) { + mv2_allgather_thresholds_table[i] = + mv2_allgather_thresholds_table[i - 1] + + mv2_size_allgather_tuning_table[i - 1]; + memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i], + (sizeof(mv2_allgather_tuning_table) + * mv2_size_allgather_tuning_table[i])); + } + xbt_free(table_ptrs); +} + + +/************ Gather variables and initializers */ + typedef struct { int min; int max; @@ -113,15 +560,6 @@ typedef struct { mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS]; } mv2_gather_tuning_table; -extern int mv2_size_gather_tuning_table; -extern mv2_gather_tuning_table * mv2_gather_thresholds_table; - -extern int mv2_user_gather_switch_point; -extern int mv2_use_two_level_gather; -extern int mv2_gather_direct_system_size_small; -extern int mv2_gather_direct_system_size_medium; -extern int mv2_use_direct_gather; - int mv2_size_gather_tuning_table=7; mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL; @@ -142,6 +580,55 @@ MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL; #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich +static void init_mv2_gather_tables_stampede(){ + + mv2_size_gather_tuning_table=7; + mv2_gather_thresholds_table = xbt_malloc(mv2_size_gather_tuning_table* + sizeof (mv2_gather_tuning_table)); + mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={ + {16, + 2,{{0, 524288, &MPIR_Gather_MV2_Direct}, + {524288, -1, &MPIR_Gather_intra}}, + 1,{{0, -1, &MPIR_Gather_MV2_Direct}}}, + {32, + 3,{{0, 16384, &MPIR_Gather_MV2_Direct}, + {16384, 131072, &MPIR_Gather_intra}, + {131072, -1, &MPIR_Gather_MV2_two_level_Direct}}, + 1,{{0, -1, &MPIR_Gather_intra}}}, + {64, + 3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct}, + {256, 16384, &MPIR_Gather_MV2_Direct}, + {256, -1, &MPIR_Gather_MV2_two_level_Direct}}, + 1,{{0, -1, &MPIR_Gather_intra}}}, + {128, + 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, + {512, 16384, &MPIR_Gather_MV2_Direct}, + {16384, -1, &MPIR_Gather_MV2_two_level_Direct}}, + 1,{{0, -1, &MPIR_Gather_intra}}}, + {256, + 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, + {512, 16384, &MPIR_Gather_MV2_Direct}, + {16384, -1, &MPIR_Gather_MV2_two_level_Direct}}, + 1,{{0, -1, &MPIR_Gather_intra}}}, + {512, + 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, + {512, 16384, &MPIR_Gather_MV2_Direct}, + {8196, -1, &MPIR_Gather_MV2_two_level_Direct}}, + 1,{{0, -1, &MPIR_Gather_intra}}}, + {1024, + 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct}, + {512, 16384, &MPIR_Gather_MV2_Direct}, + {8196, -1, &MPIR_Gather_MV2_two_level_Direct}}, + 1,{{0, -1, &MPIR_Gather_intra}}}, + }; + + memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table, + mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table)); + +} + + +/************ Allgatherv variables and initializers */ typedef struct { int min; @@ -162,9 +649,6 @@ typedef struct { mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS]; } mv2_allgatherv_tuning_table; -extern int mv2_size_allgatherv_tuning_table; -extern mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table; - int (*MV2_Allgatherv_function)(void *sendbuf, int sendcount, MPI_Datatype sendtype, @@ -182,6 +666,68 @@ mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL; #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring +static void init_mv2_allgatherv_tables_stampede(){ + mv2_size_allgatherv_tuning_table = 6; + mv2_allgatherv_thresholds_table = xbt_malloc(mv2_size_allgatherv_tuning_table * + sizeof (mv2_allgatherv_tuning_table)); + mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = { + { + 16, + 2, + { + {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2}, + {512, -1, &MPIR_Allgatherv_Ring_MV2}, + }, + }, + { + 32, + 2, + { + {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2}, + {512, -1, &MPIR_Allgatherv_Ring_MV2}, + }, + }, + { + 64, + 2, + { + {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2}, + {256, -1, &MPIR_Allgatherv_Ring_MV2}, + }, + }, + { + 128, + 2, + { + {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2}, + {256, -1, &MPIR_Allgatherv_Ring_MV2}, + }, + }, + { + 256, + 2, + { + {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2}, + {256, -1, &MPIR_Allgatherv_Ring_MV2}, + }, + }, + { + 512, + 2, + { + {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2}, + {256, -1, &MPIR_Allgatherv_Ring_MV2}, + }, + }, + + }; + memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table, + mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table)); +} + + +/************ Allreduce variables and initializers */ + typedef struct { int min; int max; @@ -202,10 +748,6 @@ typedef struct { mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS]; } mv2_allreduce_tuning_table; -extern int mv2_size_allreduce_tuning_table; -extern mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table; -extern int mv2_use_old_allreduce; - int (*MV2_Allreduce_function)(void *sendbuf, void *recvbuf, @@ -266,8 +808,150 @@ static int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf, } #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb -#define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_rab1 - +#define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs + + + +static void init_mv2_allreduce_tables_stampede(){ +mv2_size_allreduce_tuning_table = 8; + mv2_allreduce_thresholds_table = xbt_malloc(mv2_size_allreduce_tuning_table * + sizeof (mv2_allreduce_tuning_table)); + mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = { + { + 16, + 0, + {1, 0}, + 2, + { + {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2}, + {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, + }, + 2, + { + {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2}, + {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2}, + }, + }, + { + 32, + 0, + {1, 1, 0}, + 3, + { + {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2}, + {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2}, + {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, + }, + 2, + { + {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2}, + {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2}, + }, + }, + { + 64, + 0, + {1, 1, 0}, + 3, + { + {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2}, + {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2}, + {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, + }, + 2, + { + {0, 512, &MPIR_Allreduce_reduce_shmem_MV2}, + {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2}, + }, + }, + { + 128, + 0, + {1, 1, 0}, + 3, + { + {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2}, + {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2}, + {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, + }, + 2, + { + {0, 512, &MPIR_Allreduce_reduce_shmem_MV2}, + {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2}, + }, + }, + { + 256, + 0, + {1, 1, 0}, + 3, + { + {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2}, + {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2}, + {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, + }, + 2, + { + {0, 512, &MPIR_Allreduce_reduce_shmem_MV2}, + {512, -1, &MPIR_Allreduce_reduce_p2p_MV2}, + }, + }, + { + 512, + 0, + {1, 1, 0}, + 3, + { + {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2}, + {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2}, + {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, + }, + 2, + { + {0, 512, &MPIR_Allreduce_reduce_shmem_MV2}, + {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2}, + }, + }, + { + 1024, + 0, + {1, 1, 1, 0}, + 4, + { + {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2}, + {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2}, + {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2}, + {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, + }, + 2, + { + {0, 512, &MPIR_Allreduce_reduce_shmem_MV2}, + {512, -1, &MPIR_Allreduce_reduce_p2p_MV2}, + }, + }, + { + 2048, + 0, + {1, 1, 1, 0}, + 4, + { + {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2}, + {64, 512, &MPIR_Allreduce_reduce_p2p_MV2}, + {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2}, + {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2}, + {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2}, + }, + 2, + { + {0, 512, &MPIR_Allreduce_reduce_shmem_MV2}, + {512, -1, &MPIR_Allreduce_reduce_p2p_MV2}, + }, + }, + + }; + memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table, + mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table)); +} /* @@ -292,16 +976,6 @@ typedef struct { mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS]; } mv2_bcast_tuning_table; -extern int mv2_use_pipelined_bcast; -extern int mv2_pipelined_knomial_factor; -extern int mv2_pipelined_zcpy_knomial_factor; -extern int zcpy_knomial_factor; -extern int bcast_segment_size; - -extern int mv2_size_bcast_tuning_table; -extern mv2_bcast_tuning_table *mv2_bcast_thresholds_table; -extern int mv2_use_old_bcast; - int mv2_size_bcast_tuning_table = 0; mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL; @@ -315,6 +989,220 @@ int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype data */ + +/* +static void init_mv2_bcast_tables_stampede(){ + //Stampede, + mv2_size_bcast_tuning_table=8; + mv2_bcast_thresholds_table = xbt_malloc(mv2_size_bcast_tuning_table * + sizeof (mv2_bcast_tuning_table)); + + mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={ + { + 16, + 8192, 4, 4, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + 11, + { + {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, + {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, + {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {8192, 16384, &MPIR_Bcast_binomial_MV2, -1}, + {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, + {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}, + {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}, + {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1}, + {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1} + }, + 11, + { + {0, 8, &MPIR_Shmem_Bcast_MV2, 2}, + {8, 16, &MPIR_Shmem_Bcast_MV2, 4}, + {16, 1024, &MPIR_Shmem_Bcast_MV2, 2}, + {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4}, + {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1}, + {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4}, + {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2}, + {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1}, + {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1}, + {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1}, + {524288, -1, &MPIR_Shmem_Bcast_MV2, -1} + } + }, + { + 32, + 8192, 4, 4, + {1, 1, 1, 1, 1, 1, 1, 1}, + 8, + { + {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, + {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, + {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, + {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, + {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, + {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8} + }, + 8, + { + {0, 128, &MPIR_Shmem_Bcast_MV2, 2}, + {128, 256, &MPIR_Shmem_Bcast_MV2, 4}, + {256, 32768, &MPIR_Shmem_Bcast_MV2, 2}, + {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4}, + {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2}, + {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8}, + {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2}, + {524288, -1, &MPIR_Shmem_Bcast_MV2, 8} + } + }, + { + 64, + 8192, 4, 4, + {1, 1, 1, 1, 1, 1, 1, 1, 1}, + 9, + { + {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, + {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, + {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, + {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, + {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2} + }, + 9, + { + {0, 2, &MPIR_Shmem_Bcast_MV2, 4}, + {2, 4, &MPIR_Shmem_Bcast_MV2, 8}, + {4, 16, &MPIR_Shmem_Bcast_MV2, 4}, + {16, 32, &MPIR_Shmem_Bcast_MV2, 8}, + {32, 128, &MPIR_Shmem_Bcast_MV2, 4}, + {128, 256, &MPIR_Shmem_Bcast_MV2, 8}, + {256, 4096, &MPIR_Shmem_Bcast_MV2, 4}, + {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8}, + {32768, -1, &MPIR_Shmem_Bcast_MV2, 2} + } + }, + { + 128, + 8192, 4, 4, + {1, 1, 1, 0}, + 4, + { + {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, + {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, + {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1} + }, + 4, + { + {0, 8192, &MPIR_Shmem_Bcast_MV2, 8}, + {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4}, + {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2}, + {524288, -1, NULL, -1} + } + }, + { + 256, + 8192, 4, 4, + {1, 1, 1, 1, 1}, + 5, + { + {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, + {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}, + {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, + {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1} + }, + 5, + { + {0, 16384, &MPIR_Shmem_Bcast_MV2, 4}, + {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2}, + {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1}, + {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2}, + {524288, -1, &MPIR_Shmem_Bcast_MV2, -1} + } + }, + { + 512, + 8192, 4, 4, + {1, 1, 1, 1, 1}, + 5, + { + {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, + {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, + {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1}, + {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1} + }, + 5, + { + {0, 4096, &MPIR_Shmem_Bcast_MV2, 8}, + {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4}, + {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2}, + {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1}, + {262144, -1, &MPIR_Shmem_Bcast_MV2, -1} + } + }, + { + 1024, + 8192, 4, 4, + {1, 1, 1, 1, 1}, + 5, + { + {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, + {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, + {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1}, + {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1} + }, + 5, + { + {0, 8192, &MPIR_Shmem_Bcast_MV2, 8}, + {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4}, + {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2}, + {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1}, + {524288, -1, &MPIR_Shmem_Bcast_MV2, -1} + } + }, + { + 2048, + 8192, 4, 4, + {1, 1, 1, 1, 1, 1, 1}, + 7, + { + {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, + {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}, + {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4}, + {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}, + {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1}, + {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1} + }, + 7, + { + {0, 16, &MPIR_Shmem_Bcast_MV2, 8}, + {16, 32, &MPIR_Shmem_Bcast_MV2, 4}, + {32, 4096, &MPIR_Shmem_Bcast_MV2, 8}, + {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4}, + {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2}, + {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1}, + {524288, -1, &MPIR_Shmem_Bcast_MV2, -1} + } + } + }; + + memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table, + mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table)); +}*/ + + +/************ Reduce variables and initializers */ + typedef struct { int min; int max; @@ -338,10 +1226,6 @@ typedef struct { mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS]; } mv2_reduce_tuning_table; -extern int mv2_size_reduce_tuning_table; -extern mv2_reduce_tuning_table *mv2_reduce_thresholds_table; -extern int mv2_use_old_reduce; - int mv2_size_reduce_tuning_table = 0; mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL; @@ -366,12 +1250,209 @@ int (*MV2_Reduce_intra_function)( void *sendbuf, MPI_Comm comm_ptr)=NULL; -#define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_ompi_binomial -#define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_ompi_binomial -#define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_ompi_binomial +#define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_binomial +#define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_binomial +#define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear + + +static void init_mv2_reduce_tables_stampede(){ + /*Stampede*/ + mv2_size_reduce_tuning_table = 8; + mv2_reduce_thresholds_table = xbt_malloc(mv2_size_reduce_tuning_table * + sizeof (mv2_reduce_tuning_table)); + mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = { + { + 16, + 4, + 4, + {1, 0, 0}, + 3, + { + {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {262144, 1048576, &MPIR_Reduce_binomial_MV2}, + {1048576, -1, &MPIR_Reduce_redscat_gather_MV2}, + }, + 2, + { + {0, 65536, &MPIR_Reduce_shmem_MV2}, + {65536,-1, &MPIR_Reduce_binomial_MV2}, + }, + }, + { + 32, + 4, + 4, + {1, 1, 1, 1, 0, 0, 0}, + 7, + { + {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {32768, 65536, &MPIR_Reduce_binomial_MV2}, + {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {262144, 1048576, &MPIR_Reduce_binomial_MV2}, + {1048576, -1, &MPIR_Reduce_redscat_gather_MV2}, + }, + 6, + { + {0, 8192, &MPIR_Reduce_shmem_MV2}, + {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2}, + {16384, 32768, &MPIR_Reduce_shmem_MV2}, + {32768, 65536, &MPIR_Reduce_shmem_MV2}, + {65536, 262144, &MPIR_Reduce_shmem_MV2}, + {262144,-1, &MPIR_Reduce_binomial_MV2}, + }, + }, + { + 64, + 4, + 4, + {1, 1, 1, 1, 0}, + 5, + { + {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {16384, 65536, &MPIR_Reduce_binomial_MV2}, + {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {262144, -1, &MPIR_Reduce_redscat_gather_MV2}, + }, + 5, + { + {0, 8192, &MPIR_Reduce_shmem_MV2}, + {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2}, + {16384, 65536, &MPIR_Reduce_shmem_MV2}, + {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2}, + {262144, -1, &MPIR_Reduce_binomial_MV2}, + }, + }, + { + 128, + 4, + 4, + {1, 0, 1, 0, 1, 0}, + 6, + { + {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {16384, 65536, &MPIR_Reduce_binomial_MV2}, + {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {262144, 1048576, &MPIR_Reduce_binomial_MV2}, + {1048576, -1, &MPIR_Reduce_redscat_gather_MV2}, + }, + 5, + { + {0, 8192, &MPIR_Reduce_shmem_MV2}, + {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2}, + {16384, 65536, &MPIR_Reduce_shmem_MV2}, + {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2}, + {262144, -1, &MPIR_Reduce_binomial_MV2}, + }, + }, + { + 256, + 4, + 4, + {1, 1, 1, 0, 1, 1, 0}, + 7, + { + {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {16384, 32768, &MPIR_Reduce_binomial_MV2}, + {32768, 65536, &MPIR_Reduce_binomial_MV2}, + {65536, 262144, &MPIR_Reduce_binomial_MV2}, + {262144, 1048576, &MPIR_Reduce_binomial_MV2}, + {1048576, -1, &MPIR_Reduce_redscat_gather_MV2}, + }, + 6, + { + {0, 8192, &MPIR_Reduce_shmem_MV2}, + {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2}, + {16384, 32768, &MPIR_Reduce_shmem_MV2}, + {32768, 65536, &MPIR_Reduce_shmem_MV2}, + {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2}, + {262144, -1, &MPIR_Reduce_binomial_MV2}, + }, + }, + { + 512, + 4, + 4, + {1, 0, 1, 1, 1, 0}, + 6, + { + {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {16384, 65536, &MPIR_Reduce_binomial_MV2}, + {65536, 262144, &MPIR_Reduce_binomial_MV2}, + {262144, 1048576, &MPIR_Reduce_binomial_MV2}, + {1048576, -1, &MPIR_Reduce_redscat_gather_MV2}, + }, + 5, + { + {0, 8192, &MPIR_Reduce_shmem_MV2}, + {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2}, + {16384, 65536, &MPIR_Reduce_shmem_MV2}, + {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2}, + {262144, -1, &MPIR_Reduce_binomial_MV2}, + }, + }, + { + 1024, + 4, + 4, + {1, 0, 1, 1, 1}, + 5, + { + {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {16384, 65536, &MPIR_Reduce_binomial_MV2}, + {65536, 262144, &MPIR_Reduce_binomial_MV2}, + {262144, -1, &MPIR_Reduce_binomial_MV2}, + }, + 5, + { + {0, 8192, &MPIR_Reduce_shmem_MV2}, + {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2}, + {16384, 65536, &MPIR_Reduce_shmem_MV2}, + {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2}, + {262144, -1, &MPIR_Reduce_binomial_MV2}, + }, + }, + { + 2048, + 4, + 4, + {1, 0, 1, 1, 1,1}, + 6, + { + {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2}, + {4096, 16384, &MPIR_Reduce_binomial_MV2}, + {16384, 65536, &MPIR_Reduce_binomial_MV2}, + {65536, 131072, &MPIR_Reduce_binomial_MV2}, + {131072, -1, &MPIR_Reduce_binomial_MV2}, + }, + 6, + { + {0, 2048, &MPIR_Reduce_shmem_MV2}, + {2048, 4096, &MPIR_Reduce_shmem_MV2}, + {4096, 16384, &MPIR_Reduce_shmem_MV2}, + {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2}, + {65536, 131072, &MPIR_Reduce_binomial_MV2}, + {131072, -1, &MPIR_Reduce_shmem_MV2}, + }, + }, + + }; + memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table, + mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table)); +} + +/************ Reduce scatter variables and initializers */ + typedef struct { int min; int max; @@ -389,9 +1470,6 @@ typedef struct { mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS]; } mv2_red_scat_tuning_table; -extern int mv2_size_red_scat_tuning_table; -extern mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table; - int mv2_size_red_scat_tuning_table = 0; mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL; @@ -402,18 +1480,89 @@ int (*MV2_Red_scat_function)(void *sendbuf, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm_ptr); + + -#define MPIR_Reduce_Scatter_Basic_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm +static int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf, + void *recvbuf, + int *recvcnts, + MPI_Datatype datatype, + MPI_Op op, + MPI_Comm comm) +{ + smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm); + return MPI_SUCCESS; +} #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair -/* Indicates number of processes per node */ -extern int *mv2_scatter_table_ppn_conf; -/* Indicates total number of configurations */ -extern int mv2_scatter_num_ppn_conf; + +static void init_mv2_reduce_scatter_tables_stampede(){ + mv2_size_red_scat_tuning_table = 6; + mv2_red_scat_thresholds_table = xbt_malloc(mv2_size_red_scat_tuning_table * + sizeof (mv2_red_scat_tuning_table)); + mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = { + { + 16, + 3, + { + {0, 64, &MPIR_Reduce_Scatter_Basic_MV2}, + {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2}, + {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2}, + }, + }, + { + 32, + 3, + { + {0, 64, &MPIR_Reduce_Scatter_Basic_MV2}, + {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2}, + {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2}, + }, + }, + { + 64, + 3, + { + {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2}, + {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2}, + {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2}, + }, + }, + { + 128, + 2, + { + {0, 128, &MPIR_Reduce_Scatter_Basic_MV2}, + {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2}, + }, + }, + { + 256, + 2, + { + {0, 128, &MPIR_Reduce_Scatter_Basic_MV2}, + {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2}, + }, + }, + { + 512, + 2, + { + {0, 256, &MPIR_Reduce_Scatter_Basic_MV2}, + {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2}, + }, + }, + + }; + memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table, + mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table)); +} + +/************ Scatter variables and initializers */ typedef struct { int min; @@ -435,9 +1584,6 @@ typedef struct { mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS]; } mv2_scatter_tuning_table; -extern int *mv2_size_scatter_tuning_table; -extern mv2_scatter_tuning_table **mv2_scatter_thresholds_table; - int *mv2_scatter_table_ppn_conf = NULL; int mv2_scatter_num_ppn_conf = 1; @@ -475,3 +1621,317 @@ int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf, #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_ompi_binomial #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_ompi_basic_linear + + + +static void init_mv2_scatter_tables_stampede(){ +{ + int agg_table_sum = 0; + int i; + mv2_scatter_tuning_table **table_ptrs = NULL; + mv2_scatter_num_ppn_conf = 3; + mv2_scatter_thresholds_table + = xbt_malloc(sizeof(mv2_scatter_tuning_table *) + * mv2_scatter_num_ppn_conf); + table_ptrs = xbt_malloc(sizeof(mv2_scatter_tuning_table *) + * mv2_scatter_num_ppn_conf); + mv2_size_scatter_tuning_table = xbt_malloc(sizeof(int) * + mv2_scatter_num_ppn_conf); + mv2_scatter_table_ppn_conf + = xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int)); + mv2_scatter_table_ppn_conf[0] = 1; + mv2_size_scatter_tuning_table[0] = 6; + mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = { + {2, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Binomial}, + }, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Binomial}, + }, + }, + + {4, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Direct}, + }, + }, + + {8, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Direct}, + }, + }, + + {16, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Direct}, + }, + }, + + {32, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Direct}, + }, + }, + + {64, + 2, + { + {0, 32, &MPIR_Scatter_MV2_Binomial}, + {32, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Binomial}, + }, + }, + }; + table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn; + mv2_scatter_table_ppn_conf[1] = 2; + mv2_size_scatter_tuning_table[1] = 6; + mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = { + {4, + 2, + { + {0, 4096, &MPIR_Scatter_MV2_Binomial}, + {4096, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Direct}, + }, + }, + + {8, + 2, + { + {0, 512, &MPIR_Scatter_MV2_two_level_Direct}, + {512, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Binomial}, + }, + }, + + {16, + 2, + { + {0, 2048, &MPIR_Scatter_MV2_two_level_Direct}, + {2048, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Binomial}, + }, + }, + + {32, + 2, + { + {0, 2048, &MPIR_Scatter_MV2_two_level_Direct}, + {2048, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Binomial}, + }, + }, + + {64, + 2, + { + {0, 8192, &MPIR_Scatter_MV2_two_level_Direct}, + {8192, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + {0, -1, &MPIR_Scatter_MV2_Binomial}, + }, + }, + + {128, + 4, + { + {0, 16, &MPIR_Scatter_MV2_Binomial}, + {16, 128, &MPIR_Scatter_MV2_two_level_Binomial}, + {128, 16384, &MPIR_Scatter_MV2_two_level_Direct}, + {16384, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + {0, 128, &MPIR_Scatter_MV2_Direct}, + {128, -1, &MPIR_Scatter_MV2_Binomial}, + }, + }, + }; + table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn; + mv2_scatter_table_ppn_conf[2] = 16; + mv2_size_scatter_tuning_table[2] = 8; + mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = { + { + 16, + 2, + { + {0, 256, &MPIR_Scatter_MV2_Binomial}, + {256, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + { 0, -1, &MPIR_Scatter_MV2_Direct}, + }, + }, + + { + 32, + 2, + { + {0, 512, &MPIR_Scatter_MV2_Binomial}, + {512, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + { 0, -1, &MPIR_Scatter_MV2_Direct}, + }, + }, + + { + 64, + 2, + { + {0, 1024, &MPIR_Scatter_MV2_two_level_Direct}, + {1024, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + { 0, -1, &MPIR_Scatter_MV2_Direct}, + }, + }, + + { + 128, + 4, + { + {0, 16, &MPIR_Scatter_mcst_wrap_MV2}, + {0, 16, &MPIR_Scatter_MV2_two_level_Direct}, + {16, 2048, &MPIR_Scatter_MV2_two_level_Direct}, + {2048, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + { 0, -1, &MPIR_Scatter_MV2_Direct}, + }, + }, + + { + 256, + 4, + { + {0, 16, &MPIR_Scatter_mcst_wrap_MV2}, + {0, 16, &MPIR_Scatter_MV2_two_level_Direct}, + {16, 2048, &MPIR_Scatter_MV2_two_level_Direct}, + {2048, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + { 0, -1, &MPIR_Scatter_MV2_Direct}, + }, + }, + + { + 512, + 4, + { + {0, 16, &MPIR_Scatter_mcst_wrap_MV2}, + {16, 16, &MPIR_Scatter_MV2_two_level_Direct}, + {16, 4096, &MPIR_Scatter_MV2_two_level_Direct}, + {4096, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + { 0, -1, &MPIR_Scatter_MV2_Binomial}, + }, + }, + { + 1024, + 5, + { + {0, 16, &MPIR_Scatter_mcst_wrap_MV2}, + {0, 16, &MPIR_Scatter_MV2_Binomial}, + {16, 32, &MPIR_Scatter_MV2_Binomial}, + {32, 4096, &MPIR_Scatter_MV2_two_level_Direct}, + {4096, -1, &MPIR_Scatter_MV2_Direct}, + }, + 1, + { + { 0, -1, &MPIR_Scatter_MV2_Binomial}, + }, + }, + { + 2048, + 7, + { + {0, 16, &MPIR_Scatter_mcst_wrap_MV2}, + {0, 16, &MPIR_Scatter_MV2_two_level_Binomial}, + {16, 128, &MPIR_Scatter_MV2_two_level_Binomial}, + {128, 1024, &MPIR_Scatter_MV2_two_level_Direct}, + {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct}, + {16384, 65536, &MPIR_Scatter_MV2_Direct}, + {65536, -1, &MPIR_Scatter_MV2_two_level_Direct}, + }, + 6, + { + {0, 16, &MPIR_Scatter_MV2_Binomial}, + {16, 128, &MPIR_Scatter_MV2_Binomial}, + {128, 1024, &MPIR_Scatter_MV2_Binomial}, + {1024, 16384, &MPIR_Scatter_MV2_Direct}, + {16384, 65536, &MPIR_Scatter_MV2_Direct}, + {65536, -1, &MPIR_Scatter_MV2_Direct}, + }, + }, + }; + table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn; + agg_table_sum = 0; + for (i = 0; i < mv2_scatter_num_ppn_conf; i++) { + agg_table_sum += mv2_size_scatter_tuning_table[i]; + } + mv2_scatter_thresholds_table[0] = + xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table)); + memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0], + (sizeof(mv2_scatter_tuning_table) + * mv2_size_scatter_tuning_table[0])); + for (i = 1; i < mv2_scatter_num_ppn_conf; i++) { + mv2_scatter_thresholds_table[i] = + mv2_scatter_thresholds_table[i - 1] + + mv2_size_scatter_tuning_table[i - 1]; + memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i], + (sizeof(mv2_scatter_tuning_table) + * mv2_size_scatter_tuning_table[i])); + } + xbt_free(table_ptrs); + } +} + -- 2.20.1