From: Augustin Degomme Date: Wed, 24 Jul 2013 11:31:08 +0000 (+0200) Subject: Add simple autotuning selector for collectives X-Git-Tag: v3_9_90~128^2~49 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/d367ff11bd2e88d47b9ba5e30f22c2b81473f08e Add simple autotuning selector for collectives For now, it loops over existing ones and benches the time for each process and the maximum time. It outputs the shortest collective found for each process, and also the global shortest. A rollback feature should be added to allow correct continuation and simulation. This is still experimental, tests are not generated, it can be called with --cfg=smpi/collname:automatic For now we don't check for input values, so some algorithms will fail (because they need power of 2 or even number of processes, mainly). Checks should be added --- diff --git a/buildtools/Cmake/DefinePackages.cmake b/buildtools/Cmake/DefinePackages.cmake index 7e5a7c51ef..ad79642ae8 100644 --- a/buildtools/Cmake/DefinePackages.cmake +++ b/buildtools/Cmake/DefinePackages.cmake @@ -204,6 +204,7 @@ set(SMPI_SRC src/smpi/colls/gather-ompi.c src/smpi/colls/reduce_scatter-ompi.c src/smpi/colls/reduce_scatter-mpich.c + src/smpi/colls/smpi_automatic_selector.c src/smpi/colls/scatter-ompi.c src/smpi/colls/barrier-ompi.c ) diff --git a/src/smpi/colls/colls.h b/src/smpi/colls/colls.h index 67b62a04ac..a2091b10e2 100644 --- a/src/smpi/colls/colls.h +++ b/src/smpi/colls/colls.h @@ -33,7 +33,8 @@ COLL_APPLY(action, COLL_GATHER_SIG, ompi) COLL_sep \ COLL_APPLY(action, COLL_GATHER_SIG, ompi_basic_linear) COLL_sep \ COLL_APPLY(action, COLL_GATHER_SIG, ompi_binomial) COLL_sep \ COLL_APPLY(action, COLL_GATHER_SIG, ompi_linear_sync) COLL_sep \ -COLL_APPLY(action, COLL_GATHER_SIG, mpich) \ +COLL_APPLY(action, COLL_GATHER_SIG, mpich) COLL_sep \ +COLL_APPLY(action, COLL_GATHER_SIG, automatic) @@ -65,7 +66,8 @@ COLL_APPLY(action, COLL_ALLGATHER_SIG, smp_simple) COLL_sep \ COLL_APPLY(action, COLL_ALLGATHER_SIG, spreading_simple) COLL_sep \ COLL_APPLY(action, COLL_ALLGATHER_SIG, ompi) COLL_sep \ COLL_APPLY(action, COLL_ALLGATHER_SIG, ompi_neighborexchange) COLL_sep \ -COLL_APPLY(action, COLL_ALLGATHER_SIG, mpich) +COLL_APPLY(action, COLL_ALLGATHER_SIG, mpich) COLL_sep \ +COLL_APPLY(action, COLL_ALLGATHER_SIG, automatic) COLL_ALLGATHERS(COLL_PROTO, COLL_NOsep) @@ -86,7 +88,8 @@ COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi) COLL_sep \ COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi_neighborexchange) COLL_sep \ COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi_bruck) COLL_sep \ COLL_APPLY(action, COLL_ALLGATHERV_SIG, mpich) COLL_sep \ -COLL_APPLY(action, COLL_ALLGATHERV_SIG, mpich_rdb) +COLL_APPLY(action, COLL_ALLGATHERV_SIG, mpich_rdb) COLL_sep \ +COLL_APPLY(action, COLL_ALLGATHERV_SIG, automatic) COLL_ALLGATHERVS(COLL_PROTO, COLL_NOsep) @@ -115,7 +118,8 @@ COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag_rab) COLL_sep \ COLL_APPLY(action, COLL_ALLREDUCE_SIG, redbcast) COLL_sep \ COLL_APPLY(action, COLL_ALLREDUCE_SIG, ompi) COLL_sep \ COLL_APPLY(action, COLL_ALLREDUCE_SIG, ompi_ring_segmented) COLL_sep \ -COLL_APPLY(action, COLL_ALLREDUCE_SIG, mpich) +COLL_APPLY(action, COLL_ALLREDUCE_SIG, mpich) COLL_sep \ +COLL_APPLY(action, COLL_ALLREDUCE_SIG, automatic) COLL_ALLREDUCES(COLL_PROTO, COLL_NOsep) @@ -126,7 +130,7 @@ COLL_ALLREDUCES(COLL_PROTO, COLL_NOsep) #define COLL_ALLTOALL_SIG alltoall, int, \ (void *send_buff, int send_count, MPI_Datatype send_type, \ void *recv_buff, int recv_count, MPI_Datatype recv_type, \ - MPI_Comm com) + MPI_Comm comm) #define COLL_ALLTOALLS(action, COLL_sep) \ COLL_APPLY(action, COLL_ALLTOALL_SIG, 2dmesh) COLL_sep \ @@ -143,8 +147,9 @@ COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_mpi_barrier) COLL_sep \ COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_one_barrier) COLL_sep \ COLL_APPLY(action, COLL_ALLTOALL_SIG, simple) COLL_sep \ COLL_APPLY(action, COLL_ALLTOALL_SIG, ompi) COLL_sep \ -COLL_APPLY(action, COLL_ALLTOALL_SIG, mpich)COLL_sep \ -COLL_APPLY(action, COLL_ALLTOALL_SIG, ompi_pairwise) +COLL_APPLY(action, COLL_ALLTOALL_SIG, mpich) COLL_sep \ +COLL_APPLY(action, COLL_ALLTOALL_SIG, ompi_pairwise) COLL_sep \ +COLL_APPLY(action, COLL_ALLTOALL_SIG, automatic) COLL_ALLTOALLS(COLL_PROTO, COLL_NOsep) @@ -154,7 +159,7 @@ COLL_ALLTOALLS(COLL_PROTO, COLL_NOsep) #define COLL_ALLTOALLV_SIG alltoallv, int, \ (void *send_buff, int *send_counts, int *send_disps, MPI_Datatype send_type, \ void *recv_buff, int *recv_counts, int *recv_disps, MPI_Datatype recv_type, \ - MPI_Comm com) + MPI_Comm comm) #define COLL_ALLTOALLVS(action, COLL_sep) \ COLL_APPLY(action, COLL_ALLTOALLV_SIG, bruck) COLL_sep \ @@ -167,8 +172,9 @@ COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_light_barrier) COLL_sep \ COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_mpi_barrier) COLL_sep \ COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_one_barrier) COLL_sep \ COLL_APPLY(action, COLL_ALLTOALLV_SIG, ompi) COLL_sep \ -COLL_APPLY(action, COLL_ALLTOALLV_SIG, mpich)COLL_sep \ -COLL_APPLY(action, COLL_ALLTOALLV_SIG, ompi_basic_linear) +COLL_APPLY(action, COLL_ALLTOALLV_SIG, mpich) COLL_sep \ +COLL_APPLY(action, COLL_ALLTOALLV_SIG, ompi_basic_linear) COLL_sep \ +COLL_APPLY(action, COLL_ALLTOALLV_SIG, automatic) COLL_ALLTOALLVS(COLL_PROTO, COLL_NOsep) @@ -198,7 +204,8 @@ COLL_APPLY(action, COLL_BCAST_SIG, SMP_linear) COLL_sep \ COLL_APPLY(action, COLL_BCAST_SIG, ompi) COLL_sep \ COLL_APPLY(action, COLL_BCAST_SIG, ompi_split_bintree) COLL_sep \ COLL_APPLY(action, COLL_BCAST_SIG, ompi_pipeline) COLL_sep \ -COLL_APPLY(action, COLL_BCAST_SIG, mpich) +COLL_APPLY(action, COLL_BCAST_SIG, mpich) COLL_sep \ +COLL_APPLY(action, COLL_BCAST_SIG, automatic) COLL_BCASTS(COLL_PROTO, COLL_NOsep) @@ -223,7 +230,8 @@ COLL_APPLY(action, COLL_REDUCE_SIG, ompi_basic_linear) COLL_sep \ COLL_APPLY(action, COLL_REDUCE_SIG, ompi_in_order_binary) COLL_sep \ COLL_APPLY(action, COLL_REDUCE_SIG, ompi_binary) COLL_sep \ COLL_APPLY(action, COLL_REDUCE_SIG, ompi_binomial) COLL_sep \ -COLL_APPLY(action, COLL_REDUCE_SIG, mpich) +COLL_APPLY(action, COLL_REDUCE_SIG, mpich) COLL_sep \ +COLL_APPLY(action, COLL_REDUCE_SIG, automatic) COLL_REDUCES(COLL_PROTO, COLL_NOsep) @@ -241,7 +249,9 @@ COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, ompi_ring) COLL_sep \ COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich) COLL_sep \ COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_pair) COLL_sep \ COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_rdb) COLL_sep \ -COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_noncomm) +COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_noncomm) COLL_sep \ +COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, automatic) + COLL_REDUCE_SCATTERS(COLL_PROTO, COLL_NOsep) @@ -259,7 +269,8 @@ COLL_REDUCE_SCATTERS(COLL_PROTO, COLL_NOsep) COLL_APPLY(action, COLL_SCATTER_SIG, ompi) COLL_sep \ COLL_APPLY(action, COLL_SCATTER_SIG, ompi_basic_linear) COLL_sep \ COLL_APPLY(action, COLL_SCATTER_SIG, ompi_binomial) COLL_sep \ -COLL_APPLY(action, COLL_SCATTER_SIG, mpich) +COLL_APPLY(action, COLL_SCATTER_SIG, mpich) COLL_sep \ +COLL_APPLY(action, COLL_SCATTER_SIG, automatic) COLL_SCATTERS(COLL_PROTO, COLL_NOsep) @@ -277,7 +288,8 @@ COLL_APPLY(action, COLL_BARRIER_SIG, ompi_tree) COLL_sep \ COLL_APPLY(action, COLL_BARRIER_SIG, ompi_bruck) COLL_sep \ COLL_APPLY(action, COLL_BARRIER_SIG, ompi_recursivedoubling) COLL_sep \ COLL_APPLY(action, COLL_BARRIER_SIG, ompi_doublering) COLL_sep \ -COLL_APPLY(action, COLL_BARRIER_SIG, mpich) +COLL_APPLY(action, COLL_BARRIER_SIG, mpich) COLL_sep \ +COLL_APPLY(action, COLL_BARRIER_SIG, automatic) COLL_BARRIERS(COLL_PROTO, COLL_NOsep) diff --git a/src/smpi/colls/smpi_automatic_selector.c b/src/smpi/colls/smpi_automatic_selector.c new file mode 100644 index 0000000000..192ac0f892 --- /dev/null +++ b/src/smpi/colls/smpi_automatic_selector.c @@ -0,0 +1,63 @@ +#include "colls_private.h" +#include +#include "mc/mc_private.h" + +//attempt to do a quick autotuning version of the collective, + +#define AUTOMATIC_COLL_BENCH(cat, ret, args, args2)\ + ret smpi_coll_tuned_ ## cat ## _ ## automatic(COLL_UNPAREN args)\ +{\ + double time1, time2, time_min=INT_MAX;\ + int min_coll=-1, global_coll=-1;\ + int i;\ + double buf_in, buf_out, max_min=INT_MAX;\ + for (i = 0; mpi_coll_##cat##_description[i].name; i++){\ + if(!strcmp(mpi_coll_##cat##_description[i].name, "automatic"))continue;\ + if(!strcmp(mpi_coll_##cat##_description[i].name, "default"))continue;\ + smpi_mpi_barrier(comm);\ + if (TRACE_is_enabled()){\ + type_t type = PJ_type_get_or_null (#cat, PJ_type_get_root());\ + if (!type){\ + type=PJ_type_event_new(#cat, PJ_type_get_root());\ + }\ + char* cont_name=malloc(25*sizeof(char*));\ + sprintf(cont_name, "rank-%d", smpi_process_index());\ + val_t value = PJ_value_get_or_new(mpi_coll_##cat##_description[i].name,"1.0 1.0 1.0", type);\ + new_pajeNewEvent (SIMIX_get_clock(), PJ_container_get(cont_name), type, value);\ + }\ + time1 = SIMIX_get_clock();\ + ((int (*) args)\ + mpi_coll_##cat##_description[i].coll) args2 ;\ + time2 = SIMIX_get_clock();\ + buf_out=time2-time1;\ + smpi_mpi_reduce((void*)&buf_out,(void*)&buf_in, 1, MPI_DOUBLE, MPI_MAX, 0,comm );\ + if(time2-time1