From 56b8ebe99bfcd666e373069b2072291ab5e04ee8 Mon Sep 17 00:00:00 2001 From: Augustin Degomme Date: Wed, 22 Jan 2014 17:46:13 +0100 Subject: [PATCH] protect (hopefully) collective communication algorithms from abuse. Prevent their use with a number of node they don't like Allows the automatic selector to ignore those failing using exception mechanism --- src/smpi/colls/allgather-2dmesh.c | 2 +- src/smpi/colls/allgather-3dmesh.c | 4 +++- src/smpi/colls/allgather-SMP-NTS.c | 3 +++ src/smpi/colls/allgather-loosely-lr.c | 4 ++++ src/smpi/colls/allgather-pair.c | 4 ++++ src/smpi/colls/allgather-rhv.c | 4 ++++ src/smpi/colls/allgather-smp-simple.c | 4 ++++ src/smpi/colls/allgatherv-pair.c | 4 ++++ src/smpi/colls/allreduce-rab1.c | 3 +++ src/smpi/colls/allreduce-smp-rsag-rab.c | 4 ++++ src/smpi/colls/alltoall-pair-light-barrier.c | 4 ++++ src/smpi/colls/alltoall-pair-mpi-barrier.c | 4 ++++ src/smpi/colls/alltoall-pair-one-barrier.c | 4 ++++ src/smpi/colls/alltoall-pair.c | 4 ++++ src/smpi/colls/alltoallv-pair-light-barrier.c | 4 ++++ src/smpi/colls/alltoallv-pair-mpi-barrier.c | 4 ++++ src/smpi/colls/alltoallv-pair-one-barrier.c | 4 ++++ src/smpi/colls/alltoallv-pair.c | 4 ++++ src/smpi/colls/bcast-SMP-binary.c | 3 +++ src/smpi/colls/bcast-SMP-binomial.c | 3 +++ src/smpi/colls/bcast-SMP-linear.c | 3 +++ src/smpi/colls/colls.h | 1 + src/smpi/colls/smpi_automatic_selector.c | 6 ++++++ 23 files changed, 82 insertions(+), 2 deletions(-) diff --git a/src/smpi/colls/allgather-2dmesh.c b/src/smpi/colls/allgather-2dmesh.c index 602d540e11..8ebc8489b3 100644 --- a/src/smpi/colls/allgather-2dmesh.c +++ b/src/smpi/colls/allgather-2dmesh.c @@ -120,7 +120,7 @@ smpi_coll_tuned_allgather_2dmesh(void *send_buff, int send_count, MPI_Datatype block_size = extent * send_count; if (!is_2dmesh(num_procs, &X, &Y)) - return MPI_ERR_COMM; + THROWF(arg_error,0, "allgather_2dmesh algorithm can't be used with this number of processes! "); my_row_base = (rank / Y) * Y; my_col_base = rank % Y; diff --git a/src/smpi/colls/allgather-3dmesh.c b/src/smpi/colls/allgather-3dmesh.c index 5a224379c9..c83fdc6a3c 100644 --- a/src/smpi/colls/allgather-3dmesh.c +++ b/src/smpi/colls/allgather-3dmesh.c @@ -103,7 +103,9 @@ int smpi_coll_tuned_allgather_3dmesh(void *send_buff, int send_count, num_procs = smpi_comm_size(comm); extent = smpi_datatype_get_extent(send_type); - is_3dmesh(num_procs, &X, &Y, &Z); + if (!is_3dmesh(num_procs, &X, &Y, &Z)) + THROWF(arg_error,0, "allgather_3dmesh algorithm can't be used with this number of processes! "); + num_reqs = X; diff --git a/src/smpi/colls/allgather-SMP-NTS.c b/src/smpi/colls/allgather-SMP-NTS.c index 3bbad64f99..7ad059a316 100644 --- a/src/smpi/colls/allgather-SMP-NTS.c +++ b/src/smpi/colls/allgather-SMP-NTS.c @@ -23,6 +23,9 @@ int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount, int inter_comm_size = (comm_size + NUM_CORE - 1) / NUM_CORE; int num_core_in_current_smp = NUM_CORE; + if(comm_size%NUM_CORE) + THROWF(arg_error,0, "allgather SMP NTS algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ", NUM_CORE); + /* for too small number of processes, use default implementation */ if (comm_size <= NUM_CORE) { XBT_WARN("MPI_allgather_SMP_NTS use default MPI_allgather."); diff --git a/src/smpi/colls/allgather-loosely-lr.c b/src/smpi/colls/allgather-loosely-lr.c index 04c7faff15..7abe6e0d2b 100644 --- a/src/smpi/colls/allgather-loosely-lr.c +++ b/src/smpi/colls/allgather-loosely-lr.c @@ -16,6 +16,10 @@ int smpi_coll_tuned_allgather_loosely_lr(void *sbuf, int scount, int inter_dst, inter_src; comm_size = smpi_comm_size(comm); + + if(comm_size%4) + THROWF(arg_error,0, "allgather loosely lr algorithm can't be used with non multiple of NUM_CORE=4 number of processes ! "); + rank = smpi_comm_rank(comm); MPI_Aint rextent, sextent; rextent = smpi_datatype_get_extent(rtype); diff --git a/src/smpi/colls/allgather-pair.c b/src/smpi/colls/allgather-pair.c index 6075723abc..50f75aa747 100644 --- a/src/smpi/colls/allgather-pair.c +++ b/src/smpi/colls/allgather-pair.c @@ -75,6 +75,10 @@ smpi_coll_tuned_allgather_pair(void *send_buff, int send_count, rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); + + if((num_procs&(num_procs-1))) + THROWF(arg_error,0, "allgather pair algorithm can't be used with non power of two number of processes ! "); + extent = smpi_datatype_get_extent(send_type); // local send/recv diff --git a/src/smpi/colls/allgather-rhv.c b/src/smpi/colls/allgather-rhv.c index aceb283c20..8179aff371 100644 --- a/src/smpi/colls/allgather-rhv.c +++ b/src/smpi/colls/allgather-rhv.c @@ -21,6 +21,10 @@ smpi_coll_tuned_allgather_rhv(void *sbuf, int send_count, // get size of the communicator, followed by rank num_procs = smpi_comm_size(comm); + + if((num_procs&(num_procs-1))) + THROWF(arg_error,0, "allgather rhv algorithm can't be used with non power of two number of processes ! "); + rank = smpi_comm_rank(comm); // get size of single element's type for send buffer and recv buffer diff --git a/src/smpi/colls/allgather-smp-simple.c b/src/smpi/colls/allgather-smp-simple.c index 143887052e..f1c25d0428 100644 --- a/src/smpi/colls/allgather-smp-simple.c +++ b/src/smpi/colls/allgather-smp-simple.c @@ -10,6 +10,10 @@ int smpi_coll_tuned_allgather_smp_simple(void *send_buf, int scount, { int src, dst, comm_size, rank; comm_size = smpi_comm_size(comm); + + if(comm_size%NUM_CORE) + THROWF(arg_error,0, "allgather SMP simple algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ", NUM_CORE); + rank = smpi_comm_rank(comm); MPI_Aint rextent, sextent; rextent = smpi_datatype_get_extent(rtype); diff --git a/src/smpi/colls/allgatherv-pair.c b/src/smpi/colls/allgatherv-pair.c index c73366eeb7..e31fb6236c 100644 --- a/src/smpi/colls/allgatherv-pair.c +++ b/src/smpi/colls/allgatherv-pair.c @@ -75,6 +75,10 @@ smpi_coll_tuned_allgatherv_pair(void *send_buff, int send_count, rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); + + if((num_procs&(num_procs-1))) + THROWF(arg_error,0, "allgatherv pair algorithm can't be used with non power of two number of processes ! "); + extent = smpi_datatype_get_extent(send_type); // local send/recv diff --git a/src/smpi/colls/allreduce-rab1.c b/src/smpi/colls/allreduce-rab1.c index 66862da91b..e8633396b6 100644 --- a/src/smpi/colls/allreduce-rab1.c +++ b/src/smpi/colls/allreduce-rab1.c @@ -16,6 +16,9 @@ int smpi_coll_tuned_allreduce_rab1(void *sbuff, void *rbuff, rank = smpi_comm_rank(comm); nprocs = smpi_comm_size(comm); + if((nprocs&(nprocs-1))) + THROWF(arg_error,0, "allreduce rab1 algorithm can't be used with non power of two number of processes ! "); + extent = smpi_datatype_get_extent(dtype); pof2 = 1; diff --git a/src/smpi/colls/allreduce-smp-rsag-rab.c b/src/smpi/colls/allreduce-smp-rsag-rab.c index cec75216b1..ced01ab0c6 100644 --- a/src/smpi/colls/allreduce-smp-rsag-rab.c +++ b/src/smpi/colls/allreduce-smp-rsag-rab.c @@ -29,6 +29,10 @@ int smpi_coll_tuned_allreduce_smp_rsag_rab(void *sbuf, void *rbuf, int count, int num_core = NUM_CORE; comm_size = smpi_comm_size(comm); + + if((comm_size&(comm_size-1))) + THROWF(arg_error,0, "allreduce smp rsag rab algorithm can't be used with non power of two number of processes ! "); + rank = smpi_comm_rank(comm); MPI_Aint extent; extent = smpi_datatype_get_extent(dtype); diff --git a/src/smpi/colls/alltoall-pair-light-barrier.c b/src/smpi/colls/alltoall-pair-light-barrier.c index 65ae5f8488..9c6a6ded33 100644 --- a/src/smpi/colls/alltoall-pair-light-barrier.c +++ b/src/smpi/colls/alltoall-pair-light-barrier.c @@ -39,6 +39,10 @@ smpi_coll_tuned_alltoall_pair_light_barrier(void *send_buff, int send_count, rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); + + if((num_procs&(num_procs-1))) + THROWF(arg_error,0, "alltoall pair algorithm can't be used with non power of two number of processes ! "); + send_chunk = smpi_datatype_get_extent(send_type); recv_chunk = smpi_datatype_get_extent(recv_type); diff --git a/src/smpi/colls/alltoall-pair-mpi-barrier.c b/src/smpi/colls/alltoall-pair-mpi-barrier.c index 20dd573026..1816cff4d4 100644 --- a/src/smpi/colls/alltoall-pair-mpi-barrier.c +++ b/src/smpi/colls/alltoall-pair-mpi-barrier.c @@ -36,6 +36,10 @@ smpi_coll_tuned_alltoall_pair_mpi_barrier(void *send_buff, int send_count, rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); + + if((num_procs&(num_procs-1))) + THROWF(arg_error,0, "alltoall pair algorithm can't be used with non power of two number of processes ! "); + send_chunk = smpi_datatype_get_extent(send_type); recv_chunk = smpi_datatype_get_extent(recv_type); diff --git a/src/smpi/colls/alltoall-pair-one-barrier.c b/src/smpi/colls/alltoall-pair-one-barrier.c index 209fa81c29..b23040254a 100644 --- a/src/smpi/colls/alltoall-pair-one-barrier.c +++ b/src/smpi/colls/alltoall-pair-one-barrier.c @@ -37,6 +37,10 @@ smpi_coll_tuned_alltoall_pair_one_barrier(void *send_buff, int send_count, rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); + + if((num_procs&(num_procs-1))) + THROWF(arg_error,0, "alltoall pair algorithm can't be used with non power of two number of processes ! "); + send_chunk = smpi_datatype_get_extent(send_type); recv_chunk = smpi_datatype_get_extent(recv_type); diff --git a/src/smpi/colls/alltoall-pair.c b/src/smpi/colls/alltoall-pair.c index 66c47675ca..bac1909a84 100644 --- a/src/smpi/colls/alltoall-pair.c +++ b/src/smpi/colls/alltoall-pair.c @@ -74,6 +74,10 @@ int smpi_coll_tuned_alltoall_pair(void *send_buff, int send_count, rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); + + if((num_procs&(num_procs-1))) + THROWF(arg_error,0, "alltoall pair algorithm can't be used with non power of two number of processes ! "); + send_chunk = smpi_datatype_get_extent(send_type); recv_chunk = smpi_datatype_get_extent(recv_type); diff --git a/src/smpi/colls/alltoallv-pair-light-barrier.c b/src/smpi/colls/alltoallv-pair-light-barrier.c index 60e2647375..7843d9130c 100644 --- a/src/smpi/colls/alltoallv-pair-light-barrier.c +++ b/src/smpi/colls/alltoallv-pair-light-barrier.c @@ -39,6 +39,10 @@ smpi_coll_tuned_alltoallv_pair_light_barrier(void *send_buff, int *send_counts, rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); + + if((num_procs&(num_procs-1))) + THROWF(arg_error,0, "alltoallv pair algorithm can't be used with non power of two number of processes ! "); + send_chunk = smpi_datatype_get_extent(send_type); recv_chunk = smpi_datatype_get_extent(recv_type); diff --git a/src/smpi/colls/alltoallv-pair-mpi-barrier.c b/src/smpi/colls/alltoallv-pair-mpi-barrier.c index 62b0d71550..aba8f25827 100644 --- a/src/smpi/colls/alltoallv-pair-mpi-barrier.c +++ b/src/smpi/colls/alltoallv-pair-mpi-barrier.c @@ -36,6 +36,10 @@ smpi_coll_tuned_alltoallv_pair_mpi_barrier(void *send_buff, int *send_counts, in rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); + + if((num_procs&(num_procs-1))) + THROWF(arg_error,0, "alltoallv pair algorithm can't be used with non power of two number of processes ! "); + send_chunk = smpi_datatype_get_extent(send_type); recv_chunk = smpi_datatype_get_extent(recv_type); diff --git a/src/smpi/colls/alltoallv-pair-one-barrier.c b/src/smpi/colls/alltoallv-pair-one-barrier.c index 7bea7e7171..7227df81fb 100644 --- a/src/smpi/colls/alltoallv-pair-one-barrier.c +++ b/src/smpi/colls/alltoallv-pair-one-barrier.c @@ -36,6 +36,10 @@ smpi_coll_tuned_alltoallv_pair_one_barrier(void *send_buff, int *send_counts, in rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); + + if((num_procs&(num_procs-1))) + THROWF(arg_error,0, "alltoallv pair algorithm can't be used with non power of two number of processes ! "); + send_chunk = smpi_datatype_get_extent(send_type); recv_chunk = smpi_datatype_get_extent(recv_type); diff --git a/src/smpi/colls/alltoallv-pair.c b/src/smpi/colls/alltoallv-pair.c index 6692eeb30d..afce437755 100644 --- a/src/smpi/colls/alltoallv-pair.c +++ b/src/smpi/colls/alltoallv-pair.c @@ -36,6 +36,10 @@ int smpi_coll_tuned_alltoallv_pair(void *send_buff, int *send_counts, int *send_ rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); + + if((num_procs&(num_procs-1))) + THROWF(arg_error,0, "alltoallv pair algorithm can't be used with non power of two number of processes ! "); + send_chunk = smpi_datatype_get_extent(send_type); recv_chunk = smpi_datatype_get_extent(recv_type); diff --git a/src/smpi/colls/bcast-SMP-binary.c b/src/smpi/colls/bcast-SMP-binary.c index 1645e710dd..c09d703450 100644 --- a/src/smpi/colls/bcast-SMP-binary.c +++ b/src/smpi/colls/bcast-SMP-binary.c @@ -22,6 +22,9 @@ int smpi_coll_tuned_bcast_SMP_binary(void *buf, int count, rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); + if(size%NUM_CORE) + THROWF(arg_error,0, "bcast SMP binary can't be used with non multiple of NUM_CORE=%d number of processes ! ",NUM_CORE); + int segment = bcast_SMP_binary_segment_byte / extent; int pipe_length = count / segment; int remainder = count % segment; diff --git a/src/smpi/colls/bcast-SMP-binomial.c b/src/smpi/colls/bcast-SMP-binomial.c index c2b24a4d00..223996096b 100644 --- a/src/smpi/colls/bcast-SMP-binomial.c +++ b/src/smpi/colls/bcast-SMP-binomial.c @@ -16,6 +16,9 @@ int smpi_coll_tuned_bcast_SMP_binomial(void *buf, int count, size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); + if(size%NUM_CORE) + THROWF(arg_error,0, "bcast SMP binomial can't be used with non multiple of NUM_CORE=%d number of processes ! ",NUM_CORE); + int to_intra, to_inter; int from_intra, from_inter; int inter_rank = rank / NUM_CORE; diff --git a/src/smpi/colls/bcast-SMP-linear.c b/src/smpi/colls/bcast-SMP-linear.c index b999b395d8..092ab26ed0 100644 --- a/src/smpi/colls/bcast-SMP-linear.c +++ b/src/smpi/colls/bcast-SMP-linear.c @@ -22,6 +22,9 @@ int smpi_coll_tuned_bcast_SMP_linear(void *buf, int count, rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); + if(size%NUM_CORE) + THROWF(arg_error,0, "bcast SMP linear can't be used with non multiple of NUM_CORE=%d number of processes ! ",NUM_CORE); + int segment = bcast_SMP_linear_segment_byte / extent; int pipe_length = count / segment; int remainder = count % segment; diff --git a/src/smpi/colls/colls.h b/src/smpi/colls/colls.h index b70a313966..9a2cd45370 100644 --- a/src/smpi/colls/colls.h +++ b/src/smpi/colls/colls.h @@ -4,6 +4,7 @@ #include #include "smpi/mpi.h" #include "smpi/private.h" +#include "xbt/ex.h" #include "xbt.h" #define COLL_DESCRIPTION(cat, ret, args, name) \ diff --git a/src/smpi/colls/smpi_automatic_selector.c b/src/smpi/colls/smpi_automatic_selector.c index ded62ec1b0..95a7ce470d 100644 --- a/src/smpi/colls/smpi_automatic_selector.c +++ b/src/smpi/colls/smpi_automatic_selector.c @@ -26,6 +26,7 @@ double time1, time2, time_min=DBL_MAX;\ int min_coll=-1, global_coll=-1;\ int i;\ + xbt_ex_t ex;\ double buf_in, buf_out, max_min=DBL_MAX;\ for (i = 0; mpi_coll_##cat##_description[i].name; i++){\ if(!strcmp(mpi_coll_##cat##_description[i].name, "automatic"))continue;\ @@ -33,8 +34,13 @@ smpi_mpi_barrier(comm);\ TRACE_AUTO_COLL(cat)\ time1 = SIMIX_get_clock();\ + TRY{\ ((int (*) args)\ mpi_coll_##cat##_description[i].coll) args2 ;\ + }\ + CATCH(ex) {\ + continue;\ + }\ time2 = SIMIX_get_clock();\ buf_out=time2-time1;\ smpi_mpi_reduce((void*)&buf_out,(void*)&buf_in, 1, MPI_DOUBLE, MPI_MAX, 0,comm );\ -- 2.20.1