Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Add simple autotuning selector for collectives
authorAugustin Degomme <degomme@idpann.imag.fr>
Wed, 24 Jul 2013 11:31:08 +0000 (13:31 +0200)
committerAugustin Degomme <degomme@idpann.imag.fr>
Wed, 24 Jul 2013 11:31:08 +0000 (13:31 +0200)
For now, it loops over existing ones and benches the time for each process and the maximum time.
It outputs the shortest collective found for each process, and also the global shortest.
A rollback feature should be added to allow correct continuation and simulation.
This is still experimental, tests are not generated, it can be called with --cfg=smpi/collname:automatic
For now we don't check for input values, so some algorithms will fail (because they need power of 2 or even number of processes, mainly). Checks should be added

buildtools/Cmake/DefinePackages.cmake
src/smpi/colls/colls.h
src/smpi/colls/smpi_automatic_selector.c [new file with mode: 0644]

index 7e5a7c5..ad79642 100644 (file)
@@ -204,6 +204,7 @@ set(SMPI_SRC
   src/smpi/colls/gather-ompi.c
   src/smpi/colls/reduce_scatter-ompi.c
   src/smpi/colls/reduce_scatter-mpich.c
+  src/smpi/colls/smpi_automatic_selector.c
   src/smpi/colls/scatter-ompi.c
   src/smpi/colls/barrier-ompi.c
   )
index 67b62a0..a2091b1 100644 (file)
@@ -33,7 +33,8 @@ COLL_APPLY(action, COLL_GATHER_SIG, ompi) COLL_sep \
 COLL_APPLY(action, COLL_GATHER_SIG, ompi_basic_linear) COLL_sep \
 COLL_APPLY(action, COLL_GATHER_SIG, ompi_binomial) COLL_sep \
 COLL_APPLY(action, COLL_GATHER_SIG, ompi_linear_sync) COLL_sep \
-COLL_APPLY(action, COLL_GATHER_SIG, mpich) \
+COLL_APPLY(action, COLL_GATHER_SIG, mpich) COLL_sep \
+COLL_APPLY(action, COLL_GATHER_SIG, automatic)
 
 
 
@@ -65,7 +66,8 @@ COLL_APPLY(action, COLL_ALLGATHER_SIG, smp_simple) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, spreading_simple) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, ompi) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, ompi_neighborexchange) COLL_sep \
-COLL_APPLY(action, COLL_ALLGATHER_SIG, mpich) 
+COLL_APPLY(action, COLL_ALLGATHER_SIG, mpich) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, automatic)
 
 
 COLL_ALLGATHERS(COLL_PROTO, COLL_NOsep)
@@ -86,7 +88,8 @@ COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi_neighborexchange) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi_bruck) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHERV_SIG, mpich) COLL_sep \
-COLL_APPLY(action, COLL_ALLGATHERV_SIG, mpich_rdb)
+COLL_APPLY(action, COLL_ALLGATHERV_SIG, mpich_rdb) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHERV_SIG, automatic)
 
 COLL_ALLGATHERVS(COLL_PROTO, COLL_NOsep)
 
@@ -115,7 +118,8 @@ COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag_rab) COLL_sep \
 COLL_APPLY(action, COLL_ALLREDUCE_SIG, redbcast) COLL_sep \
 COLL_APPLY(action, COLL_ALLREDUCE_SIG, ompi) COLL_sep \
 COLL_APPLY(action, COLL_ALLREDUCE_SIG, ompi_ring_segmented) COLL_sep \
-COLL_APPLY(action, COLL_ALLREDUCE_SIG, mpich)
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, mpich) COLL_sep \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, automatic)
 
 COLL_ALLREDUCES(COLL_PROTO, COLL_NOsep)
 
@@ -126,7 +130,7 @@ COLL_ALLREDUCES(COLL_PROTO, COLL_NOsep)
 #define COLL_ALLTOALL_SIG alltoall, int, \
                         (void *send_buff, int send_count, MPI_Datatype send_type, \
                          void *recv_buff, int recv_count, MPI_Datatype recv_type, \
-                          MPI_Comm com)
+                          MPI_Comm comm)
 
 #define COLL_ALLTOALLS(action, COLL_sep) \
 COLL_APPLY(action, COLL_ALLTOALL_SIG, 2dmesh) COLL_sep \
@@ -143,8 +147,9 @@ COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_mpi_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_one_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALL_SIG, simple) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALL_SIG, ompi) COLL_sep \
-COLL_APPLY(action, COLL_ALLTOALL_SIG, mpich)COLL_sep \
-COLL_APPLY(action, COLL_ALLTOALL_SIG, ompi_pairwise)
+COLL_APPLY(action, COLL_ALLTOALL_SIG, mpich) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, ompi_pairwise) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, automatic)
 
 COLL_ALLTOALLS(COLL_PROTO, COLL_NOsep)
 
@@ -154,7 +159,7 @@ COLL_ALLTOALLS(COLL_PROTO, COLL_NOsep)
 #define COLL_ALLTOALLV_SIG alltoallv, int, \
                         (void *send_buff, int *send_counts, int *send_disps, MPI_Datatype send_type, \
                          void *recv_buff, int *recv_counts, int *recv_disps, MPI_Datatype recv_type, \
-                          MPI_Comm com)
+                          MPI_Comm comm)
 
 #define COLL_ALLTOALLVS(action, COLL_sep) \
 COLL_APPLY(action, COLL_ALLTOALLV_SIG, bruck) COLL_sep \
@@ -167,8 +172,9 @@ COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_light_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_mpi_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_one_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALLV_SIG, ompi) COLL_sep \
-COLL_APPLY(action, COLL_ALLTOALLV_SIG, mpich)COLL_sep \
-COLL_APPLY(action, COLL_ALLTOALLV_SIG, ompi_basic_linear)
+COLL_APPLY(action, COLL_ALLTOALLV_SIG, mpich) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALLV_SIG, ompi_basic_linear) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALLV_SIG, automatic)
 
 COLL_ALLTOALLVS(COLL_PROTO, COLL_NOsep)
 
@@ -198,7 +204,8 @@ COLL_APPLY(action, COLL_BCAST_SIG, SMP_linear) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, ompi) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, ompi_split_bintree) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, ompi_pipeline) COLL_sep \
-COLL_APPLY(action, COLL_BCAST_SIG, mpich)
+COLL_APPLY(action, COLL_BCAST_SIG, mpich) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, automatic)
 
 COLL_BCASTS(COLL_PROTO, COLL_NOsep)
 
@@ -223,7 +230,8 @@ COLL_APPLY(action, COLL_REDUCE_SIG, ompi_basic_linear) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, ompi_in_order_binary) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, ompi_binary) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, ompi_binomial) COLL_sep \
-COLL_APPLY(action, COLL_REDUCE_SIG, mpich)
+COLL_APPLY(action, COLL_REDUCE_SIG, mpich) COLL_sep \
+COLL_APPLY(action, COLL_REDUCE_SIG, automatic)
 
 COLL_REDUCES(COLL_PROTO, COLL_NOsep)
 
@@ -241,7 +249,9 @@ COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, ompi_ring)  COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_pair) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_rdb) COLL_sep \
-COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_noncomm) 
+COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_noncomm) COLL_sep \
+COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, automatic)
+
 
 
 COLL_REDUCE_SCATTERS(COLL_PROTO, COLL_NOsep)
@@ -259,7 +269,8 @@ COLL_REDUCE_SCATTERS(COLL_PROTO, COLL_NOsep)
 COLL_APPLY(action, COLL_SCATTER_SIG, ompi) COLL_sep \
 COLL_APPLY(action, COLL_SCATTER_SIG, ompi_basic_linear) COLL_sep \
 COLL_APPLY(action, COLL_SCATTER_SIG, ompi_binomial)  COLL_sep \
-COLL_APPLY(action, COLL_SCATTER_SIG, mpich) 
+COLL_APPLY(action, COLL_SCATTER_SIG, mpich)   COLL_sep \
+COLL_APPLY(action, COLL_SCATTER_SIG, automatic)
 
 COLL_SCATTERS(COLL_PROTO, COLL_NOsep)
 
@@ -277,7 +288,8 @@ COLL_APPLY(action, COLL_BARRIER_SIG, ompi_tree)  COLL_sep \
 COLL_APPLY(action, COLL_BARRIER_SIG, ompi_bruck)  COLL_sep \
 COLL_APPLY(action, COLL_BARRIER_SIG, ompi_recursivedoubling) COLL_sep \
 COLL_APPLY(action, COLL_BARRIER_SIG, ompi_doublering) COLL_sep \
-COLL_APPLY(action, COLL_BARRIER_SIG, mpich)  
+COLL_APPLY(action, COLL_BARRIER_SIG, mpich)   COLL_sep \
+COLL_APPLY(action, COLL_BARRIER_SIG, automatic)
 
 COLL_BARRIERS(COLL_PROTO, COLL_NOsep)
 
diff --git a/src/smpi/colls/smpi_automatic_selector.c b/src/smpi/colls/smpi_automatic_selector.c
new file mode 100644 (file)
index 0000000..192ac0f
--- /dev/null
@@ -0,0 +1,63 @@
+#include "colls_private.h"
+#include <limits.h>
+#include "mc/mc_private.h"
+
+//attempt to do a quick autotuning version of the collective,
+
+#define AUTOMATIC_COLL_BENCH(cat, ret, args, args2)\
+    ret smpi_coll_tuned_ ## cat ## _ ## automatic(COLL_UNPAREN args)\
+{\
+  double time1, time2, time_min=INT_MAX;\
+  int min_coll=-1, global_coll=-1;\
+  int i;\
+  double buf_in, buf_out, max_min=INT_MAX;\
+  for (i = 0; mpi_coll_##cat##_description[i].name; i++){\
+      if(!strcmp(mpi_coll_##cat##_description[i].name, "automatic"))continue;\
+      if(!strcmp(mpi_coll_##cat##_description[i].name, "default"))continue;\
+      smpi_mpi_barrier(comm);\
+      if (TRACE_is_enabled()){\
+        type_t type = PJ_type_get_or_null (#cat, PJ_type_get_root());\
+         if (!type){\
+             type=PJ_type_event_new(#cat, PJ_type_get_root());\
+         }\
+         char* cont_name=malloc(25*sizeof(char*));\
+         sprintf(cont_name, "rank-%d", smpi_process_index());\
+         val_t value = PJ_value_get_or_new(mpi_coll_##cat##_description[i].name,"1.0 1.0 1.0", type);\
+         new_pajeNewEvent (SIMIX_get_clock(), PJ_container_get(cont_name), type, value);\
+      }\
+      time1 = SIMIX_get_clock();\
+      ((int (*) args)\
+          mpi_coll_##cat##_description[i].coll) args2 ;\
+      time2 = SIMIX_get_clock();\
+      buf_out=time2-time1;\
+      smpi_mpi_reduce((void*)&buf_out,(void*)&buf_in, 1, MPI_DOUBLE, MPI_MAX, 0,comm );\
+      if(time2-time1<time_min){\
+          min_coll=i;\
+          time_min=time2-time1;\
+      }\
+      if(smpi_comm_rank(comm)==0){\
+          if(buf_in<max_min){\
+              max_min=buf_in;\
+              global_coll=i;\
+          }\
+      }\
+  }\
+  if(smpi_comm_rank(comm)==0){\
+      XBT_WARN("For rank 0, the quickest was %s : %lf , but global was %s : %lf at max",mpi_coll_##cat##_description[min_coll].name, time_min,mpi_coll_##cat##_description[global_coll].name, max_min);\
+  }else\
+  XBT_WARN("The quickest reduce_scatter was %s on rank %d and took %lf",mpi_coll_##cat##_description[min_coll].name, smpi_comm_rank(comm), time_min);\
+  return (min_coll!=-1)?MPI_SUCCESS:MPI_ERR_INTERN;\
+}\
+
+
+COLL_APPLY(AUTOMATIC_COLL_BENCH, COLL_ALLGATHERV_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_disps, recv_type, comm));
+COLL_APPLY(AUTOMATIC_COLL_BENCH, COLL_ALLREDUCE_SIG, (sbuf, rbuf, rcount, dtype, op, comm));
+COLL_APPLY(AUTOMATIC_COLL_BENCH, COLL_GATHER_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_type, root, comm));
+COLL_APPLY(AUTOMATIC_COLL_BENCH, COLL_ALLGATHER_SIG, (send_buff,send_count,send_type,recv_buff,recv_count,recv_type,comm));
+COLL_APPLY(AUTOMATIC_COLL_BENCH, COLL_ALLTOALL_SIG,(send_buff, send_count, send_type, recv_buff, recv_count, recv_type,comm));
+COLL_APPLY(AUTOMATIC_COLL_BENCH, COLL_ALLTOALLV_SIG, (send_buff, send_counts, send_disps, send_type, recv_buff, recv_counts, recv_disps, recv_type, comm));
+COLL_APPLY(AUTOMATIC_COLL_BENCH, COLL_BCAST_SIG , (buf, count, datatype, root, comm));
+COLL_APPLY(AUTOMATIC_COLL_BENCH, COLL_REDUCE_SIG,(buf,rbuf, count, datatype, op, root, comm));
+COLL_APPLY(AUTOMATIC_COLL_BENCH, COLL_REDUCE_SCATTER_SIG ,(sbuf,rbuf, rcounts,dtype,op,comm));
+COLL_APPLY(AUTOMATIC_COLL_BENCH, COLL_SCATTER_SIG ,(sendbuf, sendcount, sendtype,recvbuf, recvcount, recvtype,root, comm));
+COLL_APPLY(AUTOMATIC_COLL_BENCH, COLL_BARRIER_SIG,(comm));