+#include "xbt/dict.h"
+#include "xbt/sysdep.h"
+#include "xbt/ex.h"
+#include "xbt/hash.h"
+#include "surf/surf.h"
+#include "simgrid/sg_config.h"
+
+#ifndef WIN32
+#include <sys/mman.h>
+#endif
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <math.h> // sqrt
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+
+XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
+ "Logging specific to SMPI (benchmarking)");
+
+/* Shared allocations are handled through shared memory segments.
+ * Associated data and metadata are used as follows:
+ *
+ * mmap #1
+ * `allocs' dict ---- -.
+ * ---------- shared_data_t shared_metadata_t / | | |
+ * .->| <name> | ---> -------------------- <--. ----------------- | | | |
+ * | ---------- | fd of <name> | | | size of mmap | --| | | |
+ * | | count (2) | |-- | data | \ | | |
+ * `----------------- | <name> | | ----------------- ---- |
+ * -------------------- | ^ |
+ * | | |
+ * | | `allocs_metadata' dict |
+ * | | ---------------------- |
+ * | `-- | <addr of mmap #1> |<-'
+ * | .-- | <addr of mmap #2> |<-.
+ * | | ---------------------- |
+ * | | |
+ * | | |
+ * | | |
+ * | | mmap #2 |
+ * | v ---- -'
+ * | shared_metadata_t / | |
+ * | ----------------- | | |
+ * | | size of mmap | --| | |
+ * `-- | data | | | |
+ * ----------------- | | |
+ * \ | |
+ * ----
+ */
+
+#define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
+
+xbt_dict_t allocs = NULL; /* Allocated on first use */
+xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
+xbt_dict_t samples = NULL; /* Allocated on first use */
+xbt_dict_t calls = NULL; /* Allocated on first use */
+__thread int smpi_current_rank = 0; /* Updated after each MPI call */
+
+double smpi_cpu_threshold;
+double smpi_running_power;
+
+typedef struct {
+ int fd;
+ int count;
+ char* loc;
+} shared_data_t;
+
+typedef struct {
+ size_t size;
+ shared_data_t* data;
+} shared_metadata_t;
+
+static size_t shm_size(int fd) {
+ struct stat st;
+
+ if(fstat(fd, &st) < 0) {
+ xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
+ }
+ return (size_t)st.st_size;
+}
+
+#ifndef WIN32
+static void* shm_map(int fd, size_t size, shared_data_t* data) {
+ void* mem;
+ char loc[PTR_STRLEN];
+ shared_metadata_t* meta;
+
+ if(size > shm_size(fd)) {
+ if(ftruncate(fd, (off_t)size) < 0) {
+ xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
+ }
+ }
+
+ mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if(mem == MAP_FAILED) {
+ xbt_die("Could not map fd %d: %s", fd, strerror(errno));
+ }
+ if(!allocs_metadata) {
+ allocs_metadata = xbt_dict_new_homogeneous(xbt_free);
+ }
+ snprintf(loc, PTR_STRLEN, "%p", mem);
+ meta = xbt_new(shared_metadata_t, 1);
+ meta->size = size;
+ meta->data = data;
+ xbt_dict_set(allocs_metadata, loc, meta, NULL);
+ XBT_DEBUG("MMAP %zu to %p", size, mem);
+ return mem;
+}
+#endif
+
+void smpi_bench_destroy(void)
+{
+ xbt_dict_free(&allocs);
+ xbt_dict_free(&allocs_metadata);
+ xbt_dict_free(&samples);
+ xbt_dict_free(&calls);
+}
+
+XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
+void smpi_execute_flops_(double *flops)
+{
+ smpi_execute_flops(*flops);
+}
+
+XBT_PUBLIC(void) smpi_execute_(double *duration);
+void smpi_execute_(double *duration)
+{
+ smpi_execute(*duration);
+}
+
+void smpi_execute_flops(double flops) {
+ smx_action_t action;
+ smx_host_t host;
+ host = SIMIX_host_self();
+ XBT_DEBUG("Handle real computation time: %f flops", flops);
+ action = simcall_host_execute("computation", host, flops, 1, 0, 0);
+#ifdef HAVE_TRACING
+ simcall_set_category (action, TRACE_internal_smpi_get_category());
+#endif
+ simcall_host_execution_wait(action);
+}
+
+void smpi_execute(double duration)
+{
+ if (duration >= smpi_cpu_threshold) {
+ XBT_DEBUG("Sleep for %g to handle real computation time", duration);
+ double flops = duration * smpi_running_power;
+#ifdef HAVE_TRACING
+ int rank = smpi_process_index();
+ instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
+ extra->type=TRACING_COMPUTING;
+ extra->comp_size=flops;
+ TRACE_smpi_computing_in(rank, extra);
+#endif
+ smpi_execute_flops(flops);
+
+#ifdef HAVE_TRACING
+ TRACE_smpi_computing_out(rank);
+#endif
+
+ } else {
+ XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
+ duration, smpi_cpu_threshold);
+ }
+}
+
+void smpi_bench_begin(void)
+{
+ xbt_os_threadtimer_start(smpi_process_timer());
+ smpi_current_rank = smpi_process_index();
+}
+
+void smpi_bench_end(void)
+{
+ xbt_os_timer_t timer = smpi_process_timer();
+ xbt_os_threadtimer_stop(timer);
+ if (smpi_process_get_sampling()) {
+ XBT_CRITICAL("Cannot do recursive benchmarks.");
+ XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
+ xbt_backtrace_display_current();
+ xbt_die("Aborting.");
+ }
+ smpi_execute(xbt_os_timer_elapsed(timer));
+}
+
+unsigned int smpi_sleep(unsigned int secs)
+{
+ smx_action_t action;
+
+ smpi_bench_end();
+
+ double flops = (double) secs*simcall_host_get_speed(SIMIX_host_self());
+ XBT_DEBUG("Sleep for: %f flops", flops);
+ action = simcall_host_execute("computation", SIMIX_host_self(), flops, 1, 0, 0);
+ #ifdef HAVE_TRACING
+ simcall_set_category (action, TRACE_internal_smpi_get_category());
+ #endif
+ simcall_host_execution_wait(action);
+
+ smpi_bench_begin();
+ return secs;
+}
+
+int smpi_gettimeofday(struct timeval *tv)
+{
+ double now;
+ smpi_bench_end();
+ now = SIMIX_get_clock();
+ if (tv) {
+ tv->tv_sec = (time_t)now;
+#ifdef WIN32
+ tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
+#else
+ tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
+#endif
+ }
+ smpi_bench_begin();
+ return 0;
+}
+
+extern double sg_maxmin_precision;
+unsigned long long smpi_rastro_resolution (void)
+{
+ smpi_bench_end();
+ double resolution = (1/sg_maxmin_precision);
+ smpi_bench_begin();
+ return (unsigned long long)resolution;
+}
+
+unsigned long long smpi_rastro_timestamp (void)
+{
+ smpi_bench_end();
+ double now = SIMIX_get_clock();
+
+ unsigned long long sec = (unsigned long long)now;
+ unsigned long long pre = (now - sec) * smpi_rastro_resolution();
+ smpi_bench_begin();
+ return (unsigned long long)sec * smpi_rastro_resolution() + pre;
+}
+
+/* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
+typedef struct {
+ double threshold; /* maximal stderr requested (if positive) */
+ double relstderr; /* observed stderr so far */
+ double mean; /* mean of benched times, to be used if the block is disabled */
+ double sum; /* sum of benched times (to compute the mean and stderr) */
+ double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
+ int iters; /* amount of requested iterations */
+ int count; /* amount of iterations done so far */
+ int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
+} local_data_t;
+
+static char *sample_location(int global, const char *file, int line) {
+ if (global) {
+ return bprintf("%s:%d", file, line);
+ } else {
+ return bprintf("%s:%d:%d", file, line, smpi_process_index());
+ }
+}
+static int sample_enough_benchs(local_data_t *data) {
+ int res = data->count >= data->iters;
+ if (data->threshold>0.0) {
+ if (data->count <2)
+ res = 0; // not enough data
+ if (data->relstderr > data->threshold)
+ res = 0; // stderr too high yet
+ }
+ XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
+ (res?"enough benchs":"need more data"),
+ data->count, data->iters, data->relstderr, data->threshold, data->mean);
+ return res;
+}
+
+void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
+{
+ char *loc = sample_location(global, file, line);
+ local_data_t *data;
+
+ smpi_bench_end(); /* Take time from previous, unrelated computation into account */
+ smpi_process_set_sampling(1);
+
+ if (!samples)
+ samples = xbt_dict_new_homogeneous(free);
+
+ data = xbt_dict_get_or_null(samples, loc);
+ if (!data) {
+ xbt_assert(threshold>0 || iters>0,
+ "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
+ data = (local_data_t *) xbt_new(local_data_t, 1);
+ data->count = 0;
+ data->sum = 0.0;
+ data->sum_pow2 = 0.0;
+ data->iters = iters;
+ data->threshold = threshold;
+ data->benching = 1; // If we have no data, we need at least one
+ data->mean = 0;
+ xbt_dict_set(samples, loc, data, NULL);
+ XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
+ } else {
+ if (data->iters != iters || data->threshold != threshold) {
+ XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
+ loc, data->iters, data->threshold, iters,threshold);
+ THROW_IMPOSSIBLE;
+ }
+
+ // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
+ data->benching = !sample_enough_benchs(data);
+ XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
+ }
+ xbt_free(loc);
+}
+
+int smpi_sample_2(int global, const char *file, int line)
+{
+ char *loc = sample_location(global, file, line);
+ local_data_t *data;
+ int res;
+
+ xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
+ data = xbt_dict_get(samples, loc);
+ XBT_DEBUG("sample2 %s",loc);
+ xbt_free(loc);