1 /* Copyright (c) 2007, 2009-2015. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include <unordered_map>
12 #include "src/internal_config.h"
15 #include "xbt/sysdep.h"
17 #include "surf/surf.h"
18 #include "simgrid/sg_config.h"
19 #include "simgrid/modelchecker.h"
20 #include "src/mc/mc_replay.h"
26 #include <sys/types.h>
29 #include <math.h> // sqrt
35 #define MAP_ANONYMOUS MAP_ANON
38 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
39 "Logging specific to SMPI (benchmarking)");
41 /* Shared allocations are handled through shared memory segments.
42 * Associated data and metadata are used as follows:
45 * `allocs' dict ---- -.
46 * ---------- shared_data_t shared_metadata_t / | | |
47 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
48 * | ---------- | fd of <name> | | | size of mmap | --| | | |
49 * | | count (2) | |-- | data | \ | | |
50 * `----------------- | <name> | | ----------------- ---- |
51 * -------------------- | ^ |
53 * | | `allocs_metadata' dict |
54 * | | ---------------------- |
55 * | `-- | <addr of mmap #1> |<-'
56 * | .-- | <addr of mmap #2> |<-.
57 * | | ---------------------- |
63 * | shared_metadata_t / | |
64 * | ----------------- | | |
65 * | | size of mmap | --| | |
67 * ----------------- | | |
72 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
74 xbt_dict_t samples = NULL; /* Allocated on first use */
75 xbt_dict_t calls = NULL; /* Allocated on first use */
77 double smpi_cpu_threshold;
78 double smpi_running_power;
80 int smpi_loaded_page = -1;
81 char* smpi_start_data_exe = NULL;
82 int smpi_size_data_exe = 0;
83 int smpi_privatize_global_variables;
84 double smpi_total_benched_time = 0;
85 smpi_privatisation_region_t smpi_privatisation_regions;
89 /** Some location in the source code
91 * This information is used by SMPI_SHARED_MALLOC to allocate
92 * some shared memory for all simulated processes.
94 class smpi_source_location {
96 smpi_source_location(const char* filename, int line)
97 : filename(filename), filename_length(strlen(filename)), line(line) {}
99 /** Pointer to a static string containing the file name */
100 const char* filename = nullptr;
101 int filename_length = 0;
104 bool operator==(smpi_source_location const& that) const
106 return filename_length == that.filename_length
108 && std::memcmp(filename, that.filename, filename_length) == 0;
110 bool operator!=(smpi_source_location const& that) const
112 return !(*this == that);
121 class hash<smpi_source_location> {
123 typedef smpi_source_location argument_type;
124 typedef std::size_t result_type;
125 result_type operator()(smpi_source_location const& loc) const
127 return xbt_str_hash_ext(loc.filename, loc.filename_length)
128 ^ xbt_str_hash_ext((const char*) &loc.line, sizeof(loc.line));
141 std::unordered_map<smpi_source_location, shared_data_t> allocs;
142 typedef std::unordered_map<smpi_source_location, shared_data_t>::value_type shared_data_key_type;
146 shared_data_key_type* data;
149 std::unordered_map<void*, shared_metadata_t> allocs_metadata;
153 static size_t shm_size(int fd) {
156 if(fstat(fd, &st) < 0) {
157 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
159 return (size_t)st.st_size;
163 static void* shm_map(int fd, size_t size, shared_data_key_type* data) {
165 char loc[PTR_STRLEN];
166 shared_metadata_t meta;
168 if(size > shm_size(fd)) {
169 if(ftruncate(fd, (off_t)size) < 0) {
170 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
174 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
175 if(mem == MAP_FAILED) {
176 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
178 snprintf(loc, PTR_STRLEN, "%p", mem);
181 allocs_metadata[mem] = meta;
182 XBT_DEBUG("MMAP %zu to %p", size, mem);
187 void smpi_bench_destroy(void)
190 allocs_metadata.clear();
191 xbt_dict_free(&samples);
192 xbt_dict_free(&calls);
195 extern "C" XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
196 void smpi_execute_flops_(double *flops)
198 smpi_execute_flops(*flops);
201 extern "C" XBT_PUBLIC(void) smpi_execute_(double *duration);
202 void smpi_execute_(double *duration)
204 smpi_execute(*duration);
207 void smpi_execute_flops(double flops) {
208 smx_synchro_t action;
209 XBT_DEBUG("Handle real computation time: %f flops", flops);
210 action = simcall_execution_start("computation", flops, 1, 0, 0);
211 simcall_set_category (action, TRACE_internal_smpi_get_category());
212 simcall_execution_wait(action);
213 smpi_switch_data_segment(smpi_process_index());
216 void smpi_execute(double duration)
218 if (duration >= smpi_cpu_threshold) {
219 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
220 double flops = duration * smpi_running_power;
221 int rank = smpi_process_index();
222 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
223 extra->type=TRACING_COMPUTING;
224 extra->comp_size=flops;
225 TRACE_smpi_computing_in(rank, extra);
226 smpi_execute_flops(flops);
228 TRACE_smpi_computing_out(rank);
231 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
232 duration, smpi_cpu_threshold);
236 void smpi_switch_data_segment(int dest);
238 void smpi_bench_begin(void)
240 if (smpi_privatize_global_variables) {
241 smpi_switch_data_segment(smpi_process_index());
244 if (MC_is_active() || MC_record_replay_is_active())
247 xbt_os_threadtimer_start(smpi_process_timer());
250 void smpi_bench_end(void)
253 if (MC_is_active() || MC_record_replay_is_active())
256 xbt_os_timer_t timer = smpi_process_timer();
257 xbt_os_threadtimer_stop(timer);
258 // smpi_switch_data_segment(smpi_process_count());
259 if (smpi_process_get_sampling()) {
260 XBT_CRITICAL("Cannot do recursive benchmarks.");
261 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
262 xbt_backtrace_display_current();
263 xbt_die("Aborting.");
265 // Simulate the benchmarked computation unless disabled via command-line argument
266 if (sg_cfg_get_boolean("smpi/simulate_computation")) {
267 smpi_execute(xbt_os_timer_elapsed(timer));
270 smpi_total_benched_time += xbt_os_timer_elapsed(timer);
273 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
274 static unsigned int private_sleep(double secs)
278 XBT_DEBUG("Sleep for: %lf secs", secs);
279 int rank = smpi_comm_rank(MPI_COMM_WORLD);
280 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
281 extra->type=TRACING_SLEEPING;
282 extra->sleep_duration=secs;
283 TRACE_smpi_sleeping_in(rank, extra);
285 simcall_process_sleep(secs);
287 TRACE_smpi_sleeping_out(rank);
293 unsigned int smpi_sleep(unsigned int secs)
295 return private_sleep((double)secs);
298 int smpi_usleep(useconds_t usecs)
300 return (int)private_sleep((double)usecs / 1000000.0);
304 int smpi_gettimeofday(struct timeval *tv, void* tz)
308 now = SIMIX_get_clock();
310 tv->tv_sec = (time_t)now;
312 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
314 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
321 extern double sg_surf_precision;
322 unsigned long long smpi_rastro_resolution (void)
325 double resolution = (1/sg_surf_precision);
327 return (unsigned long long)resolution;
330 unsigned long long smpi_rastro_timestamp (void)
333 double now = SIMIX_get_clock();
335 unsigned long long sec = (unsigned long long)now;
336 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
338 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
341 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
343 double threshold; /* maximal stderr requested (if positive) */
344 double relstderr; /* observed stderr so far */
345 double mean; /* mean of benched times, to be used if the block is disabled */
346 double sum; /* sum of benched times (to compute the mean and stderr) */
347 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
348 int iters; /* amount of requested iterations */
349 int count; /* amount of iterations done so far */
350 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
353 static char *sample_location(int global, const char *file, int line) {
355 return bprintf("%s:%d", file, line);
357 return bprintf("%s:%d:%d", file, line, smpi_process_index());
360 static int sample_enough_benchs(local_data_t *data) {
361 int res = data->count >= data->iters;
362 if (data->threshold>0.0) {
364 res = 0; // not enough data
365 if (data->relstderr > data->threshold)
366 res = 0; // stderr too high yet
368 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
369 (res?"enough benchs":"need more data"),
370 data->count, data->iters, data->relstderr, data->threshold, data->mean);
374 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
376 char *loc = sample_location(global, file, line);
379 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
380 smpi_process_set_sampling(1);
383 samples = xbt_dict_new_homogeneous(free);
385 data = static_cast<local_data_t *>(xbt_dict_get_or_null(samples, loc));
387 xbt_assert(threshold>0 || iters>0,
388 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
389 data = (local_data_t *) xbt_new(local_data_t, 1);
392 data->sum_pow2 = 0.0;
394 data->threshold = threshold;
395 data->benching = 1; // If we have no data, we need at least one
397 xbt_dict_set(samples, loc, data, NULL);
398 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
400 if (data->iters != iters || data->threshold != threshold) {
401 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
402 loc, data->iters, data->threshold, iters,threshold);
406 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
407 data->benching = !sample_enough_benchs(data);
408 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
413 int smpi_sample_2(int global, const char *file, int line)
415 char *loc = sample_location(global, file, line);
419 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
420 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
421 XBT_DEBUG("sample2 %s",loc);
424 if (data->benching==1) {
425 // we need to run a new bench
426 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
427 data->count, data->iters, data->relstderr, data->threshold, data->mean);
430 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
431 // Just sleep instead
432 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
433 data->count, data->iters, data->relstderr, data->threshold, data->mean);
434 smpi_execute(data->mean);
435 smpi_process_set_sampling(0);
436 res = 0; // prepare to capture future, unrelated computations
443 void smpi_sample_3(int global, const char *file, int line)
445 char *loc = sample_location(global, file, line);
448 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
449 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
450 XBT_DEBUG("sample3 %s",loc);
453 if (data->benching==0) {
457 // ok, benchmarking this loop is over
458 xbt_os_threadtimer_stop(smpi_process_timer());
463 sample = xbt_os_timer_elapsed(smpi_process_timer());
465 data->sum_pow2 += sample * sample;
466 n = (double)data->count;
467 data->mean = data->sum / n;
468 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
469 if (!sample_enough_benchs(data)) {
470 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
472 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
473 data->mean, data->relstderr, sample);
475 // That's enough for now, prevent sample_2 to run the same code over and over
481 void *smpi_shared_malloc(size_t size, const char *file, int line)
484 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
486 smpi_source_location loc(file, line);
487 auto res = allocs.insert(std::make_pair(loc, shared_data_t()));
488 auto data = res.first;
490 // The insertion did not take place.
491 // Generate a shared memory name from the address of the shared_data:
492 char shmname[32]; // cannot be longer than PSHMNAMLEN = 31 on Mac OS X (shm_open raises ENAMETOOLONG otherwise)
493 snprintf(shmname, 31, "/shmalloc%p", &*data);
494 fd = shm_open(shmname, O_RDWR | O_CREAT | O_EXCL,
495 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
499 xbt_die("Please cleanup /dev/shm/%s", shmname);
501 xbt_die("An unhandled error occured while opening %s. shm_open: %s", shmname, strerror(errno));
504 data->second.fd = fd;
505 data->second.count = 1;
506 mem = shm_map(fd, size, &*data);
507 if (shm_unlink(shmname) < 0) {
508 XBT_WARN("Could not early unlink %s. shm_unlink: %s", shmname, strerror(errno));
510 XBT_DEBUG("Mapping %s at %p through %d", shmname, mem, fd);
512 mem = shm_map(data->second.fd, size, &*data);
513 data->second.count++;
515 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, &*data);
517 mem = xbt_malloc(size);
518 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
524 void smpi_shared_free(void *ptr)
526 char loc[PTR_STRLEN];
528 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
529 snprintf(loc, PTR_STRLEN, "%p", ptr);
530 auto meta = allocs_metadata.find(ptr);
531 if (meta == allocs_metadata.end()) {
532 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
535 shared_data_t* data = &meta->second.data->second;
536 if (munmap(ptr, meta->second.size) < 0) {
537 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
540 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
541 if (data->count <= 0) {
543 allocs.erase(allocs.find(meta->second.data->first));
544 XBT_DEBUG("Shared free - with removal - of %p", ptr);
547 XBT_DEBUG("Classic free of %p", ptr);
553 int smpi_shared_known_call(const char* func, const char* input)
555 char* loc = bprintf("%s:%s", func, input);
560 calls = xbt_dict_new_homogeneous(NULL);
563 xbt_dict_get(calls, loc); /* Succeed or throw */
570 if (ex.category != not_found_error)
577 void* smpi_shared_get_call(const char* func, const char* input) {
578 char* loc = bprintf("%s:%s", func, input);
582 calls = xbt_dict_new_homogeneous(NULL);
584 data = xbt_dict_get(calls, loc);
589 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
590 char* loc = bprintf("%s:%s", func, input);
593 calls = xbt_dict_new_homogeneous(NULL);
595 xbt_dict_set(calls, loc, data, NULL);
603 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
606 /** Map a given SMPI privatization segment (make a SMPI process active)
608 void smpi_switch_data_segment(int dest){
610 if (smpi_loaded_page==dest)//no need to switch either
614 smpi_really_switch_data_segment(dest);
617 /** Map a given SMPI privatization segment (make a SMPI process active)
618 * even if SMPI thinks it is already active
620 * When doing a state restoration, the state of the restored variables
621 * might not be consistent with the state of the virtual memory.
622 * In this case, we to change the data segment.
624 void smpi_really_switch_data_segment(int dest) {
626 if(smpi_size_data_exe == 0)//no need to switch
629 #ifdef HAVE_PRIVATIZATION
631 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
632 for (i=0; i< smpi_process_count(); i++){
633 memcpy(smpi_privatisation_regions[i].address,
634 TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
638 // FIXME, cross-process support (mmap across process when necessary)
639 int current = smpi_privatisation_regions[dest].file_descriptor;
640 XBT_DEBUG("Switching data frame to the one of process %d", dest);
641 void* tmp = mmap (TOPAGE(smpi_start_data_exe), smpi_size_data_exe,
642 PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
643 if (tmp != TOPAGE(smpi_start_data_exe))
644 xbt_die("Couldn't map the new region");
645 smpi_loaded_page = dest;
649 int smpi_is_privatisation_file(char* file)
651 return strncmp("/dev/shm/my-buffer-", file, 19) == 0;
654 void smpi_initialize_global_memory_segments(){
656 #ifndef HAVE_PRIVATIZATION
657 smpi_privatize_global_variables=0;
658 xbt_die("You are trying to use privatization on a system that does not support it. Don't.");
663 smpi_get_executable_global_size();
665 XBT_DEBUG ("bss+data segment found : size %d starting at %p",
666 smpi_size_data_exe, smpi_start_data_exe );
668 if (smpi_size_data_exe == 0){//no need to switch
669 smpi_privatize_global_variables=0;
673 smpi_privatisation_regions = (smpi_privatisation_region_t) malloc(
674 smpi_process_count() * sizeof(struct s_smpi_privatisation_region));
676 for (i=0; i< smpi_process_count(); i++){
677 //create SIMIX_process_count() mappings of this size with the same data inside
678 void *address = NULL;
679 char path[] = "/dev/shm/my-buffer-XXXXXX";
682 int file_descriptor= mkstemp (path);
683 if (file_descriptor < 0) {
685 xbt_die("Impossible to create temporary file for memory mapping: %s\n\
686 The open() system call failed with the EMFILE error code (too many files). \n\n\
687 This means that you reached the system limits concerning the amount of files per process. \
688 This is not a surprise if you are trying to virtualize many processes on top of SMPI. \
689 Don't panic -- you should simply increase your system limits and try again. \n\n\
690 First, check what your limits are:\n\
691 cat /proc/sys/fs/file-max # Gives you the system-wide limit\n\
692 ulimit -Hn # Gives you the per process hard limit\n\
693 ulimit -Sn # Gives you the per process soft limit\n\
694 cat /proc/self/limits # Displays any per-process limitation (including the one given above)\n\n\
695 If one of these values is less than the amount of MPI processes that you try to run, then you got the explanation of this error. \
696 Ask the Internet about tutorials on how to increase the files limit such as: https://rtcamp.com/tutorials/linux/increase-open-files-limit/",
699 xbt_die("Impossible to create temporary file for memory mapping: %s",
703 status = unlink (path);
705 xbt_die("Impossible to unlink temporary file for memory mapping");
707 status = ftruncate(file_descriptor, smpi_size_data_exe);
709 xbt_die("Impossible to set the size of the temporary file for memory mapping");
711 /* Ask for a free region */
712 address = mmap (NULL, smpi_size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
713 if (address == MAP_FAILED)
714 xbt_die("Couldn't find a free region for memory mapping");
716 //initialize the values
717 memcpy(address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
719 //store the address of the mapping for further switches
720 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
721 smpi_privatisation_regions[i].address = address;
728 void smpi_destroy_global_memory_segments(){
729 if (smpi_size_data_exe == 0)//no need to switch
731 #ifdef HAVE_PRIVATIZATION
733 for (i=0; i< smpi_process_count(); i++){
734 if(munmap(smpi_privatisation_regions[i].address, smpi_size_data_exe) < 0) {
735 XBT_WARN("Unmapping of fd %d failed: %s",
736 smpi_privatisation_regions[i].file_descriptor, strerror(errno));
738 close(smpi_privatisation_regions[i].file_descriptor);
740 xbt_free(smpi_privatisation_regions);