1 /* Copyright (c) 2007, 2009-2015. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include <unordered_map>
12 #include "src/internal_config.h"
14 #include "private.hpp"
16 #include "xbt/sysdep.h"
18 #include "surf/surf.h"
19 #include "simgrid/sg_config.h"
20 #include "simgrid/modelchecker.h"
21 #include "src/mc/mc_replay.h"
27 #include <sys/types.h>
30 #include <math.h> // sqrt
36 #define MAP_ANONYMOUS MAP_ANON
39 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi, "Logging specific to SMPI (benchmarking)");
41 /* Shared allocations are handled through shared memory segments.
42 * Associated data and metadata are used as follows:
45 * `allocs' dict ---- -.
46 * ---------- shared_data_t shared_metadata_t / | | |
47 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
48 * | ---------- | fd of <name> | | | size of mmap | --| | | |
49 * | | count (2) | |-- | data | \ | | |
50 * `----------------- | <name> | | ----------------- ---- |
51 * -------------------- | ^ |
53 * | | `allocs_metadata' dict |
54 * | | ---------------------- |
55 * | `-- | <addr of mmap #1> |<-'
56 * | .-- | <addr of mmap #2> |<-.
57 * | | ---------------------- |
63 * | shared_metadata_t / | |
64 * | ----------------- | | |
65 * | | size of mmap | --| | |
67 * ----------------- | | |
72 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
74 xbt_dict_t samples = NULL; /* Allocated on first use */
75 xbt_dict_t calls = NULL; /* Allocated on first use */
77 double smpi_cpu_threshold;
78 double smpi_running_power;
80 int smpi_loaded_page = -1;
81 char* smpi_start_data_exe = NULL;
82 int smpi_size_data_exe = 0;
83 int smpi_privatize_global_variables;
84 double smpi_total_benched_time = 0;
85 smpi_privatisation_region_t smpi_privatisation_regions;
89 /** Some location in the source code
91 * This information is used by SMPI_SHARED_MALLOC to allocate some shared memory for all simulated processes.
93 class smpi_source_location {
95 smpi_source_location(const char* filename, int line)
96 : filename(filename), filename_length(strlen(filename)), line(line) {}
98 /** Pointer to a static string containing the file name */
99 const char* filename = nullptr;
100 int filename_length = 0;
103 bool operator==(smpi_source_location const& that) const
105 return filename_length == that.filename_length
107 && std::memcmp(filename, that.filename, filename_length) == 0;
109 bool operator!=(smpi_source_location const& that) const
111 return !(*this == that);
120 class hash<smpi_source_location> {
122 typedef smpi_source_location argument_type;
123 typedef std::size_t result_type;
124 result_type operator()(smpi_source_location const& loc) const
126 return xbt_str_hash_ext(loc.filename, loc.filename_length)
127 ^ xbt_str_hash_ext((const char*) &loc.line, sizeof(loc.line));
140 std::unordered_map<smpi_source_location, shared_data_t> allocs;
141 typedef std::unordered_map<smpi_source_location, shared_data_t>::value_type shared_data_key_type;
145 shared_data_key_type* data;
148 std::unordered_map<void*, shared_metadata_t> allocs_metadata;
152 static size_t shm_size(int fd) {
155 if(fstat(fd, &st) < 0) {
156 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
158 return (size_t)st.st_size;
162 static void* shm_map(int fd, size_t size, shared_data_key_type* data) {
164 char loc[PTR_STRLEN];
165 shared_metadata_t meta;
167 if(size > shm_size(fd)) {
168 if(ftruncate(fd, (off_t)size) < 0) {
169 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
173 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
174 if(mem == MAP_FAILED) {
175 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
177 snprintf(loc, PTR_STRLEN, "%p", mem);
180 allocs_metadata[mem] = meta;
181 XBT_DEBUG("MMAP %zu to %p", size, mem);
186 void smpi_bench_destroy(void)
189 allocs_metadata.clear();
190 xbt_dict_free(&samples);
191 xbt_dict_free(&calls);
194 extern "C" XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
195 void smpi_execute_flops_(double *flops)
197 smpi_execute_flops(*flops);
200 extern "C" XBT_PUBLIC(void) smpi_execute_(double *duration);
201 void smpi_execute_(double *duration)
203 smpi_execute(*duration);
206 void smpi_execute_flops(double flops) {
207 smx_synchro_t action;
208 XBT_DEBUG("Handle real computation time: %f flops", flops);
209 action = simcall_execution_start("computation", flops, 1, 0, 0);
210 simcall_set_category (action, TRACE_internal_smpi_get_category());
211 simcall_execution_wait(action);
212 smpi_switch_data_segment(smpi_process_index());
215 void smpi_execute(double duration)
217 if (duration >= smpi_cpu_threshold) {
218 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
219 double flops = duration * smpi_running_power;
220 int rank = smpi_process_index();
221 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
222 extra->type=TRACING_COMPUTING;
223 extra->comp_size=flops;
224 TRACE_smpi_computing_in(rank, extra);
225 smpi_execute_flops(flops);
227 TRACE_smpi_computing_out(rank);
230 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
231 duration, smpi_cpu_threshold);
235 void smpi_switch_data_segment(int dest);
237 void smpi_bench_begin(void)
239 if (smpi_privatize_global_variables) {
240 smpi_switch_data_segment(smpi_process_index());
243 if (MC_is_active() || MC_record_replay_is_active())
246 xbt_os_threadtimer_start(smpi_process_timer());
249 void smpi_bench_end(void)
252 if (MC_is_active() || MC_record_replay_is_active())
256 xbt_os_timer_t timer = smpi_process_timer();
257 xbt_os_threadtimer_stop(timer);
258 // smpi_switch_data_segment(smpi_process_count());
259 if (smpi_process_get_sampling()) {
260 XBT_CRITICAL("Cannot do recursive benchmarks.");
261 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
262 xbt_backtrace_display_current();
263 xbt_die("Aborting.");
266 if (xbt_cfg_get_string("smpi/comp-adjustment-file")[0] != '\0') { // Maybe we need to artificially speed up or slow
267 // down our computation based on our statistical analysis.
269 smpi_trace_call_location_t* loc = smpi_process_get_call_location();
270 std::string key = loc->get_composed_key();
271 std::map<std::string, double>::const_iterator it = location2speedup.find(key);
272 if (it != location2speedup.end()) {
273 speedup = it->second;
277 // Simulate the benchmarked computation unless disabled via command-line argument
278 if (xbt_cfg_get_boolean("smpi/simulate-computation")) {
279 smpi_execute(xbt_os_timer_elapsed(timer)/speedup);
282 smpi_total_benched_time += xbt_os_timer_elapsed(timer);
285 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
286 static unsigned int private_sleep(double secs)
290 XBT_DEBUG("Sleep for: %lf secs", secs);
291 int rank = smpi_comm_rank(MPI_COMM_WORLD);
292 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
293 extra->type=TRACING_SLEEPING;
294 extra->sleep_duration=secs;
295 TRACE_smpi_sleeping_in(rank, extra);
297 simcall_process_sleep(secs);
299 TRACE_smpi_sleeping_out(rank);
305 unsigned int smpi_sleep(unsigned int secs)
307 return private_sleep((double)secs);
310 int smpi_usleep(useconds_t usecs)
312 return (int)private_sleep((double)usecs / 1000000.0);
315 int smpi_gettimeofday(struct timeval *tv, void* tz)
319 now = SIMIX_get_clock();
321 tv->tv_sec = (time_t)now;
323 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
325 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
332 extern double sg_surf_precision;
333 unsigned long long smpi_rastro_resolution (void)
336 double resolution = (1/sg_surf_precision);
338 return (unsigned long long)resolution;
341 unsigned long long smpi_rastro_timestamp (void)
344 double now = SIMIX_get_clock();
346 unsigned long long sec = (unsigned long long)now;
347 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
349 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
352 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
354 double threshold; /* maximal stderr requested (if positive) */
355 double relstderr; /* observed stderr so far */
356 double mean; /* mean of benched times, to be used if the block is disabled */
357 double sum; /* sum of benched times (to compute the mean and stderr) */
358 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
359 int iters; /* amount of requested iterations */
360 int count; /* amount of iterations done so far */
361 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
364 static char *sample_location(int global, const char *file, int line) {
366 return bprintf("%s:%d", file, line);
368 return bprintf("%s:%d:%d", file, line, smpi_process_index());
372 static int sample_enough_benchs(local_data_t *data) {
373 int res = data->count >= data->iters;
374 if (data->threshold>0.0) {
376 res = 0; // not enough data
377 if (data->relstderr > data->threshold)
378 res = 0; // stderr too high yet
380 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
381 (res?"enough benchs":"need more data"), data->count, data->iters, data->relstderr, data->threshold, data->mean);
385 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
387 char *loc = sample_location(global, file, line);
390 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
391 smpi_process_set_sampling(1);
394 samples = xbt_dict_new_homogeneous(free);
396 data = static_cast<local_data_t *>(xbt_dict_get_or_null(samples, loc));
398 xbt_assert(threshold>0 || iters>0,
399 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
400 data = (local_data_t *) xbt_new(local_data_t, 1);
403 data->sum_pow2 = 0.0;
405 data->threshold = threshold;
406 data->benching = 1; // If we have no data, we need at least one
408 xbt_dict_set(samples, loc, data, NULL);
409 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
411 if (data->iters != iters || data->threshold != threshold) {
412 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. "
413 "How did you manage to give two numbers at the same line??",
414 loc, data->iters, data->threshold, iters,threshold);
418 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate
419 // the computation instead
420 data->benching = !sample_enough_benchs(data);
421 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc,
422 (data->benching?"more benching needed":"we have enough data, skip computes"));
427 int smpi_sample_2(int global, const char *file, int line)
429 char *loc = sample_location(global, file, line);
433 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
434 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
435 XBT_DEBUG("sample2 %s",loc);
438 if (data->benching==1) {
439 // we need to run a new bench
440 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
441 data->count, data->iters, data->relstderr, data->threshold, data->mean);
444 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just
445 //ran one bench and need to bail out now that our job is done). Just sleep instead
446 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f)."
447 " apply the %fs delay instead", data->count, data->iters, data->relstderr, data->threshold, data->mean);
448 smpi_execute(data->mean);
449 smpi_process_set_sampling(0);
450 res = 0; // prepare to capture future, unrelated computations
456 void smpi_sample_3(int global, const char *file, int line)
458 char *loc = sample_location(global, file, line);
461 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
462 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
463 XBT_DEBUG("sample3 %s",loc);
466 if (data->benching==0) {
470 // ok, benchmarking this loop is over
471 xbt_os_threadtimer_stop(smpi_process_timer());
476 sample = xbt_os_timer_elapsed(smpi_process_timer());
478 data->sum_pow2 += sample * sample;
479 n = (double)data->count;
480 data->mean = data->sum / n;
481 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
482 if (!sample_enough_benchs(data)) {
483 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop
484 // occurrence before leaving, not the mean over the history
486 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
487 data->mean, data->relstderr, sample);
489 // That's enough for now, prevent sample_2 to run the same code over and over
495 void *smpi_shared_malloc(size_t size, const char *file, int line)
498 if (xbt_cfg_get_boolean("smpi/use-shared-malloc")){
500 smpi_source_location loc(file, line);
501 auto res = allocs.insert(std::make_pair(loc, shared_data_t()));
502 auto data = res.first;
504 // The insertion did not take place.
505 // Generate a shared memory name from the address of the shared_data:
506 char shmname[32]; // cannot be longer than PSHMNAMLEN = 31 on Mac OS X (shm_open raises ENAMETOOLONG otherwise)
507 snprintf(shmname, 31, "/shmalloc%p", &*data);
508 fd = shm_open(shmname, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
512 xbt_die("Please cleanup /dev/shm/%s", shmname);
514 xbt_die("An unhandled error occurred while opening %s. shm_open: %s", shmname, strerror(errno));
517 data->second.fd = fd;
518 data->second.count = 1;
519 mem = shm_map(fd, size, &*data);
520 if (shm_unlink(shmname) < 0) {
521 XBT_WARN("Could not early unlink %s. shm_unlink: %s", shmname, strerror(errno));
523 XBT_DEBUG("Mapping %s at %p through %d", shmname, mem, fd);
525 mem = shm_map(data->second.fd, size, &*data);
526 data->second.count++;
528 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, &*data);
530 mem = xbt_malloc(size);
531 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
537 void smpi_shared_free(void *ptr)
539 char loc[PTR_STRLEN];
541 if (xbt_cfg_get_boolean("smpi/use-shared-malloc")){
542 snprintf(loc, PTR_STRLEN, "%p", ptr);
543 auto meta = allocs_metadata.find(ptr);
544 if (meta == allocs_metadata.end()) {
545 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
548 shared_data_t* data = &meta->second.data->second;
549 if (munmap(ptr, meta->second.size) < 0) {
550 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
553 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
554 if (data->count <= 0) {
556 allocs.erase(allocs.find(meta->second.data->first));
557 XBT_DEBUG("Shared free - with removal - of %p", ptr);
560 XBT_DEBUG("Classic free of %p", ptr);
566 int smpi_shared_known_call(const char* func, const char* input)
568 char* loc = bprintf("%s:%s", func, input);
573 calls = xbt_dict_new_homogeneous(NULL);
576 xbt_dict_get(calls, loc); /* Succeed or throw */
583 if (ex.category != not_found_error)
590 void* smpi_shared_get_call(const char* func, const char* input) {
591 char* loc = bprintf("%s:%s", func, input);
595 calls = xbt_dict_new_homogeneous(NULL);
597 data = xbt_dict_get(calls, loc);
602 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
603 char* loc = bprintf("%s:%s", func, input);
606 calls = xbt_dict_new_homogeneous(NULL);
608 xbt_dict_set(calls, loc, data, NULL);
613 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
615 /** Map a given SMPI privatization segment (make a SMPI process active) */
616 void smpi_switch_data_segment(int dest){
617 if (smpi_loaded_page==dest)//no need to switch either
621 smpi_really_switch_data_segment(dest);
624 /** Map a given SMPI privatization segment (make a SMPI process active) even if SMPI thinks it is already active
626 * When doing a state restoration, the state of the restored variables might not be consistent with the state of the
627 * virtual memory. In this case, we to change the data segment.
629 void smpi_really_switch_data_segment(int dest) {
630 if(smpi_size_data_exe == 0)//no need to switch
633 #if HAVE_PRIVATIZATION
634 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
635 for (int i=0; i< smpi_process_count(); i++){
636 memcpy(smpi_privatisation_regions[i].address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
640 // FIXME, cross-process support (mmap across process when necessary)
641 int current = smpi_privatisation_regions[dest].file_descriptor;
642 XBT_DEBUG("Switching data frame to the one of process %d", dest);
643 void* tmp = mmap (TOPAGE(smpi_start_data_exe), smpi_size_data_exe,
644 PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
645 if (tmp != TOPAGE(smpi_start_data_exe))
646 xbt_die("Couldn't map the new region");
647 smpi_loaded_page = dest;
651 int smpi_is_privatisation_file(char* file)
653 return strncmp("/dev/shm/my-buffer-", file, 19) == 0;
656 void smpi_initialize_global_memory_segments(){
658 #if !HAVE_PRIVATIZATION
659 smpi_privatize_global_variables=0;
660 xbt_die("You are trying to use privatization on a system that does not support it. Don't.");
664 smpi_get_executable_global_size();
666 XBT_DEBUG ("bss+data segment found : size %d starting at %p", smpi_size_data_exe, smpi_start_data_exe );
668 if (smpi_size_data_exe == 0){//no need to switch
669 smpi_privatize_global_variables=0;
673 smpi_privatisation_regions =
674 (smpi_privatisation_region_t) malloc(smpi_process_count() * sizeof(struct s_smpi_privatisation_region));
676 for (int i=0; i< smpi_process_count(); i++){
677 //create SIMIX_process_count() mappings of this size with the same data inside
678 void *address = NULL;
679 char path[] = "/dev/shm/my-buffer-XXXXXX";
682 int file_descriptor= mkstemp (path);
683 if (file_descriptor < 0) {
685 xbt_die("Impossible to create temporary file for memory mapping: %s\n\
686 The open() system call failed with the EMFILE error code (too many files). \n\n\
687 This means that you reached the system limits concerning the amount of files per process. \
688 This is not a surprise if you are trying to virtualize many processes on top of SMPI. \
689 Don't panic -- you should simply increase your system limits and try again. \n\n\
690 First, check what your limits are:\n\
691 cat /proc/sys/fs/file-max # Gives you the system-wide limit\n\
692 ulimit -Hn # Gives you the per process hard limit\n\
693 ulimit -Sn # Gives you the per process soft limit\n\
694 cat /proc/self/limits # Displays any per-process limitation (including the one given above)\n\n\
695 If one of these values is less than the amount of MPI processes that you try to run, then you got the explanation of this error. \
696 Ask the Internet about tutorials on how to increase the files limit such as: https://rtcamp.com/tutorials/linux/increase-open-files-limit/",
699 xbt_die("Impossible to create temporary file for memory mapping: %s",
703 status = unlink (path);
705 xbt_die("Impossible to unlink temporary file for memory mapping");
707 status = ftruncate(file_descriptor, smpi_size_data_exe);
709 xbt_die("Impossible to set the size of the temporary file for memory mapping");
711 /* Ask for a free region */
712 address = mmap (NULL, smpi_size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
713 if (address == MAP_FAILED)
714 xbt_die("Couldn't find a free region for memory mapping");
716 //initialize the values
717 memcpy(address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
719 //store the address of the mapping for further switches
720 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
721 smpi_privatisation_regions[i].address = address;
726 void smpi_destroy_global_memory_segments(){
727 if (smpi_size_data_exe == 0)//no need to switch
729 #if HAVE_PRIVATIZATION
731 for (i=0; i< smpi_process_count(); i++){
732 if(munmap(smpi_privatisation_regions[i].address, smpi_size_data_exe) < 0) {
733 XBT_WARN("Unmapping of fd %d failed: %s", smpi_privatisation_regions[i].file_descriptor, strerror(errno));
735 close(smpi_privatisation_regions[i].file_descriptor);
737 xbt_free(smpi_privatisation_regions);
743 smpi_trace_call_location_t trace_call_location;
745 smpi_trace_call_location_t* smpi_trace_get_call_location() {
746 return smpi_process_get_call_location();
749 void smpi_trace_set_call_location(const char* file, int line) {
750 smpi_trace_call_location_t* loc = smpi_process_get_call_location();
752 loc->previous_filename = loc->filename;
753 loc->previous_linenumber = loc->linenumber;
754 loc->filename = file;
755 loc->linenumber = line;
759 * Required for Fortran bindings
761 void smpi_trace_set_call_location_(const char* file, int* line) {
762 smpi_trace_set_call_location(file, *line);
766 * Required for Fortran if -fsecond-underscore is activated
768 void smpi_trace_set_call_location__(const char* file, int* line) {
769 smpi_trace_set_call_location(file, *line);