1 /* Copyright (c) 2007, 2009-2015. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include <unordered_map>
12 #include "src/internal_config.h"
14 #include "private.hpp"
16 #include "xbt/sysdep.h"
18 #include "surf/surf.h"
19 #include "simgrid/sg_config.h"
20 #include "simgrid/modelchecker.h"
21 #include "src/mc/mc_replay.h"
27 #include <sys/types.h>
30 #include <math.h> // sqrt
36 #define MAP_ANONYMOUS MAP_ANON
39 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi, "Logging specific to SMPI (benchmarking)");
41 /* Shared allocations are handled through shared memory segments.
42 * Associated data and metadata are used as follows:
45 * `allocs' dict ---- -.
46 * ---------- shared_data_t shared_metadata_t / | | |
47 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
48 * | ---------- | fd of <name> | | | size of mmap | --| | | |
49 * | | count (2) | |-- | data | \ | | |
50 * `----------------- | <name> | | ----------------- ---- |
51 * -------------------- | ^ |
53 * | | `allocs_metadata' dict |
54 * | | ---------------------- |
55 * | `-- | <addr of mmap #1> |<-'
56 * | .-- | <addr of mmap #2> |<-.
57 * | | ---------------------- |
63 * | shared_metadata_t / | |
64 * | ----------------- | | |
65 * | | size of mmap | --| | |
67 * ----------------- | | |
72 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
74 xbt_dict_t samples = nullptr; /* Allocated on first use */
75 xbt_dict_t calls = nullptr; /* Allocated on first use */
77 double smpi_cpu_threshold;
78 double smpi_running_power;
80 int smpi_loaded_page = -1;
81 char* smpi_start_data_exe = nullptr;
82 int smpi_size_data_exe = 0;
83 bool smpi_privatize_global_variables;
84 double smpi_total_benched_time = 0;
85 smpi_privatisation_region_t smpi_privatisation_regions;
89 /** Some location in the source code
91 * This information is used by SMPI_SHARED_MALLOC to allocate some shared memory for all simulated processes.
93 class smpi_source_location {
95 smpi_source_location(const char* filename, int line)
96 : filename(filename), filename_length(strlen(filename)), line(line) {}
98 /** Pointer to a static string containing the file name */
99 const char* filename = nullptr;
100 int filename_length = 0;
103 bool operator==(smpi_source_location const& that) const
105 return filename_length == that.filename_length
107 && std::memcmp(filename, that.filename, filename_length) == 0;
109 bool operator!=(smpi_source_location const& that) const
111 return !(*this == that);
120 class hash<smpi_source_location> {
122 typedef smpi_source_location argument_type;
123 typedef std::size_t result_type;
124 result_type operator()(smpi_source_location const& loc) const
126 return xbt_str_hash_ext(loc.filename, loc.filename_length)
127 ^ xbt_str_hash_ext((const char*) &loc.line, sizeof(loc.line));
140 std::unordered_map<smpi_source_location, shared_data_t> allocs;
141 typedef std::unordered_map<smpi_source_location, shared_data_t>::value_type shared_data_key_type;
145 shared_data_key_type* data;
148 std::unordered_map<void*, shared_metadata_t> allocs_metadata;
152 static size_t shm_size(int fd) {
155 if(fstat(fd, &st) < 0) {
156 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
158 return static_cast<size_t>(st.st_size);
162 static void* shm_map(int fd, size_t size, shared_data_key_type* data) {
164 char loc[PTR_STRLEN];
165 shared_metadata_t meta;
167 if(size > shm_size(fd) && (ftruncate(fd, static_cast<off_t>(size)) < 0)) {
168 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
171 mem = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
172 if(mem == MAP_FAILED) {
173 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
175 snprintf(loc, PTR_STRLEN, "%p", mem);
178 allocs_metadata[mem] = meta;
179 XBT_DEBUG("MMAP %zu to %p", size, mem);
184 void smpi_bench_destroy(void)
187 allocs_metadata.clear();
188 xbt_dict_free(&samples);
189 xbt_dict_free(&calls);
192 extern "C" XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
193 void smpi_execute_flops_(double *flops)
195 smpi_execute_flops(*flops);
198 extern "C" XBT_PUBLIC(void) smpi_execute_(double *duration);
199 void smpi_execute_(double *duration)
201 smpi_execute(*duration);
204 void smpi_execute_flops(double flops) {
205 smx_synchro_t action;
206 XBT_DEBUG("Handle real computation time: %f flops", flops);
207 action = simcall_execution_start("computation", flops, 1, 0, 0);
208 simcall_set_category (action, TRACE_internal_smpi_get_category());
209 simcall_execution_wait(action);
210 smpi_switch_data_segment(smpi_process_index());
213 void smpi_execute(double duration)
215 if (duration >= smpi_cpu_threshold) {
216 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
217 double flops = duration * smpi_running_power;
218 int rank = smpi_process_index();
219 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
220 extra->type=TRACING_COMPUTING;
221 extra->comp_size=flops;
222 TRACE_smpi_computing_in(rank, extra);
223 smpi_execute_flops(flops);
225 TRACE_smpi_computing_out(rank);
228 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
229 duration, smpi_cpu_threshold);
233 void smpi_bench_begin(void)
235 if (smpi_privatize_global_variables) {
236 smpi_switch_data_segment(smpi_process_index());
239 if (MC_is_active() || MC_record_replay_is_active())
242 xbt_os_threadtimer_start(smpi_process_timer());
245 void smpi_bench_end(void)
248 if (MC_is_active() || MC_record_replay_is_active())
252 xbt_os_timer_t timer = smpi_process_timer();
253 xbt_os_threadtimer_stop(timer);
254 if (smpi_process_get_sampling()) {
255 XBT_CRITICAL("Cannot do recursive benchmarks.");
256 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
257 xbt_backtrace_display_current();
258 xbt_die("Aborting.");
261 if (xbt_cfg_get_string("smpi/comp-adjustment-file")[0] != '\0') { // Maybe we need to artificially speed up or slow
262 // down our computation based on our statistical analysis.
264 smpi_trace_call_location_t* loc = smpi_process_get_call_location();
265 std::string key = loc->get_composed_key();
266 std::unordered_map<std::string, double>::const_iterator it = location2speedup.find(key);
267 if (it != location2speedup.end()) {
268 speedup = it->second;
272 // Simulate the benchmarked computation unless disabled via command-line argument
273 if (xbt_cfg_get_boolean("smpi/simulate-computation")) {
274 smpi_execute(xbt_os_timer_elapsed(timer)/speedup);
277 smpi_total_benched_time += xbt_os_timer_elapsed(timer);
280 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
281 static unsigned int private_sleep(double secs)
285 XBT_DEBUG("Sleep for: %lf secs", secs);
286 int rank = smpi_comm_rank(MPI_COMM_WORLD);
287 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
288 extra->type=TRACING_SLEEPING;
289 extra->sleep_duration=secs;
290 TRACE_smpi_sleeping_in(rank, extra);
292 simcall_process_sleep(secs);
294 TRACE_smpi_sleeping_out(rank);
300 unsigned int smpi_sleep(unsigned int secs)
302 return private_sleep(static_cast<double>(secs));
305 int smpi_usleep(useconds_t usecs)
307 return static_cast<int>(private_sleep(static_cast<double>(usecs) / 1000000.0));
310 int smpi_gettimeofday(struct timeval *tv, void* tz)
314 now = SIMIX_get_clock();
316 tv->tv_sec = static_cast<time_t>(now);
318 tv->tv_usec = static_cast<useconds_t>((now - tv->tv_sec) * 1e6);
320 tv->tv_usec = static_cast<suseconds_t>((now - tv->tv_sec) * 1e6);
327 extern double sg_surf_precision;
328 unsigned long long smpi_rastro_resolution (void)
331 double resolution = (1/sg_surf_precision);
333 return static_cast<unsigned long long>(resolution);
336 unsigned long long smpi_rastro_timestamp (void)
339 double now = SIMIX_get_clock();
341 unsigned long long sec = (unsigned long long)now;
342 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
344 return static_cast<unsigned long long>(sec) * smpi_rastro_resolution() + pre;
347 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
349 double threshold; /* maximal stderr requested (if positive) */
350 double relstderr; /* observed stderr so far */
351 double mean; /* mean of benched times, to be used if the block is disabled */
352 double sum; /* sum of benched times (to compute the mean and stderr) */
353 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
354 int iters; /* amount of requested iterations */
355 int count; /* amount of iterations done so far */
356 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
359 static char *sample_location(int global, const char *file, int line) {
361 return bprintf("%s:%d", file, line);
363 return bprintf("%s:%d:%d", file, line, smpi_process_index());
367 static int sample_enough_benchs(local_data_t *data) {
368 int res = data->count >= data->iters;
369 if (data->threshold>0.0) {
371 res = 0; // not enough data
372 if (data->relstderr > data->threshold)
373 res = 0; // stderr too high yet
375 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
376 (res?"enough benchs":"need more data"), data->count, data->iters, data->relstderr, data->threshold, data->mean);
380 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
382 char *loc = sample_location(global, file, line);
385 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
386 smpi_process_set_sampling(1);
388 if (samples==nullptr)
389 samples = xbt_dict_new_homogeneous(free);
391 data = static_cast<local_data_t *>(xbt_dict_get_or_null(samples, loc));
393 xbt_assert(threshold>0 || iters>0,
394 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
395 data = static_cast<local_data_t *>( xbt_new(local_data_t, 1));
398 data->sum_pow2 = 0.0;
400 data->threshold = threshold;
401 data->benching = 1; // If we have no data, we need at least one
403 xbt_dict_set(samples, loc, data, nullptr);
404 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
406 if (data->iters != iters || data->threshold != threshold) {
407 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. "
408 "How did you manage to give two numbers at the same line??",
409 loc, data->iters, data->threshold, iters,threshold);
413 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate
414 // the computation instead
415 data->benching = (sample_enough_benchs(data) == 0);
416 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc,
417 (data->benching?"more benching needed":"we have enough data, skip computes"));
422 int smpi_sample_2(int global, const char *file, int line)
424 char *loc = sample_location(global, file, line);
428 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
429 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
430 XBT_DEBUG("sample2 %s",loc);
433 if (data->benching==1) {
434 // we need to run a new bench
435 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
436 data->count, data->iters, data->relstderr, data->threshold, data->mean);
439 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just
440 //ran one bench and need to bail out now that our job is done). Just sleep instead
441 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f)."
442 " apply the %fs delay instead", data->count, data->iters, data->relstderr, data->threshold, data->mean);
443 smpi_execute(data->mean);
444 smpi_process_set_sampling(0);
445 res = 0; // prepare to capture future, unrelated computations
451 void smpi_sample_3(int global, const char *file, int line)
453 char *loc = sample_location(global, file, line);
456 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
457 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
458 XBT_DEBUG("sample3 %s",loc);
461 if (data->benching==0) {
465 // ok, benchmarking this loop is over
466 xbt_os_threadtimer_stop(smpi_process_timer());
471 sample = xbt_os_timer_elapsed(smpi_process_timer());
473 data->sum_pow2 += sample * sample;
474 n = static_cast<double>(data->count);
475 data->mean = data->sum / n;
476 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
477 if (sample_enough_benchs(data)==0) {
478 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop
479 // occurrence before leaving, not the mean over the history
481 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
482 data->mean, data->relstderr, sample);
484 // That's enough for now, prevent sample_2 to run the same code over and over
490 void *smpi_shared_malloc(size_t size, const char *file, int line)
493 if (xbt_cfg_get_boolean("smpi/use-shared-malloc")){
495 smpi_source_location loc(file, line);
496 auto res = allocs.insert(std::make_pair(loc, shared_data_t()));
497 auto data = res.first;
499 // The insertion did not take place.
500 // Generate a shared memory name from the address of the shared_data:
501 char shmname[32]; // cannot be longer than PSHMNAMLEN = 31 on Mac OS X (shm_open raises ENAMETOOLONG otherwise)
502 snprintf(shmname, 31, "/shmalloc%p", &*data);
503 fd = shm_open(shmname, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
506 xbt_die("Please cleanup /dev/shm/%s", shmname);
508 xbt_die("An unhandled error occurred while opening %s. shm_open: %s", shmname, strerror(errno));
510 data->second.fd = fd;
511 data->second.count = 1;
512 mem = shm_map(fd, size, &*data);
513 if (shm_unlink(shmname) < 0) {
514 XBT_WARN("Could not early unlink %s. shm_unlink: %s", shmname, strerror(errno));
516 XBT_DEBUG("Mapping %s at %p through %d", shmname, mem, fd);
518 mem = shm_map(data->second.fd, size, &*data);
519 data->second.count++;
521 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, &*data);
523 mem = xbt_malloc(size);
524 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
530 void smpi_shared_free(void *ptr)
532 char loc[PTR_STRLEN];
534 if (xbt_cfg_get_boolean("smpi/use-shared-malloc")){
535 snprintf(loc, PTR_STRLEN, "%p", ptr);
536 auto meta = allocs_metadata.find(ptr);
537 if (meta == allocs_metadata.end()) {
538 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
541 shared_data_t* data = &meta->second.data->second;
542 if (munmap(ptr, meta->second.size) < 0) {
543 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
546 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
547 if (data->count <= 0) {
549 allocs.erase(allocs.find(meta->second.data->first));
550 XBT_DEBUG("Shared free - with removal - of %p", ptr);
553 XBT_DEBUG("Classic free of %p", ptr);
559 int smpi_shared_known_call(const char* func, const char* input)
561 char* loc = bprintf("%s:%s", func, input);
564 if (calls==nullptr) {
565 calls = xbt_dict_new_homogeneous(nullptr);
568 xbt_dict_get(calls, loc); /* Succeed or throw */
574 if (ex.category != not_found_error)
584 void* smpi_shared_get_call(const char* func, const char* input) {
585 char* loc = bprintf("%s:%s", func, input);
589 calls = xbt_dict_new_homogeneous(nullptr);
591 data = xbt_dict_get(calls, loc);
596 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
597 char* loc = bprintf("%s:%s", func, input);
600 calls = xbt_dict_new_homogeneous(nullptr);
602 xbt_dict_set(calls, loc, data, nullptr);
608 /** Map a given SMPI privatization segment (make a SMPI process active) */
609 void smpi_switch_data_segment(int dest) {
610 if (smpi_loaded_page == dest)//no need to switch, we've already loaded the one we want
614 smpi_really_switch_data_segment(dest);
617 /** Map a given SMPI privatization segment (make a SMPI process active) even if SMPI thinks it is already active
619 * When doing a state restoration, the state of the restored variables might not be consistent with the state of the
620 * virtual memory. In this case, we to change the data segment.
622 void smpi_really_switch_data_segment(int dest) {
623 if(smpi_size_data_exe == 0)//no need to switch
626 #if HAVE_PRIVATIZATION
627 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
628 for (int i=0; i< smpi_process_count(); i++){
629 memcpy(smpi_privatisation_regions[i].address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
633 // FIXME, cross-process support (mmap across process when necessary)
634 int current = smpi_privatisation_regions[dest].file_descriptor;
635 XBT_DEBUG("Switching data frame to the one of process %d", dest);
636 void* tmp = mmap (TOPAGE(smpi_start_data_exe), smpi_size_data_exe,
637 PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
638 if (tmp != TOPAGE(smpi_start_data_exe))
639 xbt_die("Couldn't map the new region");
640 smpi_loaded_page = dest;
644 int smpi_is_privatisation_file(char* file)
646 return strncmp("/dev/shm/my-buffer-", file, std::strlen("/dev/shm/my-buffer-")) == 0;
649 void smpi_initialize_global_memory_segments(){
651 #if !HAVE_PRIVATIZATION
652 smpi_privatize_global_variables=false;
653 xbt_die("You are trying to use privatization on a system that does not support it. Don't.");
657 smpi_get_executable_global_size();
659 XBT_DEBUG ("bss+data segment found : size %d starting at %p", smpi_size_data_exe, smpi_start_data_exe );
661 if (smpi_size_data_exe == 0){//no need to switch
662 smpi_privatize_global_variables=false;
666 smpi_privatisation_regions =
667 static_cast<smpi_privatisation_region_t>( xbt_malloc(smpi_process_count() * sizeof(struct s_smpi_privatisation_region)));
669 for (int i=0; i< smpi_process_count(); i++){
670 //create SIMIX_process_count() mappings of this size with the same data inside
671 void *address = nullptr;
672 char path[] = "/dev/shm/my-buffer-XXXXXX";
675 int file_descriptor= mkstemp (path);
676 if (file_descriptor < 0) {
678 xbt_die("Impossible to create temporary file for memory mapping: %s\n\
679 The open() system call failed with the EMFILE error code (too many files). \n\n\
680 This means that you reached the system limits concerning the amount of files per process. \
681 This is not a surprise if you are trying to virtualize many processes on top of SMPI. \
682 Don't panic -- you should simply increase your system limits and try again. \n\n\
683 First, check what your limits are:\n\
684 cat /proc/sys/fs/file-max # Gives you the system-wide limit\n\
685 ulimit -Hn # Gives you the per process hard limit\n\
686 ulimit -Sn # Gives you the per process soft limit\n\
687 cat /proc/self/limits # Displays any per-process limitation (including the one given above)\n\n\
688 If one of these values is less than the amount of MPI processes that you try to run, then you got the explanation of this error. \
689 Ask the Internet about tutorials on how to increase the files limit such as: https://rtcamp.com/tutorials/linux/increase-open-files-limit/",
692 xbt_die("Impossible to create temporary file for memory mapping: %s",
696 status = unlink (path);
698 xbt_die("Impossible to unlink temporary file for memory mapping");
700 status = ftruncate(file_descriptor, smpi_size_data_exe);
702 xbt_die("Impossible to set the size of the temporary file for memory mapping");
704 /* Ask for a free region */
705 address = mmap (nullptr, smpi_size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
706 if (address == MAP_FAILED)
707 xbt_die("Couldn't find a free region for memory mapping");
709 //initialize the values
710 memcpy(address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
712 //store the address of the mapping for further switches
713 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
714 smpi_privatisation_regions[i].address = address;
719 void smpi_destroy_global_memory_segments(){
720 if (smpi_size_data_exe == 0)//no need to switch
722 #if HAVE_PRIVATIZATION
724 for (i=0; i< smpi_process_count(); i++){
725 if(munmap(smpi_privatisation_regions[i].address, smpi_size_data_exe) < 0) {
726 XBT_WARN("Unmapping of fd %d failed: %s", smpi_privatisation_regions[i].file_descriptor, strerror(errno));
728 close(smpi_privatisation_regions[i].file_descriptor);
730 xbt_free(smpi_privatisation_regions);
734 extern "C" { /** These functions will be called from the user code **/
735 smpi_trace_call_location_t* smpi_trace_get_call_location() {
736 return smpi_process_get_call_location();
739 void smpi_trace_set_call_location(const char* file, const int line) {
740 smpi_trace_call_location_t* loc = smpi_process_get_call_location();
742 loc->previous_filename = loc->filename;
743 loc->previous_linenumber = loc->linenumber;
744 loc->filename = file;
745 loc->linenumber = line;
749 * Required for Fortran bindings
751 void smpi_trace_set_call_location_(const char* file, int* line) {
752 smpi_trace_set_call_location(file, *line);
756 * Required for Fortran if -fsecond-underscore is activated
758 void smpi_trace_set_call_location__(const char* file, int* line) {
759 smpi_trace_set_call_location(file, *line);