1 /* Copyright (c) 2007, 2009-2015. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include <unordered_map>
12 #include "src/internal_config.h"
14 #include "private.hpp"
16 #include "xbt/sysdep.h"
18 #include "surf/surf.h"
19 #include "simgrid/sg_config.h"
20 #include "simgrid/modelchecker.h"
21 #include "src/mc/mc_replay.h"
27 #include <sys/types.h>
30 #include <math.h> // sqrt
36 #define MAP_ANONYMOUS MAP_ANON
39 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi, "Logging specific to SMPI (benchmarking)");
41 /* Shared allocations are handled through shared memory segments.
42 * Associated data and metadata are used as follows:
45 * `allocs' dict ---- -.
46 * ---------- shared_data_t shared_metadata_t / | | |
47 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
48 * | ---------- | fd of <name> | | | size of mmap | --| | | |
49 * | | count (2) | |-- | data | \ | | |
50 * `----------------- | <name> | | ----------------- ---- |
51 * -------------------- | ^ |
53 * | | `allocs_metadata' dict |
54 * | | ---------------------- |
55 * | `-- | <addr of mmap #1> |<-'
56 * | .-- | <addr of mmap #2> |<-.
57 * | | ---------------------- |
63 * | shared_metadata_t / | |
64 * | ----------------- | | |
65 * | | size of mmap | --| | |
67 * ----------------- | | |
72 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
74 xbt_dict_t samples = nullptr; /* Allocated on first use */
75 xbt_dict_t calls = nullptr; /* Allocated on first use */
77 double smpi_cpu_threshold;
78 double smpi_running_power;
80 int smpi_loaded_page = -1;
81 char* smpi_start_data_exe = nullptr;
82 int smpi_size_data_exe = 0;
83 bool smpi_privatize_global_variables;
84 double smpi_total_benched_time = 0;
85 smpi_privatisation_region_t smpi_privatisation_regions;
89 /** Some location in the source code
91 * This information is used by SMPI_SHARED_MALLOC to allocate some shared memory for all simulated processes.
93 class smpi_source_location {
95 smpi_source_location(const char* filename, int line)
96 : filename(filename), filename_length(strlen(filename)), line(line) {}
98 /** Pointer to a static string containing the file name */
99 const char* filename = nullptr;
100 int filename_length = 0;
103 bool operator==(smpi_source_location const& that) const
105 return filename_length == that.filename_length
107 && std::memcmp(filename, that.filename, filename_length) == 0;
109 bool operator!=(smpi_source_location const& that) const
111 return !(*this == that);
120 class hash<smpi_source_location> {
122 typedef smpi_source_location argument_type;
123 typedef std::size_t result_type;
124 result_type operator()(smpi_source_location const& loc) const
126 return xbt_str_hash_ext(loc.filename, loc.filename_length)
127 ^ xbt_str_hash_ext((const char*) &loc.line, sizeof(loc.line));
140 std::unordered_map<smpi_source_location, shared_data_t> allocs;
141 typedef std::unordered_map<smpi_source_location, shared_data_t>::value_type shared_data_key_type;
145 shared_data_key_type* data;
148 std::unordered_map<void*, shared_metadata_t> allocs_metadata;
152 static size_t shm_size(int fd) {
155 if(fstat(fd, &st) < 0) {
156 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
158 return static_cast<size_t>(st.st_size);
162 static void* shm_map(int fd, size_t size, shared_data_key_type* data) {
164 char loc[PTR_STRLEN];
165 shared_metadata_t meta;
167 if(size > shm_size(fd) && (ftruncate(fd, static_cast<off_t>(size)) < 0)) {
168 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
171 mem = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
172 if(mem == MAP_FAILED) {
173 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
175 snprintf(loc, PTR_STRLEN, "%p", mem);
178 allocs_metadata[mem] = meta;
179 XBT_DEBUG("MMAP %zu to %p", size, mem);
184 void smpi_bench_destroy(void)
187 allocs_metadata.clear();
188 xbt_dict_free(&samples);
189 xbt_dict_free(&calls);
192 extern "C" XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
193 void smpi_execute_flops_(double *flops)
195 smpi_execute_flops(*flops);
198 extern "C" XBT_PUBLIC(void) smpi_execute_(double *duration);
199 void smpi_execute_(double *duration)
201 smpi_execute(*duration);
204 void smpi_execute_flops(double flops) {
205 smx_synchro_t action;
206 XBT_DEBUG("Handle real computation time: %f flops", flops);
207 action = simcall_execution_start("computation", flops, 1, 0, 0);
208 simcall_set_category (action, TRACE_internal_smpi_get_category());
209 simcall_execution_wait(action);
210 smpi_switch_data_segment(smpi_process_index());
213 void smpi_execute(double duration)
215 if (duration >= smpi_cpu_threshold) {
216 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
217 double flops = duration * smpi_running_power;
218 int rank = smpi_process_index();
219 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
220 extra->type=TRACING_COMPUTING;
221 extra->comp_size=flops;
222 TRACE_smpi_computing_in(rank, extra);
223 smpi_execute_flops(flops);
225 TRACE_smpi_computing_out(rank);
228 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
229 duration, smpi_cpu_threshold);
233 void smpi_bench_begin(void)
235 if (smpi_privatize_global_variables) {
236 smpi_switch_data_segment(smpi_process_index());
239 if (MC_is_active() || MC_record_replay_is_active())
242 xbt_os_threadtimer_start(smpi_process_timer());
245 void smpi_bench_end(void)
248 if (MC_is_active() || MC_record_replay_is_active())
252 xbt_os_timer_t timer = smpi_process_timer();
253 xbt_os_threadtimer_stop(timer);
254 if (smpi_process_get_sampling()) {
255 XBT_CRITICAL("Cannot do recursive benchmarks.");
256 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
257 xbt_backtrace_display_current();
258 xbt_die("Aborting.");
261 if (xbt_cfg_get_string("smpi/comp-adjustment-file")[0] != '\0') { // Maybe we need to artificially speed up or slow
262 // down our computation based on our statistical analysis.
264 smpi_trace_call_location_t* loc = smpi_process_get_call_location();
265 std::string key = loc->get_composed_key();
266 std::unordered_map<std::string, double>::const_iterator it = location2speedup.find(key);
267 if (it != location2speedup.end()) {
268 speedup = it->second;
272 // Simulate the benchmarked computation unless disabled via command-line argument
273 if (xbt_cfg_get_boolean("smpi/simulate-computation")) {
274 smpi_execute(xbt_os_timer_elapsed(timer)/speedup);
277 smpi_total_benched_time += xbt_os_timer_elapsed(timer);
280 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
281 static unsigned int private_sleep(double secs)
285 XBT_DEBUG("Sleep for: %lf secs", secs);
286 int rank = smpi_comm_rank(MPI_COMM_WORLD);
287 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
288 extra->type=TRACING_SLEEPING;
289 extra->sleep_duration=secs;
290 TRACE_smpi_sleeping_in(rank, extra);
292 simcall_process_sleep(secs);
294 TRACE_smpi_sleeping_out(rank);
300 unsigned int smpi_sleep(unsigned int secs)
302 return private_sleep(static_cast<double>(secs));
305 int smpi_usleep(useconds_t usecs)
307 return static_cast<int>(private_sleep(static_cast<double>(usecs) / 1000000.0));
311 int smpi_nanosleep(struct timespec *tp, void* t)
313 return static_cast<int>(private_sleep(static_cast<double>(tp->tv_sec + tp->tv_nsec / 1000000000.0)));
317 int smpi_gettimeofday(struct timeval *tv, void* tz)
321 now = SIMIX_get_clock();
323 tv->tv_sec = static_cast<time_t>(now);
325 tv->tv_usec = static_cast<useconds_t>((now - tv->tv_sec) * 1e6);
327 tv->tv_usec = static_cast<suseconds_t>((now - tv->tv_sec) * 1e6);
335 int smpi_clock_gettime(clockid_t clk_id, struct timespec *tp)
337 //there is only one time in SMPI, so clk_id is ignored.
340 now = SIMIX_get_clock();
342 tp->tv_sec = static_cast<time_t>(now);
343 tp->tv_nsec = static_cast<long int>((now - tp->tv_sec) * 1e9);
350 extern double sg_surf_precision;
351 unsigned long long smpi_rastro_resolution (void)
354 double resolution = (1/sg_surf_precision);
356 return static_cast<unsigned long long>(resolution);
359 unsigned long long smpi_rastro_timestamp (void)
362 double now = SIMIX_get_clock();
364 unsigned long long sec = (unsigned long long)now;
365 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
367 return static_cast<unsigned long long>(sec) * smpi_rastro_resolution() + pre;
370 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
372 double threshold; /* maximal stderr requested (if positive) */
373 double relstderr; /* observed stderr so far */
374 double mean; /* mean of benched times, to be used if the block is disabled */
375 double sum; /* sum of benched times (to compute the mean and stderr) */
376 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
377 int iters; /* amount of requested iterations */
378 int count; /* amount of iterations done so far */
379 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
382 static char *sample_location(int global, const char *file, int line) {
384 return bprintf("%s:%d", file, line);
386 return bprintf("%s:%d:%d", file, line, smpi_process_index());
390 static int sample_enough_benchs(local_data_t *data) {
391 int res = data->count >= data->iters;
392 if (data->threshold>0.0) {
394 res = 0; // not enough data
395 if (data->relstderr > data->threshold)
396 res = 0; // stderr too high yet
398 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
399 (res?"enough benchs":"need more data"), data->count, data->iters, data->relstderr, data->threshold, data->mean);
403 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
405 char *loc = sample_location(global, file, line);
408 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
409 smpi_process_set_sampling(1);
411 if (samples==nullptr)
412 samples = xbt_dict_new_homogeneous(free);
414 data = static_cast<local_data_t *>(xbt_dict_get_or_null(samples, loc));
416 xbt_assert(threshold>0 || iters>0,
417 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
418 data = static_cast<local_data_t *>( xbt_new(local_data_t, 1));
421 data->sum_pow2 = 0.0;
423 data->threshold = threshold;
424 data->benching = 1; // If we have no data, we need at least one
426 xbt_dict_set(samples, loc, data, nullptr);
427 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
429 if (data->iters != iters || data->threshold != threshold) {
430 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. "
431 "How did you manage to give two numbers at the same line??",
432 loc, data->iters, data->threshold, iters,threshold);
436 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate
437 // the computation instead
438 data->benching = (sample_enough_benchs(data) == 0);
439 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc,
440 (data->benching?"more benching needed":"we have enough data, skip computes"));
445 int smpi_sample_2(int global, const char *file, int line)
447 char *loc = sample_location(global, file, line);
451 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
452 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
453 XBT_DEBUG("sample2 %s",loc);
456 if (data->benching==1) {
457 // we need to run a new bench
458 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
459 data->count, data->iters, data->relstderr, data->threshold, data->mean);
462 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just
463 //ran one bench and need to bail out now that our job is done). Just sleep instead
464 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f)."
465 " apply the %fs delay instead", data->count, data->iters, data->relstderr, data->threshold, data->mean);
466 smpi_execute(data->mean);
467 smpi_process_set_sampling(0);
468 res = 0; // prepare to capture future, unrelated computations
474 void smpi_sample_3(int global, const char *file, int line)
476 char *loc = sample_location(global, file, line);
479 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
480 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
481 XBT_DEBUG("sample3 %s",loc);
484 if (data->benching==0) {
488 // ok, benchmarking this loop is over
489 xbt_os_threadtimer_stop(smpi_process_timer());
494 sample = xbt_os_timer_elapsed(smpi_process_timer());
496 data->sum_pow2 += sample * sample;
497 n = static_cast<double>(data->count);
498 data->mean = data->sum / n;
499 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
500 if (sample_enough_benchs(data)==0) {
501 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop
502 // occurrence before leaving, not the mean over the history
504 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
505 data->mean, data->relstderr, sample);
507 // That's enough for now, prevent sample_2 to run the same code over and over
513 void *smpi_shared_malloc(size_t size, const char *file, int line)
516 if (xbt_cfg_get_boolean("smpi/use-shared-malloc")){
518 smpi_source_location loc(file, line);
519 auto res = allocs.insert(std::make_pair(loc, shared_data_t()));
520 auto data = res.first;
522 // The insertion did not take place.
523 // Generate a shared memory name from the address of the shared_data:
524 char shmname[32]; // cannot be longer than PSHMNAMLEN = 31 on Mac OS X (shm_open raises ENAMETOOLONG otherwise)
525 snprintf(shmname, 31, "/shmalloc%p", &*data);
526 fd = shm_open(shmname, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
529 xbt_die("Please cleanup /dev/shm/%s", shmname);
531 xbt_die("An unhandled error occurred while opening %s. shm_open: %s", shmname, strerror(errno));
533 data->second.fd = fd;
534 data->second.count = 1;
535 mem = shm_map(fd, size, &*data);
536 if (shm_unlink(shmname) < 0) {
537 XBT_WARN("Could not early unlink %s. shm_unlink: %s", shmname, strerror(errno));
539 XBT_DEBUG("Mapping %s at %p through %d", shmname, mem, fd);
541 mem = shm_map(data->second.fd, size, &*data);
542 data->second.count++;
544 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, &*data);
546 mem = xbt_malloc(size);
547 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
553 void smpi_shared_free(void *ptr)
555 char loc[PTR_STRLEN];
557 if (xbt_cfg_get_boolean("smpi/use-shared-malloc")){
558 snprintf(loc, PTR_STRLEN, "%p", ptr);
559 auto meta = allocs_metadata.find(ptr);
560 if (meta == allocs_metadata.end()) {
561 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
564 shared_data_t* data = &meta->second.data->second;
565 if (munmap(ptr, meta->second.size) < 0) {
566 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
569 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
570 if (data->count <= 0) {
572 allocs.erase(allocs.find(meta->second.data->first));
573 XBT_DEBUG("Shared free - with removal - of %p", ptr);
576 XBT_DEBUG("Classic free of %p", ptr);
582 int smpi_shared_known_call(const char* func, const char* input)
584 char* loc = bprintf("%s:%s", func, input);
587 if (calls==nullptr) {
588 calls = xbt_dict_new_homogeneous(nullptr);
591 xbt_dict_get(calls, loc); /* Succeed or throw */
597 if (ex.category != not_found_error)
607 void* smpi_shared_get_call(const char* func, const char* input) {
608 char* loc = bprintf("%s:%s", func, input);
612 calls = xbt_dict_new_homogeneous(nullptr);
614 data = xbt_dict_get(calls, loc);
619 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
620 char* loc = bprintf("%s:%s", func, input);
623 calls = xbt_dict_new_homogeneous(nullptr);
625 xbt_dict_set(calls, loc, data, nullptr);
631 /** Map a given SMPI privatization segment (make a SMPI process active) */
632 void smpi_switch_data_segment(int dest) {
633 if (smpi_loaded_page == dest)//no need to switch, we've already loaded the one we want
637 smpi_really_switch_data_segment(dest);
640 /** Map a given SMPI privatization segment (make a SMPI process active) even if SMPI thinks it is already active
642 * When doing a state restoration, the state of the restored variables might not be consistent with the state of the
643 * virtual memory. In this case, we to change the data segment.
645 void smpi_really_switch_data_segment(int dest) {
646 if(smpi_size_data_exe == 0)//no need to switch
649 #if HAVE_PRIVATIZATION
650 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
651 for (int i=0; i< smpi_process_count(); i++){
652 memcpy(smpi_privatisation_regions[i].address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
656 // FIXME, cross-process support (mmap across process when necessary)
657 int current = smpi_privatisation_regions[dest].file_descriptor;
658 XBT_DEBUG("Switching data frame to the one of process %d", dest);
659 void* tmp = mmap (TOPAGE(smpi_start_data_exe), smpi_size_data_exe,
660 PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
661 if (tmp != TOPAGE(smpi_start_data_exe))
662 xbt_die("Couldn't map the new region");
663 smpi_loaded_page = dest;
667 int smpi_is_privatisation_file(char* file)
669 return strncmp("/dev/shm/my-buffer-", file, std::strlen("/dev/shm/my-buffer-")) == 0;
672 void smpi_initialize_global_memory_segments(){
674 #if !HAVE_PRIVATIZATION
675 smpi_privatize_global_variables=false;
676 xbt_die("You are trying to use privatization on a system that does not support it. Don't.");
680 smpi_get_executable_global_size();
682 XBT_DEBUG ("bss+data segment found : size %d starting at %p", smpi_size_data_exe, smpi_start_data_exe );
684 if (smpi_size_data_exe == 0){//no need to switch
685 smpi_privatize_global_variables=false;
689 smpi_privatisation_regions =
690 static_cast<smpi_privatisation_region_t>( xbt_malloc(smpi_process_count() * sizeof(struct s_smpi_privatisation_region)));
692 for (int i=0; i< smpi_process_count(); i++){
693 //create SIMIX_process_count() mappings of this size with the same data inside
694 void *address = nullptr;
695 char path[] = "/dev/shm/my-buffer-XXXXXX";
698 int file_descriptor= mkstemp (path);
699 if (file_descriptor < 0) {
701 xbt_die("Impossible to create temporary file for memory mapping: %s\n\
702 The open() system call failed with the EMFILE error code (too many files). \n\n\
703 This means that you reached the system limits concerning the amount of files per process. \
704 This is not a surprise if you are trying to virtualize many processes on top of SMPI. \
705 Don't panic -- you should simply increase your system limits and try again. \n\n\
706 First, check what your limits are:\n\
707 cat /proc/sys/fs/file-max # Gives you the system-wide limit\n\
708 ulimit -Hn # Gives you the per process hard limit\n\
709 ulimit -Sn # Gives you the per process soft limit\n\
710 cat /proc/self/limits # Displays any per-process limitation (including the one given above)\n\n\
711 If one of these values is less than the amount of MPI processes that you try to run, then you got the explanation of this error. \
712 Ask the Internet about tutorials on how to increase the files limit such as: https://rtcamp.com/tutorials/linux/increase-open-files-limit/",
715 xbt_die("Impossible to create temporary file for memory mapping: %s",
719 status = unlink (path);
721 xbt_die("Impossible to unlink temporary file for memory mapping");
723 status = ftruncate(file_descriptor, smpi_size_data_exe);
725 xbt_die("Impossible to set the size of the temporary file for memory mapping");
727 /* Ask for a free region */
728 address = mmap (nullptr, smpi_size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
729 if (address == MAP_FAILED)
730 xbt_die("Couldn't find a free region for memory mapping");
732 //initialize the values
733 memcpy(address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
735 //store the address of the mapping for further switches
736 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
737 smpi_privatisation_regions[i].address = address;
742 void smpi_destroy_global_memory_segments(){
743 if (smpi_size_data_exe == 0)//no need to switch
745 #if HAVE_PRIVATIZATION
747 for (i=0; i< smpi_process_count(); i++){
748 if(munmap(smpi_privatisation_regions[i].address, smpi_size_data_exe) < 0) {
749 XBT_WARN("Unmapping of fd %d failed: %s", smpi_privatisation_regions[i].file_descriptor, strerror(errno));
751 close(smpi_privatisation_regions[i].file_descriptor);
753 xbt_free(smpi_privatisation_regions);
757 extern "C" { /** These functions will be called from the user code **/
758 smpi_trace_call_location_t* smpi_trace_get_call_location() {
759 return smpi_process_get_call_location();
762 void smpi_trace_set_call_location(const char* file, const int line) {
763 smpi_trace_call_location_t* loc = smpi_process_get_call_location();
765 loc->previous_filename = loc->filename;
766 loc->previous_linenumber = loc->linenumber;
767 loc->filename = file;
768 loc->linenumber = line;
772 * Required for Fortran bindings
774 void smpi_trace_set_call_location_(const char* file, int* line) {
775 smpi_trace_set_call_location(file, *line);
779 * Required for Fortran if -fsecond-underscore is activated
781 void smpi_trace_set_call_location__(const char* file, int* line) {
782 smpi_trace_set_call_location(file, *line);