1 /* Copyright (c) 2007, 2009-2015. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include <unordered_map>
12 #include "src/internal_config.h"
14 #include "private.hpp"
16 #include "xbt/sysdep.h"
18 #include "surf/surf.h"
19 #include "simgrid/sg_config.h"
20 #include "simgrid/modelchecker.h"
21 #include "src/mc/mc_replay.h"
27 #include <sys/types.h>
30 #include <math.h> // sqrt
36 #define MAP_ANONYMOUS MAP_ANON
39 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi, "Logging specific to SMPI (benchmarking)");
41 /* Shared allocations are handled through shared memory segments.
42 * Associated data and metadata are used as follows:
45 * `allocs' dict ---- -.
46 * ---------- shared_data_t shared_metadata_t / | | |
47 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
48 * | ---------- | fd of <name> | | | size of mmap | --| | | |
49 * | | count (2) | |-- | data | \ | | |
50 * `----------------- | <name> | | ----------------- ---- |
51 * -------------------- | ^ |
53 * | | `allocs_metadata' dict |
54 * | | ---------------------- |
55 * | `-- | <addr of mmap #1> |<-'
56 * | .-- | <addr of mmap #2> |<-.
57 * | | ---------------------- |
63 * | shared_metadata_t / | |
64 * | ----------------- | | |
65 * | | size of mmap | --| | |
67 * ----------------- | | |
72 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
74 xbt_dict_t samples = NULL; /* Allocated on first use */
75 xbt_dict_t calls = NULL; /* Allocated on first use */
77 double smpi_cpu_threshold;
78 double smpi_running_power;
80 int smpi_loaded_page = -1;
81 char* smpi_start_data_exe = NULL;
82 int smpi_size_data_exe = 0;
83 bool smpi_privatize_global_variables;
84 double smpi_total_benched_time = 0;
85 smpi_privatisation_region_t smpi_privatisation_regions;
89 /** Some location in the source code
91 * This information is used by SMPI_SHARED_MALLOC to allocate some shared memory for all simulated processes.
93 class smpi_source_location {
95 smpi_source_location(const char* filename, int line)
96 : filename(filename), filename_length(strlen(filename)), line(line) {}
98 /** Pointer to a static string containing the file name */
99 const char* filename = nullptr;
100 int filename_length = 0;
103 bool operator==(smpi_source_location const& that) const
105 return filename_length == that.filename_length
107 && std::memcmp(filename, that.filename, filename_length) == 0;
109 bool operator!=(smpi_source_location const& that) const
111 return !(*this == that);
120 class hash<smpi_source_location> {
122 typedef smpi_source_location argument_type;
123 typedef std::size_t result_type;
124 result_type operator()(smpi_source_location const& loc) const
126 return xbt_str_hash_ext(loc.filename, loc.filename_length)
127 ^ xbt_str_hash_ext((const char*) &loc.line, sizeof(loc.line));
140 std::unordered_map<smpi_source_location, shared_data_t> allocs;
141 typedef std::unordered_map<smpi_source_location, shared_data_t>::value_type shared_data_key_type;
145 shared_data_key_type* data;
148 std::unordered_map<void*, shared_metadata_t> allocs_metadata;
152 static size_t shm_size(int fd) {
155 if(fstat(fd, &st) < 0) {
156 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
158 return static_cast<size_t>(st.st_size);
162 static void* shm_map(int fd, size_t size, shared_data_key_type* data) {
164 char loc[PTR_STRLEN];
165 shared_metadata_t meta;
167 if(size > shm_size(fd) && (ftruncate(fd, static_cast<off_t>(size)) < 0)) {
168 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
171 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
172 if(mem == MAP_FAILED) {
173 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
175 snprintf(loc, PTR_STRLEN, "%p", mem);
178 allocs_metadata[mem] = meta;
179 XBT_DEBUG("MMAP %zu to %p", size, mem);
184 void smpi_bench_destroy(void)
187 allocs_metadata.clear();
188 xbt_dict_free(&samples);
189 xbt_dict_free(&calls);
192 extern "C" XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
193 void smpi_execute_flops_(double *flops)
195 smpi_execute_flops(*flops);
198 extern "C" XBT_PUBLIC(void) smpi_execute_(double *duration);
199 void smpi_execute_(double *duration)
201 smpi_execute(*duration);
204 void smpi_execute_flops(double flops) {
205 smx_synchro_t action;
206 XBT_DEBUG("Handle real computation time: %f flops", flops);
207 action = simcall_execution_start("computation", flops, 1, 0, 0);
208 simcall_set_category (action, TRACE_internal_smpi_get_category());
209 simcall_execution_wait(action);
210 smpi_switch_data_segment(smpi_process_index());
213 void smpi_execute(double duration)
215 if (duration >= smpi_cpu_threshold) {
216 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
217 double flops = duration * smpi_running_power;
218 int rank = smpi_process_index();
219 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
220 extra->type=TRACING_COMPUTING;
221 extra->comp_size=flops;
222 TRACE_smpi_computing_in(rank, extra);
223 smpi_execute_flops(flops);
225 TRACE_smpi_computing_out(rank);
228 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
229 duration, smpi_cpu_threshold);
233 void smpi_switch_data_segment(int dest);
235 void smpi_bench_begin(void)
237 if (smpi_privatize_global_variables) {
238 smpi_switch_data_segment(smpi_process_index());
241 if (MC_is_active() || MC_record_replay_is_active())
244 xbt_os_threadtimer_start(smpi_process_timer());
247 void smpi_bench_end(void)
250 if (MC_is_active() || MC_record_replay_is_active())
254 xbt_os_timer_t timer = smpi_process_timer();
255 xbt_os_threadtimer_stop(timer);
256 if (smpi_process_get_sampling()) {
257 XBT_CRITICAL("Cannot do recursive benchmarks.");
258 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
259 xbt_backtrace_display_current();
260 xbt_die("Aborting.");
263 if (xbt_cfg_get_string("smpi/comp-adjustment-file")[0] != '\0') { // Maybe we need to artificially speed up or slow
264 // down our computation based on our statistical analysis.
266 smpi_trace_call_location_t* loc = smpi_process_get_call_location();
267 std::string key = loc->get_composed_key();
268 std::unordered_map<std::string, double>::const_iterator it = location2speedup.find(key);
269 if (it != location2speedup.end()) {
270 speedup = it->second;
274 // Simulate the benchmarked computation unless disabled via command-line argument
275 if (xbt_cfg_get_boolean("smpi/simulate-computation")) {
276 smpi_execute(xbt_os_timer_elapsed(timer)/speedup);
279 smpi_total_benched_time += xbt_os_timer_elapsed(timer);
282 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
283 static unsigned int private_sleep(double secs)
287 XBT_DEBUG("Sleep for: %lf secs", secs);
288 int rank = smpi_comm_rank(MPI_COMM_WORLD);
289 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
290 extra->type=TRACING_SLEEPING;
291 extra->sleep_duration=secs;
292 TRACE_smpi_sleeping_in(rank, extra);
294 simcall_process_sleep(secs);
296 TRACE_smpi_sleeping_out(rank);
302 unsigned int smpi_sleep(unsigned int secs)
304 return private_sleep(static_cast<double>(secs));
307 int smpi_usleep(useconds_t usecs)
309 return static_cast<int>(private_sleep(static_cast<double>(usecs) / 1000000.0));
312 int smpi_gettimeofday(struct timeval *tv, void* tz)
316 now = SIMIX_get_clock();
318 tv->tv_sec = static_cast<time_t>(now);
320 tv->tv_usec = static_cast<useconds_t>((now - tv->tv_sec) * 1e6);
322 tv->tv_usec = static_cast<suseconds_t>((now - tv->tv_sec) * 1e6);
329 extern double sg_surf_precision;
330 unsigned long long smpi_rastro_resolution (void)
333 double resolution = (1/sg_surf_precision);
335 return static_cast<unsigned long long>(resolution);
338 unsigned long long smpi_rastro_timestamp (void)
341 double now = SIMIX_get_clock();
343 unsigned long long sec = (unsigned long long)now;
344 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
346 return static_cast<unsigned long long>(sec) * smpi_rastro_resolution() + pre;
349 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
351 double threshold; /* maximal stderr requested (if positive) */
352 double relstderr; /* observed stderr so far */
353 double mean; /* mean of benched times, to be used if the block is disabled */
354 double sum; /* sum of benched times (to compute the mean and stderr) */
355 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
356 int iters; /* amount of requested iterations */
357 int count; /* amount of iterations done so far */
358 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
361 static char *sample_location(int global, const char *file, int line) {
363 return bprintf("%s:%d", file, line);
365 return bprintf("%s:%d:%d", file, line, smpi_process_index());
369 static int sample_enough_benchs(local_data_t *data) {
370 int res = data->count >= data->iters;
371 if (data->threshold>0.0) {
373 res = 0; // not enough data
374 if (data->relstderr > data->threshold)
375 res = 0; // stderr too high yet
377 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
378 (res?"enough benchs":"need more data"), data->count, data->iters, data->relstderr, data->threshold, data->mean);
382 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
384 char *loc = sample_location(global, file, line);
387 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
388 smpi_process_set_sampling(1);
391 samples = xbt_dict_new_homogeneous(free);
393 data = static_cast<local_data_t *>(xbt_dict_get_or_null(samples, loc));
395 xbt_assert(threshold>0 || iters>0,
396 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
397 data = static_cast<local_data_t *>( xbt_new(local_data_t, 1));
400 data->sum_pow2 = 0.0;
402 data->threshold = threshold;
403 data->benching = 1; // If we have no data, we need at least one
405 xbt_dict_set(samples, loc, data, NULL);
406 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
408 if (data->iters != iters || data->threshold != threshold) {
409 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. "
410 "How did you manage to give two numbers at the same line??",
411 loc, data->iters, data->threshold, iters,threshold);
415 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate
416 // the computation instead
417 data->benching = (sample_enough_benchs(data) == 0);
418 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc,
419 (data->benching?"more benching needed":"we have enough data, skip computes"));
424 int smpi_sample_2(int global, const char *file, int line)
426 char *loc = sample_location(global, file, line);
430 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
431 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
432 XBT_DEBUG("sample2 %s",loc);
435 if (data->benching==1) {
436 // we need to run a new bench
437 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
438 data->count, data->iters, data->relstderr, data->threshold, data->mean);
441 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just
442 //ran one bench and need to bail out now that our job is done). Just sleep instead
443 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f)."
444 " apply the %fs delay instead", data->count, data->iters, data->relstderr, data->threshold, data->mean);
445 smpi_execute(data->mean);
446 smpi_process_set_sampling(0);
447 res = 0; // prepare to capture future, unrelated computations
453 void smpi_sample_3(int global, const char *file, int line)
455 char *loc = sample_location(global, file, line);
458 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
459 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
460 XBT_DEBUG("sample3 %s",loc);
463 if (data->benching==0) {
467 // ok, benchmarking this loop is over
468 xbt_os_threadtimer_stop(smpi_process_timer());
473 sample = xbt_os_timer_elapsed(smpi_process_timer());
475 data->sum_pow2 += sample * sample;
476 n = static_cast<double>(data->count);
477 data->mean = data->sum / n;
478 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
479 if (sample_enough_benchs(data)==0) {
480 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop
481 // occurrence before leaving, not the mean over the history
483 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
484 data->mean, data->relstderr, sample);
486 // That's enough for now, prevent sample_2 to run the same code over and over
492 void *smpi_shared_malloc(size_t size, const char *file, int line)
495 if (xbt_cfg_get_boolean("smpi/use-shared-malloc")){
497 smpi_source_location loc(file, line);
498 auto res = allocs.insert(std::make_pair(loc, shared_data_t()));
499 auto data = res.first;
501 // The insertion did not take place.
502 // Generate a shared memory name from the address of the shared_data:
503 char shmname[32]; // cannot be longer than PSHMNAMLEN = 31 on Mac OS X (shm_open raises ENAMETOOLONG otherwise)
504 snprintf(shmname, 31, "/shmalloc%p", &*data);
505 fd = shm_open(shmname, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
508 xbt_die("Please cleanup /dev/shm/%s", shmname);
510 xbt_die("An unhandled error occurred while opening %s. shm_open: %s", shmname, strerror(errno));
512 data->second.fd = fd;
513 data->second.count = 1;
514 mem = shm_map(fd, size, &*data);
515 if (shm_unlink(shmname) < 0) {
516 XBT_WARN("Could not early unlink %s. shm_unlink: %s", shmname, strerror(errno));
518 XBT_DEBUG("Mapping %s at %p through %d", shmname, mem, fd);
520 mem = shm_map(data->second.fd, size, &*data);
521 data->second.count++;
523 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, &*data);
525 mem = xbt_malloc(size);
526 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
532 void smpi_shared_free(void *ptr)
534 char loc[PTR_STRLEN];
536 if (xbt_cfg_get_boolean("smpi/use-shared-malloc")){
537 snprintf(loc, PTR_STRLEN, "%p", ptr);
538 auto meta = allocs_metadata.find(ptr);
539 if (meta == allocs_metadata.end()) {
540 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
543 shared_data_t* data = &meta->second.data->second;
544 if (munmap(ptr, meta->second.size) < 0) {
545 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
548 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
549 if (data->count <= 0) {
551 allocs.erase(allocs.find(meta->second.data->first));
552 XBT_DEBUG("Shared free - with removal - of %p", ptr);
555 XBT_DEBUG("Classic free of %p", ptr);
561 int smpi_shared_known_call(const char* func, const char* input)
563 char* loc = bprintf("%s:%s", func, input);
568 calls = xbt_dict_new_homogeneous(NULL);
571 xbt_dict_get(calls, loc); /* Succeed or throw */
578 if (ex.category != not_found_error)
585 void* smpi_shared_get_call(const char* func, const char* input) {
586 char* loc = bprintf("%s:%s", func, input);
590 calls = xbt_dict_new_homogeneous(NULL);
592 data = xbt_dict_get(calls, loc);
597 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
598 char* loc = bprintf("%s:%s", func, input);
601 calls = xbt_dict_new_homogeneous(NULL);
603 xbt_dict_set(calls, loc, data, NULL);
608 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
610 /** Map a given SMPI privatization segment (make a SMPI process active) */
611 void smpi_switch_data_segment(int dest){
612 if (smpi_loaded_page==dest)//no need to switch either
616 smpi_really_switch_data_segment(dest);
619 /** Map a given SMPI privatization segment (make a SMPI process active) even if SMPI thinks it is already active
621 * When doing a state restoration, the state of the restored variables might not be consistent with the state of the
622 * virtual memory. In this case, we to change the data segment.
624 void smpi_really_switch_data_segment(int dest) {
625 if(smpi_size_data_exe == 0)//no need to switch
628 #if HAVE_PRIVATIZATION
629 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
630 for (int i=0; i< smpi_process_count(); i++){
631 memcpy(smpi_privatisation_regions[i].address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
635 // FIXME, cross-process support (mmap across process when necessary)
636 int current = smpi_privatisation_regions[dest].file_descriptor;
637 XBT_DEBUG("Switching data frame to the one of process %d", dest);
638 void* tmp = mmap (TOPAGE(smpi_start_data_exe), smpi_size_data_exe,
639 PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
640 if (tmp != TOPAGE(smpi_start_data_exe))
641 xbt_die("Couldn't map the new region");
642 smpi_loaded_page = dest;
646 int smpi_is_privatisation_file(char* file)
648 return strncmp("/dev/shm/my-buffer-", file, 19) == 0;
651 void smpi_initialize_global_memory_segments(){
653 #if !HAVE_PRIVATIZATION
654 smpi_privatize_global_variables=false;
655 xbt_die("You are trying to use privatization on a system that does not support it. Don't.");
659 smpi_get_executable_global_size();
661 XBT_DEBUG ("bss+data segment found : size %d starting at %p", smpi_size_data_exe, smpi_start_data_exe );
663 if (smpi_size_data_exe == 0){//no need to switch
664 smpi_privatize_global_variables=false;
668 smpi_privatisation_regions =
669 static_cast<smpi_privatisation_region_t>( xbt_malloc(smpi_process_count() * sizeof(struct s_smpi_privatisation_region)));
671 for (int i=0; i< smpi_process_count(); i++){
672 //create SIMIX_process_count() mappings of this size with the same data inside
673 void *address = NULL;
674 char path[] = "/dev/shm/my-buffer-XXXXXX";
677 int file_descriptor= mkstemp (path);
678 if (file_descriptor < 0) {
680 xbt_die("Impossible to create temporary file for memory mapping: %s\n\
681 The open() system call failed with the EMFILE error code (too many files). \n\n\
682 This means that you reached the system limits concerning the amount of files per process. \
683 This is not a surprise if you are trying to virtualize many processes on top of SMPI. \
684 Don't panic -- you should simply increase your system limits and try again. \n\n\
685 First, check what your limits are:\n\
686 cat /proc/sys/fs/file-max # Gives you the system-wide limit\n\
687 ulimit -Hn # Gives you the per process hard limit\n\
688 ulimit -Sn # Gives you the per process soft limit\n\
689 cat /proc/self/limits # Displays any per-process limitation (including the one given above)\n\n\
690 If one of these values is less than the amount of MPI processes that you try to run, then you got the explanation of this error. \
691 Ask the Internet about tutorials on how to increase the files limit such as: https://rtcamp.com/tutorials/linux/increase-open-files-limit/",
694 xbt_die("Impossible to create temporary file for memory mapping: %s",
698 status = unlink (path);
700 xbt_die("Impossible to unlink temporary file for memory mapping");
702 status = ftruncate(file_descriptor, smpi_size_data_exe);
704 xbt_die("Impossible to set the size of the temporary file for memory mapping");
706 /* Ask for a free region */
707 address = mmap (NULL, smpi_size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
708 if (address == MAP_FAILED)
709 xbt_die("Couldn't find a free region for memory mapping");
711 //initialize the values
712 memcpy(address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
714 //store the address of the mapping for further switches
715 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
716 smpi_privatisation_regions[i].address = address;
721 void smpi_destroy_global_memory_segments(){
722 if (smpi_size_data_exe == 0)//no need to switch
724 #if HAVE_PRIVATIZATION
726 for (i=0; i< smpi_process_count(); i++){
727 if(munmap(smpi_privatisation_regions[i].address, smpi_size_data_exe) < 0) {
728 XBT_WARN("Unmapping of fd %d failed: %s", smpi_privatisation_regions[i].file_descriptor, strerror(errno));
730 close(smpi_privatisation_regions[i].file_descriptor);
732 xbt_free(smpi_privatisation_regions);
736 extern "C" { /** These functions will be called from the user code **/
737 smpi_trace_call_location_t* smpi_trace_get_call_location() {
738 return smpi_process_get_call_location();
741 void smpi_trace_set_call_location(const char* file, const int line) {
742 smpi_trace_call_location_t* loc = smpi_process_get_call_location();
744 loc->previous_filename = loc->filename;
745 loc->previous_linenumber = loc->linenumber;
746 loc->filename = file;
747 loc->linenumber = line;
751 * Required for Fortran bindings
753 void smpi_trace_set_call_location_(const char* file, int* line) {
754 smpi_trace_set_call_location(file, *line);
758 * Required for Fortran if -fsecond-underscore is activated
760 void smpi_trace_set_call_location__(const char* file, int* line) {
761 smpi_trace_set_call_location(file, *line);