1 /* Copyright (c) 2007, 2009-2015. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include <unordered_map>
12 #include "src/internal_config.h"
15 #include "xbt/sysdep.h"
18 #include "surf/surf.h"
19 #include "simgrid/sg_config.h"
20 #include "simgrid/modelchecker.h"
21 #include "src/mc/mc_replay.h"
27 #include <sys/types.h>
30 #include <math.h> // sqrt
36 #define MAP_ANONYMOUS MAP_ANON
39 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
40 "Logging specific to SMPI (benchmarking)");
42 /* Shared allocations are handled through shared memory segments.
43 * Associated data and metadata are used as follows:
46 * `allocs' dict ---- -.
47 * ---------- shared_data_t shared_metadata_t / | | |
48 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
49 * | ---------- | fd of <name> | | | size of mmap | --| | | |
50 * | | count (2) | |-- | data | \ | | |
51 * `----------------- | <name> | | ----------------- ---- |
52 * -------------------- | ^ |
54 * | | `allocs_metadata' dict |
55 * | | ---------------------- |
56 * | `-- | <addr of mmap #1> |<-'
57 * | .-- | <addr of mmap #2> |<-.
58 * | | ---------------------- |
64 * | shared_metadata_t / | |
65 * | ----------------- | | |
66 * | | size of mmap | --| | |
68 * ----------------- | | |
73 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
75 xbt_dict_t samples = NULL; /* Allocated on first use */
76 xbt_dict_t calls = NULL; /* Allocated on first use */
78 double smpi_cpu_threshold;
79 double smpi_running_power;
81 int smpi_loaded_page = -1;
82 char* smpi_start_data_exe = NULL;
83 int smpi_size_data_exe = 0;
84 int smpi_privatize_global_variables;
85 double smpi_total_benched_time = 0;
86 smpi_privatisation_region_t smpi_privatisation_regions;
90 /** Some location in the source code
92 * This information is used by SMPI_SHARED_MALLOC to allocate
93 * some shared memory for all simulated processes.
95 class smpi_source_location {
97 smpi_source_location(const char* filename, int line)
98 : filename(filename), filename_length(strlen(filename)), line(line) {}
100 /** Pointer to a static string containing the file name */
101 const char* filename = nullptr;
102 int filename_length = 0;
105 bool operator==(smpi_source_location const& that) const
107 return filename_length == that.filename_length
109 && std::memcmp(filename, that.filename, filename_length) == 0;
111 bool operator!=(smpi_source_location const& that) const
113 return !(*this == that);
122 class hash<smpi_source_location> {
124 typedef smpi_source_location argument_type;
125 typedef std::size_t result_type;
126 result_type operator()(smpi_source_location const& loc) const
128 return xbt_str_hash_ext(loc.filename, loc.filename_length)
129 ^ xbt_str_hash_ext((const char*) &loc.line, sizeof(loc.line));
142 std::unordered_map<smpi_source_location, shared_data_t> allocs;
143 typedef std::unordered_map<smpi_source_location, shared_data_t>::value_type shared_data_key_type;
147 shared_data_key_type* data;
150 std::unordered_map<void*, shared_metadata_t> allocs_metadata;
154 static size_t shm_size(int fd) {
157 if(fstat(fd, &st) < 0) {
158 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
160 return (size_t)st.st_size;
164 static void* shm_map(int fd, size_t size, shared_data_key_type* data) {
166 char loc[PTR_STRLEN];
167 shared_metadata_t meta;
169 if(size > shm_size(fd)) {
170 if(ftruncate(fd, (off_t)size) < 0) {
171 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
175 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
176 if(mem == MAP_FAILED) {
177 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
179 snprintf(loc, PTR_STRLEN, "%p", mem);
182 allocs_metadata[mem] = meta;
183 XBT_DEBUG("MMAP %zu to %p", size, mem);
188 void smpi_bench_destroy(void)
191 allocs_metadata.clear();
192 xbt_dict_free(&samples);
193 xbt_dict_free(&calls);
196 extern "C" XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
197 void smpi_execute_flops_(double *flops)
199 smpi_execute_flops(*flops);
202 extern "C" XBT_PUBLIC(void) smpi_execute_(double *duration);
203 void smpi_execute_(double *duration)
205 smpi_execute(*duration);
208 void smpi_execute_flops(double flops) {
209 smx_synchro_t action;
210 XBT_DEBUG("Handle real computation time: %f flops", flops);
211 action = simcall_execution_start("computation", flops, 1, 0, 0);
212 simcall_set_category (action, TRACE_internal_smpi_get_category());
213 simcall_execution_wait(action);
214 smpi_switch_data_segment(smpi_process_index());
217 void smpi_execute(double duration)
219 if (duration >= smpi_cpu_threshold) {
220 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
221 double flops = duration * smpi_running_power;
222 int rank = smpi_process_index();
223 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
224 extra->type=TRACING_COMPUTING;
225 extra->comp_size=flops;
226 TRACE_smpi_computing_in(rank, extra);
227 smpi_execute_flops(flops);
229 TRACE_smpi_computing_out(rank);
232 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
233 duration, smpi_cpu_threshold);
237 void smpi_switch_data_segment(int dest);
239 void smpi_bench_begin(void)
241 if (smpi_privatize_global_variables) {
242 smpi_switch_data_segment(smpi_process_index());
245 if (MC_is_active() || MC_record_replay_is_active())
248 xbt_os_threadtimer_start(smpi_process_timer());
251 void smpi_bench_end(void)
254 if (MC_is_active() || MC_record_replay_is_active())
257 xbt_os_timer_t timer = smpi_process_timer();
258 xbt_os_threadtimer_stop(timer);
259 // smpi_switch_data_segment(smpi_process_count());
260 if (smpi_process_get_sampling()) {
261 XBT_CRITICAL("Cannot do recursive benchmarks.");
262 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
263 xbt_backtrace_display_current();
264 xbt_die("Aborting.");
266 // Simulate the benchmarked computation unless disabled via command-line argument
267 if (sg_cfg_get_boolean("smpi/simulate_computation")) {
268 smpi_execute(xbt_os_timer_elapsed(timer));
271 smpi_total_benched_time += xbt_os_timer_elapsed(timer);
274 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
275 static unsigned int private_sleep(double secs)
279 XBT_DEBUG("Sleep for: %lf secs", secs);
280 int rank = smpi_comm_rank(MPI_COMM_WORLD);
281 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
282 extra->type=TRACING_SLEEPING;
283 extra->sleep_duration=secs;
284 TRACE_smpi_sleeping_in(rank, extra);
286 simcall_process_sleep(secs);
288 TRACE_smpi_sleeping_out(rank);
294 unsigned int smpi_sleep(unsigned int secs)
296 return private_sleep((double)secs);
299 int smpi_usleep(useconds_t usecs)
301 return (int)private_sleep((double)usecs / 1000000.0);
305 int smpi_gettimeofday(struct timeval *tv, void* tz)
309 now = SIMIX_get_clock();
311 tv->tv_sec = (time_t)now;
313 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
315 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
322 extern double sg_surf_precision;
323 unsigned long long smpi_rastro_resolution (void)
326 double resolution = (1/sg_surf_precision);
328 return (unsigned long long)resolution;
331 unsigned long long smpi_rastro_timestamp (void)
334 double now = SIMIX_get_clock();
336 unsigned long long sec = (unsigned long long)now;
337 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
339 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
342 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
344 double threshold; /* maximal stderr requested (if positive) */
345 double relstderr; /* observed stderr so far */
346 double mean; /* mean of benched times, to be used if the block is disabled */
347 double sum; /* sum of benched times (to compute the mean and stderr) */
348 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
349 int iters; /* amount of requested iterations */
350 int count; /* amount of iterations done so far */
351 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
354 static char *sample_location(int global, const char *file, int line) {
356 return bprintf("%s:%d", file, line);
358 return bprintf("%s:%d:%d", file, line, smpi_process_index());
361 static int sample_enough_benchs(local_data_t *data) {
362 int res = data->count >= data->iters;
363 if (data->threshold>0.0) {
365 res = 0; // not enough data
366 if (data->relstderr > data->threshold)
367 res = 0; // stderr too high yet
369 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
370 (res?"enough benchs":"need more data"),
371 data->count, data->iters, data->relstderr, data->threshold, data->mean);
375 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
377 char *loc = sample_location(global, file, line);
380 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
381 smpi_process_set_sampling(1);
384 samples = xbt_dict_new_homogeneous(free);
386 data = static_cast<local_data_t *>(xbt_dict_get_or_null(samples, loc));
388 xbt_assert(threshold>0 || iters>0,
389 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
390 data = (local_data_t *) xbt_new(local_data_t, 1);
393 data->sum_pow2 = 0.0;
395 data->threshold = threshold;
396 data->benching = 1; // If we have no data, we need at least one
398 xbt_dict_set(samples, loc, data, NULL);
399 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
401 if (data->iters != iters || data->threshold != threshold) {
402 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
403 loc, data->iters, data->threshold, iters,threshold);
407 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
408 data->benching = !sample_enough_benchs(data);
409 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
414 int smpi_sample_2(int global, const char *file, int line)
416 char *loc = sample_location(global, file, line);
420 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
421 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
422 XBT_DEBUG("sample2 %s",loc);
425 if (data->benching==1) {
426 // we need to run a new bench
427 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
428 data->count, data->iters, data->relstderr, data->threshold, data->mean);
431 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
432 // Just sleep instead
433 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
434 data->count, data->iters, data->relstderr, data->threshold, data->mean);
435 smpi_execute(data->mean);
436 smpi_process_set_sampling(0);
437 res = 0; // prepare to capture future, unrelated computations
444 void smpi_sample_3(int global, const char *file, int line)
446 char *loc = sample_location(global, file, line);
449 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
450 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
451 XBT_DEBUG("sample3 %s",loc);
454 if (data->benching==0) {
458 // ok, benchmarking this loop is over
459 xbt_os_threadtimer_stop(smpi_process_timer());
464 sample = xbt_os_timer_elapsed(smpi_process_timer());
466 data->sum_pow2 += sample * sample;
467 n = (double)data->count;
468 data->mean = data->sum / n;
469 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
470 if (!sample_enough_benchs(data)) {
471 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
473 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
474 data->mean, data->relstderr, sample);
476 // That's enough for now, prevent sample_2 to run the same code over and over
482 void *smpi_shared_malloc(size_t size, const char *file, int line)
485 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
487 smpi_source_location loc(file, line);
488 auto res = allocs.insert(std::make_pair(loc, shared_data_t()));
489 auto data = res.first;
491 // The insertion did not take place.
492 // Generate a shared memory name from the address of the shared_data:
494 sprintf(shmname, "smpi_shared_malloc_%p", &*data);
495 fd = shm_open(shmname, O_RDWR | O_CREAT | O_EXCL,
496 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
500 xbt_die("Please cleanup /dev/shm/%s", shmname);
502 xbt_die("An unhandled error occured while opening %s. shm_open: %s", shmname, strerror(errno));
505 data->second.fd = fd;
506 data->second.count = 1;
507 mem = shm_map(fd, size, &*data);
508 if (shm_unlink(shmname) < 0) {
509 XBT_WARN("Could not early unlink %s. shm_unlink: %s", shmname, strerror(errno));
511 XBT_DEBUG("Mapping %s at %p through %d", shmname, mem, fd);
513 mem = shm_map(data->second.fd, size, &*data);
514 data->second.count++;
516 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, &*data);
518 mem = xbt_malloc(size);
519 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
525 void smpi_shared_free(void *ptr)
527 char loc[PTR_STRLEN];
529 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
530 snprintf(loc, PTR_STRLEN, "%p", ptr);
531 auto meta = allocs_metadata.find(ptr);
532 if (meta == allocs_metadata.end()) {
533 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
536 shared_data_t* data = &meta->second.data->second;
537 if (munmap(ptr, meta->second.size) < 0) {
538 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
541 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
542 if (data->count <= 0) {
544 allocs.erase(allocs.find(meta->second.data->first));
545 XBT_DEBUG("Shared free - with removal - of %p", ptr);
548 XBT_DEBUG("Classic free of %p", ptr);
554 int smpi_shared_known_call(const char* func, const char* input)
556 char* loc = bprintf("%s:%s", func, input);
561 calls = xbt_dict_new_homogeneous(NULL);
564 xbt_dict_get(calls, loc); /* Succeed or throw */
571 if (ex.category != not_found_error)
578 void* smpi_shared_get_call(const char* func, const char* input) {
579 char* loc = bprintf("%s:%s", func, input);
583 calls = xbt_dict_new_homogeneous(NULL);
585 data = xbt_dict_get(calls, loc);
590 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
591 char* loc = bprintf("%s:%s", func, input);
594 calls = xbt_dict_new_homogeneous(NULL);
596 xbt_dict_set(calls, loc, data, NULL);
604 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
607 /** Map a given SMPI privatization segment (make a SMPI process active)
609 void smpi_switch_data_segment(int dest){
611 if (smpi_loaded_page==dest)//no need to switch either
615 smpi_really_switch_data_segment(dest);
618 /** Map a given SMPI privatization segment (make a SMPI process active)
619 * even if SMPI thinks it is already active
621 * When doing a state restoration, the state of the restored variables
622 * might not be consistent with the state of the virtual memory.
623 * In this case, we to change the data segment.
625 void smpi_really_switch_data_segment(int dest) {
627 if(smpi_size_data_exe == 0)//no need to switch
630 #ifdef HAVE_PRIVATIZATION
632 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
633 for (i=0; i< smpi_process_count(); i++){
634 memcpy(smpi_privatisation_regions[i].address,
635 TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
639 // FIXME, cross-process support (mmap across process when necessary)
640 int current = smpi_privatisation_regions[dest].file_descriptor;
641 XBT_DEBUG("Switching data frame to the one of process %d", dest);
642 void* tmp = mmap (TOPAGE(smpi_start_data_exe), smpi_size_data_exe,
643 PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
644 if (tmp != TOPAGE(smpi_start_data_exe))
645 xbt_die("Couldn't map the new region");
646 smpi_loaded_page = dest;
650 int smpi_is_privatisation_file(char* file)
652 return strncmp("/dev/shm/my-buffer-", file, 19) == 0;
655 void smpi_initialize_global_memory_segments(){
657 #ifndef HAVE_PRIVATIZATION
658 smpi_privatize_global_variables=0;
659 xbt_die("You are trying to use privatization on a system that does not support it. Don't.");
664 smpi_get_executable_global_size();
666 XBT_DEBUG ("bss+data segment found : size %d starting at %p",
667 smpi_size_data_exe, smpi_start_data_exe );
669 if (smpi_size_data_exe == 0){//no need to switch
670 smpi_privatize_global_variables=0;
674 smpi_privatisation_regions = (smpi_privatisation_region_t) malloc(
675 smpi_process_count() * sizeof(struct s_smpi_privatisation_region));
677 for (i=0; i< smpi_process_count(); i++){
678 //create SIMIX_process_count() mappings of this size with the same data inside
679 void *address = NULL;
680 char path[] = "/dev/shm/my-buffer-XXXXXX";
683 int file_descriptor= mkstemp (path);
684 if (file_descriptor < 0) {
686 xbt_die("Impossible to create temporary file for memory mapping: %s\n\
687 The open() system call failed with the EMFILE error code (too many files). \n\n\
688 This means that you reached the system limits concerning the amount of files per process. \
689 This is not a surprise if you are trying to virtualize many processes on top of SMPI. \
690 Don't panic -- you should simply increase your system limits and try again. \n\n\
691 First, check what your limits are:\n\
692 cat /proc/sys/fs/file-max # Gives you the system-wide limit\n\
693 ulimit -Hn # Gives you the per process hard limit\n\
694 ulimit -Sn # Gives you the per process soft limit\n\
695 cat /proc/self/limits # Displays any per-process limitation (including the one given above)\n\n\
696 If one of these values is less than the amount of MPI processes that you try to run, then you got the explanation of this error. \
697 Ask the Internet about tutorials on how to increase the files limit such as: https://rtcamp.com/tutorials/linux/increase-open-files-limit/",
700 xbt_die("Impossible to create temporary file for memory mapping: %s",
704 status = unlink (path);
706 xbt_die("Impossible to unlink temporary file for memory mapping");
708 status = ftruncate(file_descriptor, smpi_size_data_exe);
710 xbt_die("Impossible to set the size of the temporary file for memory mapping");
712 /* Ask for a free region */
713 address = mmap (NULL, smpi_size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
714 if (address == MAP_FAILED)
715 xbt_die("Couldn't find a free region for memory mapping");
717 //initialize the values
718 memcpy(address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
720 //store the address of the mapping for further switches
721 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
722 smpi_privatisation_regions[i].address = address;
729 void smpi_destroy_global_memory_segments(){
730 if (smpi_size_data_exe == 0)//no need to switch
732 #ifdef HAVE_PRIVATIZATION
734 for (i=0; i< smpi_process_count(); i++){
735 if(munmap(smpi_privatisation_regions[i].address, smpi_size_data_exe) < 0) {
736 XBT_WARN("Unmapping of fd %d failed: %s",
737 smpi_privatisation_regions[i].file_descriptor, strerror(errno));
739 close(smpi_privatisation_regions[i].file_descriptor);
741 xbt_free(smpi_privatisation_regions);