1 /* Copyright (c) 2007, 2009-2015. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include <unordered_map>
12 #include "src/internal_config.h"
15 #include "xbt/sysdep.h"
17 #include "surf/surf.h"
18 #include "simgrid/sg_config.h"
19 #include "simgrid/modelchecker.h"
20 #include "src/mc/mc_replay.h"
26 #include <sys/types.h>
29 #include <math.h> // sqrt
35 #define MAP_ANONYMOUS MAP_ANON
38 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi, "Logging specific to SMPI (benchmarking)");
40 /* Shared allocations are handled through shared memory segments.
41 * Associated data and metadata are used as follows:
44 * `allocs' dict ---- -.
45 * ---------- shared_data_t shared_metadata_t / | | |
46 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
47 * | ---------- | fd of <name> | | | size of mmap | --| | | |
48 * | | count (2) | |-- | data | \ | | |
49 * `----------------- | <name> | | ----------------- ---- |
50 * -------------------- | ^ |
52 * | | `allocs_metadata' dict |
53 * | | ---------------------- |
54 * | `-- | <addr of mmap #1> |<-'
55 * | .-- | <addr of mmap #2> |<-.
56 * | | ---------------------- |
62 * | shared_metadata_t / | |
63 * | ----------------- | | |
64 * | | size of mmap | --| | |
66 * ----------------- | | |
71 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
73 xbt_dict_t samples = NULL; /* Allocated on first use */
74 xbt_dict_t calls = NULL; /* Allocated on first use */
76 double smpi_cpu_threshold;
77 double smpi_running_power;
79 int smpi_loaded_page = -1;
80 char* smpi_start_data_exe = NULL;
81 int smpi_size_data_exe = 0;
82 int smpi_privatize_global_variables;
83 double smpi_total_benched_time = 0;
84 smpi_privatisation_region_t smpi_privatisation_regions;
88 /** Some location in the source code
90 * This information is used by SMPI_SHARED_MALLOC to allocate some shared memory for all simulated processes.
92 class smpi_source_location {
94 smpi_source_location(const char* filename, int line)
95 : filename(filename), filename_length(strlen(filename)), line(line) {}
97 /** Pointer to a static string containing the file name */
98 const char* filename = nullptr;
99 int filename_length = 0;
102 bool operator==(smpi_source_location const& that) const
104 return filename_length == that.filename_length
106 && std::memcmp(filename, that.filename, filename_length) == 0;
108 bool operator!=(smpi_source_location const& that) const
110 return !(*this == that);
119 class hash<smpi_source_location> {
121 typedef smpi_source_location argument_type;
122 typedef std::size_t result_type;
123 result_type operator()(smpi_source_location const& loc) const
125 return xbt_str_hash_ext(loc.filename, loc.filename_length)
126 ^ xbt_str_hash_ext((const char*) &loc.line, sizeof(loc.line));
139 std::unordered_map<smpi_source_location, shared_data_t> allocs;
140 typedef std::unordered_map<smpi_source_location, shared_data_t>::value_type shared_data_key_type;
144 shared_data_key_type* data;
147 std::unordered_map<void*, shared_metadata_t> allocs_metadata;
151 static size_t shm_size(int fd) {
154 if(fstat(fd, &st) < 0) {
155 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
157 return (size_t)st.st_size;
161 static void* shm_map(int fd, size_t size, shared_data_key_type* data) {
163 char loc[PTR_STRLEN];
164 shared_metadata_t meta;
166 if(size > shm_size(fd)) {
167 if(ftruncate(fd, (off_t)size) < 0) {
168 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
172 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
173 if(mem == MAP_FAILED) {
174 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
176 snprintf(loc, PTR_STRLEN, "%p", mem);
179 allocs_metadata[mem] = meta;
180 XBT_DEBUG("MMAP %zu to %p", size, mem);
185 void smpi_bench_destroy(void)
188 allocs_metadata.clear();
189 xbt_dict_free(&samples);
190 xbt_dict_free(&calls);
193 extern "C" XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
194 void smpi_execute_flops_(double *flops)
196 smpi_execute_flops(*flops);
199 extern "C" XBT_PUBLIC(void) smpi_execute_(double *duration);
200 void smpi_execute_(double *duration)
202 smpi_execute(*duration);
205 void smpi_execute_flops(double flops) {
206 smx_synchro_t action;
207 XBT_DEBUG("Handle real computation time: %f flops", flops);
208 action = simcall_execution_start("computation", flops, 1, 0, 0);
209 simcall_set_category (action, TRACE_internal_smpi_get_category());
210 simcall_execution_wait(action);
211 smpi_switch_data_segment(smpi_process_index());
214 void smpi_execute(double duration)
216 if (duration >= smpi_cpu_threshold) {
217 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
218 double flops = duration * smpi_running_power;
219 int rank = smpi_process_index();
220 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
221 extra->type=TRACING_COMPUTING;
222 extra->comp_size=flops;
223 TRACE_smpi_computing_in(rank, extra);
224 smpi_execute_flops(flops);
226 TRACE_smpi_computing_out(rank);
229 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
230 duration, smpi_cpu_threshold);
234 void smpi_switch_data_segment(int dest);
236 void smpi_bench_begin(void)
238 if (smpi_privatize_global_variables) {
239 smpi_switch_data_segment(smpi_process_index());
242 if (MC_is_active() || MC_record_replay_is_active())
245 xbt_os_threadtimer_start(smpi_process_timer());
248 void smpi_bench_end(void)
251 if (MC_is_active() || MC_record_replay_is_active())
254 xbt_os_timer_t timer = smpi_process_timer();
255 xbt_os_threadtimer_stop(timer);
256 // smpi_switch_data_segment(smpi_process_count());
257 if (smpi_process_get_sampling()) {
258 XBT_CRITICAL("Cannot do recursive benchmarks.");
259 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
260 xbt_backtrace_display_current();
261 xbt_die("Aborting.");
263 // Simulate the benchmarked computation unless disabled via command-line argument
264 if (xbt_cfg_get_boolean("smpi/simulate-computation"))
265 smpi_execute(xbt_os_timer_elapsed(timer));
267 smpi_total_benched_time += xbt_os_timer_elapsed(timer);
270 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
271 static unsigned int private_sleep(double secs)
275 XBT_DEBUG("Sleep for: %lf secs", secs);
276 int rank = smpi_comm_rank(MPI_COMM_WORLD);
277 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
278 extra->type=TRACING_SLEEPING;
279 extra->sleep_duration=secs;
280 TRACE_smpi_sleeping_in(rank, extra);
282 simcall_process_sleep(secs);
284 TRACE_smpi_sleeping_out(rank);
290 unsigned int smpi_sleep(unsigned int secs)
292 return private_sleep((double)secs);
295 int smpi_usleep(useconds_t usecs)
297 return (int)private_sleep((double)usecs / 1000000.0);
300 int smpi_gettimeofday(struct timeval *tv, void* tz)
304 now = SIMIX_get_clock();
306 tv->tv_sec = (time_t)now;
308 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
310 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
317 extern double sg_surf_precision;
318 unsigned long long smpi_rastro_resolution (void)
321 double resolution = (1/sg_surf_precision);
323 return (unsigned long long)resolution;
326 unsigned long long smpi_rastro_timestamp (void)
329 double now = SIMIX_get_clock();
331 unsigned long long sec = (unsigned long long)now;
332 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
334 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
337 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
339 double threshold; /* maximal stderr requested (if positive) */
340 double relstderr; /* observed stderr so far */
341 double mean; /* mean of benched times, to be used if the block is disabled */
342 double sum; /* sum of benched times (to compute the mean and stderr) */
343 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
344 int iters; /* amount of requested iterations */
345 int count; /* amount of iterations done so far */
346 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
349 static char *sample_location(int global, const char *file, int line) {
351 return bprintf("%s:%d", file, line);
353 return bprintf("%s:%d:%d", file, line, smpi_process_index());
357 static int sample_enough_benchs(local_data_t *data) {
358 int res = data->count >= data->iters;
359 if (data->threshold>0.0) {
361 res = 0; // not enough data
362 if (data->relstderr > data->threshold)
363 res = 0; // stderr too high yet
365 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
366 (res?"enough benchs":"need more data"), data->count, data->iters, data->relstderr, data->threshold, data->mean);
370 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
372 char *loc = sample_location(global, file, line);
375 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
376 smpi_process_set_sampling(1);
379 samples = xbt_dict_new_homogeneous(free);
381 data = static_cast<local_data_t *>(xbt_dict_get_or_null(samples, loc));
383 xbt_assert(threshold>0 || iters>0,
384 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
385 data = (local_data_t *) xbt_new(local_data_t, 1);
388 data->sum_pow2 = 0.0;
390 data->threshold = threshold;
391 data->benching = 1; // If we have no data, we need at least one
393 xbt_dict_set(samples, loc, data, NULL);
394 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
396 if (data->iters != iters || data->threshold != threshold) {
397 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. "
398 "How did you manage to give two numbers at the same line??",
399 loc, data->iters, data->threshold, iters,threshold);
403 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate
404 // the computation instead
405 data->benching = !sample_enough_benchs(data);
406 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc,
407 (data->benching?"more benching needed":"we have enough data, skip computes"));
412 int smpi_sample_2(int global, const char *file, int line)
414 char *loc = sample_location(global, file, line);
418 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
419 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
420 XBT_DEBUG("sample2 %s",loc);
423 if (data->benching==1) {
424 // we need to run a new bench
425 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
426 data->count, data->iters, data->relstderr, data->threshold, data->mean);
429 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just
430 //ran one bench and need to bail out now that our job is done). Just sleep instead
431 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f)."
432 " apply the %fs delay instead", data->count, data->iters, data->relstderr, data->threshold, data->mean);
433 smpi_execute(data->mean);
434 smpi_process_set_sampling(0);
435 res = 0; // prepare to capture future, unrelated computations
441 void smpi_sample_3(int global, const char *file, int line)
443 char *loc = sample_location(global, file, line);
446 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
447 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
448 XBT_DEBUG("sample3 %s",loc);
451 if (data->benching==0) {
455 // ok, benchmarking this loop is over
456 xbt_os_threadtimer_stop(smpi_process_timer());
461 sample = xbt_os_timer_elapsed(smpi_process_timer());
463 data->sum_pow2 += sample * sample;
464 n = (double)data->count;
465 data->mean = data->sum / n;
466 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
467 if (!sample_enough_benchs(data)) {
468 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop
469 // occurrence before leaving, not the mean over the history
471 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
472 data->mean, data->relstderr, sample);
474 // That's enough for now, prevent sample_2 to run the same code over and over
480 void *smpi_shared_malloc(size_t size, const char *file, int line)
483 if (xbt_cfg_get_boolean("smpi/use-shared-malloc")){
485 smpi_source_location loc(file, line);
486 auto res = allocs.insert(std::make_pair(loc, shared_data_t()));
487 auto data = res.first;
489 // The insertion did not take place.
490 // Generate a shared memory name from the address of the shared_data:
491 char shmname[32]; // cannot be longer than PSHMNAMLEN = 31 on Mac OS X (shm_open raises ENAMETOOLONG otherwise)
492 snprintf(shmname, 31, "/shmalloc%p", &*data);
493 fd = shm_open(shmname, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
497 xbt_die("Please cleanup /dev/shm/%s", shmname);
499 xbt_die("An unhandled error occurred while opening %s. shm_open: %s", shmname, strerror(errno));
502 data->second.fd = fd;
503 data->second.count = 1;
504 mem = shm_map(fd, size, &*data);
505 if (shm_unlink(shmname) < 0) {
506 XBT_WARN("Could not early unlink %s. shm_unlink: %s", shmname, strerror(errno));
508 XBT_DEBUG("Mapping %s at %p through %d", shmname, mem, fd);
510 mem = shm_map(data->second.fd, size, &*data);
511 data->second.count++;
513 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, &*data);
515 mem = xbt_malloc(size);
516 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
522 void smpi_shared_free(void *ptr)
524 char loc[PTR_STRLEN];
526 if (xbt_cfg_get_boolean("smpi/use-shared-malloc")){
527 snprintf(loc, PTR_STRLEN, "%p", ptr);
528 auto meta = allocs_metadata.find(ptr);
529 if (meta == allocs_metadata.end()) {
530 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
533 shared_data_t* data = &meta->second.data->second;
534 if (munmap(ptr, meta->second.size) < 0) {
535 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
538 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
539 if (data->count <= 0) {
541 allocs.erase(allocs.find(meta->second.data->first));
542 XBT_DEBUG("Shared free - with removal - of %p", ptr);
545 XBT_DEBUG("Classic free of %p", ptr);
551 int smpi_shared_known_call(const char* func, const char* input)
553 char* loc = bprintf("%s:%s", func, input);
558 calls = xbt_dict_new_homogeneous(NULL);
561 xbt_dict_get(calls, loc); /* Succeed or throw */
568 if (ex.category != not_found_error)
575 void* smpi_shared_get_call(const char* func, const char* input) {
576 char* loc = bprintf("%s:%s", func, input);
580 calls = xbt_dict_new_homogeneous(NULL);
582 data = xbt_dict_get(calls, loc);
587 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
588 char* loc = bprintf("%s:%s", func, input);
591 calls = xbt_dict_new_homogeneous(NULL);
593 xbt_dict_set(calls, loc, data, NULL);
598 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
600 /** Map a given SMPI privatization segment (make a SMPI process active) */
601 void smpi_switch_data_segment(int dest){
602 if (smpi_loaded_page==dest)//no need to switch either
606 smpi_really_switch_data_segment(dest);
609 /** Map a given SMPI privatization segment (make a SMPI process active) even if SMPI thinks it is already active
611 * When doing a state restoration, the state of the restored variables might not be consistent with the state of the
612 * virtual memory. In this case, we to change the data segment.
614 void smpi_really_switch_data_segment(int dest) {
615 if(smpi_size_data_exe == 0)//no need to switch
618 #if HAVE_PRIVATIZATION
619 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
620 for (int i=0; i< smpi_process_count(); i++){
621 memcpy(smpi_privatisation_regions[i].address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
625 // FIXME, cross-process support (mmap across process when necessary)
626 int current = smpi_privatisation_regions[dest].file_descriptor;
627 XBT_DEBUG("Switching data frame to the one of process %d", dest);
628 void* tmp = mmap (TOPAGE(smpi_start_data_exe), smpi_size_data_exe,
629 PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
630 if (tmp != TOPAGE(smpi_start_data_exe))
631 xbt_die("Couldn't map the new region");
632 smpi_loaded_page = dest;
636 int smpi_is_privatisation_file(char* file)
638 return strncmp("/dev/shm/my-buffer-", file, 19) == 0;
641 void smpi_initialize_global_memory_segments(){
643 #if !HAVE_PRIVATIZATION
644 smpi_privatize_global_variables=0;
645 xbt_die("You are trying to use privatization on a system that does not support it. Don't.");
649 smpi_get_executable_global_size();
651 XBT_DEBUG ("bss+data segment found : size %d starting at %p", smpi_size_data_exe, smpi_start_data_exe );
653 if (smpi_size_data_exe == 0){//no need to switch
654 smpi_privatize_global_variables=0;
658 smpi_privatisation_regions =
659 (smpi_privatisation_region_t) malloc(smpi_process_count() * sizeof(struct s_smpi_privatisation_region));
661 for (int i=0; i< smpi_process_count(); i++){
662 //create SIMIX_process_count() mappings of this size with the same data inside
663 void *address = NULL;
664 char path[] = "/dev/shm/my-buffer-XXXXXX";
667 int file_descriptor= mkstemp (path);
668 if (file_descriptor < 0) {
670 xbt_die("Impossible to create temporary file for memory mapping: %s\n\
671 The open() system call failed with the EMFILE error code (too many files). \n\n\
672 This means that you reached the system limits concerning the amount of files per process. \
673 This is not a surprise if you are trying to virtualize many processes on top of SMPI. \
674 Don't panic -- you should simply increase your system limits and try again. \n\n\
675 First, check what your limits are:\n\
676 cat /proc/sys/fs/file-max # Gives you the system-wide limit\n\
677 ulimit -Hn # Gives you the per process hard limit\n\
678 ulimit -Sn # Gives you the per process soft limit\n\
679 cat /proc/self/limits # Displays any per-process limitation (including the one given above)\n\n\
680 If one of these values is less than the amount of MPI processes that you try to run, then you got the explanation of this error. \
681 Ask the Internet about tutorials on how to increase the files limit such as: https://rtcamp.com/tutorials/linux/increase-open-files-limit/",
684 xbt_die("Impossible to create temporary file for memory mapping: %s",
688 status = unlink (path);
690 xbt_die("Impossible to unlink temporary file for memory mapping");
692 status = ftruncate(file_descriptor, smpi_size_data_exe);
694 xbt_die("Impossible to set the size of the temporary file for memory mapping");
696 /* Ask for a free region */
697 address = mmap (NULL, smpi_size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
698 if (address == MAP_FAILED)
699 xbt_die("Couldn't find a free region for memory mapping");
701 //initialize the values
702 memcpy(address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
704 //store the address of the mapping for further switches
705 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
706 smpi_privatisation_regions[i].address = address;
711 void smpi_destroy_global_memory_segments(){
712 if (smpi_size_data_exe == 0)//no need to switch
714 #if HAVE_PRIVATIZATION
716 for (i=0; i< smpi_process_count(); i++){
717 if(munmap(smpi_privatisation_regions[i].address, smpi_size_data_exe) < 0) {
718 XBT_WARN("Unmapping of fd %d failed: %s", smpi_privatisation_regions[i].file_descriptor, strerror(errno));
720 close(smpi_privatisation_regions[i].file_descriptor);
722 xbt_free(smpi_privatisation_regions);