1 /* Copyright (c) 2007, 2009-2015. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include <unordered_map>
12 #include "src/internal_config.h"
15 #include "xbt/sysdep.h"
17 #include "surf/surf.h"
18 #include "simgrid/sg_config.h"
19 #include "simgrid/modelchecker.h"
20 #include "src/mc/mc_replay.h"
26 #include <sys/types.h>
29 #include <math.h> // sqrt
35 #define MAP_ANONYMOUS MAP_ANON
38 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi, "Logging specific to SMPI (benchmarking)");
40 /* Shared allocations are handled through shared memory segments.
41 * Associated data and metadata are used as follows:
44 * `allocs' dict ---- -.
45 * ---------- shared_data_t shared_metadata_t / | | |
46 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
47 * | ---------- | fd of <name> | | | size of mmap | --| | | |
48 * | | count (2) | |-- | data | \ | | |
49 * `----------------- | <name> | | ----------------- ---- |
50 * -------------------- | ^ |
52 * | | `allocs_metadata' dict |
53 * | | ---------------------- |
54 * | `-- | <addr of mmap #1> |<-'
55 * | .-- | <addr of mmap #2> |<-.
56 * | | ---------------------- |
62 * | shared_metadata_t / | |
63 * | ----------------- | | |
64 * | | size of mmap | --| | |
66 * ----------------- | | |
71 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
73 xbt_dict_t samples = NULL; /* Allocated on first use */
74 xbt_dict_t calls = NULL; /* Allocated on first use */
76 double smpi_cpu_threshold;
77 double smpi_running_power;
79 int smpi_loaded_page = -1;
80 char* smpi_start_data_exe = NULL;
81 int smpi_size_data_exe = 0;
82 int smpi_privatize_global_variables;
83 double smpi_total_benched_time = 0;
84 smpi_privatisation_region_t smpi_privatisation_regions;
88 /** Some location in the source code
90 * This information is used by SMPI_SHARED_MALLOC to allocate some shared memory for all simulated processes.
92 class smpi_source_location {
94 smpi_source_location(const char* filename, int line)
95 : filename(filename), filename_length(strlen(filename)), line(line) {}
97 /** Pointer to a static string containing the file name */
98 const char* filename = nullptr;
99 int filename_length = 0;
102 bool operator==(smpi_source_location const& that) const
104 return filename_length == that.filename_length
106 && std::memcmp(filename, that.filename, filename_length) == 0;
108 bool operator!=(smpi_source_location const& that) const
110 return !(*this == that);
119 class hash<smpi_source_location> {
121 typedef smpi_source_location argument_type;
122 typedef std::size_t result_type;
123 result_type operator()(smpi_source_location const& loc) const
125 return xbt_str_hash_ext(loc.filename, loc.filename_length)
126 ^ xbt_str_hash_ext((const char*) &loc.line, sizeof(loc.line));
139 std::unordered_map<smpi_source_location, shared_data_t> allocs;
140 typedef std::unordered_map<smpi_source_location, shared_data_t>::value_type shared_data_key_type;
144 shared_data_key_type* data;
147 std::unordered_map<void*, shared_metadata_t> allocs_metadata;
151 static size_t shm_size(int fd) {
154 if(fstat(fd, &st) < 0) {
155 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
157 return (size_t)st.st_size;
161 static void* shm_map(int fd, size_t size, shared_data_key_type* data) {
163 char loc[PTR_STRLEN];
164 shared_metadata_t meta;
166 if(size > shm_size(fd)) {
167 if(ftruncate(fd, (off_t)size) < 0) {
168 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
172 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
173 if(mem == MAP_FAILED) {
174 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
176 snprintf(loc, PTR_STRLEN, "%p", mem);
179 allocs_metadata[mem] = meta;
180 XBT_DEBUG("MMAP %zu to %p", size, mem);
185 void smpi_bench_destroy(void)
188 allocs_metadata.clear();
189 xbt_dict_free(&samples);
190 xbt_dict_free(&calls);
193 extern "C" XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
194 void smpi_execute_flops_(double *flops)
196 smpi_execute_flops(*flops);
199 extern "C" XBT_PUBLIC(void) smpi_execute_(double *duration);
200 void smpi_execute_(double *duration)
202 smpi_execute(*duration);
205 void smpi_execute_flops(double flops) {
206 smx_synchro_t action;
207 XBT_DEBUG("Handle real computation time: %f flops", flops);
208 action = simcall_execution_start("computation", flops, 1, 0, 0);
209 simcall_set_category (action, TRACE_internal_smpi_get_category());
210 simcall_execution_wait(action);
211 smpi_switch_data_segment(smpi_process_index());
214 void smpi_execute(double duration)
216 if (duration >= smpi_cpu_threshold) {
217 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
218 double flops = duration * smpi_running_power;
219 int rank = smpi_process_index();
220 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
221 extra->type=TRACING_COMPUTING;
222 extra->comp_size=flops;
223 TRACE_smpi_computing_in(rank, extra);
224 smpi_execute_flops(flops);
226 TRACE_smpi_computing_out(rank);
229 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
230 duration, smpi_cpu_threshold);
234 void smpi_switch_data_segment(int dest);
236 void smpi_bench_begin(void)
238 if (smpi_privatize_global_variables) {
239 smpi_switch_data_segment(smpi_process_index());
242 if (MC_is_active() || MC_record_replay_is_active())
245 xbt_os_threadtimer_start(smpi_process_timer());
248 void smpi_bench_end(void)
251 if (MC_is_active() || MC_record_replay_is_active())
254 xbt_os_timer_t timer = smpi_process_timer();
255 xbt_os_threadtimer_stop(timer);
256 // smpi_switch_data_segment(smpi_process_count());
257 if (smpi_process_get_sampling()) {
258 XBT_CRITICAL("Cannot do recursive benchmarks.");
259 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
260 xbt_backtrace_display_current();
261 xbt_die("Aborting.");
263 // Simulate the benchmarked computation unless disabled via command-line argument
264 if (sg_cfg_get_boolean("smpi/simulate_computation")) {
265 smpi_execute(xbt_os_timer_elapsed(timer));
268 smpi_total_benched_time += xbt_os_timer_elapsed(timer);
271 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
272 static unsigned int private_sleep(double secs)
276 XBT_DEBUG("Sleep for: %lf secs", secs);
277 int rank = smpi_comm_rank(MPI_COMM_WORLD);
278 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
279 extra->type=TRACING_SLEEPING;
280 extra->sleep_duration=secs;
281 TRACE_smpi_sleeping_in(rank, extra);
283 simcall_process_sleep(secs);
285 TRACE_smpi_sleeping_out(rank);
291 unsigned int smpi_sleep(unsigned int secs)
293 return private_sleep((double)secs);
296 int smpi_usleep(useconds_t usecs)
298 return (int)private_sleep((double)usecs / 1000000.0);
301 int smpi_gettimeofday(struct timeval *tv, void* tz)
305 now = SIMIX_get_clock();
307 tv->tv_sec = (time_t)now;
309 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
311 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
318 extern double sg_surf_precision;
319 unsigned long long smpi_rastro_resolution (void)
322 double resolution = (1/sg_surf_precision);
324 return (unsigned long long)resolution;
327 unsigned long long smpi_rastro_timestamp (void)
330 double now = SIMIX_get_clock();
332 unsigned long long sec = (unsigned long long)now;
333 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
335 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
338 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
340 double threshold; /* maximal stderr requested (if positive) */
341 double relstderr; /* observed stderr so far */
342 double mean; /* mean of benched times, to be used if the block is disabled */
343 double sum; /* sum of benched times (to compute the mean and stderr) */
344 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
345 int iters; /* amount of requested iterations */
346 int count; /* amount of iterations done so far */
347 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
350 static char *sample_location(int global, const char *file, int line) {
352 return bprintf("%s:%d", file, line);
354 return bprintf("%s:%d:%d", file, line, smpi_process_index());
358 static int sample_enough_benchs(local_data_t *data) {
359 int res = data->count >= data->iters;
360 if (data->threshold>0.0) {
362 res = 0; // not enough data
363 if (data->relstderr > data->threshold)
364 res = 0; // stderr too high yet
366 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
367 (res?"enough benchs":"need more data"), data->count, data->iters, data->relstderr, data->threshold, data->mean);
371 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
373 char *loc = sample_location(global, file, line);
376 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
377 smpi_process_set_sampling(1);
380 samples = xbt_dict_new_homogeneous(free);
382 data = static_cast<local_data_t *>(xbt_dict_get_or_null(samples, loc));
384 xbt_assert(threshold>0 || iters>0,
385 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
386 data = (local_data_t *) xbt_new(local_data_t, 1);
389 data->sum_pow2 = 0.0;
391 data->threshold = threshold;
392 data->benching = 1; // If we have no data, we need at least one
394 xbt_dict_set(samples, loc, data, NULL);
395 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
397 if (data->iters != iters || data->threshold != threshold) {
398 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. "
399 "How did you manage to give two numbers at the same line??",
400 loc, data->iters, data->threshold, iters,threshold);
404 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate
405 // the computation instead
406 data->benching = !sample_enough_benchs(data);
407 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc,
408 (data->benching?"more benching needed":"we have enough data, skip computes"));
413 int smpi_sample_2(int global, const char *file, int line)
415 char *loc = sample_location(global, file, line);
419 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
420 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
421 XBT_DEBUG("sample2 %s",loc);
424 if (data->benching==1) {
425 // we need to run a new bench
426 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
427 data->count, data->iters, data->relstderr, data->threshold, data->mean);
430 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just
431 //ran one bench and need to bail out now that our job is done). Just sleep instead
432 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f)."
433 " apply the %fs delay instead", data->count, data->iters, data->relstderr, data->threshold, data->mean);
434 smpi_execute(data->mean);
435 smpi_process_set_sampling(0);
436 res = 0; // prepare to capture future, unrelated computations
442 void smpi_sample_3(int global, const char *file, int line)
444 char *loc = sample_location(global, file, line);
447 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
448 data = static_cast<local_data_t *>(xbt_dict_get(samples, loc));
449 XBT_DEBUG("sample3 %s",loc);
452 if (data->benching==0) {
456 // ok, benchmarking this loop is over
457 xbt_os_threadtimer_stop(smpi_process_timer());
462 sample = xbt_os_timer_elapsed(smpi_process_timer());
464 data->sum_pow2 += sample * sample;
465 n = (double)data->count;
466 data->mean = data->sum / n;
467 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
468 if (!sample_enough_benchs(data)) {
469 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop
470 // occurrence before leaving, not the mean over the history
472 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
473 data->mean, data->relstderr, sample);
475 // That's enough for now, prevent sample_2 to run the same code over and over
481 void *smpi_shared_malloc(size_t size, const char *file, int line)
484 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
486 smpi_source_location loc(file, line);
487 auto res = allocs.insert(std::make_pair(loc, shared_data_t()));
488 auto data = res.first;
490 // The insertion did not take place.
491 // Generate a shared memory name from the address of the shared_data:
492 char shmname[32]; // cannot be longer than PSHMNAMLEN = 31 on Mac OS X (shm_open raises ENAMETOOLONG otherwise)
493 snprintf(shmname, 31, "/shmalloc%p", &*data);
494 fd = shm_open(shmname, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
498 xbt_die("Please cleanup /dev/shm/%s", shmname);
500 xbt_die("An unhandled error occured while opening %s. shm_open: %s", shmname, strerror(errno));
503 data->second.fd = fd;
504 data->second.count = 1;
505 mem = shm_map(fd, size, &*data);
506 if (shm_unlink(shmname) < 0) {
507 XBT_WARN("Could not early unlink %s. shm_unlink: %s", shmname, strerror(errno));
509 XBT_DEBUG("Mapping %s at %p through %d", shmname, mem, fd);
511 mem = shm_map(data->second.fd, size, &*data);
512 data->second.count++;
514 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, &*data);
516 mem = xbt_malloc(size);
517 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
523 void smpi_shared_free(void *ptr)
525 char loc[PTR_STRLEN];
527 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
528 snprintf(loc, PTR_STRLEN, "%p", ptr);
529 auto meta = allocs_metadata.find(ptr);
530 if (meta == allocs_metadata.end()) {
531 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
534 shared_data_t* data = &meta->second.data->second;
535 if (munmap(ptr, meta->second.size) < 0) {
536 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
539 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
540 if (data->count <= 0) {
542 allocs.erase(allocs.find(meta->second.data->first));
543 XBT_DEBUG("Shared free - with removal - of %p", ptr);
546 XBT_DEBUG("Classic free of %p", ptr);
552 int smpi_shared_known_call(const char* func, const char* input)
554 char* loc = bprintf("%s:%s", func, input);
559 calls = xbt_dict_new_homogeneous(NULL);
562 xbt_dict_get(calls, loc); /* Succeed or throw */
569 if (ex.category != not_found_error)
576 void* smpi_shared_get_call(const char* func, const char* input) {
577 char* loc = bprintf("%s:%s", func, input);
581 calls = xbt_dict_new_homogeneous(NULL);
583 data = xbt_dict_get(calls, loc);
588 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
589 char* loc = bprintf("%s:%s", func, input);
592 calls = xbt_dict_new_homogeneous(NULL);
594 xbt_dict_set(calls, loc, data, NULL);
599 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
601 /** Map a given SMPI privatization segment (make a SMPI process active) */
602 void smpi_switch_data_segment(int dest){
603 if (smpi_loaded_page==dest)//no need to switch either
607 smpi_really_switch_data_segment(dest);
610 /** Map a given SMPI privatization segment (make a SMPI process active) even if SMPI thinks it is already active
612 * When doing a state restoration, the state of the restored variables might not be consistent with the state of the
613 * virtual memory. In this case, we to change the data segment.
615 void smpi_really_switch_data_segment(int dest) {
616 if(smpi_size_data_exe == 0)//no need to switch
619 #ifdef HAVE_PRIVATIZATION
620 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
621 for (int i=0; i< smpi_process_count(); i++){
622 memcpy(smpi_privatisation_regions[i].address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
626 // FIXME, cross-process support (mmap across process when necessary)
627 int current = smpi_privatisation_regions[dest].file_descriptor;
628 XBT_DEBUG("Switching data frame to the one of process %d", dest);
629 void* tmp = mmap (TOPAGE(smpi_start_data_exe), smpi_size_data_exe,
630 PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
631 if (tmp != TOPAGE(smpi_start_data_exe))
632 xbt_die("Couldn't map the new region");
633 smpi_loaded_page = dest;
637 int smpi_is_privatisation_file(char* file)
639 return strncmp("/dev/shm/my-buffer-", file, 19) == 0;
642 void smpi_initialize_global_memory_segments(){
644 #ifndef HAVE_PRIVATIZATION
645 smpi_privatize_global_variables=0;
646 xbt_die("You are trying to use privatization on a system that does not support it. Don't.");
650 smpi_get_executable_global_size();
652 XBT_DEBUG ("bss+data segment found : size %d starting at %p", smpi_size_data_exe, smpi_start_data_exe );
654 if (smpi_size_data_exe == 0){//no need to switch
655 smpi_privatize_global_variables=0;
659 smpi_privatisation_regions =
660 (smpi_privatisation_region_t) malloc(smpi_process_count() * sizeof(struct s_smpi_privatisation_region));
662 for (int i=0; i< smpi_process_count(); i++){
663 //create SIMIX_process_count() mappings of this size with the same data inside
664 void *address = NULL;
665 char path[] = "/dev/shm/my-buffer-XXXXXX";
668 int file_descriptor= mkstemp (path);
669 if (file_descriptor < 0) {
671 xbt_die("Impossible to create temporary file for memory mapping: %s\n\
672 The open() system call failed with the EMFILE error code (too many files). \n\n\
673 This means that you reached the system limits concerning the amount of files per process. \
674 This is not a surprise if you are trying to virtualize many processes on top of SMPI. \
675 Don't panic -- you should simply increase your system limits and try again. \n\n\
676 First, check what your limits are:\n\
677 cat /proc/sys/fs/file-max # Gives you the system-wide limit\n\
678 ulimit -Hn # Gives you the per process hard limit\n\
679 ulimit -Sn # Gives you the per process soft limit\n\
680 cat /proc/self/limits # Displays any per-process limitation (including the one given above)\n\n\
681 If one of these values is less than the amount of MPI processes that you try to run, then you got the explanation of this error. \
682 Ask the Internet about tutorials on how to increase the files limit such as: https://rtcamp.com/tutorials/linux/increase-open-files-limit/",
685 xbt_die("Impossible to create temporary file for memory mapping: %s",
689 status = unlink (path);
691 xbt_die("Impossible to unlink temporary file for memory mapping");
693 status = ftruncate(file_descriptor, smpi_size_data_exe);
695 xbt_die("Impossible to set the size of the temporary file for memory mapping");
697 /* Ask for a free region */
698 address = mmap (NULL, smpi_size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
699 if (address == MAP_FAILED)
700 xbt_die("Couldn't find a free region for memory mapping");
702 //initialize the values
703 memcpy(address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
705 //store the address of the mapping for further switches
706 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
707 smpi_privatisation_regions[i].address = address;
712 void smpi_destroy_global_memory_segments(){
713 if (smpi_size_data_exe == 0)//no need to switch
715 #ifdef HAVE_PRIVATIZATION
717 for (i=0; i< smpi_process_count(); i++){
718 if(munmap(smpi_privatisation_regions[i].address, smpi_size_data_exe) < 0) {
719 XBT_WARN("Unmapping of fd %d failed: %s", smpi_privatisation_regions[i].file_descriptor, strerror(errno));
721 close(smpi_privatisation_regions[i].file_descriptor);
723 xbt_free(smpi_privatisation_regions);