1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
15 #include "simgrid/modelchecker.h"
21 #include <sys/types.h>
24 #include <math.h> // sqrt
30 #define MAP_ANONYMOUS MAP_ANON
33 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
34 "Logging specific to SMPI (benchmarking)");
36 /* Shared allocations are handled through shared memory segments.
37 * Associated data and metadata are used as follows:
40 * `allocs' dict ---- -.
41 * ---------- shared_data_t shared_metadata_t / | | |
42 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
43 * | ---------- | fd of <name> | | | size of mmap | --| | | |
44 * | | count (2) | |-- | data | \ | | |
45 * `----------------- | <name> | | ----------------- ---- |
46 * -------------------- | ^ |
48 * | | `allocs_metadata' dict |
49 * | | ---------------------- |
50 * | `-- | <addr of mmap #1> |<-'
51 * | .-- | <addr of mmap #2> |<-.
52 * | | ---------------------- |
58 * | shared_metadata_t / | |
59 * | ----------------- | | |
60 * | | size of mmap | --| | |
62 * ----------------- | | |
67 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
69 xbt_dict_t allocs = NULL; /* Allocated on first use */
70 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
71 xbt_dict_t samples = NULL; /* Allocated on first use */
72 xbt_dict_t calls = NULL; /* Allocated on first use */
74 double smpi_cpu_threshold;
75 double smpi_running_power;
77 int smpi_loaded_page = -1;
78 char* start_data_exe = NULL;
79 int size_data_exe = 0;
80 int smpi_privatize_global_variables;
81 double smpi_total_benched_time = 0;
84 smpi_privatisation_region_t smpi_privatisation_regions;
97 static size_t shm_size(int fd) {
100 if(fstat(fd, &st) < 0) {
101 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
103 return (size_t)st.st_size;
107 static void* shm_map(int fd, size_t size, shared_data_t* data) {
109 char loc[PTR_STRLEN];
110 shared_metadata_t* meta;
112 if(size > shm_size(fd)) {
113 if(ftruncate(fd, (off_t)size) < 0) {
114 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
118 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
119 if(mem == MAP_FAILED) {
120 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
122 if(!allocs_metadata) {
123 allocs_metadata = xbt_dict_new_homogeneous(xbt_free_f);
125 snprintf(loc, PTR_STRLEN, "%p", mem);
126 meta = xbt_new(shared_metadata_t, 1);
129 xbt_dict_set(allocs_metadata, loc, meta, NULL);
130 XBT_DEBUG("MMAP %zu to %p", size, mem);
135 void smpi_bench_destroy(void)
137 xbt_dict_free(&allocs);
138 xbt_dict_free(&allocs_metadata);
139 xbt_dict_free(&samples);
140 xbt_dict_free(&calls);
143 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
144 void smpi_execute_flops_(double *flops)
146 smpi_execute_flops(*flops);
149 XBT_PUBLIC(void) smpi_execute_(double *duration);
150 void smpi_execute_(double *duration)
152 smpi_execute(*duration);
155 void smpi_execute_flops(double flops) {
156 smx_synchro_t action;
158 host = SIMIX_host_self();
159 XBT_DEBUG("Handle real computation time: %f flops", flops);
160 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
161 simcall_set_category (action, TRACE_internal_smpi_get_category());
162 simcall_host_execution_wait(action);
163 smpi_switch_data_segment(smpi_process_index());
166 void smpi_execute(double duration)
168 if (duration >= smpi_cpu_threshold) {
169 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
170 double flops = duration * smpi_running_power;
171 int rank = smpi_process_index();
172 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
173 extra->type=TRACING_COMPUTING;
174 extra->comp_size=flops;
175 TRACE_smpi_computing_in(rank, extra);
176 smpi_execute_flops(flops);
178 TRACE_smpi_computing_out(rank);
181 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
182 duration, smpi_cpu_threshold);
186 void smpi_switch_data_segment(int dest);
188 void smpi_bench_begin(void)
190 smpi_switch_data_segment(smpi_process_index());
192 if (MC_is_active() || MC_record_replay_is_active())
195 xbt_os_threadtimer_start(smpi_process_timer());
198 void smpi_bench_end(void)
201 if (MC_is_active() || MC_record_replay_is_active())
204 xbt_os_timer_t timer = smpi_process_timer();
205 xbt_os_threadtimer_stop(timer);
206 // smpi_switch_data_segment(smpi_process_count());
207 if (smpi_process_get_sampling()) {
208 XBT_CRITICAL("Cannot do recursive benchmarks.");
209 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
210 xbt_backtrace_display_current();
211 xbt_die("Aborting.");
213 // Simulate the benchmarked computation unless disabled via command-line argument
214 if (sg_cfg_get_boolean("smpi/simulate_computation")) {
215 smpi_execute(xbt_os_timer_elapsed(timer));
218 smpi_total_benched_time += xbt_os_timer_elapsed(timer);
221 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
222 static unsigned int private_sleep(double secs)
226 XBT_DEBUG("Sleep for: %lf secs", secs);
227 int rank = smpi_comm_rank(MPI_COMM_WORLD);
228 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
229 extra->type=TRACING_SLEEPING;
230 extra->sleep_duration=secs;
231 TRACE_smpi_sleeping_in(rank, extra);
233 simcall_process_sleep(secs);
235 TRACE_smpi_sleeping_out(rank);
241 unsigned int smpi_sleep(unsigned int secs)
243 return private_sleep((double)secs);
246 int smpi_usleep(useconds_t usecs)
248 return (int)private_sleep((double)usecs / 1000000.0);
252 int smpi_gettimeofday(struct timeval *tv, void* tz)
256 now = SIMIX_get_clock();
258 tv->tv_sec = (time_t)now;
260 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
262 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
269 extern double sg_surf_precision;
270 unsigned long long smpi_rastro_resolution (void)
273 double resolution = (1/sg_surf_precision);
275 return (unsigned long long)resolution;
278 unsigned long long smpi_rastro_timestamp (void)
281 double now = SIMIX_get_clock();
283 unsigned long long sec = (unsigned long long)now;
284 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
286 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
289 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
291 double threshold; /* maximal stderr requested (if positive) */
292 double relstderr; /* observed stderr so far */
293 double mean; /* mean of benched times, to be used if the block is disabled */
294 double sum; /* sum of benched times (to compute the mean and stderr) */
295 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
296 int iters; /* amount of requested iterations */
297 int count; /* amount of iterations done so far */
298 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
301 static char *sample_location(int global, const char *file, int line) {
303 return bprintf("%s:%d", file, line);
305 return bprintf("%s:%d:%d", file, line, smpi_process_index());
308 static int sample_enough_benchs(local_data_t *data) {
309 int res = data->count >= data->iters;
310 if (data->threshold>0.0) {
312 res = 0; // not enough data
313 if (data->relstderr > data->threshold)
314 res = 0; // stderr too high yet
316 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
317 (res?"enough benchs":"need more data"),
318 data->count, data->iters, data->relstderr, data->threshold, data->mean);
322 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
324 char *loc = sample_location(global, file, line);
327 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
328 smpi_process_set_sampling(1);
331 samples = xbt_dict_new_homogeneous(free);
333 data = xbt_dict_get_or_null(samples, loc);
335 xbt_assert(threshold>0 || iters>0,
336 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
337 data = (local_data_t *) xbt_new(local_data_t, 1);
340 data->sum_pow2 = 0.0;
342 data->threshold = threshold;
343 data->benching = 1; // If we have no data, we need at least one
345 xbt_dict_set(samples, loc, data, NULL);
346 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
348 if (data->iters != iters || data->threshold != threshold) {
349 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
350 loc, data->iters, data->threshold, iters,threshold);
354 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
355 data->benching = !sample_enough_benchs(data);
356 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
361 int smpi_sample_2(int global, const char *file, int line)
363 char *loc = sample_location(global, file, line);
367 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
368 data = xbt_dict_get(samples, loc);
369 XBT_DEBUG("sample2 %s",loc);
372 if (data->benching==1) {
373 // we need to run a new bench
374 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
375 data->count, data->iters, data->relstderr, data->threshold, data->mean);
378 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
379 // Just sleep instead
380 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
381 data->count, data->iters, data->relstderr, data->threshold, data->mean);
382 smpi_execute(data->mean);
383 smpi_process_set_sampling(0);
384 res = 0; // prepare to capture future, unrelated computations
391 void smpi_sample_3(int global, const char *file, int line)
393 char *loc = sample_location(global, file, line);
396 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
397 data = xbt_dict_get(samples, loc);
398 XBT_DEBUG("sample3 %s",loc);
401 if (data->benching==0) {
405 // ok, benchmarking this loop is over
406 xbt_os_threadtimer_stop(smpi_process_timer());
411 sample = xbt_os_timer_elapsed(smpi_process_timer());
413 data->sum_pow2 += sample * sample;
414 n = (double)data->count;
415 data->mean = data->sum / n;
416 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
417 if (!sample_enough_benchs(data)) {
418 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
420 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
421 data->mean, data->relstderr, sample);
423 // That's enough for now, prevent sample_2 to run the same code over and over
428 static void smpi_shared_alloc_free(void *p)
430 shared_data_t *data = p;
435 static char *smpi_shared_alloc_hash(char *loc)
445 loc = xbt_realloc(loc, 30);
447 for (i = 0; i < 40; i += 6) { /* base64 encode */
448 memcpy(s, hash + i, 6);
449 val = strtoul(s, NULL, 16);
450 for (j = 0; j < 4; j++) {
451 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
452 loc[1 + 4 * i / 6 + j] =
453 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
460 void *smpi_shared_malloc(size_t size, const char *file, int line)
463 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
464 char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
467 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
470 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
472 data = xbt_dict_get_or_null(allocs, loc);
474 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
475 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
479 xbt_die("Please cleanup /dev/shm/%s", loc);
481 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
484 data = xbt_new(shared_data_t, 1);
488 mem = shm_map(fd, size, data);
489 if (shm_unlink(loc) < 0) {
490 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
492 xbt_dict_set(allocs, loc, data, NULL);
493 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
496 mem = shm_map(data->fd, size, data);
499 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
501 mem = xbt_malloc(size);
502 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
507 void smpi_shared_free(void *ptr)
509 char loc[PTR_STRLEN];
510 shared_metadata_t* meta;
512 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
515 XBT_WARN("Cannot free: nothing was allocated");
518 if(!allocs_metadata) {
519 XBT_WARN("Cannot free: no metadata was allocated");
521 snprintf(loc, PTR_STRLEN, "%p", ptr);
522 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
524 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
529 XBT_WARN("Cannot free: something is broken in the metadata link");
532 if(munmap(ptr, meta->size) < 0) {
533 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
536 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
537 if (data->count <= 0) {
539 xbt_dict_remove(allocs, data->loc);
540 XBT_DEBUG("Shared free - with removal - of %p", ptr);
543 XBT_DEBUG("Classic free of %p", ptr);
549 int smpi_shared_known_call(const char* func, const char* input)
551 char* loc = bprintf("%s:%s", func, input);
556 calls = xbt_dict_new_homogeneous(NULL);
559 xbt_dict_get(calls, loc); /* Succeed or throw */
566 if (ex.category != not_found_error)
573 void* smpi_shared_get_call(const char* func, const char* input) {
574 char* loc = bprintf("%s:%s", func, input);
578 calls = xbt_dict_new_homogeneous(NULL);
580 data = xbt_dict_get(calls, loc);
585 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
586 char* loc = bprintf("%s:%s", func, input);
589 calls = xbt_dict_new_homogeneous(NULL);
591 xbt_dict_set(calls, loc, data, NULL);
599 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
602 /** Map a given SMPI privatization segment (make a SMPI process active)
604 void smpi_switch_data_segment(int dest){
606 if (smpi_loaded_page==dest)//no need to switch either
610 smpi_really_switch_data_segment(dest);
613 /** Map a given SMPI privatization segment (make a SMPI process active)
614 * even if SMPI thinks it is already active
616 * When doing a state restoration, the state of the restored variables
617 * might not be consistent with the state of the virtual memory.
618 * In this case, we to change the data segment.
620 void smpi_really_switch_data_segment(int dest) {
622 if(size_data_exe == 0)//no need to switch
627 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
628 for (i=0; i< SIMIX_process_count(); i++){
629 memcpy(smpi_privatisation_regions[i].address,TOPAGE(start_data_exe),size_data_exe);
633 int current = smpi_privatisation_regions[dest].file_descriptor;
634 XBT_DEBUG("Switching data frame to the one of process %d", dest);
635 void* tmp = mmap (TOPAGE(start_data_exe), size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
636 if (tmp != TOPAGE(start_data_exe))
637 xbt_die("Couldn't map the new region");
638 smpi_loaded_page=dest;
642 int smpi_is_privatisation_file(char* file)
644 return strncmp("/dev/shm/my-buffer-", file, 19) == 0;
647 void smpi_get_executable_global_size(){
648 int size_bss_binary=0;
649 int size_data_binary=0;
651 char *line = NULL; /* Temporal storage for each line that is readed */
652 ssize_t read; /* Number of bytes readed */
653 size_t n = 0; /* Amount of bytes to read by xbt_getline */
658 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
660 fp = popen(command, "r");
663 perror("popen failed");
667 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
672 /* Wipeout the new line character */
673 line[read - 1] = '\0';
675 lfields[0] = strtok(line, " ");
677 if(lfields[0] == NULL)
680 if(strcmp(lfields[0], "Sections:") == 0
681 || strcmp(lfields[0], "Idx") == 0
682 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
685 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
686 lfields[i] = strtok(NULL, " ");
690 * we are looking for these fields
691 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
692 CONTENTS, ALLOC, LOAD, DATA
693 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
698 if(strcmp(lfields[1], ".data") == 0){
699 size_data_binary = strtoul(lfields[2], NULL, 16);
700 start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
702 }else if(strcmp(lfields[1], ".bss") == 0){
703 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
704 //TODO : check if this is OK, as some segments may be inserted between them..
705 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (start_data_exe + size_data_binary))
706 + strtoul(lfields[2], NULL, 16);
714 size_data_exe =(unsigned long)start_data_exe - (unsigned long)TOPAGE(start_data_exe)+ size_data_binary+size_bss_binary;
721 void smpi_initialize_global_memory_segments(){
724 smpi_privatize_global_variables=0;
729 smpi_get_executable_global_size();
731 XBT_DEBUG ("bss+data segment found : size %d starting at %p",size_data_exe, start_data_exe );
733 if(size_data_exe == 0){//no need to switch
734 smpi_privatize_global_variables=0;
738 smpi_privatisation_regions = (smpi_privatisation_region_t) malloc(
739 smpi_process_count() * sizeof(struct s_smpi_privatisation_region));
741 for (i=0; i< SIMIX_process_count(); i++){
742 //create SIMIX_process_count() mappings of this size with the same data inside
743 void *address = NULL;
744 char path[] = "/dev/shm/my-buffer-XXXXXX";
747 int file_descriptor= mkstemp (path);
748 if (file_descriptor < 0) {
750 xbt_die("Impossible to create temporary file for memory mapping: %s\n\
751 The open() system call failed with the EMFILE error code (too many files). \n\n\
752 This means that you reached the system limits concerning the amount of files per process. \
753 This is not a surprise if you are trying to virtualize many processes on top of SMPI. \
754 Don't panic -- you should simply increase your system limits and try again. \n\n\
755 First, check what your limits are:\n\
756 cat /proc/sys/fs/file-max # Gives you the system-wide limit\n\
757 ulimit -Hn # Gives you the per process hard limit\n\
758 ulimit -Sn # Gives you the per process soft limit\n\
759 cat /proc/self/limits # Displays any per-process limitation (including the one given above)\n\n\
760 If one of these values is less than the amount of MPI processes that you try to run, then you got the explanation of this error. \
761 Ask the Internet about tutorials on how to increase the files limit such as: https://rtcamp.com/tutorials/linux/increase-open-files-limit/",
764 xbt_die("Impossible to create temporary file for memory mapping: %s",
768 status = unlink (path);
770 xbt_die("Impossible to unlink temporary file for memory mapping");
772 status = ftruncate(file_descriptor, size_data_exe);
774 xbt_die("Impossible to set the size of the temporary file for memory mapping");
776 /* Ask for a free region */
777 address = mmap (NULL, size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
778 if (address == MAP_FAILED)
779 xbt_die("Couldn't find a free region for memory mapping");
781 //initialize the values
782 memcpy(address,TOPAGE(start_data_exe),size_data_exe);
784 //store the address of the mapping for further switches
785 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
786 smpi_privatisation_regions[i].address = address;
793 void smpi_destroy_global_memory_segments(){
794 if(size_data_exe == 0)//no need to switch
798 for (i=0; i< smpi_process_count(); i++){
799 if(munmap(smpi_privatisation_regions[i].address,size_data_exe) < 0) {
800 XBT_WARN("Unmapping of fd %d failed: %s",
801 smpi_privatisation_regions[i].file_descriptor, strerror(errno));
803 close(smpi_privatisation_regions[i].file_descriptor);
805 xbt_free(smpi_privatisation_regions);