1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
15 #include "simgrid/modelchecker.h"
21 #include <sys/types.h>
24 #include <math.h> // sqrt
30 #define MAP_ANONYMOUS MAP_ANON
33 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
34 "Logging specific to SMPI (benchmarking)");
36 /* Shared allocations are handled through shared memory segments.
37 * Associated data and metadata are used as follows:
40 * `allocs' dict ---- -.
41 * ---------- shared_data_t shared_metadata_t / | | |
42 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
43 * | ---------- | fd of <name> | | | size of mmap | --| | | |
44 * | | count (2) | |-- | data | \ | | |
45 * `----------------- | <name> | | ----------------- ---- |
46 * -------------------- | ^ |
48 * | | `allocs_metadata' dict |
49 * | | ---------------------- |
50 * | `-- | <addr of mmap #1> |<-'
51 * | .-- | <addr of mmap #2> |<-.
52 * | | ---------------------- |
58 * | shared_metadata_t / | |
59 * | ----------------- | | |
60 * | | size of mmap | --| | |
62 * ----------------- | | |
67 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
69 xbt_dict_t allocs = NULL; /* Allocated on first use */
70 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
71 xbt_dict_t samples = NULL; /* Allocated on first use */
72 xbt_dict_t calls = NULL; /* Allocated on first use */
74 double smpi_cpu_threshold;
75 double smpi_running_power;
77 int smpi_loaded_page = -1;
78 char* smpi_start_data_exe = NULL;
79 int smpi_size_data_exe = 0;
80 int smpi_privatize_global_variables;
81 double smpi_total_benched_time = 0;
82 smpi_privatisation_region_t smpi_privatisation_regions;
95 static size_t shm_size(int fd) {
98 if(fstat(fd, &st) < 0) {
99 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
101 return (size_t)st.st_size;
105 static void* shm_map(int fd, size_t size, shared_data_t* data) {
107 char loc[PTR_STRLEN];
108 shared_metadata_t* meta;
110 if(size > shm_size(fd)) {
111 if(ftruncate(fd, (off_t)size) < 0) {
112 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
116 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
117 if(mem == MAP_FAILED) {
118 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
120 if(!allocs_metadata) {
121 allocs_metadata = xbt_dict_new_homogeneous(xbt_free_f);
123 snprintf(loc, PTR_STRLEN, "%p", mem);
124 meta = xbt_new(shared_metadata_t, 1);
127 xbt_dict_set(allocs_metadata, loc, meta, NULL);
128 XBT_DEBUG("MMAP %zu to %p", size, mem);
133 void smpi_bench_destroy(void)
135 xbt_dict_free(&allocs);
136 xbt_dict_free(&allocs_metadata);
137 xbt_dict_free(&samples);
138 xbt_dict_free(&calls);
141 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
142 void smpi_execute_flops_(double *flops)
144 smpi_execute_flops(*flops);
147 XBT_PUBLIC(void) smpi_execute_(double *duration);
148 void smpi_execute_(double *duration)
150 smpi_execute(*duration);
153 void smpi_execute_flops(double flops) {
154 smx_synchro_t action;
156 host = SIMIX_host_self();
157 XBT_DEBUG("Handle real computation time: %f flops", flops);
158 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
160 simcall_set_category (action, TRACE_internal_smpi_get_category());
162 simcall_host_execution_wait(action);
163 smpi_switch_data_segment(smpi_process_index());
166 void smpi_execute(double duration)
168 if (duration >= smpi_cpu_threshold) {
169 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
170 double flops = duration * smpi_running_power;
172 int rank = smpi_process_index();
173 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
174 extra->type=TRACING_COMPUTING;
175 extra->comp_size=flops;
176 TRACE_smpi_computing_in(rank, extra);
178 smpi_execute_flops(flops);
181 TRACE_smpi_computing_out(rank);
185 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
186 duration, smpi_cpu_threshold);
190 void smpi_switch_data_segment(int dest);
192 void smpi_bench_begin(void)
194 smpi_switch_data_segment(smpi_process_index());
196 if (MC_is_active() || MC_record_replay_is_active())
199 xbt_os_threadtimer_start(smpi_process_timer());
202 void smpi_bench_end(void)
205 if (MC_is_active() || MC_record_replay_is_active())
208 xbt_os_timer_t timer = smpi_process_timer();
209 xbt_os_threadtimer_stop(timer);
210 // smpi_switch_data_segment(smpi_process_count());
211 if (smpi_process_get_sampling()) {
212 XBT_CRITICAL("Cannot do recursive benchmarks.");
213 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
214 xbt_backtrace_display_current();
215 xbt_die("Aborting.");
217 // Simulate the benchmarked computation unless disabled via command-line argument
218 if (sg_cfg_get_boolean("smpi/simulate_computation")) {
219 smpi_execute(xbt_os_timer_elapsed(timer));
222 smpi_total_benched_time += xbt_os_timer_elapsed(timer);
225 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
226 static unsigned int private_sleep(double secs)
230 XBT_DEBUG("Sleep for: %lf secs", secs);
232 int rank = smpi_comm_rank(MPI_COMM_WORLD);
233 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
234 extra->type=TRACING_SLEEPING;
235 extra->sleep_duration=secs;
236 TRACE_smpi_sleeping_in(rank, extra);
238 simcall_process_sleep(secs);
240 TRACE_smpi_sleeping_out(rank);
247 unsigned int smpi_sleep(unsigned int secs)
249 return private_sleep((double)secs);
252 int smpi_usleep(useconds_t usecs)
254 return (int)private_sleep((double)usecs / 1000000.0);
258 int smpi_gettimeofday(struct timeval *tv, void* tz)
262 now = SIMIX_get_clock();
264 tv->tv_sec = (time_t)now;
266 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
268 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
275 extern double sg_surf_precision;
276 unsigned long long smpi_rastro_resolution (void)
279 double resolution = (1/sg_surf_precision);
281 return (unsigned long long)resolution;
284 unsigned long long smpi_rastro_timestamp (void)
287 double now = SIMIX_get_clock();
289 unsigned long long sec = (unsigned long long)now;
290 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
292 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
295 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
297 double threshold; /* maximal stderr requested (if positive) */
298 double relstderr; /* observed stderr so far */
299 double mean; /* mean of benched times, to be used if the block is disabled */
300 double sum; /* sum of benched times (to compute the mean and stderr) */
301 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
302 int iters; /* amount of requested iterations */
303 int count; /* amount of iterations done so far */
304 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
307 static char *sample_location(int global, const char *file, int line) {
309 return bprintf("%s:%d", file, line);
311 return bprintf("%s:%d:%d", file, line, smpi_process_index());
314 static int sample_enough_benchs(local_data_t *data) {
315 int res = data->count >= data->iters;
316 if (data->threshold>0.0) {
318 res = 0; // not enough data
319 if (data->relstderr > data->threshold)
320 res = 0; // stderr too high yet
322 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
323 (res?"enough benchs":"need more data"),
324 data->count, data->iters, data->relstderr, data->threshold, data->mean);
328 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
330 char *loc = sample_location(global, file, line);
333 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
334 smpi_process_set_sampling(1);
337 samples = xbt_dict_new_homogeneous(free);
339 data = xbt_dict_get_or_null(samples, loc);
341 xbt_assert(threshold>0 || iters>0,
342 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
343 data = (local_data_t *) xbt_new(local_data_t, 1);
346 data->sum_pow2 = 0.0;
348 data->threshold = threshold;
349 data->benching = 1; // If we have no data, we need at least one
351 xbt_dict_set(samples, loc, data, NULL);
352 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
354 if (data->iters != iters || data->threshold != threshold) {
355 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
356 loc, data->iters, data->threshold, iters,threshold);
360 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
361 data->benching = !sample_enough_benchs(data);
362 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
367 int smpi_sample_2(int global, const char *file, int line)
369 char *loc = sample_location(global, file, line);
373 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
374 data = xbt_dict_get(samples, loc);
375 XBT_DEBUG("sample2 %s",loc);
378 if (data->benching==1) {
379 // we need to run a new bench
380 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
381 data->count, data->iters, data->relstderr, data->threshold, data->mean);
384 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
385 // Just sleep instead
386 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
387 data->count, data->iters, data->relstderr, data->threshold, data->mean);
388 smpi_execute(data->mean);
389 smpi_process_set_sampling(0);
390 res = 0; // prepare to capture future, unrelated computations
397 void smpi_sample_3(int global, const char *file, int line)
399 char *loc = sample_location(global, file, line);
402 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
403 data = xbt_dict_get(samples, loc);
404 XBT_DEBUG("sample3 %s",loc);
407 if (data->benching==0) {
411 // ok, benchmarking this loop is over
412 xbt_os_threadtimer_stop(smpi_process_timer());
417 sample = xbt_os_timer_elapsed(smpi_process_timer());
419 data->sum_pow2 += sample * sample;
420 n = (double)data->count;
421 data->mean = data->sum / n;
422 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
423 if (!sample_enough_benchs(data)) {
424 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
426 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
427 data->mean, data->relstderr, sample);
429 // That's enough for now, prevent sample_2 to run the same code over and over
434 static void smpi_shared_alloc_free(void *p)
436 shared_data_t *data = p;
441 static char *smpi_shared_alloc_hash(char *loc)
451 loc = xbt_realloc(loc, 30);
453 for (i = 0; i < 40; i += 6) { /* base64 encode */
454 memcpy(s, hash + i, 6);
455 val = strtoul(s, NULL, 16);
456 for (j = 0; j < 4; j++) {
457 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
458 loc[1 + 4 * i / 6 + j] =
459 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
466 void *smpi_shared_malloc(size_t size, const char *file, int line)
469 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
470 char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
473 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
476 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
478 data = xbt_dict_get_or_null(allocs, loc);
480 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
481 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
485 xbt_die("Please cleanup /dev/shm/%s", loc);
487 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
490 data = xbt_new(shared_data_t, 1);
494 mem = shm_map(fd, size, data);
495 if (shm_unlink(loc) < 0) {
496 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
498 xbt_dict_set(allocs, loc, data, NULL);
499 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
502 mem = shm_map(data->fd, size, data);
505 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
507 mem = xbt_malloc(size);
508 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
513 void smpi_shared_free(void *ptr)
515 char loc[PTR_STRLEN];
516 shared_metadata_t* meta;
518 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
521 XBT_WARN("Cannot free: nothing was allocated");
524 if(!allocs_metadata) {
525 XBT_WARN("Cannot free: no metadata was allocated");
527 snprintf(loc, PTR_STRLEN, "%p", ptr);
528 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
530 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
535 XBT_WARN("Cannot free: something is broken in the metadata link");
538 if(munmap(ptr, meta->size) < 0) {
539 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
542 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
543 if (data->count <= 0) {
545 xbt_dict_remove(allocs, data->loc);
546 XBT_DEBUG("Shared free - with removal - of %p", ptr);
549 XBT_DEBUG("Classic free of %p", ptr);
555 int smpi_shared_known_call(const char* func, const char* input)
557 char* loc = bprintf("%s:%s", func, input);
562 calls = xbt_dict_new_homogeneous(NULL);
565 xbt_dict_get(calls, loc); /* Succeed or throw */
572 if (ex.category != not_found_error)
579 void* smpi_shared_get_call(const char* func, const char* input) {
580 char* loc = bprintf("%s:%s", func, input);
584 calls = xbt_dict_new_homogeneous(NULL);
586 data = xbt_dict_get(calls, loc);
591 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
592 char* loc = bprintf("%s:%s", func, input);
595 calls = xbt_dict_new_homogeneous(NULL);
597 xbt_dict_set(calls, loc, data, NULL);
605 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
608 /** Map a given SMPI privatization segment (make a SMPI process active)
610 void smpi_switch_data_segment(int dest){
612 if (smpi_loaded_page==dest)//no need to switch either
616 smpi_really_switch_data_segment(dest);
619 /** Map a given SMPI privatization segment (make a SMPI process active)
620 * even if SMPI thinks it is already active
622 * When doing a state restoration, the state of the restored variables
623 * might not be consistent with the state of the virtual memory.
624 * In this case, we to change the data segment.
626 void smpi_really_switch_data_segment(int dest) {
628 if(smpi_size_data_exe == 0)//no need to switch
633 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
634 for (i=0; i< SIMIX_process_count(); i++){
635 memcpy(smpi_privatisation_regions[i].address,
636 TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
640 // FIXME, cross-process support (mmap across process when necessary)
641 int current = smpi_privatisation_regions[dest].file_descriptor;
642 XBT_DEBUG("Switching data frame to the one of process %d", dest);
643 void* tmp = mmap (TOPAGE(smpi_start_data_exe), smpi_size_data_exe,
644 PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
645 if (tmp != TOPAGE(smpi_start_data_exe))
646 xbt_die("Couldn't map the new region");
647 smpi_loaded_page = dest;
651 int smpi_is_privatisation_file(char* file)
653 return strncmp("/dev/shm/my-buffer-", file, 19) == 0;
656 void smpi_get_executable_global_size(){
657 int size_bss_binary=0;
658 int size_data_binary=0;
660 char *line = NULL; /* Temporal storage for each line that is readed */
661 ssize_t read; /* Number of bytes readed */
662 size_t n = 0; /* Amount of bytes to read by xbt_getline */
667 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
669 fp = popen(command, "r");
672 perror("popen failed");
676 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
681 /* Wipeout the new line character */
682 line[read - 1] = '\0';
684 lfields[0] = strtok(line, " ");
686 if(lfields[0] == NULL)
689 if(strcmp(lfields[0], "Sections:") == 0
690 || strcmp(lfields[0], "Idx") == 0
691 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
694 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
695 lfields[i] = strtok(NULL, " ");
699 * we are looking for these fields
700 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
701 CONTENTS, ALLOC, LOAD, DATA
702 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
707 if(strcmp(lfields[1], ".data") == 0){
708 size_data_binary = strtoul(lfields[2], NULL, 16);
709 smpi_start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
711 }else if(strcmp(lfields[1], ".bss") == 0){
712 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
713 //TODO : check if this is OK, as some segments may be inserted between them..
714 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (smpi_start_data_exe + size_data_binary))
715 + strtoul(lfields[2], NULL, 16);
723 smpi_size_data_exe = (unsigned long) smpi_start_data_exe
724 - (unsigned long) TOPAGE(smpi_start_data_exe)
725 + size_data_binary+size_bss_binary;
732 void smpi_initialize_global_memory_segments(){
735 smpi_privatize_global_variables=0;
740 smpi_get_executable_global_size();
742 XBT_DEBUG ("bss+data segment found : size %d starting at %p",
743 smpi_size_data_exe, smpi_start_data_exe );
745 if (smpi_size_data_exe == 0){//no need to switch
746 smpi_privatize_global_variables=0;
750 smpi_privatisation_regions = (smpi_privatisation_region_t) malloc(
751 smpi_process_count() * sizeof(struct s_smpi_privatisation_region));
753 for (i=0; i< SIMIX_process_count(); i++){
754 //create SIMIX_process_count() mappings of this size with the same data inside
755 void *address = NULL;
756 char path[] = "/dev/shm/my-buffer-XXXXXX";
759 int file_descriptor= mkstemp (path);
760 if (file_descriptor < 0) {
762 xbt_die("Impossible to create temporary file for memory mapping: %s\n\
763 The open() system call failed with the EMFILE error code (too many files). \n\n\
764 This means that you reached the system limits concerning the amount of files per process. \
765 This is not a surprise if you are trying to virtualize many processes on top of SMPI. \
766 Don't panic -- you should simply increase your system limits and try again. \n\n\
767 First, check what your limits are:\n\
768 cat /proc/sys/fs/file-max # Gives you the system-wide limit\n\
769 ulimit -Hn # Gives you the per process hard limit\n\
770 ulimit -Sn # Gives you the per process soft limit\n\
771 cat /proc/self/limits # Displays any per-process limitation (including the one given above)\n\n\
772 If one of these values is less than the amount of MPI processes that you try to run, then you got the explanation of this error. \
773 Ask the Internet about tutorials on how to increase the files limit such as: https://rtcamp.com/tutorials/linux/increase-open-files-limit/",
776 xbt_die("Impossible to create temporary file for memory mapping: %s",
780 status = unlink (path);
782 xbt_die("Impossible to unlink temporary file for memory mapping");
784 status = ftruncate(file_descriptor, smpi_size_data_exe);
786 xbt_die("Impossible to set the size of the temporary file for memory mapping");
788 /* Ask for a free region */
789 address = mmap (NULL, smpi_size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
790 if (address == MAP_FAILED)
791 xbt_die("Couldn't find a free region for memory mapping");
793 //initialize the values
794 memcpy(address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
796 //store the address of the mapping for further switches
797 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
798 smpi_privatisation_regions[i].address = address;
805 void smpi_destroy_global_memory_segments(){
806 if (smpi_size_data_exe == 0)//no need to switch
810 for (i=0; i< smpi_process_count(); i++){
811 if(munmap(smpi_privatisation_regions[i].address, smpi_size_data_exe) < 0) {
812 XBT_WARN("Unmapping of fd %d failed: %s",
813 smpi_privatisation_regions[i].file_descriptor, strerror(errno));
815 close(smpi_privatisation_regions[i].file_descriptor);
817 xbt_free(smpi_privatisation_regions);