1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
15 #include "simgrid/modelchecker.h"
21 #include <sys/types.h>
24 #include <math.h> // sqrt
30 #define MAP_ANONYMOUS MAP_ANON
33 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
34 "Logging specific to SMPI (benchmarking)");
36 /* Shared allocations are handled through shared memory segments.
37 * Associated data and metadata are used as follows:
40 * `allocs' dict ---- -.
41 * ---------- shared_data_t shared_metadata_t / | | |
42 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
43 * | ---------- | fd of <name> | | | size of mmap | --| | | |
44 * | | count (2) | |-- | data | \ | | |
45 * `----------------- | <name> | | ----------------- ---- |
46 * -------------------- | ^ |
48 * | | `allocs_metadata' dict |
49 * | | ---------------------- |
50 * | `-- | <addr of mmap #1> |<-'
51 * | .-- | <addr of mmap #2> |<-.
52 * | | ---------------------- |
58 * | shared_metadata_t / | |
59 * | ----------------- | | |
60 * | | size of mmap | --| | |
62 * ----------------- | | |
67 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
69 xbt_dict_t allocs = NULL; /* Allocated on first use */
70 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
71 xbt_dict_t samples = NULL; /* Allocated on first use */
72 xbt_dict_t calls = NULL; /* Allocated on first use */
74 double smpi_cpu_threshold;
75 double smpi_running_power;
77 int smpi_loaded_page = -1;
78 char* start_data_exe = NULL;
79 int size_data_exe = 0;
80 int smpi_privatize_global_variables;
82 smpi_privatisation_region_t smpi_privatisation_regions;
95 static size_t shm_size(int fd) {
98 if(fstat(fd, &st) < 0) {
99 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
101 return (size_t)st.st_size;
105 static void* shm_map(int fd, size_t size, shared_data_t* data) {
107 char loc[PTR_STRLEN];
108 shared_metadata_t* meta;
110 if(size > shm_size(fd)) {
111 if(ftruncate(fd, (off_t)size) < 0) {
112 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
116 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
117 if(mem == MAP_FAILED) {
118 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
120 if(!allocs_metadata) {
121 allocs_metadata = xbt_dict_new_homogeneous(xbt_free_f);
123 snprintf(loc, PTR_STRLEN, "%p", mem);
124 meta = xbt_new(shared_metadata_t, 1);
127 xbt_dict_set(allocs_metadata, loc, meta, NULL);
128 XBT_DEBUG("MMAP %zu to %p", size, mem);
133 void smpi_bench_destroy(void)
135 xbt_dict_free(&allocs);
136 xbt_dict_free(&allocs_metadata);
137 xbt_dict_free(&samples);
138 xbt_dict_free(&calls);
141 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
142 void smpi_execute_flops_(double *flops)
144 smpi_execute_flops(*flops);
147 XBT_PUBLIC(void) smpi_execute_(double *duration);
148 void smpi_execute_(double *duration)
150 smpi_execute(*duration);
153 void smpi_execute_flops(double flops) {
156 host = SIMIX_host_self();
157 XBT_DEBUG("Handle real computation time: %f flops", flops);
158 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
160 simcall_set_category (action, TRACE_internal_smpi_get_category());
162 simcall_host_execution_wait(action);
163 smpi_switch_data_segment(smpi_process_index());
166 void smpi_execute(double duration)
168 if (duration >= smpi_cpu_threshold) {
169 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
170 double flops = duration * smpi_running_power;
172 int rank = smpi_process_index();
173 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
174 extra->type=TRACING_COMPUTING;
175 extra->comp_size=flops;
176 TRACE_smpi_computing_in(rank, extra);
178 smpi_execute_flops(flops);
181 TRACE_smpi_computing_out(rank);
185 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
186 duration, smpi_cpu_threshold);
190 void smpi_switch_data_segment(int dest);
192 void smpi_bench_begin(void)
194 smpi_switch_data_segment(smpi_process_index());
199 xbt_os_threadtimer_start(smpi_process_timer());
202 void smpi_bench_end(void)
208 xbt_os_timer_t timer = smpi_process_timer();
209 xbt_os_threadtimer_stop(timer);
210 // smpi_switch_data_segment(smpi_process_count());
211 if (smpi_process_get_sampling()) {
212 XBT_CRITICAL("Cannot do recursive benchmarks.");
213 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
214 xbt_backtrace_display_current();
215 xbt_die("Aborting.");
217 // Simulate the benchmarked computation unless disabled via command-line argument
218 if (sg_cfg_get_boolean("smpi/simulate_computation")) {
219 smpi_execute(xbt_os_timer_elapsed(timer));
223 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
224 static unsigned int private_sleep(double secs)
228 XBT_DEBUG("Sleep for: %lf secs", secs);
230 int rank = smpi_comm_rank(MPI_COMM_WORLD);
231 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
232 extra->type=TRACING_SLEEPING;
233 extra->sleep_duration=secs;
234 TRACE_smpi_sleeping_in(rank, extra);
236 simcall_process_sleep(secs);
238 TRACE_smpi_sleeping_out(rank);
245 unsigned int smpi_sleep(unsigned int secs)
247 return private_sleep((double)secs);
250 int smpi_usleep(useconds_t usecs)
252 return (int)private_sleep((double)usecs / 1000000.0);
256 int smpi_gettimeofday(struct timeval *tv, void* tz)
260 now = SIMIX_get_clock();
262 tv->tv_sec = (time_t)now;
264 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
266 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
273 extern double sg_surf_precision;
274 unsigned long long smpi_rastro_resolution (void)
277 double resolution = (1/sg_surf_precision);
279 return (unsigned long long)resolution;
282 unsigned long long smpi_rastro_timestamp (void)
285 double now = SIMIX_get_clock();
287 unsigned long long sec = (unsigned long long)now;
288 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
290 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
293 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
295 double threshold; /* maximal stderr requested (if positive) */
296 double relstderr; /* observed stderr so far */
297 double mean; /* mean of benched times, to be used if the block is disabled */
298 double sum; /* sum of benched times (to compute the mean and stderr) */
299 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
300 int iters; /* amount of requested iterations */
301 int count; /* amount of iterations done so far */
302 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
305 static char *sample_location(int global, const char *file, int line) {
307 return bprintf("%s:%d", file, line);
309 return bprintf("%s:%d:%d", file, line, smpi_process_index());
312 static int sample_enough_benchs(local_data_t *data) {
313 int res = data->count >= data->iters;
314 if (data->threshold>0.0) {
316 res = 0; // not enough data
317 if (data->relstderr > data->threshold)
318 res = 0; // stderr too high yet
320 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
321 (res?"enough benchs":"need more data"),
322 data->count, data->iters, data->relstderr, data->threshold, data->mean);
326 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
328 char *loc = sample_location(global, file, line);
331 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
332 smpi_process_set_sampling(1);
335 samples = xbt_dict_new_homogeneous(free);
337 data = xbt_dict_get_or_null(samples, loc);
339 xbt_assert(threshold>0 || iters>0,
340 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
341 data = (local_data_t *) xbt_new(local_data_t, 1);
344 data->sum_pow2 = 0.0;
346 data->threshold = threshold;
347 data->benching = 1; // If we have no data, we need at least one
349 xbt_dict_set(samples, loc, data, NULL);
350 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
352 if (data->iters != iters || data->threshold != threshold) {
353 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
354 loc, data->iters, data->threshold, iters,threshold);
358 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
359 data->benching = !sample_enough_benchs(data);
360 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
365 int smpi_sample_2(int global, const char *file, int line)
367 char *loc = sample_location(global, file, line);
371 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
372 data = xbt_dict_get(samples, loc);
373 XBT_DEBUG("sample2 %s",loc);
376 if (data->benching==1) {
377 // we need to run a new bench
378 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
379 data->count, data->iters, data->relstderr, data->threshold, data->mean);
382 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
383 // Just sleep instead
384 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
385 data->count, data->iters, data->relstderr, data->threshold, data->mean);
386 smpi_execute(data->mean);
387 smpi_process_set_sampling(0);
388 res = 0; // prepare to capture future, unrelated computations
395 void smpi_sample_3(int global, const char *file, int line)
397 char *loc = sample_location(global, file, line);
400 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
401 data = xbt_dict_get(samples, loc);
402 XBT_DEBUG("sample3 %s",loc);
405 if (data->benching==0) {
409 // ok, benchmarking this loop is over
410 xbt_os_threadtimer_stop(smpi_process_timer());
415 sample = xbt_os_timer_elapsed(smpi_process_timer());
417 data->sum_pow2 += sample * sample;
418 n = (double)data->count;
419 data->mean = data->sum / n;
420 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
421 if (!sample_enough_benchs(data)) {
422 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
424 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
425 data->mean, data->relstderr, sample);
427 // That's enough for now, prevent sample_2 to run the same code over and over
432 static void smpi_shared_alloc_free(void *p)
434 shared_data_t *data = p;
439 static char *smpi_shared_alloc_hash(char *loc)
449 loc = xbt_realloc(loc, 30);
451 for (i = 0; i < 40; i += 6) { /* base64 encode */
452 memcpy(s, hash + i, 6);
453 val = strtoul(s, NULL, 16);
454 for (j = 0; j < 4; j++) {
455 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
456 loc[1 + 4 * i / 6 + j] =
457 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
464 void *smpi_shared_malloc(size_t size, const char *file, int line)
467 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
468 char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
471 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
474 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
476 data = xbt_dict_get_or_null(allocs, loc);
478 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
479 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
483 xbt_die("Please cleanup /dev/shm/%s", loc);
485 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
488 data = xbt_new(shared_data_t, 1);
492 mem = shm_map(fd, size, data);
493 if (shm_unlink(loc) < 0) {
494 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
496 xbt_dict_set(allocs, loc, data, NULL);
497 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
500 mem = shm_map(data->fd, size, data);
503 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
505 mem = xbt_malloc(size);
506 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
511 void smpi_shared_free(void *ptr)
513 char loc[PTR_STRLEN];
514 shared_metadata_t* meta;
516 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
519 XBT_WARN("Cannot free: nothing was allocated");
522 if(!allocs_metadata) {
523 XBT_WARN("Cannot free: no metadata was allocated");
525 snprintf(loc, PTR_STRLEN, "%p", ptr);
526 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
528 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
533 XBT_WARN("Cannot free: something is broken in the metadata link");
536 if(munmap(ptr, meta->size) < 0) {
537 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
540 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
541 if (data->count <= 0) {
543 xbt_dict_remove(allocs, data->loc);
544 XBT_DEBUG("Shared free - with removal - of %p", ptr);
547 XBT_DEBUG("Classic free of %p", ptr);
553 int smpi_shared_known_call(const char* func, const char* input)
555 char* loc = bprintf("%s:%s", func, input);
560 calls = xbt_dict_new_homogeneous(NULL);
563 xbt_dict_get(calls, loc); /* Succeed or throw */
570 if (ex.category != not_found_error)
577 void* smpi_shared_get_call(const char* func, const char* input) {
578 char* loc = bprintf("%s:%s", func, input);
582 calls = xbt_dict_new_homogeneous(NULL);
584 data = xbt_dict_get(calls, loc);
589 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
590 char* loc = bprintf("%s:%s", func, input);
593 calls = xbt_dict_new_homogeneous(NULL);
595 xbt_dict_set(calls, loc, data, NULL);
603 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
607 * - read the executable data+bss section addresses and sizes
608 * - for each process create a copy of these sections with mmap
609 * - store them in a dynar
615 void smpi_switch_data_segment(int dest){
617 if(size_data_exe == 0)//no need to switch
620 if (smpi_loaded_page==dest)//no need to switch either
625 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
626 for (i=0; i< SIMIX_process_count(); i++){
627 memcpy(smpi_privatisation_regions[i].address,TOPAGE(start_data_exe),size_data_exe);
631 int current = smpi_privatisation_regions[dest].file_descriptor;
632 XBT_DEBUG("Switching data frame to the one of process %d", dest);
633 void* tmp = mmap (TOPAGE(start_data_exe), size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
634 if (tmp != TOPAGE(start_data_exe))
635 xbt_die("Couldn't map the new region");
636 smpi_loaded_page=dest;
640 void smpi_get_executable_global_size(){
641 int size_bss_binary=0;
642 int size_data_binary=0;
644 char *line = NULL; /* Temporal storage for each line that is readed */
645 ssize_t read; /* Number of bytes readed */
646 size_t n = 0; /* Amount of bytes to read by xbt_getline */
651 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
653 fp = popen(command, "r");
656 perror("popen failed");
660 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
665 /* Wipeout the new line character */
666 line[read - 1] = '\0';
668 lfields[0] = strtok(line, " ");
670 if(lfields[0] == NULL)
673 if(strcmp(lfields[0], "Sections:") == 0
674 || strcmp(lfields[0], "Idx") == 0
675 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
678 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
679 lfields[i] = strtok(NULL, " ");
683 * we are looking for these fields
684 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
685 CONTENTS, ALLOC, LOAD, DATA
686 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
691 if(strcmp(lfields[1], ".data") == 0){
692 size_data_binary = strtoul(lfields[2], NULL, 16);
693 start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
695 }else if(strcmp(lfields[1], ".bss") == 0){
696 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
697 //TODO : check if this is OK, as some segments may be inserted between them..
698 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (start_data_exe + size_data_binary))
699 + strtoul(lfields[2], NULL, 16);
707 size_data_exe =(unsigned long)start_data_exe - (unsigned long)TOPAGE(start_data_exe)+ size_data_binary+size_bss_binary;
714 void smpi_initialize_global_memory_segments(){
717 smpi_privatize_global_variables=0;
722 smpi_get_executable_global_size();
724 XBT_DEBUG ("bss+data segment found : size %d starting at %p",size_data_exe, start_data_exe );
726 if(size_data_exe == 0){//no need to switch
727 smpi_privatize_global_variables=0;
731 smpi_privatisation_regions = (smpi_privatisation_region_t) malloc(
732 smpi_process_count() * sizeof(struct s_smpi_privatisation_region));
734 for (i=0; i< SIMIX_process_count(); i++){
735 //create SIMIX_process_count() mappings of this size with the same data inside
736 void *address = NULL;
737 char path[] = "/dev/shm/my-buffer-XXXXXX";
740 int file_descriptor= mkstemp (path);
741 if (file_descriptor < 0)
742 xbt_die("Impossible to create temporary file for memory mapping");
744 status = unlink (path);
746 xbt_die("Impossible to unlink temporary file for memory mapping");
748 status = ftruncate(file_descriptor, size_data_exe);
750 xbt_die("Impossible to set the size of the temporary file for memory mapping");
752 /* Ask for a free region */
753 address = mmap (NULL, size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
754 if (address == MAP_FAILED)
755 xbt_die("Couldn't find a free region for memory mapping");
757 //initialize the values
758 memcpy(address,TOPAGE(start_data_exe),size_data_exe);
760 //store the address of the mapping for further switches
761 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
762 smpi_privatisation_regions[i].address = address;
769 void smpi_destroy_global_memory_segments(){
770 if(size_data_exe == 0)//no need to switch
774 for (i=0; i< smpi_process_count(); i++){
775 if(munmap(smpi_privatisation_regions[i].address,size_data_exe) < 0) {
776 XBT_WARN("Unmapping of fd %d failed: %s",
777 smpi_privatisation_regions[i].file_descriptor, strerror(errno));
779 close(smpi_privatisation_regions[i].file_descriptor);
781 xbt_free(smpi_privatisation_regions);