1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
15 #include "simgrid/modelchecker.h"
21 #include <sys/types.h>
24 #include <math.h> // sqrt
30 #define MAP_ANONYMOUS MAP_ANON
33 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
34 "Logging specific to SMPI (benchmarking)");
36 /* Shared allocations are handled through shared memory segments.
37 * Associated data and metadata are used as follows:
40 * `allocs' dict ---- -.
41 * ---------- shared_data_t shared_metadata_t / | | |
42 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
43 * | ---------- | fd of <name> | | | size of mmap | --| | | |
44 * | | count (2) | |-- | data | \ | | |
45 * `----------------- | <name> | | ----------------- ---- |
46 * -------------------- | ^ |
48 * | | `allocs_metadata' dict |
49 * | | ---------------------- |
50 * | `-- | <addr of mmap #1> |<-'
51 * | .-- | <addr of mmap #2> |<-.
52 * | | ---------------------- |
58 * | shared_metadata_t / | |
59 * | ----------------- | | |
60 * | | size of mmap | --| | |
62 * ----------------- | | |
67 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
69 xbt_dict_t allocs = NULL; /* Allocated on first use */
70 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
71 xbt_dict_t samples = NULL; /* Allocated on first use */
72 xbt_dict_t calls = NULL; /* Allocated on first use */
74 double smpi_cpu_threshold;
75 double smpi_running_power;
77 int smpi_loaded_page = -1;
78 char* start_data_exe = NULL;
79 int size_data_exe = 0;
80 int smpi_privatize_global_variables;
82 smpi_privatisation_region_t smpi_privatisation_regions;
95 static size_t shm_size(int fd) {
98 if(fstat(fd, &st) < 0) {
99 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
101 return (size_t)st.st_size;
105 static void* shm_map(int fd, size_t size, shared_data_t* data) {
107 char loc[PTR_STRLEN];
108 shared_metadata_t* meta;
110 if(size > shm_size(fd)) {
111 if(ftruncate(fd, (off_t)size) < 0) {
112 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
116 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
117 if(mem == MAP_FAILED) {
118 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
120 if(!allocs_metadata) {
121 allocs_metadata = xbt_dict_new_homogeneous(xbt_free_f);
123 snprintf(loc, PTR_STRLEN, "%p", mem);
124 meta = xbt_new(shared_metadata_t, 1);
127 xbt_dict_set(allocs_metadata, loc, meta, NULL);
128 XBT_DEBUG("MMAP %zu to %p", size, mem);
133 void smpi_bench_destroy(void)
135 xbt_dict_free(&allocs);
136 xbt_dict_free(&allocs_metadata);
137 xbt_dict_free(&samples);
138 xbt_dict_free(&calls);
141 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
142 void smpi_execute_flops_(double *flops)
144 smpi_execute_flops(*flops);
147 XBT_PUBLIC(void) smpi_execute_(double *duration);
148 void smpi_execute_(double *duration)
150 smpi_execute(*duration);
153 void smpi_execute_flops(double flops) {
156 host = SIMIX_host_self();
157 XBT_DEBUG("Handle real computation time: %f flops", flops);
158 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
160 simcall_set_category (action, TRACE_internal_smpi_get_category());
162 simcall_host_execution_wait(action);
165 void smpi_execute(double duration)
167 if (duration >= smpi_cpu_threshold) {
168 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
169 double flops = duration * smpi_running_power;
171 int rank = smpi_process_index();
172 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
173 extra->type=TRACING_COMPUTING;
174 extra->comp_size=flops;
175 TRACE_smpi_computing_in(rank, extra);
177 smpi_execute_flops(flops);
180 TRACE_smpi_computing_out(rank);
184 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
185 duration, smpi_cpu_threshold);
189 void smpi_switch_data_segment(int dest);
191 void smpi_bench_begin(void)
193 smpi_switch_data_segment(smpi_process_index());
198 xbt_os_threadtimer_start(smpi_process_timer());
201 void smpi_bench_end(void)
207 xbt_os_timer_t timer = smpi_process_timer();
208 xbt_os_threadtimer_stop(timer);
209 // smpi_switch_data_segment(smpi_process_count());
210 if (smpi_process_get_sampling()) {
211 XBT_CRITICAL("Cannot do recursive benchmarks.");
212 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
213 xbt_backtrace_display_current();
214 xbt_die("Aborting.");
216 // Simulate the benchmarked computation unless disabled via command-line argument
217 if (sg_cfg_get_boolean("smpi/simulate_computation")) {
218 smpi_execute(xbt_os_timer_elapsed(timer));
222 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
223 static unsigned int private_sleep(double secs)
227 XBT_DEBUG("Sleep for: %lf secs", secs);
229 int rank = smpi_comm_rank(MPI_COMM_WORLD);
230 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
231 extra->type=TRACING_SLEEPING;
232 extra->sleep_duration=secs;
233 TRACE_smpi_sleeping_in(rank, extra);
235 simcall_process_sleep(secs);
237 TRACE_smpi_sleeping_out(rank);
244 unsigned int smpi_sleep(unsigned int secs)
246 return private_sleep((double)secs);
249 int smpi_usleep(useconds_t usecs)
251 return (int)private_sleep((double)usecs / 1000000.0);
255 int smpi_gettimeofday(struct timeval *tv, void* tz)
259 now = SIMIX_get_clock();
261 tv->tv_sec = (time_t)now;
263 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
265 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
272 extern double sg_surf_precision;
273 unsigned long long smpi_rastro_resolution (void)
276 double resolution = (1/sg_surf_precision);
278 return (unsigned long long)resolution;
281 unsigned long long smpi_rastro_timestamp (void)
284 double now = SIMIX_get_clock();
286 unsigned long long sec = (unsigned long long)now;
287 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
289 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
292 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
294 double threshold; /* maximal stderr requested (if positive) */
295 double relstderr; /* observed stderr so far */
296 double mean; /* mean of benched times, to be used if the block is disabled */
297 double sum; /* sum of benched times (to compute the mean and stderr) */
298 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
299 int iters; /* amount of requested iterations */
300 int count; /* amount of iterations done so far */
301 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
304 static char *sample_location(int global, const char *file, int line) {
306 return bprintf("%s:%d", file, line);
308 return bprintf("%s:%d:%d", file, line, smpi_process_index());
311 static int sample_enough_benchs(local_data_t *data) {
312 int res = data->count >= data->iters;
313 if (data->threshold>0.0) {
315 res = 0; // not enough data
316 if (data->relstderr > data->threshold)
317 res = 0; // stderr too high yet
319 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
320 (res?"enough benchs":"need more data"),
321 data->count, data->iters, data->relstderr, data->threshold, data->mean);
325 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
327 char *loc = sample_location(global, file, line);
330 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
331 smpi_process_set_sampling(1);
334 samples = xbt_dict_new_homogeneous(free);
336 data = xbt_dict_get_or_null(samples, loc);
338 xbt_assert(threshold>0 || iters>0,
339 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
340 data = (local_data_t *) xbt_new(local_data_t, 1);
343 data->sum_pow2 = 0.0;
345 data->threshold = threshold;
346 data->benching = 1; // If we have no data, we need at least one
348 xbt_dict_set(samples, loc, data, NULL);
349 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
351 if (data->iters != iters || data->threshold != threshold) {
352 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
353 loc, data->iters, data->threshold, iters,threshold);
357 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
358 data->benching = !sample_enough_benchs(data);
359 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
364 int smpi_sample_2(int global, const char *file, int line)
366 char *loc = sample_location(global, file, line);
370 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
371 data = xbt_dict_get(samples, loc);
372 XBT_DEBUG("sample2 %s",loc);
375 if (data->benching==1) {
376 // we need to run a new bench
377 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
378 data->count, data->iters, data->relstderr, data->threshold, data->mean);
381 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
382 // Just sleep instead
383 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
384 data->count, data->iters, data->relstderr, data->threshold, data->mean);
385 smpi_execute(data->mean);
386 smpi_process_set_sampling(0);
387 res = 0; // prepare to capture future, unrelated computations
394 void smpi_sample_3(int global, const char *file, int line)
396 char *loc = sample_location(global, file, line);
399 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
400 data = xbt_dict_get(samples, loc);
401 XBT_DEBUG("sample3 %s",loc);
404 if (data->benching==0) {
408 // ok, benchmarking this loop is over
409 xbt_os_threadtimer_stop(smpi_process_timer());
414 sample = xbt_os_timer_elapsed(smpi_process_timer());
416 data->sum_pow2 += sample * sample;
417 n = (double)data->count;
418 data->mean = data->sum / n;
419 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
420 if (!sample_enough_benchs(data)) {
421 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
423 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
424 data->mean, data->relstderr, sample);
426 // That's enough for now, prevent sample_2 to run the same code over and over
431 static void smpi_shared_alloc_free(void *p)
433 shared_data_t *data = p;
438 static char *smpi_shared_alloc_hash(char *loc)
448 loc = xbt_realloc(loc, 30);
450 for (i = 0; i < 40; i += 6) { /* base64 encode */
451 memcpy(s, hash + i, 6);
452 val = strtoul(s, NULL, 16);
453 for (j = 0; j < 4; j++) {
454 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
455 loc[1 + 4 * i / 6 + j] =
456 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
463 void *smpi_shared_malloc(size_t size, const char *file, int line)
466 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
467 char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
470 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
473 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
475 data = xbt_dict_get_or_null(allocs, loc);
477 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
478 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
482 xbt_die("Please cleanup /dev/shm/%s", loc);
484 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
487 data = xbt_new(shared_data_t, 1);
491 mem = shm_map(fd, size, data);
492 if (shm_unlink(loc) < 0) {
493 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
495 xbt_dict_set(allocs, loc, data, NULL);
496 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
499 mem = shm_map(data->fd, size, data);
502 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
504 mem = xbt_malloc(size);
505 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
510 void smpi_shared_free(void *ptr)
512 char loc[PTR_STRLEN];
513 shared_metadata_t* meta;
515 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
518 XBT_WARN("Cannot free: nothing was allocated");
521 if(!allocs_metadata) {
522 XBT_WARN("Cannot free: no metadata was allocated");
524 snprintf(loc, PTR_STRLEN, "%p", ptr);
525 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
527 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
532 XBT_WARN("Cannot free: something is broken in the metadata link");
535 if(munmap(ptr, meta->size) < 0) {
536 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
539 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
540 if (data->count <= 0) {
542 xbt_dict_remove(allocs, data->loc);
543 XBT_DEBUG("Shared free - with removal - of %p", ptr);
546 XBT_DEBUG("Classic free of %p", ptr);
552 int smpi_shared_known_call(const char* func, const char* input)
554 char* loc = bprintf("%s:%s", func, input);
559 calls = xbt_dict_new_homogeneous(NULL);
562 xbt_dict_get(calls, loc); /* Succeed or throw */
569 if (ex.category != not_found_error)
576 void* smpi_shared_get_call(const char* func, const char* input) {
577 char* loc = bprintf("%s:%s", func, input);
581 calls = xbt_dict_new_homogeneous(NULL);
583 data = xbt_dict_get(calls, loc);
588 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
589 char* loc = bprintf("%s:%s", func, input);
592 calls = xbt_dict_new_homogeneous(NULL);
594 xbt_dict_set(calls, loc, data, NULL);
602 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
606 * - read the executable data+bss section addresses and sizes
607 * - for each process create a copy of these sections with mmap
608 * - store them in a dynar
614 void smpi_switch_data_segment(int dest){
616 if(size_data_exe == 0)//no need to switch
619 if (smpi_loaded_page==dest)//no need to switch either
624 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
625 for (i=0; i< SIMIX_process_count(); i++){
626 memcpy(smpi_privatisation_regions[i].address,TOPAGE(start_data_exe),size_data_exe);
629 int current = smpi_privatisation_regions[dest].file_descriptor;
630 XBT_VERB("Switching data frame to the one of process %d", dest);
631 void* tmp = mmap (TOPAGE(start_data_exe), size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
632 if (tmp != TOPAGE(start_data_exe))
633 xbt_die("Couldn't map the new region");
634 smpi_loaded_page=dest;
638 void smpi_get_executable_global_size(){
639 int size_bss_binary=0;
640 int size_data_binary=0;
642 char *line = NULL; /* Temporal storage for each line that is readed */
643 ssize_t read; /* Number of bytes readed */
644 size_t n = 0; /* Amount of bytes to read by xbt_getline */
649 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
651 fp = popen(command, "r");
654 perror("popen failed");
658 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
663 /* Wipeout the new line character */
664 line[read - 1] = '\0';
666 lfields[0] = strtok(line, " ");
668 if(lfields[0] == NULL)
671 if(strcmp(lfields[0], "Sections:") == 0
672 || strcmp(lfields[0], "Idx") == 0
673 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
676 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
677 lfields[i] = strtok(NULL, " ");
681 * we are looking for these fields
682 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
683 CONTENTS, ALLOC, LOAD, DATA
684 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
689 if(strcmp(lfields[1], ".data") == 0){
690 size_data_binary = strtoul(lfields[2], NULL, 16);
691 start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
693 }else if(strcmp(lfields[1], ".bss") == 0){
694 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
695 //TODO : check if this is OK, as some segments may be inserted between them..
696 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (start_data_exe + size_data_binary))
697 + strtoul(lfields[2], NULL, 16);
705 size_data_exe =(unsigned long)start_data_exe - (unsigned long)TOPAGE(start_data_exe)+ size_data_binary+size_bss_binary;
712 void smpi_initialize_global_memory_segments(){
715 smpi_privatize_global_variables=0;
720 smpi_get_executable_global_size();
722 XBT_DEBUG ("bss+data segment found : size %d starting at %p",size_data_exe, start_data_exe );
724 if(size_data_exe == 0){//no need to switch
725 smpi_privatize_global_variables=0;
729 smpi_privatisation_regions = (smpi_privatisation_region_t) malloc(
730 sizeof(struct s_smpi_privatisation_region) * sizeof(int));
732 for (i=0; i< SIMIX_process_count(); i++){
733 //create SIMIX_process_count() mappings of this size with the same data inside
734 void *address = NULL;
735 char path[] = "/dev/shm/my-buffer-XXXXXX";
738 int file_descriptor= mkstemp (path);
739 if (file_descriptor < 0)
740 xbt_die("Impossible to create temporary file for memory mapping");
742 status = unlink (path);
744 xbt_die("Impossible to unlink temporary file for memory mapping");
746 status = ftruncate(file_descriptor, size_data_exe);
748 xbt_die("Impossible to set the size of the temporary file for memory mapping");
750 /* Ask for a free region */
751 address = mmap (NULL, size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
752 if (address == MAP_FAILED)
753 xbt_die("Couldn't find a free region for memory mapping");
755 //initialize the values
756 memcpy(address,TOPAGE(start_data_exe),size_data_exe);
758 //store the address of the mapping for further switches
759 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
760 smpi_privatisation_regions[i].address = address;
767 void smpi_destroy_global_memory_segments(){
768 if(size_data_exe == 0)//no need to switch
772 for (i=0; i< smpi_process_count(); i++){
773 if(munmap(smpi_privatisation_regions[i].address,size_data_exe) < 0) {
774 XBT_WARN("Unmapping of fd %d failed: %s",
775 smpi_privatisation_regions[i].file_descriptor, strerror(errno));
777 close(smpi_privatisation_regions[i].file_descriptor);
779 xbt_free(smpi_privatisation_regions);