1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
20 #include <sys/types.h>
23 #include <math.h> // sqrt
29 #define MAP_ANONYMOUS MAP_ANON
32 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
33 "Logging specific to SMPI (benchmarking)");
35 /* Shared allocations are handled through shared memory segments.
36 * Associated data and metadata are used as follows:
39 * `allocs' dict ---- -.
40 * ---------- shared_data_t shared_metadata_t / | | |
41 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
42 * | ---------- | fd of <name> | | | size of mmap | --| | | |
43 * | | count (2) | |-- | data | \ | | |
44 * `----------------- | <name> | | ----------------- ---- |
45 * -------------------- | ^ |
47 * | | `allocs_metadata' dict |
48 * | | ---------------------- |
49 * | `-- | <addr of mmap #1> |<-'
50 * | .-- | <addr of mmap #2> |<-.
51 * | | ---------------------- |
57 * | shared_metadata_t / | |
58 * | ----------------- | | |
59 * | | size of mmap | --| | |
61 * ----------------- | | |
66 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
68 xbt_dict_t allocs = NULL; /* Allocated on first use */
69 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
70 xbt_dict_t samples = NULL; /* Allocated on first use */
71 xbt_dict_t calls = NULL; /* Allocated on first use */
72 __thread int smpi_current_rank = 0; /* Updated after each MPI call */
74 double smpi_cpu_threshold;
75 double smpi_running_power;
80 char* start_data_exe = NULL;
81 int size_data_exe = 0;
82 int smpi_privatize_global_variables;
95 static size_t shm_size(int fd) {
98 if(fstat(fd, &st) < 0) {
99 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
101 return (size_t)st.st_size;
105 static void* shm_map(int fd, size_t size, shared_data_t* data) {
107 char loc[PTR_STRLEN];
108 shared_metadata_t* meta;
110 if(size > shm_size(fd)) {
111 if(ftruncate(fd, (off_t)size) < 0) {
112 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
116 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
117 if(mem == MAP_FAILED) {
118 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
120 if(!allocs_metadata) {
121 allocs_metadata = xbt_dict_new_homogeneous(xbt_free);
123 snprintf(loc, PTR_STRLEN, "%p", mem);
124 meta = xbt_new(shared_metadata_t, 1);
127 xbt_dict_set(allocs_metadata, loc, meta, NULL);
128 XBT_DEBUG("MMAP %zu to %p", size, mem);
133 void smpi_bench_destroy(void)
135 xbt_dict_free(&allocs);
136 xbt_dict_free(&allocs_metadata);
137 xbt_dict_free(&samples);
138 xbt_dict_free(&calls);
141 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
142 void smpi_execute_flops_(double *flops)
144 smpi_execute_flops(*flops);
147 XBT_PUBLIC(void) smpi_execute_(double *duration);
148 void smpi_execute_(double *duration)
150 smpi_execute(*duration);
153 void smpi_execute_flops(double flops) {
156 host = SIMIX_host_self();
157 XBT_DEBUG("Handle real computation time: %f flops", flops);
158 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
160 simcall_set_category (action, TRACE_internal_smpi_get_category());
162 simcall_host_execution_wait(action);
165 void smpi_execute(double duration)
167 if (duration >= smpi_cpu_threshold) {
168 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
169 double flops = duration * smpi_running_power;
171 int rank = smpi_process_index();
172 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
173 extra->type=TRACING_COMPUTING;
174 extra->comp_size=flops;
175 TRACE_smpi_computing_in(rank, extra);
177 smpi_execute_flops(flops);
180 TRACE_smpi_computing_out(rank);
184 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
185 duration, smpi_cpu_threshold);
189 void switch_data_segment(int dest);
191 void smpi_bench_begin(void)
193 switch_data_segment(smpi_process_index());
194 xbt_os_threadtimer_start(smpi_process_timer());
195 smpi_current_rank = smpi_process_index();
198 void smpi_bench_end(void)
200 xbt_os_timer_t timer = smpi_process_timer();
201 xbt_os_threadtimer_stop(timer);
202 // switch_data_segment(smpi_process_count());
203 if (smpi_process_get_sampling()) {
204 XBT_CRITICAL("Cannot do recursive benchmarks.");
205 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
206 xbt_backtrace_display_current();
207 xbt_die("Aborting.");
209 smpi_execute(xbt_os_timer_elapsed(timer));
212 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
213 static unsigned int private_sleep(double secs)
217 XBT_DEBUG("Sleep for: %lf secs", secs);
219 int rank = smpi_comm_rank(MPI_COMM_WORLD);
220 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
221 extra->type=TRACING_SLEEPING;
222 extra->sleep_duration=secs;
223 TRACE_smpi_sleeping_in(rank, extra);
225 simcall_process_sleep(secs);
227 TRACE_smpi_sleeping_out(rank);
234 unsigned int smpi_sleep(unsigned int secs)
236 return private_sleep((double)secs);
239 int smpi_usleep(useconds_t usecs)
241 return (int)private_sleep((double)usecs / 1000000.0);
245 int smpi_gettimeofday(struct timeval *tv, void* tz)
249 now = SIMIX_get_clock();
251 tv->tv_sec = (time_t)now;
253 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
255 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
262 extern double sg_surf_precision;
263 unsigned long long smpi_rastro_resolution (void)
266 double resolution = (1/sg_surf_precision);
268 return (unsigned long long)resolution;
271 unsigned long long smpi_rastro_timestamp (void)
274 double now = SIMIX_get_clock();
276 unsigned long long sec = (unsigned long long)now;
277 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
279 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
282 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
284 double threshold; /* maximal stderr requested (if positive) */
285 double relstderr; /* observed stderr so far */
286 double mean; /* mean of benched times, to be used if the block is disabled */
287 double sum; /* sum of benched times (to compute the mean and stderr) */
288 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
289 int iters; /* amount of requested iterations */
290 int count; /* amount of iterations done so far */
291 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
294 static char *sample_location(int global, const char *file, int line) {
296 return bprintf("%s:%d", file, line);
298 return bprintf("%s:%d:%d", file, line, smpi_process_index());
301 static int sample_enough_benchs(local_data_t *data) {
302 int res = data->count >= data->iters;
303 if (data->threshold>0.0) {
305 res = 0; // not enough data
306 if (data->relstderr > data->threshold)
307 res = 0; // stderr too high yet
309 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
310 (res?"enough benchs":"need more data"),
311 data->count, data->iters, data->relstderr, data->threshold, data->mean);
315 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
317 char *loc = sample_location(global, file, line);
320 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
321 smpi_process_set_sampling(1);
324 samples = xbt_dict_new_homogeneous(free);
326 data = xbt_dict_get_or_null(samples, loc);
328 xbt_assert(threshold>0 || iters>0,
329 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
330 data = (local_data_t *) xbt_new(local_data_t, 1);
333 data->sum_pow2 = 0.0;
335 data->threshold = threshold;
336 data->benching = 1; // If we have no data, we need at least one
338 xbt_dict_set(samples, loc, data, NULL);
339 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
341 if (data->iters != iters || data->threshold != threshold) {
342 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
343 loc, data->iters, data->threshold, iters,threshold);
347 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
348 data->benching = !sample_enough_benchs(data);
349 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
354 int smpi_sample_2(int global, const char *file, int line)
356 char *loc = sample_location(global, file, line);
360 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
361 data = xbt_dict_get(samples, loc);
362 XBT_DEBUG("sample2 %s",loc);
365 if (data->benching==1) {
366 // we need to run a new bench
367 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
368 data->count, data->iters, data->relstderr, data->threshold, data->mean);
371 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
372 // Just sleep instead
373 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
374 data->count, data->iters, data->relstderr, data->threshold, data->mean);
375 smpi_execute(data->mean);
376 smpi_process_set_sampling(0);
377 res = 0; // prepare to capture future, unrelated computations
384 void smpi_sample_3(int global, const char *file, int line)
386 char *loc = sample_location(global, file, line);
389 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
390 data = xbt_dict_get(samples, loc);
391 XBT_DEBUG("sample3 %s",loc);
394 if (data->benching==0) {
398 // ok, benchmarking this loop is over
399 xbt_os_threadtimer_stop(smpi_process_timer());
404 sample = xbt_os_timer_elapsed(smpi_process_timer());
406 data->sum_pow2 += sample * sample;
407 n = (double)data->count;
408 data->mean = data->sum / n;
409 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
410 if (!sample_enough_benchs(data)) {
411 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
413 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
414 data->mean, data->relstderr, sample);
416 // That's enough for now, prevent sample_2 to run the same code over and over
421 static void smpi_shared_alloc_free(void *p)
423 shared_data_t *data = p;
428 static char *smpi_shared_alloc_hash(char *loc)
438 loc = xbt_realloc(loc, 30);
440 for (i = 0; i < 40; i += 6) { /* base64 encode */
441 memcpy(s, hash + i, 6);
442 val = strtoul(s, NULL, 16);
443 for (j = 0; j < 4; j++) {
444 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
445 loc[1 + 4 * i / 6 + j] =
446 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
453 void *smpi_shared_malloc(size_t size, const char *file, int line)
456 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
457 char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
460 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
463 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
465 data = xbt_dict_get_or_null(allocs, loc);
467 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
468 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
472 xbt_die("Please cleanup /dev/shm/%s", loc);
474 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
477 data = xbt_new(shared_data_t, 1);
481 mem = shm_map(fd, size, data);
482 if (shm_unlink(loc) < 0) {
483 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
485 xbt_dict_set(allocs, loc, data, NULL);
486 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
489 mem = shm_map(data->fd, size, data);
492 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
494 mem = xbt_malloc(size);
495 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
500 void smpi_shared_free(void *ptr)
502 char loc[PTR_STRLEN];
503 shared_metadata_t* meta;
505 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
508 XBT_WARN("Cannot free: nothing was allocated");
511 if(!allocs_metadata) {
512 XBT_WARN("Cannot free: no metadata was allocated");
514 snprintf(loc, PTR_STRLEN, "%p", ptr);
515 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
517 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
522 XBT_WARN("Cannot free: something is broken in the metadata link");
525 if(munmap(ptr, meta->size) < 0) {
526 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
529 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
530 if (data->count <= 0) {
532 xbt_dict_remove(allocs, data->loc);
533 XBT_DEBUG("Shared free - with removal - of %p", ptr);
536 XBT_DEBUG("Classic free of %p", ptr);
542 int smpi_shared_known_call(const char* func, const char* input)
544 char* loc = bprintf("%s:%s", func, input);
549 calls = xbt_dict_new_homogeneous(NULL);
552 xbt_dict_get(calls, loc); /* Succeed or throw */
559 if (ex.category != not_found_error)
566 void* smpi_shared_get_call(const char* func, const char* input) {
567 char* loc = bprintf("%s:%s", func, input);
571 calls = xbt_dict_new_homogeneous(NULL);
573 data = xbt_dict_get(calls, loc);
578 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
579 char* loc = bprintf("%s:%s", func, input);
582 calls = xbt_dict_new_homogeneous(NULL);
584 xbt_dict_set(calls, loc, data, NULL);
592 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
596 * - read the executable data+bss section addresses and sizes
597 * - for each process create a copy of these sections with mmap
598 * - store them in a dynar
604 void switch_data_segment(int dest){
606 if(size_data_exe == 0)//no need to switch
609 if (loaded_page==dest)//no need to switch either
615 if(loaded_page==-1){//initial switch, do the copy from the real page here
616 for (i=0; i< SIMIX_process_count(); i++){
617 memcpy(mappings[i],TOPAGE(start_data_exe),size_data_exe);
620 int current= fds[dest];
621 XBT_VERB("Switching data frame to the one of process %d", dest);
622 void* tmp = mmap (TOPAGE(start_data_exe), size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
623 if (tmp != TOPAGE(start_data_exe))
624 xbt_die("Couldn't map the new region");
629 void smpi_get_executable_global_size(){
630 int size_bss_binary=0;
631 int size_data_binary=0;
633 char *line = NULL; /* Temporal storage for each line that is readed */
634 ssize_t read; /* Number of bytes readed */
635 size_t n = 0; /* Amount of bytes to read by xbt_getline */
640 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
642 fp = popen(command, "r");
645 perror("popen failed");
649 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
654 /* Wipeout the new line character */
655 line[read - 1] = '\0';
657 lfields[0] = strtok(line, " ");
659 if(lfields[0] == NULL)
662 if(strcmp(lfields[0], "Sections:") == 0
663 || strcmp(lfields[0], "Idx") == 0
664 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
667 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
668 lfields[i] = strtok(NULL, " ");
672 * we are looking for these fields
673 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
674 CONTENTS, ALLOC, LOAD, DATA
675 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
680 if(strcmp(lfields[1], ".data") == 0){
681 size_data_binary = strtoul(lfields[2], NULL, 16);
682 start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
684 }else if(strcmp(lfields[1], ".bss") == 0){
685 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
686 //TODO : check if this is OK, as some segments may be inserted between them..
687 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (start_data_exe + size_data_binary))
688 + strtoul(lfields[2], NULL, 16);
696 size_data_exe =(unsigned long)start_data_exe - (unsigned long)TOPAGE(start_data_exe)+ size_data_binary+size_bss_binary;
703 void smpi_initialize_global_memory_segments(){
706 smpi_privatize_global_variables=0;
711 smpi_get_executable_global_size();
713 XBT_DEBUG ("bss+data segment found : size %d starting at %p",size_data_exe, start_data_exe );
715 if(size_data_exe == 0){//no need to switch
716 smpi_privatize_global_variables=0;
720 fds= (int*)xbt_malloc((smpi_process_count())*sizeof(int));
721 mappings= (void**)xbt_malloc((smpi_process_count())*sizeof(void*));
724 for (i=0; i< SIMIX_process_count(); i++){
725 //create SIMIX_process_count() mappings of this size with the same data inside
726 void *address = NULL, *tmp = NULL;
727 char path[] = "/dev/shm/my-buffer-XXXXXX";
729 int file_descriptor= mkstemp (path);
730 if (file_descriptor < 0)
731 xbt_die("Impossible to create temporary file for memory mapping");
732 status = unlink (path);
734 xbt_die("Impossible to unlink temporary file for memory mapping");
736 status = ftruncate(file_descriptor, size_data_exe);
738 xbt_die("Impossible to set the size of the temporary file for memory mapping");
740 /* Ask for a free region */
741 address = mmap (NULL, size_data_exe, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
743 if (address == MAP_FAILED)
744 xbt_die("Couldn't find a free region for memory mapping");
746 tmp = mmap (address, size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, file_descriptor, 0);
749 xbt_die("Couldn't obtain the right address");
750 //initialize the values
751 memcpy(address,TOPAGE(start_data_exe),size_data_exe);
753 //store the address of the mapping for further switches
754 fds[i]=file_descriptor;
755 mappings[i]= address;
762 void smpi_destroy_global_memory_segments(){
763 if(size_data_exe == 0)//no need to switch
767 for (i=0; i< smpi_process_count(); i++){
768 if(munmap(mappings[i],size_data_exe) < 0) {
769 XBT_WARN("Unmapping of fd %d failed: %s", fds[i], strerror(errno));