1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
20 #include <sys/types.h>
23 #include <math.h> // sqrt
29 #define MAP_ANONYMOUS MAP_ANON
32 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
33 "Logging specific to SMPI (benchmarking)");
35 /* Shared allocations are handled through shared memory segments.
36 * Associated data and metadata are used as follows:
39 * `allocs' dict ---- -.
40 * ---------- shared_data_t shared_metadata_t / | | |
41 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
42 * | ---------- | fd of <name> | | | size of mmap | --| | | |
43 * | | count (2) | |-- | data | \ | | |
44 * `----------------- | <name> | | ----------------- ---- |
45 * -------------------- | ^ |
47 * | | `allocs_metadata' dict |
48 * | | ---------------------- |
49 * | `-- | <addr of mmap #1> |<-'
50 * | .-- | <addr of mmap #2> |<-.
51 * | | ---------------------- |
57 * | shared_metadata_t / | |
58 * | ----------------- | | |
59 * | | size of mmap | --| | |
61 * ----------------- | | |
66 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
68 xbt_dict_t allocs = NULL; /* Allocated on first use */
69 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
70 xbt_dict_t samples = NULL; /* Allocated on first use */
71 xbt_dict_t calls = NULL; /* Allocated on first use */
72 __thread int smpi_current_rank = 0; /* Updated after each MPI call */
74 double smpi_cpu_threshold;
75 double smpi_running_power;
80 char* start_data_exe = NULL;
81 int size_data_exe = 0;
82 int smpi_privatize_global_variables;
95 static size_t shm_size(int fd) {
98 if(fstat(fd, &st) < 0) {
99 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
101 return (size_t)st.st_size;
105 static void* shm_map(int fd, size_t size, shared_data_t* data) {
107 char loc[PTR_STRLEN];
108 shared_metadata_t* meta;
110 if(size > shm_size(fd)) {
111 if(ftruncate(fd, (off_t)size) < 0) {
112 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
116 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
117 if(mem == MAP_FAILED) {
118 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
120 if(!allocs_metadata) {
121 allocs_metadata = xbt_dict_new_homogeneous(xbt_free);
123 snprintf(loc, PTR_STRLEN, "%p", mem);
124 meta = xbt_new(shared_metadata_t, 1);
127 xbt_dict_set(allocs_metadata, loc, meta, NULL);
128 XBT_DEBUG("MMAP %zu to %p", size, mem);
133 void smpi_bench_destroy(void)
135 xbt_dict_free(&allocs);
136 xbt_dict_free(&allocs_metadata);
137 xbt_dict_free(&samples);
138 xbt_dict_free(&calls);
141 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
142 void smpi_execute_flops_(double *flops)
144 smpi_execute_flops(*flops);
147 XBT_PUBLIC(void) smpi_execute_(double *duration);
148 void smpi_execute_(double *duration)
150 smpi_execute(*duration);
153 void smpi_execute_flops(double flops) {
156 host = SIMIX_host_self();
157 XBT_DEBUG("Handle real computation time: %f flops", flops);
158 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
160 simcall_set_category (action, TRACE_internal_smpi_get_category());
162 simcall_host_execution_wait(action);
165 void smpi_execute(double duration)
167 if (duration >= smpi_cpu_threshold) {
168 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
169 double flops = duration * smpi_running_power;
171 int rank = smpi_process_index();
172 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
173 extra->type=TRACING_COMPUTING;
174 extra->comp_size=flops;
175 TRACE_smpi_computing_in(rank, extra);
177 smpi_execute_flops(flops);
180 TRACE_smpi_computing_out(rank);
184 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
185 duration, smpi_cpu_threshold);
189 void switch_data_segment(int dest);
191 void smpi_bench_begin(void)
193 switch_data_segment(smpi_process_index());
194 xbt_os_threadtimer_start(smpi_process_timer());
195 smpi_current_rank = smpi_process_index();
198 void smpi_bench_end(void)
200 xbt_os_timer_t timer = smpi_process_timer();
201 xbt_os_threadtimer_stop(timer);
202 // switch_data_segment(smpi_process_count());
203 if (smpi_process_get_sampling()) {
204 XBT_CRITICAL("Cannot do recursive benchmarks.");
205 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
206 xbt_backtrace_display_current();
207 xbt_die("Aborting.");
209 // Simulate the benchmarked computation unless disabled via command-line argument
210 if (sg_cfg_get_boolean("smpi/simulate_computation")) {
211 smpi_execute(xbt_os_timer_elapsed(timer));
215 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
216 static unsigned int private_sleep(double secs)
220 XBT_DEBUG("Sleep for: %lf secs", secs);
222 int rank = smpi_comm_rank(MPI_COMM_WORLD);
223 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
224 extra->type=TRACING_SLEEPING;
225 extra->sleep_duration=secs;
226 TRACE_smpi_sleeping_in(rank, extra);
228 simcall_process_sleep(secs);
230 TRACE_smpi_sleeping_out(rank);
237 unsigned int smpi_sleep(unsigned int secs)
239 return private_sleep((double)secs);
242 int smpi_usleep(useconds_t usecs)
244 return (int)private_sleep((double)usecs / 1000000.0);
248 int smpi_gettimeofday(struct timeval *tv, void* tz)
252 now = SIMIX_get_clock();
254 tv->tv_sec = (time_t)now;
256 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
258 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
265 extern double sg_surf_precision;
266 unsigned long long smpi_rastro_resolution (void)
269 double resolution = (1/sg_surf_precision);
271 return (unsigned long long)resolution;
274 unsigned long long smpi_rastro_timestamp (void)
277 double now = SIMIX_get_clock();
279 unsigned long long sec = (unsigned long long)now;
280 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
282 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
285 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
287 double threshold; /* maximal stderr requested (if positive) */
288 double relstderr; /* observed stderr so far */
289 double mean; /* mean of benched times, to be used if the block is disabled */
290 double sum; /* sum of benched times (to compute the mean and stderr) */
291 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
292 int iters; /* amount of requested iterations */
293 int count; /* amount of iterations done so far */
294 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
297 static char *sample_location(int global, const char *file, int line) {
299 return bprintf("%s:%d", file, line);
301 return bprintf("%s:%d:%d", file, line, smpi_process_index());
304 static int sample_enough_benchs(local_data_t *data) {
305 int res = data->count >= data->iters;
306 if (data->threshold>0.0) {
308 res = 0; // not enough data
309 if (data->relstderr > data->threshold)
310 res = 0; // stderr too high yet
312 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
313 (res?"enough benchs":"need more data"),
314 data->count, data->iters, data->relstderr, data->threshold, data->mean);
318 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
320 char *loc = sample_location(global, file, line);
323 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
324 smpi_process_set_sampling(1);
327 samples = xbt_dict_new_homogeneous(free);
329 data = xbt_dict_get_or_null(samples, loc);
331 xbt_assert(threshold>0 || iters>0,
332 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
333 data = (local_data_t *) xbt_new(local_data_t, 1);
336 data->sum_pow2 = 0.0;
338 data->threshold = threshold;
339 data->benching = 1; // If we have no data, we need at least one
341 xbt_dict_set(samples, loc, data, NULL);
342 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
344 if (data->iters != iters || data->threshold != threshold) {
345 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
346 loc, data->iters, data->threshold, iters,threshold);
350 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
351 data->benching = !sample_enough_benchs(data);
352 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
357 int smpi_sample_2(int global, const char *file, int line)
359 char *loc = sample_location(global, file, line);
363 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
364 data = xbt_dict_get(samples, loc);
365 XBT_DEBUG("sample2 %s",loc);
368 if (data->benching==1) {
369 // we need to run a new bench
370 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
371 data->count, data->iters, data->relstderr, data->threshold, data->mean);
374 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
375 // Just sleep instead
376 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
377 data->count, data->iters, data->relstderr, data->threshold, data->mean);
378 smpi_execute(data->mean);
379 smpi_process_set_sampling(0);
380 res = 0; // prepare to capture future, unrelated computations
387 void smpi_sample_3(int global, const char *file, int line)
389 char *loc = sample_location(global, file, line);
392 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
393 data = xbt_dict_get(samples, loc);
394 XBT_DEBUG("sample3 %s",loc);
397 if (data->benching==0) {
401 // ok, benchmarking this loop is over
402 xbt_os_threadtimer_stop(smpi_process_timer());
407 sample = xbt_os_timer_elapsed(smpi_process_timer());
409 data->sum_pow2 += sample * sample;
410 n = (double)data->count;
411 data->mean = data->sum / n;
412 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
413 if (!sample_enough_benchs(data)) {
414 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
416 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
417 data->mean, data->relstderr, sample);
419 // That's enough for now, prevent sample_2 to run the same code over and over
424 static void smpi_shared_alloc_free(void *p)
426 shared_data_t *data = p;
431 static char *smpi_shared_alloc_hash(char *loc)
441 loc = xbt_realloc(loc, 30);
443 for (i = 0; i < 40; i += 6) { /* base64 encode */
444 memcpy(s, hash + i, 6);
445 val = strtoul(s, NULL, 16);
446 for (j = 0; j < 4; j++) {
447 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
448 loc[1 + 4 * i / 6 + j] =
449 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
456 void *smpi_shared_malloc(size_t size, const char *file, int line)
459 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
460 char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
463 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
466 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
468 data = xbt_dict_get_or_null(allocs, loc);
470 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
471 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
475 xbt_die("Please cleanup /dev/shm/%s", loc);
477 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
480 data = xbt_new(shared_data_t, 1);
484 mem = shm_map(fd, size, data);
485 if (shm_unlink(loc) < 0) {
486 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
488 xbt_dict_set(allocs, loc, data, NULL);
489 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
492 mem = shm_map(data->fd, size, data);
495 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
497 mem = xbt_malloc(size);
498 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
503 void smpi_shared_free(void *ptr)
505 char loc[PTR_STRLEN];
506 shared_metadata_t* meta;
508 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
511 XBT_WARN("Cannot free: nothing was allocated");
514 if(!allocs_metadata) {
515 XBT_WARN("Cannot free: no metadata was allocated");
517 snprintf(loc, PTR_STRLEN, "%p", ptr);
518 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
520 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
525 XBT_WARN("Cannot free: something is broken in the metadata link");
528 if(munmap(ptr, meta->size) < 0) {
529 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
532 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
533 if (data->count <= 0) {
535 xbt_dict_remove(allocs, data->loc);
536 XBT_DEBUG("Shared free - with removal - of %p", ptr);
539 XBT_DEBUG("Classic free of %p", ptr);
545 int smpi_shared_known_call(const char* func, const char* input)
547 char* loc = bprintf("%s:%s", func, input);
552 calls = xbt_dict_new_homogeneous(NULL);
555 xbt_dict_get(calls, loc); /* Succeed or throw */
562 if (ex.category != not_found_error)
569 void* smpi_shared_get_call(const char* func, const char* input) {
570 char* loc = bprintf("%s:%s", func, input);
574 calls = xbt_dict_new_homogeneous(NULL);
576 data = xbt_dict_get(calls, loc);
581 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
582 char* loc = bprintf("%s:%s", func, input);
585 calls = xbt_dict_new_homogeneous(NULL);
587 xbt_dict_set(calls, loc, data, NULL);
595 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
599 * - read the executable data+bss section addresses and sizes
600 * - for each process create a copy of these sections with mmap
601 * - store them in a dynar
607 void switch_data_segment(int dest){
609 if(size_data_exe == 0)//no need to switch
612 if (loaded_page==dest)//no need to switch either
618 if(loaded_page==-1){//initial switch, do the copy from the real page here
619 for (i=0; i< SIMIX_process_count(); i++){
620 memcpy(mappings[i],TOPAGE(start_data_exe),size_data_exe);
623 int current= fds[dest];
624 XBT_VERB("Switching data frame to the one of process %d", dest);
625 void* tmp = mmap (TOPAGE(start_data_exe), size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
626 if (tmp != TOPAGE(start_data_exe))
627 xbt_die("Couldn't map the new region");
632 void smpi_get_executable_global_size(){
633 int size_bss_binary=0;
634 int size_data_binary=0;
636 char *line = NULL; /* Temporal storage for each line that is readed */
637 ssize_t read; /* Number of bytes readed */
638 size_t n = 0; /* Amount of bytes to read by xbt_getline */
643 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
645 fp = popen(command, "r");
648 perror("popen failed");
652 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
657 /* Wipeout the new line character */
658 line[read - 1] = '\0';
660 lfields[0] = strtok(line, " ");
662 if(lfields[0] == NULL)
665 if(strcmp(lfields[0], "Sections:") == 0
666 || strcmp(lfields[0], "Idx") == 0
667 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
670 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
671 lfields[i] = strtok(NULL, " ");
675 * we are looking for these fields
676 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
677 CONTENTS, ALLOC, LOAD, DATA
678 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
683 if(strcmp(lfields[1], ".data") == 0){
684 size_data_binary = strtoul(lfields[2], NULL, 16);
685 start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
687 }else if(strcmp(lfields[1], ".bss") == 0){
688 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
689 //TODO : check if this is OK, as some segments may be inserted between them..
690 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (start_data_exe + size_data_binary))
691 + strtoul(lfields[2], NULL, 16);
699 size_data_exe =(unsigned long)start_data_exe - (unsigned long)TOPAGE(start_data_exe)+ size_data_binary+size_bss_binary;
706 void smpi_initialize_global_memory_segments(){
709 smpi_privatize_global_variables=0;
714 smpi_get_executable_global_size();
716 XBT_DEBUG ("bss+data segment found : size %d starting at %p",size_data_exe, start_data_exe );
718 if(size_data_exe == 0){//no need to switch
719 smpi_privatize_global_variables=0;
723 fds= (int*)xbt_malloc((smpi_process_count())*sizeof(int));
724 mappings= (void**)xbt_malloc((smpi_process_count())*sizeof(void*));
727 for (i=0; i< SIMIX_process_count(); i++){
728 //create SIMIX_process_count() mappings of this size with the same data inside
729 void *address = NULL, *tmp = NULL;
730 char path[] = "/dev/shm/my-buffer-XXXXXX";
732 int file_descriptor= mkstemp (path);
733 if (file_descriptor < 0)
734 xbt_die("Impossible to create temporary file for memory mapping");
735 status = unlink (path);
737 xbt_die("Impossible to unlink temporary file for memory mapping");
739 status = ftruncate(file_descriptor, size_data_exe);
741 xbt_die("Impossible to set the size of the temporary file for memory mapping");
743 /* Ask for a free region */
744 address = mmap (NULL, size_data_exe, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
746 if (address == MAP_FAILED)
747 xbt_die("Couldn't find a free region for memory mapping");
749 tmp = mmap (address, size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, file_descriptor, 0);
752 xbt_die("Couldn't obtain the right address");
753 //initialize the values
754 memcpy(address,TOPAGE(start_data_exe),size_data_exe);
756 //store the address of the mapping for further switches
757 fds[i]=file_descriptor;
758 mappings[i]= address;
765 void smpi_destroy_global_memory_segments(){
766 if(size_data_exe == 0)//no need to switch
770 for (i=0; i< smpi_process_count(); i++){
771 if(munmap(mappings[i],size_data_exe) < 0) {
772 XBT_WARN("Unmapping of fd %d failed: %s", fds[i], strerror(errno));