1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
20 #include <sys/types.h>
23 #include <math.h> // sqrt
29 #define MAP_ANONYMOUS MAP_ANON
32 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
33 "Logging specific to SMPI (benchmarking)");
35 /* Shared allocations are handled through shared memory segments.
36 * Associated data and metadata are used as follows:
39 * `allocs' dict ---- -.
40 * ---------- shared_data_t shared_metadata_t / | | |
41 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
42 * | ---------- | fd of <name> | | | size of mmap | --| | | |
43 * | | count (2) | |-- | data | \ | | |
44 * `----------------- | <name> | | ----------------- ---- |
45 * -------------------- | ^ |
47 * | | `allocs_metadata' dict |
48 * | | ---------------------- |
49 * | `-- | <addr of mmap #1> |<-'
50 * | .-- | <addr of mmap #2> |<-.
51 * | | ---------------------- |
57 * | shared_metadata_t / | |
58 * | ----------------- | | |
59 * | | size of mmap | --| | |
61 * ----------------- | | |
66 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
68 xbt_dict_t allocs = NULL; /* Allocated on first use */
69 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
70 xbt_dict_t samples = NULL; /* Allocated on first use */
71 xbt_dict_t calls = NULL; /* Allocated on first use */
73 double smpi_cpu_threshold;
74 double smpi_running_power;
77 size_t mappings_count = 0;
80 char* start_data_exe = NULL;
81 int size_data_exe = 0;
82 int smpi_privatize_global_variables;
95 static size_t shm_size(int fd) {
98 if(fstat(fd, &st) < 0) {
99 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
101 return (size_t)st.st_size;
105 static void* shm_map(int fd, size_t size, shared_data_t* data) {
107 char loc[PTR_STRLEN];
108 shared_metadata_t* meta;
110 if(size > shm_size(fd)) {
111 if(ftruncate(fd, (off_t)size) < 0) {
112 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
116 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
117 if(mem == MAP_FAILED) {
118 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
120 if(!allocs_metadata) {
121 allocs_metadata = xbt_dict_new_homogeneous(xbt_free_f);
123 snprintf(loc, PTR_STRLEN, "%p", mem);
124 meta = xbt_new(shared_metadata_t, 1);
127 xbt_dict_set(allocs_metadata, loc, meta, NULL);
128 XBT_DEBUG("MMAP %zu to %p", size, mem);
133 void smpi_bench_destroy(void)
135 xbt_dict_free(&allocs);
136 xbt_dict_free(&allocs_metadata);
137 xbt_dict_free(&samples);
138 xbt_dict_free(&calls);
141 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
142 void smpi_execute_flops_(double *flops)
144 smpi_execute_flops(*flops);
147 XBT_PUBLIC(void) smpi_execute_(double *duration);
148 void smpi_execute_(double *duration)
150 smpi_execute(*duration);
153 void smpi_execute_flops(double flops) {
156 host = SIMIX_host_self();
157 XBT_DEBUG("Handle real computation time: %f flops", flops);
158 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
160 simcall_set_category (action, TRACE_internal_smpi_get_category());
162 simcall_host_execution_wait(action);
165 void smpi_execute(double duration)
167 if (duration >= smpi_cpu_threshold) {
168 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
169 double flops = duration * smpi_running_power;
171 int rank = smpi_process_index();
172 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
173 extra->type=TRACING_COMPUTING;
174 extra->comp_size=flops;
175 TRACE_smpi_computing_in(rank, extra);
177 smpi_execute_flops(flops);
180 TRACE_smpi_computing_out(rank);
184 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
185 duration, smpi_cpu_threshold);
189 void switch_data_segment(int dest);
191 void smpi_bench_begin(void)
193 switch_data_segment(smpi_process_index());
194 xbt_os_threadtimer_start(smpi_process_timer());
197 void smpi_bench_end(void)
199 xbt_os_timer_t timer = smpi_process_timer();
200 xbt_os_threadtimer_stop(timer);
201 // switch_data_segment(smpi_process_count());
202 if (smpi_process_get_sampling()) {
203 XBT_CRITICAL("Cannot do recursive benchmarks.");
204 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
205 xbt_backtrace_display_current();
206 xbt_die("Aborting.");
208 // Simulate the benchmarked computation unless disabled via command-line argument
209 if (sg_cfg_get_boolean("smpi/simulate_computation")) {
210 smpi_execute(xbt_os_timer_elapsed(timer));
214 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
215 static unsigned int private_sleep(double secs)
219 XBT_DEBUG("Sleep for: %lf secs", secs);
221 int rank = smpi_comm_rank(MPI_COMM_WORLD);
222 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
223 extra->type=TRACING_SLEEPING;
224 extra->sleep_duration=secs;
225 TRACE_smpi_sleeping_in(rank, extra);
227 simcall_process_sleep(secs);
229 TRACE_smpi_sleeping_out(rank);
236 unsigned int smpi_sleep(unsigned int secs)
238 return private_sleep((double)secs);
241 int smpi_usleep(useconds_t usecs)
243 return (int)private_sleep((double)usecs / 1000000.0);
247 int smpi_gettimeofday(struct timeval *tv, void* tz)
251 now = SIMIX_get_clock();
253 tv->tv_sec = (time_t)now;
255 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
257 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
264 extern double sg_surf_precision;
265 unsigned long long smpi_rastro_resolution (void)
268 double resolution = (1/sg_surf_precision);
270 return (unsigned long long)resolution;
273 unsigned long long smpi_rastro_timestamp (void)
276 double now = SIMIX_get_clock();
278 unsigned long long sec = (unsigned long long)now;
279 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
281 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
284 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
286 double threshold; /* maximal stderr requested (if positive) */
287 double relstderr; /* observed stderr so far */
288 double mean; /* mean of benched times, to be used if the block is disabled */
289 double sum; /* sum of benched times (to compute the mean and stderr) */
290 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
291 int iters; /* amount of requested iterations */
292 int count; /* amount of iterations done so far */
293 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
296 static char *sample_location(int global, const char *file, int line) {
298 return bprintf("%s:%d", file, line);
300 return bprintf("%s:%d:%d", file, line, smpi_process_index());
303 static int sample_enough_benchs(local_data_t *data) {
304 int res = data->count >= data->iters;
305 if (data->threshold>0.0) {
307 res = 0; // not enough data
308 if (data->relstderr > data->threshold)
309 res = 0; // stderr too high yet
311 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
312 (res?"enough benchs":"need more data"),
313 data->count, data->iters, data->relstderr, data->threshold, data->mean);
317 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
319 char *loc = sample_location(global, file, line);
322 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
323 smpi_process_set_sampling(1);
326 samples = xbt_dict_new_homogeneous(free);
328 data = xbt_dict_get_or_null(samples, loc);
330 xbt_assert(threshold>0 || iters>0,
331 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
332 data = (local_data_t *) xbt_new(local_data_t, 1);
335 data->sum_pow2 = 0.0;
337 data->threshold = threshold;
338 data->benching = 1; // If we have no data, we need at least one
340 xbt_dict_set(samples, loc, data, NULL);
341 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
343 if (data->iters != iters || data->threshold != threshold) {
344 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
345 loc, data->iters, data->threshold, iters,threshold);
349 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
350 data->benching = !sample_enough_benchs(data);
351 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
356 int smpi_sample_2(int global, const char *file, int line)
358 char *loc = sample_location(global, file, line);
362 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
363 data = xbt_dict_get(samples, loc);
364 XBT_DEBUG("sample2 %s",loc);
367 if (data->benching==1) {
368 // we need to run a new bench
369 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
370 data->count, data->iters, data->relstderr, data->threshold, data->mean);
373 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
374 // Just sleep instead
375 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
376 data->count, data->iters, data->relstderr, data->threshold, data->mean);
377 smpi_execute(data->mean);
378 smpi_process_set_sampling(0);
379 res = 0; // prepare to capture future, unrelated computations
386 void smpi_sample_3(int global, const char *file, int line)
388 char *loc = sample_location(global, file, line);
391 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
392 data = xbt_dict_get(samples, loc);
393 XBT_DEBUG("sample3 %s",loc);
396 if (data->benching==0) {
400 // ok, benchmarking this loop is over
401 xbt_os_threadtimer_stop(smpi_process_timer());
406 sample = xbt_os_timer_elapsed(smpi_process_timer());
408 data->sum_pow2 += sample * sample;
409 n = (double)data->count;
410 data->mean = data->sum / n;
411 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
412 if (!sample_enough_benchs(data)) {
413 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
415 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
416 data->mean, data->relstderr, sample);
418 // That's enough for now, prevent sample_2 to run the same code over and over
423 static void smpi_shared_alloc_free(void *p)
425 shared_data_t *data = p;
430 static char *smpi_shared_alloc_hash(char *loc)
440 loc = xbt_realloc(loc, 30);
442 for (i = 0; i < 40; i += 6) { /* base64 encode */
443 memcpy(s, hash + i, 6);
444 val = strtoul(s, NULL, 16);
445 for (j = 0; j < 4; j++) {
446 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
447 loc[1 + 4 * i / 6 + j] =
448 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
455 void *smpi_shared_malloc(size_t size, const char *file, int line)
458 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
459 char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
462 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
465 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
467 data = xbt_dict_get_or_null(allocs, loc);
469 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
470 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
474 xbt_die("Please cleanup /dev/shm/%s", loc);
476 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
479 data = xbt_new(shared_data_t, 1);
483 mem = shm_map(fd, size, data);
484 if (shm_unlink(loc) < 0) {
485 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
487 xbt_dict_set(allocs, loc, data, NULL);
488 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
491 mem = shm_map(data->fd, size, data);
494 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
496 mem = xbt_malloc(size);
497 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
502 void smpi_shared_free(void *ptr)
504 char loc[PTR_STRLEN];
505 shared_metadata_t* meta;
507 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
510 XBT_WARN("Cannot free: nothing was allocated");
513 if(!allocs_metadata) {
514 XBT_WARN("Cannot free: no metadata was allocated");
516 snprintf(loc, PTR_STRLEN, "%p", ptr);
517 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
519 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
524 XBT_WARN("Cannot free: something is broken in the metadata link");
527 if(munmap(ptr, meta->size) < 0) {
528 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
531 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
532 if (data->count <= 0) {
534 xbt_dict_remove(allocs, data->loc);
535 XBT_DEBUG("Shared free - with removal - of %p", ptr);
538 XBT_DEBUG("Classic free of %p", ptr);
544 int smpi_shared_known_call(const char* func, const char* input)
546 char* loc = bprintf("%s:%s", func, input);
551 calls = xbt_dict_new_homogeneous(NULL);
554 xbt_dict_get(calls, loc); /* Succeed or throw */
561 if (ex.category != not_found_error)
568 void* smpi_shared_get_call(const char* func, const char* input) {
569 char* loc = bprintf("%s:%s", func, input);
573 calls = xbt_dict_new_homogeneous(NULL);
575 data = xbt_dict_get(calls, loc);
580 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
581 char* loc = bprintf("%s:%s", func, input);
584 calls = xbt_dict_new_homogeneous(NULL);
586 xbt_dict_set(calls, loc, data, NULL);
594 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
598 * - read the executable data+bss section addresses and sizes
599 * - for each process create a copy of these sections with mmap
600 * - store them in a dynar
606 void switch_data_segment(int dest){
608 if(size_data_exe == 0)//no need to switch
611 if (loaded_page==dest)//no need to switch either
616 if(loaded_page==-1){//initial switch, do the copy from the real page here
617 for (i=0; i< SIMIX_process_count(); i++){
618 memcpy(mappings[i],TOPAGE(start_data_exe),size_data_exe);
621 int current= fds[dest];
622 XBT_VERB("Switching data frame to the one of process %d", dest);
623 void* tmp = mmap (TOPAGE(start_data_exe), size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
624 if (tmp != TOPAGE(start_data_exe))
625 xbt_die("Couldn't map the new region");
630 void smpi_get_executable_global_size(){
631 int size_bss_binary=0;
632 int size_data_binary=0;
634 char *line = NULL; /* Temporal storage for each line that is readed */
635 ssize_t read; /* Number of bytes readed */
636 size_t n = 0; /* Amount of bytes to read by xbt_getline */
641 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
643 fp = popen(command, "r");
646 perror("popen failed");
650 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
655 /* Wipeout the new line character */
656 line[read - 1] = '\0';
658 lfields[0] = strtok(line, " ");
660 if(lfields[0] == NULL)
663 if(strcmp(lfields[0], "Sections:") == 0
664 || strcmp(lfields[0], "Idx") == 0
665 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
668 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
669 lfields[i] = strtok(NULL, " ");
673 * we are looking for these fields
674 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
675 CONTENTS, ALLOC, LOAD, DATA
676 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
681 if(strcmp(lfields[1], ".data") == 0){
682 size_data_binary = strtoul(lfields[2], NULL, 16);
683 start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
685 }else if(strcmp(lfields[1], ".bss") == 0){
686 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
687 //TODO : check if this is OK, as some segments may be inserted between them..
688 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (start_data_exe + size_data_binary))
689 + strtoul(lfields[2], NULL, 16);
697 size_data_exe =(unsigned long)start_data_exe - (unsigned long)TOPAGE(start_data_exe)+ size_data_binary+size_bss_binary;
704 void smpi_initialize_global_memory_segments(){
707 smpi_privatize_global_variables=0;
712 smpi_get_executable_global_size();
714 XBT_DEBUG ("bss+data segment found : size %d starting at %p",size_data_exe, start_data_exe );
716 if(size_data_exe == 0){//no need to switch
717 smpi_privatize_global_variables=0;
721 fds= (int*)xbt_malloc((smpi_process_count())*sizeof(int));
722 mappings= (void**)xbt_malloc((smpi_process_count())*sizeof(void*));
725 for (i=0; i< SIMIX_process_count(); i++){
726 //create SIMIX_process_count() mappings of this size with the same data inside
727 void *address = NULL, *tmp = NULL;
728 char path[] = "/dev/shm/my-buffer-XXXXXX";
730 int file_descriptor= mkstemp (path);
731 if (file_descriptor < 0)
732 xbt_die("Impossible to create temporary file for memory mapping");
733 status = unlink (path);
735 xbt_die("Impossible to unlink temporary file for memory mapping");
737 status = ftruncate(file_descriptor, size_data_exe);
739 xbt_die("Impossible to set the size of the temporary file for memory mapping");
741 /* Ask for a free region */
742 address = mmap (NULL, size_data_exe, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
744 if (address == MAP_FAILED)
745 xbt_die("Couldn't find a free region for memory mapping");
747 tmp = mmap (address, size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, file_descriptor, 0);
750 xbt_die("Couldn't obtain the right address");
751 //initialize the values
752 memcpy(address,TOPAGE(start_data_exe),size_data_exe);
754 //store the address of the mapping for further switches
755 fds[i]=file_descriptor;
756 mappings[i]= address;
763 void smpi_destroy_global_memory_segments(){
764 if(size_data_exe == 0)//no need to switch
768 for (i=0; i< smpi_process_count(); i++){
769 if(munmap(mappings[i],size_data_exe) < 0) {
770 XBT_WARN("Unmapping of fd %d failed: %s", fds[i], strerror(errno));