1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
20 #include <sys/types.h>
23 #include <math.h> // sqrt
29 #define MAP_ANONYMOUS MAP_ANON
32 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
33 "Logging specific to SMPI (benchmarking)");
35 /* Shared allocations are handled through shared memory segments.
36 * Associated data and metadata are used as follows:
39 * `allocs' dict ---- -.
40 * ---------- shared_data_t shared_metadata_t / | | |
41 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
42 * | ---------- | fd of <name> | | | size of mmap | --| | | |
43 * | | count (2) | |-- | data | \ | | |
44 * `----------------- | <name> | | ----------------- ---- |
45 * -------------------- | ^ |
47 * | | `allocs_metadata' dict |
48 * | | ---------------------- |
49 * | `-- | <addr of mmap #1> |<-'
50 * | .-- | <addr of mmap #2> |<-.
51 * | | ---------------------- |
57 * | shared_metadata_t / | |
58 * | ----------------- | | |
59 * | | size of mmap | --| | |
61 * ----------------- | | |
66 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
68 xbt_dict_t allocs = NULL; /* Allocated on first use */
69 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
70 xbt_dict_t samples = NULL; /* Allocated on first use */
71 xbt_dict_t calls = NULL; /* Allocated on first use */
73 double smpi_cpu_threshold;
74 double smpi_running_power;
79 char* start_data_exe = NULL;
80 int size_data_exe = 0;
81 int smpi_privatize_global_variables;
94 static size_t shm_size(int fd) {
97 if(fstat(fd, &st) < 0) {
98 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
100 return (size_t)st.st_size;
104 static void* shm_map(int fd, size_t size, shared_data_t* data) {
106 char loc[PTR_STRLEN];
107 shared_metadata_t* meta;
109 if(size > shm_size(fd)) {
110 if(ftruncate(fd, (off_t)size) < 0) {
111 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
115 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
116 if(mem == MAP_FAILED) {
117 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
119 if(!allocs_metadata) {
120 allocs_metadata = xbt_dict_new_homogeneous(xbt_free_f);
122 snprintf(loc, PTR_STRLEN, "%p", mem);
123 meta = xbt_new(shared_metadata_t, 1);
126 xbt_dict_set(allocs_metadata, loc, meta, NULL);
127 XBT_DEBUG("MMAP %zu to %p", size, mem);
132 void smpi_bench_destroy(void)
134 xbt_dict_free(&allocs);
135 xbt_dict_free(&allocs_metadata);
136 xbt_dict_free(&samples);
137 xbt_dict_free(&calls);
140 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
141 void smpi_execute_flops_(double *flops)
143 smpi_execute_flops(*flops);
146 XBT_PUBLIC(void) smpi_execute_(double *duration);
147 void smpi_execute_(double *duration)
149 smpi_execute(*duration);
152 void smpi_execute_flops(double flops) {
155 host = SIMIX_host_self();
156 XBT_DEBUG("Handle real computation time: %f flops", flops);
157 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
159 simcall_set_category (action, TRACE_internal_smpi_get_category());
161 simcall_host_execution_wait(action);
164 void smpi_execute(double duration)
166 if (duration >= smpi_cpu_threshold) {
167 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
168 double flops = duration * smpi_running_power;
170 int rank = smpi_process_index();
171 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
172 extra->type=TRACING_COMPUTING;
173 extra->comp_size=flops;
174 TRACE_smpi_computing_in(rank, extra);
176 smpi_execute_flops(flops);
179 TRACE_smpi_computing_out(rank);
183 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
184 duration, smpi_cpu_threshold);
188 void switch_data_segment(int dest);
190 void smpi_bench_begin(void)
192 switch_data_segment(smpi_process_index());
193 xbt_os_threadtimer_start(smpi_process_timer());
196 void smpi_bench_end(void)
198 xbt_os_timer_t timer = smpi_process_timer();
199 xbt_os_threadtimer_stop(timer);
200 // switch_data_segment(smpi_process_count());
201 if (smpi_process_get_sampling()) {
202 XBT_CRITICAL("Cannot do recursive benchmarks.");
203 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
204 xbt_backtrace_display_current();
205 xbt_die("Aborting.");
207 // Simulate the benchmarked computation unless disabled via command-line argument
208 if (sg_cfg_get_boolean("smpi/simulate_computation")) {
209 smpi_execute(xbt_os_timer_elapsed(timer));
213 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
214 static unsigned int private_sleep(double secs)
218 XBT_DEBUG("Sleep for: %lf secs", secs);
220 int rank = smpi_comm_rank(MPI_COMM_WORLD);
221 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
222 extra->type=TRACING_SLEEPING;
223 extra->sleep_duration=secs;
224 TRACE_smpi_sleeping_in(rank, extra);
226 simcall_process_sleep(secs);
228 TRACE_smpi_sleeping_out(rank);
235 unsigned int smpi_sleep(unsigned int secs)
237 return private_sleep((double)secs);
240 int smpi_usleep(useconds_t usecs)
242 return (int)private_sleep((double)usecs / 1000000.0);
246 int smpi_gettimeofday(struct timeval *tv, void* tz)
250 now = SIMIX_get_clock();
252 tv->tv_sec = (time_t)now;
254 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
256 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
263 extern double sg_surf_precision;
264 unsigned long long smpi_rastro_resolution (void)
267 double resolution = (1/sg_surf_precision);
269 return (unsigned long long)resolution;
272 unsigned long long smpi_rastro_timestamp (void)
275 double now = SIMIX_get_clock();
277 unsigned long long sec = (unsigned long long)now;
278 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
280 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
283 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
285 double threshold; /* maximal stderr requested (if positive) */
286 double relstderr; /* observed stderr so far */
287 double mean; /* mean of benched times, to be used if the block is disabled */
288 double sum; /* sum of benched times (to compute the mean and stderr) */
289 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
290 int iters; /* amount of requested iterations */
291 int count; /* amount of iterations done so far */
292 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
295 static char *sample_location(int global, const char *file, int line) {
297 return bprintf("%s:%d", file, line);
299 return bprintf("%s:%d:%d", file, line, smpi_process_index());
302 static int sample_enough_benchs(local_data_t *data) {
303 int res = data->count >= data->iters;
304 if (data->threshold>0.0) {
306 res = 0; // not enough data
307 if (data->relstderr > data->threshold)
308 res = 0; // stderr too high yet
310 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
311 (res?"enough benchs":"need more data"),
312 data->count, data->iters, data->relstderr, data->threshold, data->mean);
316 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
318 char *loc = sample_location(global, file, line);
321 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
322 smpi_process_set_sampling(1);
325 samples = xbt_dict_new_homogeneous(free);
327 data = xbt_dict_get_or_null(samples, loc);
329 xbt_assert(threshold>0 || iters>0,
330 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
331 data = (local_data_t *) xbt_new(local_data_t, 1);
334 data->sum_pow2 = 0.0;
336 data->threshold = threshold;
337 data->benching = 1; // If we have no data, we need at least one
339 xbt_dict_set(samples, loc, data, NULL);
340 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
342 if (data->iters != iters || data->threshold != threshold) {
343 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
344 loc, data->iters, data->threshold, iters,threshold);
348 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
349 data->benching = !sample_enough_benchs(data);
350 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
355 int smpi_sample_2(int global, const char *file, int line)
357 char *loc = sample_location(global, file, line);
361 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
362 data = xbt_dict_get(samples, loc);
363 XBT_DEBUG("sample2 %s",loc);
366 if (data->benching==1) {
367 // we need to run a new bench
368 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
369 data->count, data->iters, data->relstderr, data->threshold, data->mean);
372 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
373 // Just sleep instead
374 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
375 data->count, data->iters, data->relstderr, data->threshold, data->mean);
376 smpi_execute(data->mean);
377 smpi_process_set_sampling(0);
378 res = 0; // prepare to capture future, unrelated computations
385 void smpi_sample_3(int global, const char *file, int line)
387 char *loc = sample_location(global, file, line);
390 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
391 data = xbt_dict_get(samples, loc);
392 XBT_DEBUG("sample3 %s",loc);
395 if (data->benching==0) {
399 // ok, benchmarking this loop is over
400 xbt_os_threadtimer_stop(smpi_process_timer());
405 sample = xbt_os_timer_elapsed(smpi_process_timer());
407 data->sum_pow2 += sample * sample;
408 n = (double)data->count;
409 data->mean = data->sum / n;
410 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
411 if (!sample_enough_benchs(data)) {
412 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
414 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
415 data->mean, data->relstderr, sample);
417 // That's enough for now, prevent sample_2 to run the same code over and over
422 static void smpi_shared_alloc_free(void *p)
424 shared_data_t *data = p;
429 static char *smpi_shared_alloc_hash(char *loc)
439 loc = xbt_realloc(loc, 30);
441 for (i = 0; i < 40; i += 6) { /* base64 encode */
442 memcpy(s, hash + i, 6);
443 val = strtoul(s, NULL, 16);
444 for (j = 0; j < 4; j++) {
445 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
446 loc[1 + 4 * i / 6 + j] =
447 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
454 void *smpi_shared_malloc(size_t size, const char *file, int line)
457 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
458 char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
461 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
464 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
466 data = xbt_dict_get_or_null(allocs, loc);
468 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
469 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
473 xbt_die("Please cleanup /dev/shm/%s", loc);
475 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
478 data = xbt_new(shared_data_t, 1);
482 mem = shm_map(fd, size, data);
483 if (shm_unlink(loc) < 0) {
484 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
486 xbt_dict_set(allocs, loc, data, NULL);
487 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
490 mem = shm_map(data->fd, size, data);
493 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
495 mem = xbt_malloc(size);
496 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
501 void smpi_shared_free(void *ptr)
503 char loc[PTR_STRLEN];
504 shared_metadata_t* meta;
506 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
509 XBT_WARN("Cannot free: nothing was allocated");
512 if(!allocs_metadata) {
513 XBT_WARN("Cannot free: no metadata was allocated");
515 snprintf(loc, PTR_STRLEN, "%p", ptr);
516 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
518 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
523 XBT_WARN("Cannot free: something is broken in the metadata link");
526 if(munmap(ptr, meta->size) < 0) {
527 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
530 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
531 if (data->count <= 0) {
533 xbt_dict_remove(allocs, data->loc);
534 XBT_DEBUG("Shared free - with removal - of %p", ptr);
537 XBT_DEBUG("Classic free of %p", ptr);
543 int smpi_shared_known_call(const char* func, const char* input)
545 char* loc = bprintf("%s:%s", func, input);
550 calls = xbt_dict_new_homogeneous(NULL);
553 xbt_dict_get(calls, loc); /* Succeed or throw */
560 if (ex.category != not_found_error)
567 void* smpi_shared_get_call(const char* func, const char* input) {
568 char* loc = bprintf("%s:%s", func, input);
572 calls = xbt_dict_new_homogeneous(NULL);
574 data = xbt_dict_get(calls, loc);
579 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
580 char* loc = bprintf("%s:%s", func, input);
583 calls = xbt_dict_new_homogeneous(NULL);
585 xbt_dict_set(calls, loc, data, NULL);
593 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
597 * - read the executable data+bss section addresses and sizes
598 * - for each process create a copy of these sections with mmap
599 * - store them in a dynar
605 void switch_data_segment(int dest){
607 if(size_data_exe == 0)//no need to switch
610 if (loaded_page==dest)//no need to switch either
616 if(loaded_page==-1){//initial switch, do the copy from the real page here
617 for (i=0; i< SIMIX_process_count(); i++){
618 memcpy(mappings[i],TOPAGE(start_data_exe),size_data_exe);
621 int current= fds[dest];
622 XBT_VERB("Switching data frame to the one of process %d", dest);
623 void* tmp = mmap (TOPAGE(start_data_exe), size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
624 if (tmp != TOPAGE(start_data_exe))
625 xbt_die("Couldn't map the new region");
630 void smpi_get_executable_global_size(){
631 int size_bss_binary=0;
632 int size_data_binary=0;
634 char *line = NULL; /* Temporal storage for each line that is readed */
635 ssize_t read; /* Number of bytes readed */
636 size_t n = 0; /* Amount of bytes to read by xbt_getline */
641 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
643 fp = popen(command, "r");
646 perror("popen failed");
650 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
655 /* Wipeout the new line character */
656 line[read - 1] = '\0';
658 lfields[0] = strtok(line, " ");
660 if(lfields[0] == NULL)
663 if(strcmp(lfields[0], "Sections:") == 0
664 || strcmp(lfields[0], "Idx") == 0
665 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
668 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
669 lfields[i] = strtok(NULL, " ");
673 * we are looking for these fields
674 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
675 CONTENTS, ALLOC, LOAD, DATA
676 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
681 if(strcmp(lfields[1], ".data") == 0){
682 size_data_binary = strtoul(lfields[2], NULL, 16);
683 start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
685 }else if(strcmp(lfields[1], ".bss") == 0){
686 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
687 //TODO : check if this is OK, as some segments may be inserted between them..
688 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (start_data_exe + size_data_binary))
689 + strtoul(lfields[2], NULL, 16);
697 size_data_exe =(unsigned long)start_data_exe - (unsigned long)TOPAGE(start_data_exe)+ size_data_binary+size_bss_binary;
704 void smpi_initialize_global_memory_segments(){
707 smpi_privatize_global_variables=0;
712 smpi_get_executable_global_size();
714 XBT_DEBUG ("bss+data segment found : size %d starting at %p",size_data_exe, start_data_exe );
716 if(size_data_exe == 0){//no need to switch
717 smpi_privatize_global_variables=0;
721 fds= (int*)xbt_malloc((smpi_process_count())*sizeof(int));
722 mappings= (void**)xbt_malloc((smpi_process_count())*sizeof(void*));
725 for (i=0; i< SIMIX_process_count(); i++){
726 //create SIMIX_process_count() mappings of this size with the same data inside
727 void *address = NULL, *tmp = NULL;
728 char path[] = "/dev/shm/my-buffer-XXXXXX";
730 int file_descriptor= mkstemp (path);
731 if (file_descriptor < 0)
732 xbt_die("Impossible to create temporary file for memory mapping");
733 status = unlink (path);
735 xbt_die("Impossible to unlink temporary file for memory mapping");
737 status = ftruncate(file_descriptor, size_data_exe);
739 xbt_die("Impossible to set the size of the temporary file for memory mapping");
741 /* Ask for a free region */
742 address = mmap (NULL, size_data_exe, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
744 if (address == MAP_FAILED)
745 xbt_die("Couldn't find a free region for memory mapping");
747 tmp = mmap (address, size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, file_descriptor, 0);
750 xbt_die("Couldn't obtain the right address");
751 //initialize the values
752 memcpy(address,TOPAGE(start_data_exe),size_data_exe);
754 //store the address of the mapping for further switches
755 fds[i]=file_descriptor;
756 mappings[i]= address;
763 void smpi_destroy_global_memory_segments(){
764 if(size_data_exe == 0)//no need to switch
768 for (i=0; i< smpi_process_count(); i++){
769 if(munmap(mappings[i],size_data_exe) < 0) {
770 XBT_WARN("Unmapping of fd %d failed: %s", fds[i], strerror(errno));