1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
20 #include <sys/types.h>
23 #include <math.h> // sqrt
29 #define MAP_ANONYMOUS MAP_ANON
32 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
33 "Logging specific to SMPI (benchmarking)");
35 /* Shared allocations are handled through shared memory segments.
36 * Associated data and metadata are used as follows:
39 * `allocs' dict ---- -.
40 * ---------- shared_data_t shared_metadata_t / | | |
41 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
42 * | ---------- | fd of <name> | | | size of mmap | --| | | |
43 * | | count (2) | |-- | data | \ | | |
44 * `----------------- | <name> | | ----------------- ---- |
45 * -------------------- | ^ |
47 * | | `allocs_metadata' dict |
48 * | | ---------------------- |
49 * | `-- | <addr of mmap #1> |<-'
50 * | .-- | <addr of mmap #2> |<-.
51 * | | ---------------------- |
57 * | shared_metadata_t / | |
58 * | ----------------- | | |
59 * | | size of mmap | --| | |
61 * ----------------- | | |
66 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
68 xbt_dict_t allocs = NULL; /* Allocated on first use */
69 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
70 xbt_dict_t samples = NULL; /* Allocated on first use */
71 xbt_dict_t calls = NULL; /* Allocated on first use */
72 __thread int smpi_current_rank = 0; /* Updated after each MPI call */
74 double smpi_cpu_threshold;
75 double smpi_running_power;
80 char* start_data_exe = NULL;
81 int size_data_exe = 0;
82 int smpi_privatize_global_variables;
95 static size_t shm_size(int fd) {
98 if(fstat(fd, &st) < 0) {
99 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
101 return (size_t)st.st_size;
105 static void* shm_map(int fd, size_t size, shared_data_t* data) {
107 char loc[PTR_STRLEN];
108 shared_metadata_t* meta;
110 if(size > shm_size(fd)) {
111 if(ftruncate(fd, (off_t)size) < 0) {
112 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
116 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
117 if(mem == MAP_FAILED) {
118 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
120 if(!allocs_metadata) {
121 allocs_metadata = xbt_dict_new_homogeneous(xbt_free);
123 snprintf(loc, PTR_STRLEN, "%p", mem);
124 meta = xbt_new(shared_metadata_t, 1);
127 xbt_dict_set(allocs_metadata, loc, meta, NULL);
128 XBT_DEBUG("MMAP %zu to %p", size, mem);
133 void smpi_bench_destroy(void)
135 xbt_dict_free(&allocs);
136 xbt_dict_free(&allocs_metadata);
137 xbt_dict_free(&samples);
138 xbt_dict_free(&calls);
141 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
142 void smpi_execute_flops_(double *flops)
144 smpi_execute_flops(*flops);
147 XBT_PUBLIC(void) smpi_execute_(double *duration);
148 void smpi_execute_(double *duration)
150 smpi_execute(*duration);
153 void smpi_execute_flops(double flops) {
156 host = SIMIX_host_self();
157 XBT_DEBUG("Handle real computation time: %f flops", flops);
158 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
160 simcall_set_category (action, TRACE_internal_smpi_get_category());
162 simcall_host_execution_wait(action);
165 void smpi_execute(double duration)
167 if (duration >= smpi_cpu_threshold) {
168 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
169 double flops = duration * smpi_running_power;
171 int rank = smpi_process_index();
172 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
173 extra->type=TRACING_COMPUTING;
174 extra->comp_size=flops;
175 TRACE_smpi_computing_in(rank, extra);
177 smpi_execute_flops(flops);
180 TRACE_smpi_computing_out(rank);
184 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
185 duration, smpi_cpu_threshold);
189 void switch_data_segment(int dest);
191 void smpi_bench_begin(void)
193 switch_data_segment(smpi_process_index());
194 xbt_os_threadtimer_start(smpi_process_timer());
195 smpi_current_rank = smpi_process_index();
198 void smpi_bench_end(void)
200 xbt_os_timer_t timer = smpi_process_timer();
201 xbt_os_threadtimer_stop(timer);
202 // switch_data_segment(smpi_process_count());
203 if (smpi_process_get_sampling()) {
204 XBT_CRITICAL("Cannot do recursive benchmarks.");
205 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
206 xbt_backtrace_display_current();
207 xbt_die("Aborting.");
209 smpi_execute(xbt_os_timer_elapsed(timer));
212 unsigned int smpi_sleep(unsigned int secs)
218 double flops = (double) secs*simcall_host_get_speed(SIMIX_host_self());
219 XBT_DEBUG("Sleep for: %f flops", flops);
220 action = simcall_host_execute("computation", SIMIX_host_self(), flops, 1, 0, 0);
222 simcall_set_category (action, TRACE_internal_smpi_get_category());
224 simcall_host_execution_wait(action);
230 int smpi_usleep(useconds_t usecs)
236 double flops = (double) (usecs/1000000.0)*simcall_host_get_speed(SIMIX_host_self());
237 XBT_DEBUG("Sleep for: %f flops", flops);
238 action = simcall_host_execute("computation", SIMIX_host_self(), flops, 1, 0, 0);
240 simcall_set_category (action, TRACE_internal_smpi_get_category());
242 simcall_host_execution_wait(action);
249 int smpi_gettimeofday(struct timeval *tv, void* tz)
253 now = SIMIX_get_clock();
255 tv->tv_sec = (time_t)now;
257 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
259 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
266 extern double sg_surf_precision;
267 unsigned long long smpi_rastro_resolution (void)
270 double resolution = (1/sg_surf_precision);
272 return (unsigned long long)resolution;
275 unsigned long long smpi_rastro_timestamp (void)
278 double now = SIMIX_get_clock();
280 unsigned long long sec = (unsigned long long)now;
281 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
283 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
286 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
288 double threshold; /* maximal stderr requested (if positive) */
289 double relstderr; /* observed stderr so far */
290 double mean; /* mean of benched times, to be used if the block is disabled */
291 double sum; /* sum of benched times (to compute the mean and stderr) */
292 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
293 int iters; /* amount of requested iterations */
294 int count; /* amount of iterations done so far */
295 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
298 static char *sample_location(int global, const char *file, int line) {
300 return bprintf("%s:%d", file, line);
302 return bprintf("%s:%d:%d", file, line, smpi_process_index());
305 static int sample_enough_benchs(local_data_t *data) {
306 int res = data->count >= data->iters;
307 if (data->threshold>0.0) {
309 res = 0; // not enough data
310 if (data->relstderr > data->threshold)
311 res = 0; // stderr too high yet
313 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
314 (res?"enough benchs":"need more data"),
315 data->count, data->iters, data->relstderr, data->threshold, data->mean);
319 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
321 char *loc = sample_location(global, file, line);
324 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
325 smpi_process_set_sampling(1);
328 samples = xbt_dict_new_homogeneous(free);
330 data = xbt_dict_get_or_null(samples, loc);
332 xbt_assert(threshold>0 || iters>0,
333 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
334 data = (local_data_t *) xbt_new(local_data_t, 1);
337 data->sum_pow2 = 0.0;
339 data->threshold = threshold;
340 data->benching = 1; // If we have no data, we need at least one
342 xbt_dict_set(samples, loc, data, NULL);
343 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
345 if (data->iters != iters || data->threshold != threshold) {
346 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
347 loc, data->iters, data->threshold, iters,threshold);
351 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
352 data->benching = !sample_enough_benchs(data);
353 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
358 int smpi_sample_2(int global, const char *file, int line)
360 char *loc = sample_location(global, file, line);
364 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
365 data = xbt_dict_get(samples, loc);
366 XBT_DEBUG("sample2 %s",loc);
369 if (data->benching==1) {
370 // we need to run a new bench
371 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
372 data->count, data->iters, data->relstderr, data->threshold, data->mean);
375 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
376 // Just sleep instead
377 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
378 data->count, data->iters, data->relstderr, data->threshold, data->mean);
379 smpi_execute(data->mean);
380 smpi_process_set_sampling(0);
381 res = 0; // prepare to capture future, unrelated computations
388 void smpi_sample_3(int global, const char *file, int line)
390 char *loc = sample_location(global, file, line);
393 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
394 data = xbt_dict_get(samples, loc);
395 XBT_DEBUG("sample3 %s",loc);
398 if (data->benching==0) {
402 // ok, benchmarking this loop is over
403 xbt_os_threadtimer_stop(smpi_process_timer());
408 sample = xbt_os_timer_elapsed(smpi_process_timer());
410 data->sum_pow2 += sample * sample;
411 n = (double)data->count;
412 data->mean = data->sum / n;
413 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
414 if (!sample_enough_benchs(data)) {
415 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
417 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
418 data->mean, data->relstderr, sample);
420 // That's enough for now, prevent sample_2 to run the same code over and over
425 static void smpi_shared_alloc_free(void *p)
427 shared_data_t *data = p;
432 static char *smpi_shared_alloc_hash(char *loc)
442 loc = xbt_realloc(loc, 30);
444 for (i = 0; i < 40; i += 6) { /* base64 encode */
445 memcpy(s, hash + i, 6);
446 val = strtoul(s, NULL, 16);
447 for (j = 0; j < 4; j++) {
448 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
449 loc[1 + 4 * i / 6 + j] =
450 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
457 void *smpi_shared_malloc(size_t size, const char *file, int line)
460 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
461 char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
464 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
467 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
469 data = xbt_dict_get_or_null(allocs, loc);
471 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
472 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
476 xbt_die("Please cleanup /dev/shm/%s", loc);
478 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
481 data = xbt_new(shared_data_t, 1);
485 mem = shm_map(fd, size, data);
486 if (shm_unlink(loc) < 0) {
487 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
489 xbt_dict_set(allocs, loc, data, NULL);
490 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
493 mem = shm_map(data->fd, size, data);
496 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
498 mem = xbt_malloc(size);
499 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
504 void smpi_shared_free(void *ptr)
506 char loc[PTR_STRLEN];
507 shared_metadata_t* meta;
509 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
512 XBT_WARN("Cannot free: nothing was allocated");
515 if(!allocs_metadata) {
516 XBT_WARN("Cannot free: no metadata was allocated");
518 snprintf(loc, PTR_STRLEN, "%p", ptr);
519 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
521 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
526 XBT_WARN("Cannot free: something is broken in the metadata link");
529 if(munmap(ptr, meta->size) < 0) {
530 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
533 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
534 if (data->count <= 0) {
536 xbt_dict_remove(allocs, data->loc);
537 XBT_DEBUG("Shared free - with removal - of %p", ptr);
540 XBT_DEBUG("Classic free of %p", ptr);
546 int smpi_shared_known_call(const char* func, const char* input)
548 char* loc = bprintf("%s:%s", func, input);
553 calls = xbt_dict_new_homogeneous(NULL);
556 xbt_dict_get(calls, loc); /* Succeed or throw */
563 if (ex.category != not_found_error)
570 void* smpi_shared_get_call(const char* func, const char* input) {
571 char* loc = bprintf("%s:%s", func, input);
575 calls = xbt_dict_new_homogeneous(NULL);
577 data = xbt_dict_get(calls, loc);
582 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
583 char* loc = bprintf("%s:%s", func, input);
586 calls = xbt_dict_new_homogeneous(NULL);
588 xbt_dict_set(calls, loc, data, NULL);
596 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
600 * - read the executable data+bss section addresses and sizes
601 * - for each process create a copy of these sections with mmap
602 * - store them in a dynar
608 void switch_data_segment(int dest){
610 if(size_data_exe == 0)//no need to switch
613 if (loaded_page==dest)//no need to switch either
619 if(loaded_page==-1){//initial switch, do the copy from the real page here
620 for (i=0; i< SIMIX_process_count(); i++){
621 memcpy(mappings[i],TOPAGE(start_data_exe),size_data_exe);
624 int current= fds[dest];
625 XBT_VERB("Switching data frame to the one of process %d", dest);
626 void* tmp = mmap (TOPAGE(start_data_exe), size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
627 if (tmp != TOPAGE(start_data_exe))
628 xbt_die("Couldn't map the new region");
633 void smpi_get_executable_global_size(){
634 int size_bss_binary=0;
635 int size_data_binary=0;
637 char *line = NULL; /* Temporal storage for each line that is readed */
638 ssize_t read; /* Number of bytes readed */
639 size_t n = 0; /* Amount of bytes to read by xbt_getline */
644 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
646 fp = popen(command, "r");
649 perror("popen failed");
653 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
658 /* Wipeout the new line character */
659 line[read - 1] = '\0';
661 lfields[0] = strtok(line, " ");
663 if(lfields[0] == NULL)
666 if(strcmp(lfields[0], "Sections:") == 0
667 || strcmp(lfields[0], "Idx") == 0
668 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
671 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
672 lfields[i] = strtok(NULL, " ");
676 * we are looking for these fields
677 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
678 CONTENTS, ALLOC, LOAD, DATA
679 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
684 if(strcmp(lfields[1], ".data") == 0){
685 size_data_binary = strtoul(lfields[2], NULL, 16);
686 start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
688 }else if(strcmp(lfields[1], ".bss") == 0){
689 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
690 //TODO : check if this is OK, as some segments may be inserted between them..
691 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (start_data_exe + size_data_binary))
692 + strtoul(lfields[2], NULL, 16);
700 size_data_exe =(unsigned long)start_data_exe - (unsigned long)TOPAGE(start_data_exe)+ size_data_binary+size_bss_binary;
707 void smpi_initialize_global_memory_segments(){
710 smpi_privatize_global_variables=0;
715 smpi_get_executable_global_size();
717 XBT_DEBUG ("bss+data segment found : size %d starting at %p",size_data_exe, start_data_exe );
719 if(size_data_exe == 0){//no need to switch
720 smpi_privatize_global_variables=0;
724 fds= (int*)xbt_malloc((smpi_process_count())*sizeof(int));
725 mappings= (void**)xbt_malloc((smpi_process_count())*sizeof(void*));
728 for (i=0; i< SIMIX_process_count(); i++){
729 //create SIMIX_process_count() mappings of this size with the same data inside
730 void *address = NULL, *tmp = NULL;
731 char path[] = "/dev/shm/my-buffer-XXXXXX";
733 int file_descriptor= mkstemp (path);
734 if (file_descriptor < 0)
735 xbt_die("Impossible to create temporary file for memory mapping");
736 status = unlink (path);
738 xbt_die("Impossible to unlink temporary file for memory mapping");
740 status = ftruncate(file_descriptor, size_data_exe);
742 xbt_die("Impossible to set the size of the temporary file for memory mapping");
744 /* Ask for a free region */
745 address = mmap (NULL, size_data_exe, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
747 if (address == MAP_FAILED)
748 xbt_die("Couldn't find a free region for memory mapping");
750 tmp = mmap (address, size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, file_descriptor, 0);
753 xbt_die("Couldn't obtain the right address");
754 //initialize the values
755 memcpy(address,TOPAGE(start_data_exe),size_data_exe);
757 //store the address of the mapping for further switches
758 fds[i]=file_descriptor;
759 mappings[i]= address;
766 void smpi_destroy_global_memory_segments(){
767 if(size_data_exe == 0)//no need to switch
771 for (i=0; i< smpi_process_count(); i++){
772 if(munmap(mappings[i],size_data_exe) < 0) {
773 XBT_WARN("Unmapping of fd %d failed: %s", fds[i], strerror(errno));