1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
20 #include <sys/types.h>
23 #include <math.h> // sqrt
28 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
29 "Logging specific to SMPI (benchmarking)");
31 /* Shared allocations are handled through shared memory segments.
32 * Associated data and metadata are used as follows:
35 * `allocs' dict ---- -.
36 * ---------- shared_data_t shared_metadata_t / | | |
37 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
38 * | ---------- | fd of <name> | | | size of mmap | --| | | |
39 * | | count (2) | |-- | data | \ | | |
40 * `----------------- | <name> | | ----------------- ---- |
41 * -------------------- | ^ |
43 * | | `allocs_metadata' dict |
44 * | | ---------------------- |
45 * | `-- | <addr of mmap #1> |<-'
46 * | .-- | <addr of mmap #2> |<-.
47 * | | ---------------------- |
53 * | shared_metadata_t / | |
54 * | ----------------- | | |
55 * | | size of mmap | --| | |
57 * ----------------- | | |
62 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
64 xbt_dict_t allocs = NULL; /* Allocated on first use */
65 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
66 xbt_dict_t samples = NULL; /* Allocated on first use */
67 xbt_dict_t calls = NULL; /* Allocated on first use */
68 __thread int smpi_current_rank = 0; /* Updated after each MPI call */
70 double smpi_cpu_threshold;
71 double smpi_running_power;
76 char* start_data_exe = NULL;
77 int size_data_exe = 0;
78 int smpi_privatize_global_variables;
91 static size_t shm_size(int fd) {
94 if(fstat(fd, &st) < 0) {
95 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
97 return (size_t)st.st_size;
101 static void* shm_map(int fd, size_t size, shared_data_t* data) {
103 char loc[PTR_STRLEN];
104 shared_metadata_t* meta;
106 if(size > shm_size(fd)) {
107 if(ftruncate(fd, (off_t)size) < 0) {
108 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
112 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
113 if(mem == MAP_FAILED) {
114 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
116 if(!allocs_metadata) {
117 allocs_metadata = xbt_dict_new_homogeneous(xbt_free);
119 snprintf(loc, PTR_STRLEN, "%p", mem);
120 meta = xbt_new(shared_metadata_t, 1);
123 xbt_dict_set(allocs_metadata, loc, meta, NULL);
124 XBT_DEBUG("MMAP %zu to %p", size, mem);
129 void smpi_bench_destroy(void)
131 xbt_dict_free(&allocs);
132 xbt_dict_free(&allocs_metadata);
133 xbt_dict_free(&samples);
134 xbt_dict_free(&calls);
137 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
138 void smpi_execute_flops_(double *flops)
140 smpi_execute_flops(*flops);
143 XBT_PUBLIC(void) smpi_execute_(double *duration);
144 void smpi_execute_(double *duration)
146 smpi_execute(*duration);
149 void smpi_execute_flops(double flops) {
152 host = SIMIX_host_self();
153 XBT_DEBUG("Handle real computation time: %f flops", flops);
154 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
156 simcall_set_category (action, TRACE_internal_smpi_get_category());
158 simcall_host_execution_wait(action);
161 void smpi_execute(double duration)
163 if (duration >= smpi_cpu_threshold) {
164 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
165 double flops = duration * smpi_running_power;
167 int rank = smpi_process_index();
168 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
169 extra->type=TRACING_COMPUTING;
170 extra->comp_size=flops;
171 TRACE_smpi_computing_in(rank, extra);
173 smpi_execute_flops(flops);
176 TRACE_smpi_computing_out(rank);
180 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
181 duration, smpi_cpu_threshold);
185 void switch_data_segment(int dest);
187 void smpi_bench_begin(void)
189 switch_data_segment(smpi_process_index());
190 xbt_os_threadtimer_start(smpi_process_timer());
191 smpi_current_rank = smpi_process_index();
194 void smpi_bench_end(void)
196 xbt_os_timer_t timer = smpi_process_timer();
197 xbt_os_threadtimer_stop(timer);
198 // switch_data_segment(smpi_process_count());
199 if (smpi_process_get_sampling()) {
200 XBT_CRITICAL("Cannot do recursive benchmarks.");
201 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
202 xbt_backtrace_display_current();
203 xbt_die("Aborting.");
205 smpi_execute(xbt_os_timer_elapsed(timer));
208 unsigned int smpi_sleep(unsigned int secs)
214 double flops = (double) secs*simcall_host_get_speed(SIMIX_host_self());
215 XBT_DEBUG("Sleep for: %f flops", flops);
216 action = simcall_host_execute("computation", SIMIX_host_self(), flops, 1, 0, 0);
218 simcall_set_category (action, TRACE_internal_smpi_get_category());
220 simcall_host_execution_wait(action);
226 int smpi_gettimeofday(struct timeval *tv)
230 now = SIMIX_get_clock();
232 tv->tv_sec = (time_t)now;
234 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
236 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
243 extern double sg_maxmin_precision;
244 unsigned long long smpi_rastro_resolution (void)
247 double resolution = (1/sg_maxmin_precision);
249 return (unsigned long long)resolution;
252 unsigned long long smpi_rastro_timestamp (void)
255 double now = SIMIX_get_clock();
257 unsigned long long sec = (unsigned long long)now;
258 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
260 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
263 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
265 double threshold; /* maximal stderr requested (if positive) */
266 double relstderr; /* observed stderr so far */
267 double mean; /* mean of benched times, to be used if the block is disabled */
268 double sum; /* sum of benched times (to compute the mean and stderr) */
269 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
270 int iters; /* amount of requested iterations */
271 int count; /* amount of iterations done so far */
272 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
275 static char *sample_location(int global, const char *file, int line) {
277 return bprintf("%s:%d", file, line);
279 return bprintf("%s:%d:%d", file, line, smpi_process_index());
282 static int sample_enough_benchs(local_data_t *data) {
283 int res = data->count >= data->iters;
284 if (data->threshold>0.0) {
286 res = 0; // not enough data
287 if (data->relstderr > data->threshold)
288 res = 0; // stderr too high yet
290 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
291 (res?"enough benchs":"need more data"),
292 data->count, data->iters, data->relstderr, data->threshold, data->mean);
296 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
298 char *loc = sample_location(global, file, line);
301 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
302 smpi_process_set_sampling(1);
305 samples = xbt_dict_new_homogeneous(free);
307 data = xbt_dict_get_or_null(samples, loc);
309 xbt_assert(threshold>0 || iters>0,
310 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
311 data = (local_data_t *) xbt_new(local_data_t, 1);
314 data->sum_pow2 = 0.0;
316 data->threshold = threshold;
317 data->benching = 1; // If we have no data, we need at least one
319 xbt_dict_set(samples, loc, data, NULL);
320 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
322 if (data->iters != iters || data->threshold != threshold) {
323 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
324 loc, data->iters, data->threshold, iters,threshold);
328 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
329 data->benching = !sample_enough_benchs(data);
330 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
335 int smpi_sample_2(int global, const char *file, int line)
337 char *loc = sample_location(global, file, line);
341 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
342 data = xbt_dict_get(samples, loc);
343 XBT_DEBUG("sample2 %s",loc);
346 if (data->benching==1) {
347 // we need to run a new bench
348 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
349 data->count, data->iters, data->relstderr, data->threshold, data->mean);
352 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
353 // Just sleep instead
354 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
355 data->count, data->iters, data->relstderr, data->threshold, data->mean);
356 smpi_execute(data->mean);
357 smpi_process_set_sampling(0);
358 res = 0; // prepare to capture future, unrelated computations
365 void smpi_sample_3(int global, const char *file, int line)
367 char *loc = sample_location(global, file, line);
370 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
371 data = xbt_dict_get(samples, loc);
372 XBT_DEBUG("sample3 %s",loc);
375 if (data->benching==0) {
379 // ok, benchmarking this loop is over
380 xbt_os_threadtimer_stop(smpi_process_timer());
385 sample = xbt_os_timer_elapsed(smpi_process_timer());
387 data->sum_pow2 += sample * sample;
388 n = (double)data->count;
389 data->mean = data->sum / n;
390 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
391 if (!sample_enough_benchs(data)) {
392 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
394 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
395 data->mean, data->relstderr, sample);
397 // That's enough for now, prevent sample_2 to run the same code over and over
402 static void smpi_shared_alloc_free(void *p)
404 shared_data_t *data = p;
409 static char *smpi_shared_alloc_hash(char *loc)
419 loc = xbt_realloc(loc, 30);
421 for (i = 0; i < 40; i += 6) { /* base64 encode */
422 memcpy(s, hash + i, 6);
423 val = strtoul(s, NULL, 16);
424 for (j = 0; j < 4; j++) {
425 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
426 loc[1 + 4 * i / 6 + j] =
427 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
434 void *smpi_shared_malloc(size_t size, const char *file, int line)
437 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
438 char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
441 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
444 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
446 data = xbt_dict_get_or_null(allocs, loc);
448 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
449 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
453 xbt_die("Please cleanup /dev/shm/%s", loc);
455 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
458 data = xbt_new(shared_data_t, 1);
462 mem = shm_map(fd, size, data);
463 if (shm_unlink(loc) < 0) {
464 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
466 xbt_dict_set(allocs, loc, data, NULL);
467 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
470 mem = shm_map(data->fd, size, data);
473 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
475 mem = xbt_malloc(size);
476 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
481 void smpi_shared_free(void *ptr)
483 char loc[PTR_STRLEN];
484 shared_metadata_t* meta;
486 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
489 XBT_WARN("Cannot free: nothing was allocated");
492 if(!allocs_metadata) {
493 XBT_WARN("Cannot free: no metadata was allocated");
495 snprintf(loc, PTR_STRLEN, "%p", ptr);
496 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
498 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
503 XBT_WARN("Cannot free: something is broken in the metadata link");
506 if(munmap(ptr, meta->size) < 0) {
507 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
510 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
511 if (data->count <= 0) {
513 xbt_dict_remove(allocs, data->loc);
514 XBT_DEBUG("Shared free - with removal - of %p", ptr);
517 XBT_DEBUG("Classic free of %p", ptr);
523 int smpi_shared_known_call(const char* func, const char* input)
525 char* loc = bprintf("%s:%s", func, input);
530 calls = xbt_dict_new_homogeneous(NULL);
533 xbt_dict_get(calls, loc); /* Succeed or throw */
540 if (ex.category != not_found_error)
547 void* smpi_shared_get_call(const char* func, const char* input) {
548 char* loc = bprintf("%s:%s", func, input);
552 calls = xbt_dict_new_homogeneous(NULL);
554 data = xbt_dict_get(calls, loc);
559 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
560 char* loc = bprintf("%s:%s", func, input);
563 calls = xbt_dict_new_homogeneous(NULL);
565 xbt_dict_set(calls, loc, data, NULL);
574 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
578 * - read the executable data+bss section addresses and sizes
579 * - for each process create a copy of these sections with mmap
580 * - store them in a dynar
586 void switch_data_segment(int dest){
588 if(size_data_exe == 0)//no need to switch
591 if (loaded_page==dest)//no need to switch either
594 int current= fds[dest];
595 XBT_VERB("Switching data frame to the one of process %d", dest);
596 void* tmp = mmap (TOPAGE(start_data_exe), size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
597 msync(TOPAGE(start_data_exe), size_data_exe, MS_SYNC | MS_INVALIDATE );
598 if (tmp != TOPAGE(start_data_exe))
599 xbt_die("Couldn't map the new region");
604 void smpi_get_executable_global_size(){
605 int size_bss_binary=0;
606 int size_data_binary=0;
608 char *line = NULL; /* Temporal storage for each line that is readed */
609 ssize_t read; /* Number of bytes readed */
610 size_t n = 0; /* Amount of bytes to read by xbt_getline */
615 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
617 fp = popen(command, "r");
620 perror("popen failed");
624 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
629 /* Wipeout the new line character */
630 line[read - 1] = '\0';
632 lfields[0] = strtok(line, " ");
634 if(lfields[0] == NULL)
637 if(strcmp(lfields[0], "Sections:") == 0
638 || strcmp(lfields[0], "Idx") == 0
639 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
642 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
643 lfields[i] = strtok(NULL, " ");
647 * we are looking for these fields
648 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
649 CONTENTS, ALLOC, LOAD, DATA
650 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
655 if(strcmp(lfields[1], ".data") == 0){
656 size_data_binary = strtoul(lfields[2], NULL, 16);
657 start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
659 }else if(strcmp(lfields[1], ".bss") == 0){
660 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
661 //TODO : check if this is OK, as some segments may be inserted between them..
662 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (start_data_exe + size_data_binary))
663 + strtoul(lfields[2], NULL, 16);
671 size_data_exe =(unsigned long)start_data_exe - (unsigned long)TOPAGE(start_data_exe)+ size_data_binary+size_bss_binary;
678 void smpi_initialize_global_memory_segments(){
681 smpi_privatize_global_variables=0;
686 smpi_get_executable_global_size();
688 XBT_DEBUG ("bss+data segment found : size %d starting at %p",size_data_exe, start_data_exe );
690 if(size_data_exe == 0){//no need to switch
691 smpi_privatize_global_variables=0;
695 fds= (int*)xbt_malloc((smpi_process_count())*sizeof(int));
696 mappings= (void**)xbt_malloc((smpi_process_count())*sizeof(void*));
699 for (i=0; i< SIMIX_process_count(); i++){
700 //create SIMIX_process_count() mappings of this size with the same data inside
701 void *address = NULL, *tmp = NULL;
702 char path[] = "/dev/shm/my-buffer-XXXXXX";
704 int file_descriptor= mkstemp (path);
705 if (file_descriptor < 0)
706 xbt_die("Impossible to create temporary file for memory mapping");
707 status = unlink (path);
709 xbt_die("Impossible to unlink temporary file for memory mapping");
711 status = ftruncate(file_descriptor, size_data_exe);
713 xbt_die("Impossible to set the size of the temporary file for memory mapping");
715 /* Ask for a free region */
716 address = mmap (NULL, size_data_exe, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
718 if (address == MAP_FAILED)
719 xbt_die("Couldn't find a free region for memory mapping");
721 tmp = mmap (address, size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, file_descriptor, 0);
724 xbt_die("Couldn't obtain the right address");
725 //initialize the values
726 memcpy(address,TOPAGE(start_data_exe),size_data_exe);
728 //store the address of the mapping for further switches
729 fds[i]=file_descriptor;
730 mappings[i]= address;
737 void smpi_destroy_global_memory_segments(){
738 if(size_data_exe == 0)//no need to switch
742 for (i=0; i< smpi_process_count(); i++){
743 if(munmap(mappings[i],size_data_exe) < 0) {
744 XBT_WARN("Unmapping of fd %d failed: %s", fds[i], strerror(errno));