1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
20 #include <sys/types.h>
23 #include <math.h> // sqrt
28 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
29 "Logging specific to SMPI (benchmarking)");
31 /* Shared allocations are handled through shared memory segments.
32 * Associated data and metadata are used as follows:
35 * `allocs' dict ---- -.
36 * ---------- shared_data_t shared_metadata_t / | | |
37 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
38 * | ---------- | fd of <name> | | | size of mmap | --| | | |
39 * | | count (2) | |-- | data | \ | | |
40 * `----------------- | <name> | | ----------------- ---- |
41 * -------------------- | ^ |
43 * | | `allocs_metadata' dict |
44 * | | ---------------------- |
45 * | `-- | <addr of mmap #1> |<-'
46 * | .-- | <addr of mmap #2> |<-.
47 * | | ---------------------- |
53 * | shared_metadata_t / | |
54 * | ----------------- | | |
55 * | | size of mmap | --| | |
57 * ----------------- | | |
62 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
64 xbt_dict_t allocs = NULL; /* Allocated on first use */
65 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
66 xbt_dict_t samples = NULL; /* Allocated on first use */
67 xbt_dict_t calls = NULL; /* Allocated on first use */
68 __thread int smpi_current_rank = 0; /* Updated after each MPI call */
70 double smpi_cpu_threshold;
71 double smpi_running_power;
76 char* start_data_exe = NULL;
77 int size_data_exe = 0;
78 int smpi_privatize_global_variables;
91 static size_t shm_size(int fd) {
94 if(fstat(fd, &st) < 0) {
95 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
97 return (size_t)st.st_size;
101 static void* shm_map(int fd, size_t size, shared_data_t* data) {
103 char loc[PTR_STRLEN];
104 shared_metadata_t* meta;
106 if(size > shm_size(fd)) {
107 if(ftruncate(fd, (off_t)size) < 0) {
108 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
112 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
113 if(mem == MAP_FAILED) {
114 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
116 if(!allocs_metadata) {
117 allocs_metadata = xbt_dict_new_homogeneous(xbt_free);
119 snprintf(loc, PTR_STRLEN, "%p", mem);
120 meta = xbt_new(shared_metadata_t, 1);
123 xbt_dict_set(allocs_metadata, loc, meta, NULL);
124 XBT_DEBUG("MMAP %zu to %p", size, mem);
129 void smpi_bench_destroy(void)
131 xbt_dict_free(&allocs);
132 xbt_dict_free(&allocs_metadata);
133 xbt_dict_free(&samples);
134 xbt_dict_free(&calls);
137 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
138 void smpi_execute_flops_(double *flops)
140 smpi_execute_flops(*flops);
143 XBT_PUBLIC(void) smpi_execute_(double *duration);
144 void smpi_execute_(double *duration)
146 smpi_execute(*duration);
149 void smpi_execute_flops(double flops) {
152 host = SIMIX_host_self();
153 XBT_DEBUG("Handle real computation time: %f flops", flops);
154 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
156 simcall_set_category (action, TRACE_internal_smpi_get_category());
158 simcall_host_execution_wait(action);
161 void smpi_execute(double duration)
163 if (duration >= smpi_cpu_threshold) {
164 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
165 double flops = duration * smpi_running_power;
167 int rank = smpi_process_index();
168 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
169 extra->type=TRACING_COMPUTING;
170 extra->comp_size=flops;
171 TRACE_smpi_computing_in(rank, extra);
173 smpi_execute_flops(flops);
176 TRACE_smpi_computing_out(rank);
180 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
181 duration, smpi_cpu_threshold);
185 void switch_data_segment(int dest);
187 void smpi_bench_begin(void)
189 switch_data_segment(smpi_process_index());
190 xbt_os_threadtimer_start(smpi_process_timer());
191 smpi_current_rank = smpi_process_index();
194 void smpi_bench_end(void)
196 xbt_os_timer_t timer = smpi_process_timer();
197 xbt_os_threadtimer_stop(timer);
198 // switch_data_segment(smpi_process_count());
199 if (smpi_process_get_sampling()) {
200 XBT_CRITICAL("Cannot do recursive benchmarks.");
201 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
202 xbt_backtrace_display_current();
203 xbt_die("Aborting.");
205 smpi_execute(xbt_os_timer_elapsed(timer));
208 unsigned int smpi_sleep(unsigned int secs)
214 double flops = (double) secs*simcall_host_get_speed(SIMIX_host_self());
215 XBT_DEBUG("Sleep for: %f flops", flops);
216 action = simcall_host_execute("computation", SIMIX_host_self(), flops, 1, 0, 0);
218 simcall_set_category (action, TRACE_internal_smpi_get_category());
220 simcall_host_execution_wait(action);
226 int smpi_gettimeofday(struct timeval *tv)
230 now = SIMIX_get_clock();
232 tv->tv_sec = (time_t)now;
234 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
236 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
243 extern double sg_maxmin_precision;
244 unsigned long long smpi_rastro_resolution (void)
247 double resolution = (1/sg_maxmin_precision);
249 return (unsigned long long)resolution;
252 unsigned long long smpi_rastro_timestamp (void)
255 double now = SIMIX_get_clock();
257 unsigned long long sec = (unsigned long long)now;
258 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
260 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
263 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
265 double threshold; /* maximal stderr requested (if positive) */
266 double relstderr; /* observed stderr so far */
267 double mean; /* mean of benched times, to be used if the block is disabled */
268 double sum; /* sum of benched times (to compute the mean and stderr) */
269 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
270 int iters; /* amount of requested iterations */
271 int count; /* amount of iterations done so far */
272 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
275 static char *sample_location(int global, const char *file, int line) {
277 return bprintf("%s:%d", file, line);
279 return bprintf("%s:%d:%d", file, line, smpi_process_index());
282 static int sample_enough_benchs(local_data_t *data) {
283 int res = data->count >= data->iters;
284 if (data->threshold>0.0) {
286 res = 0; // not enough data
287 if (data->relstderr > data->threshold)
288 res = 0; // stderr too high yet
290 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
291 (res?"enough benchs":"need more data"),
292 data->count, data->iters, data->relstderr, data->threshold, data->mean);
296 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
298 char *loc = sample_location(global, file, line);
301 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
302 smpi_process_set_sampling(1);
305 samples = xbt_dict_new_homogeneous(free);
307 data = xbt_dict_get_or_null(samples, loc);
309 xbt_assert(threshold>0 || iters>0,
310 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
311 data = (local_data_t *) xbt_new(local_data_t, 1);
314 data->sum_pow2 = 0.0;
316 data->threshold = threshold;
317 data->benching = 1; // If we have no data, we need at least one
319 xbt_dict_set(samples, loc, data, NULL);
320 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
322 if (data->iters != iters || data->threshold != threshold) {
323 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
324 loc, data->iters, data->threshold, iters,threshold);
328 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
329 data->benching = !sample_enough_benchs(data);
330 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
335 int smpi_sample_2(int global, const char *file, int line)
337 char *loc = sample_location(global, file, line);
341 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
342 data = xbt_dict_get(samples, loc);
343 XBT_DEBUG("sample2 %s",loc);
346 if (data->benching==1) {
347 // we need to run a new bench
348 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
349 data->count, data->iters, data->relstderr, data->threshold, data->mean);
352 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
353 // Just sleep instead
354 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
355 data->count, data->iters, data->relstderr, data->threshold, data->mean);
356 smpi_execute(data->mean);
357 smpi_process_set_sampling(0);
358 res = 0; // prepare to capture future, unrelated computations
365 void smpi_sample_3(int global, const char *file, int line)
367 char *loc = sample_location(global, file, line);
370 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
371 data = xbt_dict_get(samples, loc);
372 XBT_DEBUG("sample3 %s",loc);
375 if (data->benching==0) {
379 // ok, benchmarking this loop is over
380 xbt_os_threadtimer_stop(smpi_process_timer());
385 sample = xbt_os_timer_elapsed(smpi_process_timer());
387 data->sum_pow2 += sample * sample;
388 n = (double)data->count;
389 data->mean = data->sum / n;
390 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
391 if (!sample_enough_benchs(data)) {
392 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
394 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
395 data->mean, data->relstderr, sample);
397 // That's enough for now, prevent sample_2 to run the same code over and over
402 static void smpi_shared_alloc_free(void *p)
404 shared_data_t *data = p;
409 static char *smpi_shared_alloc_hash(char *loc)
419 loc = xbt_realloc(loc, 30);
421 for (i = 0; i < 40; i += 6) { /* base64 encode */
422 memcpy(s, hash + i, 6);
423 val = strtoul(s, NULL, 16);
424 for (j = 0; j < 4; j++) {
425 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
426 loc[1 + 4 * i / 6 + j] =
427 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
434 void *smpi_shared_malloc(size_t size, const char *file, int line)
437 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
438 char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
441 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
444 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
446 data = xbt_dict_get_or_null(allocs, loc);
448 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
449 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
453 xbt_die("Please cleanup /dev/shm/%s", loc);
455 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
458 data = xbt_new(shared_data_t, 1);
462 mem = shm_map(fd, size, data);
463 if (shm_unlink(loc) < 0) {
464 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
466 xbt_dict_set(allocs, loc, data, NULL);
467 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
470 mem = shm_map(data->fd, size, data);
473 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
475 mem = xbt_malloc(size);
476 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
481 void smpi_shared_free(void *ptr)
483 char loc[PTR_STRLEN];
484 shared_metadata_t* meta;
486 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
489 XBT_WARN("Cannot free: nothing was allocated");
492 if(!allocs_metadata) {
493 XBT_WARN("Cannot free: no metadata was allocated");
495 snprintf(loc, PTR_STRLEN, "%p", ptr);
496 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
498 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
503 XBT_WARN("Cannot free: something is broken in the metadata link");
506 if(munmap(ptr, meta->size) < 0) {
507 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
510 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
511 if (data->count <= 0) {
513 xbt_dict_remove(allocs, data->loc);
514 XBT_DEBUG("Shared free - with removal - of %p", ptr);
517 XBT_DEBUG("Classic free of %p", ptr);
523 int smpi_shared_known_call(const char* func, const char* input)
525 char* loc = bprintf("%s:%s", func, input);
530 calls = xbt_dict_new_homogeneous(NULL);
533 xbt_dict_get(calls, loc); /* Succeed or throw */
540 if (ex.category != not_found_error)
547 void* smpi_shared_get_call(const char* func, const char* input) {
548 char* loc = bprintf("%s:%s", func, input);
552 calls = xbt_dict_new_homogeneous(NULL);
554 data = xbt_dict_get(calls, loc);
559 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
560 char* loc = bprintf("%s:%s", func, input);
563 calls = xbt_dict_new_homogeneous(NULL);
565 xbt_dict_set(calls, loc, data, NULL);
574 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
578 * - read the executable data+bss section addresses and sizes
579 * - for each process create a copy of these sections with mmap
580 * - store them in a dynar
586 void switch_data_segment(int dest){
588 if(size_data_exe == 0)//no need to switch
591 if (loaded_page==dest)//no need to switch either
597 if(loaded_page==-1){//initial switch, do the copy from the real page here
598 for (i=0; i< SIMIX_process_count(); i++){
599 memcpy(mappings[i],TOPAGE(start_data_exe),size_data_exe);
602 int current= fds[dest];
603 XBT_VERB("Switching data frame to the one of process %d", dest);
604 void* tmp = mmap (TOPAGE(start_data_exe), size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
605 if (tmp != TOPAGE(start_data_exe))
606 xbt_die("Couldn't map the new region");
611 void smpi_get_executable_global_size(){
612 int size_bss_binary=0;
613 int size_data_binary=0;
615 char *line = NULL; /* Temporal storage for each line that is readed */
616 ssize_t read; /* Number of bytes readed */
617 size_t n = 0; /* Amount of bytes to read by xbt_getline */
622 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
624 fp = popen(command, "r");
627 perror("popen failed");
631 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
636 /* Wipeout the new line character */
637 line[read - 1] = '\0';
639 lfields[0] = strtok(line, " ");
641 if(lfields[0] == NULL)
644 if(strcmp(lfields[0], "Sections:") == 0
645 || strcmp(lfields[0], "Idx") == 0
646 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
649 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
650 lfields[i] = strtok(NULL, " ");
654 * we are looking for these fields
655 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
656 CONTENTS, ALLOC, LOAD, DATA
657 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
662 if(strcmp(lfields[1], ".data") == 0){
663 size_data_binary = strtoul(lfields[2], NULL, 16);
664 start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
666 }else if(strcmp(lfields[1], ".bss") == 0){
667 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
668 //TODO : check if this is OK, as some segments may be inserted between them..
669 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (start_data_exe + size_data_binary))
670 + strtoul(lfields[2], NULL, 16);
678 size_data_exe =(unsigned long)start_data_exe - (unsigned long)TOPAGE(start_data_exe)+ size_data_binary+size_bss_binary;
685 void smpi_initialize_global_memory_segments(){
688 smpi_privatize_global_variables=0;
693 smpi_get_executable_global_size();
695 XBT_DEBUG ("bss+data segment found : size %d starting at %p",size_data_exe, start_data_exe );
697 if(size_data_exe == 0){//no need to switch
698 smpi_privatize_global_variables=0;
702 fds= (int*)xbt_malloc((smpi_process_count())*sizeof(int));
703 mappings= (void**)xbt_malloc((smpi_process_count())*sizeof(void*));
706 for (i=0; i< SIMIX_process_count(); i++){
707 //create SIMIX_process_count() mappings of this size with the same data inside
708 void *address = NULL, *tmp = NULL;
709 char path[] = "/dev/shm/my-buffer-XXXXXX";
711 int file_descriptor= mkstemp (path);
712 if (file_descriptor < 0)
713 xbt_die("Impossible to create temporary file for memory mapping");
714 status = unlink (path);
716 xbt_die("Impossible to unlink temporary file for memory mapping");
718 status = ftruncate(file_descriptor, size_data_exe);
720 xbt_die("Impossible to set the size of the temporary file for memory mapping");
722 /* Ask for a free region */
723 address = mmap (NULL, size_data_exe, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
725 if (address == MAP_FAILED)
726 xbt_die("Couldn't find a free region for memory mapping");
728 tmp = mmap (address, size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, file_descriptor, 0);
731 xbt_die("Couldn't obtain the right address");
732 //initialize the values
733 memcpy(address,TOPAGE(start_data_exe),size_data_exe);
735 //store the address of the mapping for further switches
736 fds[i]=file_descriptor;
737 mappings[i]= address;
744 void smpi_destroy_global_memory_segments(){
745 if(size_data_exe == 0)//no need to switch
749 for (i=0; i< smpi_process_count(); i++){
750 if(munmap(mappings[i],size_data_exe) < 0) {
751 XBT_WARN("Unmapping of fd %d failed: %s", fds[i], strerror(errno));