1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "xbt/sysdep.h"
12 #include "surf/surf.h"
13 #include "simgrid/sg_config.h"
19 #include <sys/types.h>
22 #include <math.h> // sqrt
27 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
28 "Logging specific to SMPI (benchmarking)");
30 /* Shared allocations are handled through shared memory segments.
31 * Associated data and metadata are used as follows:
34 * `allocs' dict ---- -.
35 * ---------- shared_data_t shared_metadata_t / | | |
36 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
37 * | ---------- | fd of <name> | | | size of mmap | --| | | |
38 * | | count (2) | |-- | data | \ | | |
39 * `----------------- | <name> | | ----------------- ---- |
40 * -------------------- | ^ |
42 * | | `allocs_metadata' dict |
43 * | | ---------------------- |
44 * | `-- | <addr of mmap #1> |<-'
45 * | .-- | <addr of mmap #2> |<-.
46 * | | ---------------------- |
52 * | shared_metadata_t / | |
53 * | ----------------- | | |
54 * | | size of mmap | --| | |
56 * ----------------- | | |
61 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
63 xbt_dict_t allocs = NULL; /* Allocated on first use */
64 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
65 xbt_dict_t samples = NULL; /* Allocated on first use */
66 xbt_dict_t calls = NULL; /* Allocated on first use */
67 __thread int smpi_current_rank = 0; /* Updated after each MPI call */
69 double smpi_cpu_threshold;
70 double smpi_running_power;
75 char* start_data_exe = NULL;
76 int size_data_exe = 0;
77 int smpi_privatize_global_variables;
90 static size_t shm_size(int fd) {
93 if(fstat(fd, &st) < 0) {
94 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
96 return (size_t)st.st_size;
100 static void* shm_map(int fd, size_t size, shared_data_t* data) {
102 char loc[PTR_STRLEN];
103 shared_metadata_t* meta;
105 if(size > shm_size(fd)) {
106 if(ftruncate(fd, (off_t)size) < 0) {
107 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
111 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
112 if(mem == MAP_FAILED) {
113 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
115 if(!allocs_metadata) {
116 allocs_metadata = xbt_dict_new_homogeneous(xbt_free);
118 snprintf(loc, PTR_STRLEN, "%p", mem);
119 meta = xbt_new(shared_metadata_t, 1);
122 xbt_dict_set(allocs_metadata, loc, meta, NULL);
123 XBT_DEBUG("MMAP %zu to %p", size, mem);
128 void smpi_bench_destroy(void)
130 xbt_dict_free(&allocs);
131 xbt_dict_free(&allocs_metadata);
132 xbt_dict_free(&samples);
133 xbt_dict_free(&calls);
136 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
137 void smpi_execute_flops_(double *flops)
139 smpi_execute_flops(*flops);
142 XBT_PUBLIC(void) smpi_execute_(double *duration);
143 void smpi_execute_(double *duration)
145 smpi_execute(*duration);
148 void smpi_execute_flops(double flops) {
151 host = SIMIX_host_self();
152 XBT_DEBUG("Handle real computation time: %f flops", flops);
153 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
155 simcall_set_category (action, TRACE_internal_smpi_get_category());
157 simcall_host_execution_wait(action);
160 void smpi_execute(double duration)
162 if (duration >= smpi_cpu_threshold) {
163 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
164 double flops = duration * smpi_running_power;
166 int rank = smpi_process_index();
167 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
168 extra->type=TRACING_COMPUTING;
169 extra->comp_size=flops;
170 TRACE_smpi_computing_in(rank, extra);
172 smpi_execute_flops(flops);
175 TRACE_smpi_computing_out(rank);
179 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
180 duration, smpi_cpu_threshold);
184 void switch_data_segment(int dest);
186 void smpi_bench_begin(void)
188 switch_data_segment(smpi_process_index());
189 xbt_os_threadtimer_start(smpi_process_timer());
190 smpi_current_rank = smpi_process_index();
193 void smpi_bench_end(void)
195 xbt_os_timer_t timer = smpi_process_timer();
196 xbt_os_threadtimer_stop(timer);
197 // switch_data_segment(smpi_process_count());
198 if (smpi_process_get_sampling()) {
199 XBT_CRITICAL("Cannot do recursive benchmarks.");
200 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
201 xbt_backtrace_display_current();
202 xbt_die("Aborting.");
204 smpi_execute(xbt_os_timer_elapsed(timer));
207 unsigned int smpi_sleep(unsigned int secs)
213 double flops = (double) secs*simcall_host_get_speed(SIMIX_host_self());
214 XBT_DEBUG("Sleep for: %f flops", flops);
215 action = simcall_host_execute("computation", SIMIX_host_self(), flops, 1, 0, 0);
217 simcall_set_category (action, TRACE_internal_smpi_get_category());
219 simcall_host_execution_wait(action);
225 int smpi_gettimeofday(struct timeval *tv)
229 now = SIMIX_get_clock();
231 tv->tv_sec = (time_t)now;
233 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
235 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
242 extern double sg_maxmin_precision;
243 unsigned long long smpi_rastro_resolution (void)
246 double resolution = (1/sg_maxmin_precision);
248 return (unsigned long long)resolution;
251 unsigned long long smpi_rastro_timestamp (void)
254 double now = SIMIX_get_clock();
256 unsigned long long sec = (unsigned long long)now;
257 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
259 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
262 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
264 double threshold; /* maximal stderr requested (if positive) */
265 double relstderr; /* observed stderr so far */
266 double mean; /* mean of benched times, to be used if the block is disabled */
267 double sum; /* sum of benched times (to compute the mean and stderr) */
268 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
269 int iters; /* amount of requested iterations */
270 int count; /* amount of iterations done so far */
271 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
274 static char *sample_location(int global, const char *file, int line) {
276 return bprintf("%s:%d", file, line);
278 return bprintf("%s:%d:%d", file, line, smpi_process_index());
281 static int sample_enough_benchs(local_data_t *data) {
282 int res = data->count >= data->iters;
283 if (data->threshold>0.0) {
285 res = 0; // not enough data
286 if (data->relstderr > data->threshold)
287 res = 0; // stderr too high yet
289 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
290 (res?"enough benchs":"need more data"),
291 data->count, data->iters, data->relstderr, data->threshold, data->mean);
295 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
297 char *loc = sample_location(global, file, line);
300 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
301 smpi_process_set_sampling(1);
304 samples = xbt_dict_new_homogeneous(free);
306 data = xbt_dict_get_or_null(samples, loc);
308 xbt_assert(threshold>0 || iters>0,
309 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
310 data = (local_data_t *) xbt_new(local_data_t, 1);
313 data->sum_pow2 = 0.0;
315 data->threshold = threshold;
316 data->benching = 1; // If we have no data, we need at least one
318 xbt_dict_set(samples, loc, data, NULL);
319 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
321 if (data->iters != iters || data->threshold != threshold) {
322 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
323 loc, data->iters, data->threshold, iters,threshold);
327 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
328 data->benching = !sample_enough_benchs(data);
329 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
334 int smpi_sample_2(int global, const char *file, int line)
336 char *loc = sample_location(global, file, line);
340 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
341 data = xbt_dict_get(samples, loc);
342 XBT_DEBUG("sample2 %s",loc);
345 if (data->benching==1) {
346 // we need to run a new bench
347 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
348 data->count, data->iters, data->relstderr, data->threshold, data->mean);
351 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
352 // Just sleep instead
353 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
354 data->count, data->iters, data->relstderr, data->threshold, data->mean);
355 smpi_execute(data->mean);
356 smpi_process_set_sampling(0);
357 res = 0; // prepare to capture future, unrelated computations
364 void smpi_sample_3(int global, const char *file, int line)
366 char *loc = sample_location(global, file, line);
369 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
370 data = xbt_dict_get(samples, loc);
371 XBT_DEBUG("sample3 %s",loc);
374 if (data->benching==0) {
378 // ok, benchmarking this loop is over
379 xbt_os_threadtimer_stop(smpi_process_timer());
384 sample = xbt_os_timer_elapsed(smpi_process_timer());
386 data->sum_pow2 += sample * sample;
387 n = (double)data->count;
388 data->mean = data->sum / n;
389 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
390 if (!sample_enough_benchs(data)) {
391 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
393 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
394 data->mean, data->relstderr, sample);
396 // That's enough for now, prevent sample_2 to run the same code over and over
401 static void smpi_shared_alloc_free(void *p)
403 shared_data_t *data = p;
408 static char *smpi_shared_alloc_hash(char *loc)
418 loc = xbt_realloc(loc, 30);
420 for (i = 0; i < 40; i += 6) { /* base64 encode */
421 memcpy(s, hash + i, 6);
422 val = strtoul(s, NULL, 16);
423 for (j = 0; j < 4; j++) {
424 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
425 loc[1 + 4 * i / 6 + j] =
426 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
433 void *smpi_shared_malloc(size_t size, const char *file, int line)
436 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
437 char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
440 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
443 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
445 data = xbt_dict_get_or_null(allocs, loc);
447 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
448 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
452 xbt_die("Please cleanup /dev/shm/%s", loc);
454 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
457 data = xbt_new(shared_data_t, 1);
461 mem = shm_map(fd, size, data);
462 if (shm_unlink(loc) < 0) {
463 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
465 xbt_dict_set(allocs, loc, data, NULL);
466 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
469 mem = shm_map(data->fd, size, data);
472 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
474 mem = xbt_malloc(size);
475 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
480 void smpi_shared_free(void *ptr)
482 char loc[PTR_STRLEN];
483 shared_metadata_t* meta;
485 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
488 XBT_WARN("Cannot free: nothing was allocated");
491 if(!allocs_metadata) {
492 XBT_WARN("Cannot free: no metadata was allocated");
494 snprintf(loc, PTR_STRLEN, "%p", ptr);
495 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
497 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
502 XBT_WARN("Cannot free: something is broken in the metadata link");
505 if(munmap(ptr, meta->size) < 0) {
506 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
509 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
510 if (data->count <= 0) {
512 xbt_dict_remove(allocs, data->loc);
513 XBT_DEBUG("Shared free - with removal - of %p", ptr);
516 XBT_DEBUG("Classic free of %p", ptr);
522 int smpi_shared_known_call(const char* func, const char* input)
524 char* loc = bprintf("%s:%s", func, input);
529 calls = xbt_dict_new_homogeneous(NULL);
532 xbt_dict_get(calls, loc); /* Succeed or throw */
539 if (ex.category != not_found_error)
546 void* smpi_shared_get_call(const char* func, const char* input) {
547 char* loc = bprintf("%s:%s", func, input);
551 calls = xbt_dict_new_homogeneous(NULL);
553 data = xbt_dict_get(calls, loc);
558 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
559 char* loc = bprintf("%s:%s", func, input);
562 calls = xbt_dict_new_homogeneous(NULL);
564 xbt_dict_set(calls, loc, data, NULL);
573 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
577 * - read the executable data+bss section addresses and sizes
578 * - for each process create a copy of these sections with mmap
579 * - store them in a dynar
585 void switch_data_segment(int dest){
587 if(size_data_exe == 0)//no need to switch
590 if (loaded_page==dest)//no need to switch either
593 int current= fds[dest];
594 XBT_VERB("Switching data frame to the one of process %d", dest);
595 void* tmp = mmap (TOPAGE(start_data_exe), size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
596 msync(TOPAGE(start_data_exe), size_data_exe, MS_SYNC | MS_INVALIDATE );
597 if (tmp != TOPAGE(start_data_exe))
598 xbt_die("Couldn't map the new region");
603 void smpi_get_executable_global_size(){
604 int size_bss_binary=0;
605 int size_data_binary=0;
607 char *line = NULL; /* Temporal storage for each line that is readed */
608 ssize_t read; /* Number of bytes readed */
609 size_t n = 0; /* Amount of bytes to read by xbt_getline */
614 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
616 fp = popen(command, "r");
619 perror("popen failed");
623 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
628 /* Wipeout the new line character */
629 line[read - 1] = '\0';
631 lfields[0] = strtok(line, " ");
633 if(lfields[0] == NULL)
636 if(strcmp(lfields[0], "Sections:") == 0
637 || strcmp(lfields[0], "Idx") == 0
638 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
641 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
642 lfields[i] = strtok(NULL, " ");
646 * we are looking for these fields
647 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
648 CONTENTS, ALLOC, LOAD, DATA
649 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
654 if(strcmp(lfields[1], ".data") == 0){
655 size_data_binary = strtoul(lfields[2], NULL, 16);
656 start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
658 }else if(strcmp(lfields[1], ".bss") == 0){
659 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
660 //TODO : check if this is OK, as some segments may be inserted between them..
661 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (start_data_exe + size_data_binary))
662 + strtoul(lfields[2], NULL, 16);
670 size_data_exe =(unsigned long)start_data_exe - (unsigned long)TOPAGE(start_data_exe)+ size_data_binary+size_bss_binary;
677 void smpi_initialize_global_memory_segments(){
680 smpi_privatize_global_variables=0;
685 smpi_get_executable_global_size();
687 XBT_DEBUG ("bss+data segment found : size %d starting at %p",size_data_exe, start_data_exe );
689 if(size_data_exe == 0){//no need to switch
690 smpi_privatize_global_variables=0;
694 fds= (int*)xbt_malloc((smpi_process_count())*sizeof(int));
695 mappings= (void**)xbt_malloc((smpi_process_count())*sizeof(void*));
698 for (i=0; i< SIMIX_process_count(); i++){
699 //create SIMIX_process_count() mappings of this size with the same data inside
700 void *address = NULL, *tmp = NULL;
701 char path[] = "/dev/shm/my-buffer-XXXXXX";
703 int file_descriptor= mkstemp (path);
704 if (file_descriptor < 0)
705 xbt_die("Impossible to create temporary file for memory mapping");
706 status = unlink (path);
708 xbt_die("Impossible to unlink temporary file for memory mapping");
710 status = ftruncate(file_descriptor, size_data_exe);
712 xbt_die("Impossible to set the size of the temporary file for memory mapping");
714 /* Ask for a free region */
715 address = mmap (NULL, size_data_exe, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
717 if (address == MAP_FAILED)
718 xbt_die("Couldn't find a free region for memory mapping");
720 tmp = mmap (address, size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, file_descriptor, 0);
723 xbt_die("Couldn't obtain the right address");
724 //initialize the values
725 memcpy(address,TOPAGE(start_data_exe),size_data_exe);
727 //store the address of the mapping for further switches
728 fds[i]=file_descriptor;
729 mappings[i]= address;
736 void smpi_destroy_global_memory_segments(){
738 if(size_data_exe == 0)//no need to switch
741 for (i=0; i< smpi_process_count(); i++){
742 if(munmap(mappings[i],size_data_exe) < 0) {
743 XBT_WARN("Unmapping of fd %d failed: %s", fds[i], strerror(errno));