#include "xbt/ex.h"
#include "surf/surf.h"
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+
XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
"Logging specific to SMPI (benchmarking)");
-xbt_dict_t allocs = NULL; /* Allocated on first use */
-xbt_dict_t samples = NULL; /* Allocated on first use */
-xbt_dict_t calls = NULL; /* Allocated on first use */
+/* Shared allocations are handled through shared memory segments.
+ * Associated data and metadata are used as follows:
+ *
+ * mmap #1
+ * `allocs' dict ---- -.
+ * ---------- shared_data_t shared_metadata_t / | | |
+ * .->| <name> | ---> -------------------- <--. ----------------- | | | |
+ * | ---------- | fd of <name> | | | size of mmap | --| | | |
+ * | | count (2) | |-- | data | \ | | |
+ * `----------------- | <name> | | ----------------- ---- |
+ * -------------------- | ^ |
+ * | | |
+ * | | `allocs_metadata' dict |
+ * | | ---------------------- |
+ * | `-- | <addr of mmap #1> |<-'
+ * | .-- | <addr of mmap #2> |<-.
+ * | | ---------------------- |
+ * | | |
+ * | | |
+ * | | |
+ * | | mmap #2 |
+ * | v ---- -'
+ * | shared_metadata_t / | |
+ * | ----------------- | | |
+ * | | size of mmap | --| | |
+ * `-- | data | | | |
+ * ----------------- | | |
+ * \ | |
+ * ----
+ */
+
+#define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
+
+xbt_dict_t allocs = NULL; /* Allocated on first use */
+xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
+xbt_dict_t samples = NULL; /* Allocated on first use */
+xbt_dict_t calls = NULL; /* Allocated on first use */
+__thread int smpi_current_rank = 0; /* Updated after each MPI call */
typedef struct {
+ int fd;
int count;
- char data[];
+ char* loc;
} shared_data_t;
+typedef struct {
+ size_t size;
+ shared_data_t* data;
+} shared_metadata_t;
+
+static size_t shm_size(int fd) {
+ struct stat st;
+
+ if(fstat(fd, &st) < 0) {
+ xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
+ }
+ return (size_t)st.st_size;
+}
+
+static void* shm_map(int fd, size_t size, shared_data_t* data) {
+ void* mem;
+ char loc[PTR_STRLEN];
+ shared_metadata_t* meta;
+
+ if(size > shm_size(fd)) {
+ if(ftruncate(fd, (off_t)size) < 0) {
+ xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
+ }
+ }
+ mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if(mem == MAP_FAILED) {
+ xbt_die("Could not map fd %d: %s", fd, strerror(errno));
+ }
+ if(!allocs_metadata) {
+ allocs_metadata = xbt_dict_new();
+ }
+ snprintf(loc, PTR_STRLEN, "%p", mem);
+ meta = xbt_new(shared_metadata_t, 1);
+ meta->size = size;
+ meta->data = data;
+ xbt_dict_set(allocs_metadata, loc, meta, &free);
+ XBT_DEBUG("MMAP %zu to %p", size, mem);
+ return mem;
+}
+
typedef struct {
int count;
double sum;
void smpi_bench_begin(void)
{
xbt_os_timer_start(smpi_process_timer());
+ smpi_current_rank = smpi_process_index();
}
void smpi_bench_end(void)
void *smpi_shared_malloc(size_t size, const char *file, int line)
{
- char *loc = bprintf("%s:%d:%zu", file, line, size);
+ char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
+ size_t len = strlen(loc);
+ size_t i;
+ int fd;
+ void* mem;
shared_data_t *data;
+ for(i = 0; i < len; i++) {
+ /* Make the 'loc' ID be a flat filename */
+ if(loc[i] == '/') {
+ loc[i] = '_';
+ }
+ }
if (!allocs) {
allocs = xbt_dict_new();
}
data = xbt_dict_get_or_null(allocs, loc);
- if (!data) {
- data = (shared_data_t *) xbt_malloc0(sizeof(int) + size);
+ if(!data) {
+ fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ if(fd < 0) {
+ switch(errno) {
+ case EEXIST:
+ xbt_die("Please cleanup /dev/shm/%s", loc);
+ default:
+ xbt_die("An unhandled error occured while opening %s: %s", loc, strerror(errno));
+ }
+ }
+ data = xbt_new(shared_data_t, 1);
+ data->fd = fd;
data->count = 1;
+ data->loc = loc;
+ mem = shm_map(fd, size, data);
+ if(shm_unlink(loc) < 0) {
+ XBT_WARN("Could not early unlink %s: %s", loc, strerror(errno));
+ }
xbt_dict_set(allocs, loc, data, &free);
+ XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
} else {
+ mem = shm_map(data->fd, size, data);
data->count++;
}
- free(loc);
- return data->data;
+ XBT_DEBUG("Malloc %zu in %p (metadata at %p)", size, mem, data);
+ return mem;
}
void smpi_shared_free(void *ptr)
{
- shared_data_t *data = (shared_data_t *) ((int *) ptr - 1);
- char *loc;
+ char loc[PTR_STRLEN];
+ shared_metadata_t* meta;
+ shared_data_t* data;
if (!allocs) {
XBT_WARN("Cannot free: nothing was allocated");
return;
}
- loc = xbt_dict_get_key(allocs, data);
- if (!loc) {
+ if(!allocs_metadata) {
+ XBT_WARN("Cannot free: no metadata was allocated");
+ }
+ snprintf(loc, PTR_STRLEN, "%p", ptr);
+ meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
+ if (!meta) {
XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
return;
}
+ data = meta->data;
+ if(!data) {
+ XBT_WARN("Cannot free: something is broken in the metadata link");
+ return;
+ }
+ if(munmap(ptr, meta->size) < 0) {
+ XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
+ }
data->count--;
if (data->count <= 0) {
- xbt_dict_remove(allocs, loc);
+ close(data->fd);
+ xbt_dict_remove(allocs, data->loc);
+ free(data->loc);
}
}
/**
* Build the tree depending on a process rank (index) and the group size (extent)
- * @param index the rank of the calling process
- * @param extent the total number of processes
+ * @param root the rank of the tree root
+ * @param rank the rank of the calling process
+ * @param size the total number of processes
**/
-static void build_tree(int index, int extent, proc_tree_t * tree)
+static void build_tree(int root, int rank, int size, proc_tree_t * tree)
{
- int places = (*tree)->PROCTREE_A * index;
- int i, ch, pr;
+ int index = (rank - root + size) % size;
+ int firstChildIdx = index * (*tree)->PROCTREE_A + 1;
+ int i;
- (*tree)->me = index;
- (*tree)->root = 0;
- for (i = 1; i <= (*tree)->PROCTREE_A; i++) {
- ++places;
- ch = (*tree)->PROCTREE_A * index + i + (*tree)->root;
- ch %= extent;
- if (places < extent) {
- (*tree)->child[i - 1] = ch;
- (*tree)->numChildren++;
- }
+ (*tree)->me = rank;
+ (*tree)->root = root;
+
+ for (i = 0; i < (*tree)->PROCTREE_A && firstChildIdx + i < size; i++) {
+ (*tree)->child[i] = (firstChildIdx + i + root) % size;
+ (*tree)->numChildren++;
}
- if (index == (*tree)->root) {
+ if (rank == root) {
(*tree)->isRoot = 1;
} else {
(*tree)->isRoot = 0;
- pr = (index - 1) / (*tree)->PROCTREE_A;
- (*tree)->parent = pr;
+ (*tree)->parent = (((index - 1) / (*tree)->PROCTREE_A) + root) % size;
}
}
* bcast
**/
static void tree_bcast(void *buf, int count, MPI_Datatype datatype,
- int root, MPI_Comm comm, proc_tree_t tree)
+ MPI_Comm comm, proc_tree_t tree)
{
int system_tag = 999; // used negative int but smpi_create_request() declares this illegal (to be checked)
int rank, i;
* anti-bcast
**/
static void tree_antibcast(void *buf, int count, MPI_Datatype datatype,
- int root, MPI_Comm comm, proc_tree_t tree)
+ MPI_Comm comm, proc_tree_t tree)
{
int system_tag = 999; // used negative int but smpi_create_request() declares this illegal (to be checked)
int rank, i;
rank = smpi_comm_rank(comm);
size = smpi_comm_size(comm);
- build_tree(rank, size, &tree);
- tree_bcast(buf, count, datatype, root, comm, tree);
+ build_tree(root, rank, size, &tree);
+ tree_bcast(buf, count, datatype, comm, tree);
free_tree(tree);
}
rank = smpi_comm_rank(comm);
size = smpi_comm_size(comm);
- build_tree(rank, size, &tree);
- tree_antibcast(&dummy, 1, MPI_CHAR, 0, comm, tree);
- tree_bcast(&dummy, 1, MPI_CHAR, 0, comm, tree);
+ build_tree(0, rank, size, &tree);
+ tree_antibcast(&dummy, 1, MPI_CHAR, comm, tree);
+ tree_bcast(&dummy, 1, MPI_CHAR, comm, tree);
free_tree(tree);
}