From: Pierre Nicolas CLAUSS Date: Thu, 28 Jul 2011 08:50:49 +0000 (+0000) Subject: Merge branch 'master' into smpi X-Git-Tag: exp_20120216~195^2~33 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/2f93912677c9c5df63406a714b031993a7455d6c?hp=edacdb522ab2aaf5f4246347f9440db4fc2bdd96 Merge branch 'master' into smpi --- diff --git a/include/smpi/smpif.h.in b/include/smpi/smpif.h.in index 3e8ddf7b73..bbcb661ac1 100644 --- a/include/smpi/smpif.h.in +++ b/include/smpi/smpif.h.in @@ -17,6 +17,8 @@ #include #include +XBT_PUBLIC_DATA(__thread int) smpi_current_rank; + XBT_PUBLIC(int) smpi_process_argc(void); XBT_PUBLIC(int) smpi_process_getarg(integer* index, char* dst, ftnlen len); XBT_PUBLIC(int) smpi_global_size(void); diff --git a/src/simix/smx_network.c b/src/simix/smx_network.c index 1325aac2ae..d16566faf9 100644 --- a/src/simix/smx_network.c +++ b/src/simix/smx_network.c @@ -845,10 +845,8 @@ void SIMIX_comm_copy_data(smx_action_t comm) if (comm->comm.dst_buff_size) *comm->comm.dst_buff_size = buff_size; - if (buff_size == 0) - return; - - (*SIMIX_comm_copy_data_callback) (comm, buff_size); + if (buff_size > 0) + (*SIMIX_comm_copy_data_callback) (comm, buff_size); /* Set the copied flag so we copy data only once */ /* (this function might be called from both communication ends) */ diff --git a/src/smpi/smpi_base.c b/src/smpi/smpi_base.c index d1945536a1..9ba0db2f17 100644 --- a/src/smpi/smpi_base.c +++ b/src/smpi/smpi_base.c @@ -26,8 +26,7 @@ static int match_recv(void* a, void* b) { xbt_assert(ref, "Cannot match recv against null reference"); xbt_assert(req, "Cannot match recv against null request"); - return req->comm == ref->comm - && (ref->src == MPI_ANY_SOURCE || req->src == ref->src) + return (ref->src == MPI_ANY_SOURCE || req->src == ref->src) && (ref->tag == MPI_ANY_TAG || req->tag == ref->tag); } @@ -37,8 +36,7 @@ static int match_send(void* a, void* b) { xbt_assert(ref, "Cannot match send against null reference"); xbt_assert(req, "Cannot match send against null request"); - return req->comm == ref->comm - && (req->src == MPI_ANY_SOURCE || req->src == ref->src) + return (req->src == MPI_ANY_SOURCE || req->src == ref->src) && (req->tag == MPI_ANY_TAG || req->tag == ref->tag); } @@ -50,6 +48,7 @@ static MPI_Request build_request(void *buf, int count, request = xbt_new(s_smpi_mpi_request_t, 1); request->buf = buf; + // FIXME: this will have to be changed to support non-contiguous datatypes request->size = smpi_datatype_size(datatype) * count; request->src = src; request->dst = dst; @@ -94,10 +93,13 @@ void smpi_mpi_start(MPI_Request request) if(request->flags & RECV) { print_request("New recv", request); mailbox = smpi_process_mailbox(); + // FIXME: SIMIX does not yet support don-contiguous datatypes request->action = SIMIX_req_comm_irecv(mailbox, request->buf, &request->size, &match_recv, request); } else { print_request("New send", request); - mailbox = smpi_process_remote_mailbox(request->dst); + mailbox = smpi_process_remote_mailbox( + smpi_group_index(smpi_comm_group(request->comm), request->dst)); + // FIXME: SIMIX does not yet support don-contiguous datatypes request->action = SIMIX_req_comm_isend(mailbox, request->size, -1.0, request->buf, request->size, &match_send, request, 0); #ifdef HAVE_TRACING @@ -212,6 +214,8 @@ static void finish_wait(MPI_Request * request, MPI_Status * status) status->MPI_SOURCE = req->src; status->MPI_TAG = req->tag; status->MPI_ERROR = MPI_SUCCESS; + // FIXME: really this should just contain the count of receive-type blocks, + // right? status->count = req->size; } print_request("Finishing", req); @@ -253,9 +257,10 @@ int smpi_mpi_testany(int count, MPI_Request requests[], int *index, } } if(size > 0) { - *index = SIMIX_req_comm_testany(comms); - *index = map[*index]; - if(*index != MPI_UNDEFINED) { + i = SIMIX_req_comm_testany(comms); + // FIXME: MPI_UNDEFINED or does SIMIX have a return code? + if(i != MPI_UNDEFINED) { + *index = map[i]; smpi_mpi_wait(&requests[*index], status); flag = 1; } @@ -296,9 +301,12 @@ int smpi_mpi_waitany(int count, MPI_Request requests[], } } if(size > 0) { - index = SIMIX_req_comm_waitany(comms); - index = map[index]; - finish_wait(&requests[index], status); + i = SIMIX_req_comm_waitany(comms); + // FIXME: MPI_UNDEFINED or does SIMIX have a return code? + if (i != MPI_UNDEFINED) { + index = map[i]; + finish_wait(&requests[index], status); + } } xbt_free(map); xbt_dynar_free(&comms); @@ -362,7 +370,8 @@ void smpi_mpi_gather(void *sendbuf, int sendcount, MPI_Datatype sendtype, int root, MPI_Comm comm) { int system_tag = 666; - int rank, size, src, index, sendsize, recvsize; + int rank, size, src, index; + MPI_Aint lb = 0, recvext = 0; MPI_Request *requests; rank = smpi_comm_rank(comm); @@ -371,20 +380,19 @@ void smpi_mpi_gather(void *sendbuf, int sendcount, MPI_Datatype sendtype, // Send buffer to root smpi_mpi_send(sendbuf, sendcount, sendtype, root, system_tag, comm); } else { - sendsize = smpi_datatype_size(sendtype); - recvsize = smpi_datatype_size(recvtype); + // FIXME: check for errors + smpi_datatype_extent(recvtype, &lb, &recvext); // Local copy from root - memcpy(&((char *) recvbuf)[root * recvcount * recvsize], sendbuf, - sendcount * sendsize * sizeof(char)); + smpi_datatype_copy(sendbuf, sendcount, sendtype, + (char *)recvbuf + root * recvcount * recvext, recvcount, recvtype); // Receive buffers from senders requests = xbt_new(MPI_Request, size - 1); index = 0; for(src = 0; src < size; src++) { if(src != root) { - requests[index] = smpi_irecv_init(&((char *) recvbuf) - [src * recvcount * recvsize], - recvcount, recvtype, src, - system_tag, comm); + requests[index] = smpi_irecv_init((char *)recvbuf + src * recvcount * recvext, + recvcount, recvtype, + src, system_tag, comm); index++; } } @@ -400,7 +408,8 @@ void smpi_mpi_gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype, MPI_Datatype recvtype, int root, MPI_Comm comm) { int system_tag = 666; - int rank, size, src, index, sendsize; + int rank, size, src, index; + MPI_Aint lb = 0, recvext = 0; MPI_Request *requests; rank = smpi_comm_rank(comm); @@ -409,19 +418,20 @@ void smpi_mpi_gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype, // Send buffer to root smpi_mpi_send(sendbuf, sendcount, sendtype, root, system_tag, comm); } else { - sendsize = smpi_datatype_size(sendtype); + // FIXME: check for errors + smpi_datatype_extent(recvtype, &lb, &recvext); // Local copy from root - memcpy(&((char *) recvbuf)[displs[root]], sendbuf, - sendcount * sendsize * sizeof(char)); + smpi_datatype_copy(sendbuf, sendcount, sendtype, + (char *)recvbuf + displs[root] * recvext, + recvcounts[root], recvtype); // Receive buffers from senders requests = xbt_new(MPI_Request, size - 1); index = 0; for(src = 0; src < size; src++) { if(src != root) { requests[index] = - smpi_irecv_init(&((char *) recvbuf)[displs[src]], - recvcounts[src], recvtype, src, system_tag, - comm); + smpi_irecv_init((char *)recvbuf + displs[src] * recvext, + recvcounts[src], recvtype, src, system_tag, comm); index++; } } @@ -438,16 +448,18 @@ void smpi_mpi_allgather(void *sendbuf, int sendcount, MPI_Comm comm) { int system_tag = 666; - int rank, size, other, index, sendsize, recvsize; + int rank, size, other, index; + MPI_Aint lb = 0, recvext = 0; MPI_Request *requests; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); - sendsize = smpi_datatype_size(sendtype); - recvsize = smpi_datatype_size(recvtype); + // FIXME: check for errors + smpi_datatype_extent(recvtype, &lb, &recvext); // Local copy from self - memcpy(&((char *) recvbuf)[rank * recvcount * recvsize], sendbuf, - sendcount * sendsize * sizeof(char)); + smpi_datatype_copy(sendbuf, sendcount, sendtype, + (char *)recvbuf + rank * recvcount * recvext, recvcount, + recvtype); // Send/Recv buffers to/from others; requests = xbt_new(MPI_Request, 2 * (size - 1)); index = 0; @@ -457,9 +469,8 @@ void smpi_mpi_allgather(void *sendbuf, int sendcount, smpi_isend_init(sendbuf, sendcount, sendtype, other, system_tag, comm); index++; - requests[index] = smpi_irecv_init(&((char *) recvbuf) - [other * recvcount * recvsize], - recvcount, recvtype, other, + requests[index] = smpi_irecv_init((char *)recvbuf + other * recvcount * recvext, + recvcount, recvtype, other, system_tag, comm); index++; } @@ -476,15 +487,18 @@ void smpi_mpi_allgatherv(void *sendbuf, int sendcount, MPI_Datatype recvtype, MPI_Comm comm) { int system_tag = 666; - int rank, size, other, index, sendsize; + int rank, size, other, index; + MPI_Aint lb = 0, recvext = 0; MPI_Request *requests; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); - sendsize = smpi_datatype_size(sendtype); + // FIXME: check for errors + smpi_datatype_extent(recvtype, &lb, &recvext); // Local copy from self - memcpy(&((char *) recvbuf)[displs[rank]], sendbuf, - sendcount * sendsize * sizeof(char)); + smpi_datatype_copy(sendbuf, sendcount, sendtype, + (char *)recvbuf + displs[rank] * recvext, + recvcounts[rank], recvtype); // Send buffers to others; requests = xbt_new(MPI_Request, 2 * (size - 1)); index = 0; @@ -495,9 +509,8 @@ void smpi_mpi_allgatherv(void *sendbuf, int sendcount, comm); index++; requests[index] = - smpi_irecv_init(&((char *) recvbuf)[displs[other]], - recvcounts[other], recvtype, other, system_tag, - comm); + smpi_irecv_init((char *)recvbuf + displs[other] * recvext, recvcounts[other], + recvtype, other, system_tag, comm); index++; } } @@ -512,7 +525,8 @@ void smpi_mpi_scatter(void *sendbuf, int sendcount, MPI_Datatype sendtype, int root, MPI_Comm comm) { int system_tag = 666; - int rank, size, dst, index, sendsize, recvsize; + int rank, size, dst, index; + MPI_Aint lb = 0, sendext = 0; MPI_Request *requests; rank = smpi_comm_rank(comm); @@ -522,18 +536,17 @@ void smpi_mpi_scatter(void *sendbuf, int sendcount, MPI_Datatype sendtype, smpi_mpi_recv(recvbuf, recvcount, recvtype, root, system_tag, comm, MPI_STATUS_IGNORE); } else { - sendsize = smpi_datatype_size(sendtype); - recvsize = smpi_datatype_size(recvtype); + // FIXME: check for errors + smpi_datatype_extent(sendtype, &lb, &sendext); // Local copy from root - memcpy(recvbuf, &((char *) sendbuf)[root * sendcount * sendsize], - recvcount * recvsize * sizeof(char)); + smpi_datatype_copy((char *)sendbuf + root * sendcount * sendext, + sendcount, sendtype, recvbuf, recvcount, recvtype); // Send buffers to receivers requests = xbt_new(MPI_Request, size - 1); index = 0; for(dst = 0; dst < size; dst++) { if(dst != root) { - requests[index] = smpi_isend_init(&((char *) sendbuf) - [dst * sendcount * sendsize], + requests[index] = smpi_isend_init((char *)sendbuf + dst * sendcount * sendext, sendcount, sendtype, dst, system_tag, comm); index++; @@ -551,7 +564,8 @@ void smpi_mpi_scatterv(void *sendbuf, int *sendcounts, int *displs, MPI_Datatype recvtype, int root, MPI_Comm comm) { int system_tag = 666; - int rank, size, dst, index, recvsize; + int rank, size, dst, index; + MPI_Aint lb = 0, sendext = 0; MPI_Request *requests; rank = smpi_comm_rank(comm); @@ -561,19 +575,19 @@ void smpi_mpi_scatterv(void *sendbuf, int *sendcounts, int *displs, smpi_mpi_recv(recvbuf, recvcount, recvtype, root, system_tag, comm, MPI_STATUS_IGNORE); } else { - recvsize = smpi_datatype_size(recvtype); + // FIXME: check for errors + smpi_datatype_extent(sendtype, &lb, &sendext); // Local copy from root - memcpy(recvbuf, &((char *) sendbuf)[displs[root]], - recvcount * recvsize * sizeof(char)); + smpi_datatype_copy((char *)sendbuf + displs[root] * sendext, sendcounts[root], + sendtype, recvbuf, recvcount, recvtype); // Send buffers to receivers requests = xbt_new(MPI_Request, size - 1); index = 0; for(dst = 0; dst < size; dst++) { if(dst != root) { requests[index] = - smpi_isend_init(&((char *) sendbuf)[displs[dst]], - sendcounts[dst], sendtype, dst, system_tag, - comm); + smpi_isend_init((char *)sendbuf + displs[dst] * sendext, sendcounts[dst], + sendtype, dst, system_tag, comm); index++; } } @@ -589,7 +603,8 @@ void smpi_mpi_reduce(void *sendbuf, void *recvbuf, int count, MPI_Comm comm) { int system_tag = 666; - int rank, size, src, index, datasize; + int rank, size, src, index; + MPI_Aint lb = 0, dataext = 0; MPI_Request *requests; void **tmpbufs; @@ -599,9 +614,10 @@ void smpi_mpi_reduce(void *sendbuf, void *recvbuf, int count, // Send buffer to root smpi_mpi_send(sendbuf, count, datatype, root, system_tag, comm); } else { - datasize = smpi_datatype_size(datatype); + // FIXME: check for errors + smpi_datatype_extent(datatype, &lb, &dataext); // Local copy from root - memcpy(recvbuf, sendbuf, count * datasize * sizeof(char)); + smpi_datatype_copy(sendbuf, count, datatype, recvbuf, count, datatype); // Receive buffers from senders //TODO: make a MPI_barrier here ? requests = xbt_new(MPI_Request, size - 1); @@ -609,7 +625,9 @@ void smpi_mpi_reduce(void *sendbuf, void *recvbuf, int count, index = 0; for(src = 0; src < size; src++) { if(src != root) { - tmpbufs[index] = xbt_malloc(count * datasize); + // FIXME: possibly overkill we we have contiguous/noncontiguous data + // mapping... + tmpbufs[index] = xbt_malloc(count * dataext); requests[index] = smpi_irecv_init(tmpbufs[index], count, datatype, src, system_tag, comm); @@ -638,73 +656,34 @@ void smpi_mpi_allreduce(void *sendbuf, void *recvbuf, int count, { smpi_mpi_reduce(sendbuf, recvbuf, count, datatype, op, 0, comm); smpi_mpi_bcast(recvbuf, count, datatype, 0, comm); - -/* -FIXME: buggy implementation - - int system_tag = 666; - int rank, size, other, index, datasize; - MPI_Request* requests; - void** tmpbufs; - - rank = smpi_comm_rank(comm); - size = smpi_comm_size(comm); - datasize = smpi_datatype_size(datatype); - // Local copy from self - memcpy(recvbuf, sendbuf, count * datasize * sizeof(char)); - // Send/Recv buffers to/from others; - //TODO: make a MPI_barrier here ? - requests = xbt_new(MPI_Request, 2 * (size - 1)); - tmpbufs = xbt_new(void*, size - 1); - index = 0; - for(other = 0; other < size; other++) { - if(other != rank) { - tmpbufs[index / 2] = xbt_malloc(count * datasize); - requests[index] = smpi_mpi_isend(sendbuf, count, datatype, other, system_tag, comm); - requests[index + 1] = smpi_mpi_irecv(tmpbufs[index / 2], count, datatype, other, system_tag, comm); - index += 2; - } - } - // Wait for completion of all comms. - for(other = 0; other < 2 * (size - 1); other++) { - index = smpi_mpi_waitany(size - 1, requests, MPI_STATUS_IGNORE); - if(index == MPI_UNDEFINED) { - break; - } - if((index & 1) == 1) { - // Request is odd: it's a irecv - smpi_op_apply(op, tmpbufs[index / 2], recvbuf, &count, &datatype); - } - } - for(index = 0; index < size - 1; index++) { - xbt_free(tmpbufs[index]); - } - xbt_free(tmpbufs); - xbt_free(requests); -*/ } void smpi_mpi_scan(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) { int system_tag = 666; - int rank, size, other, index, datasize; - int total; + int rank, size, other, index; + MPI_Aint lb = 0, dataext = 0; MPI_Request *requests; void **tmpbufs; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); - datasize = smpi_datatype_size(datatype); + + // FIXME: check for errors + smpi_datatype_extent(datatype, &lb, &dataext); + // Local copy from self - memcpy(recvbuf, sendbuf, count * datasize * sizeof(char)); + smpi_datatype_copy(sendbuf, count, datatype, recvbuf, count, datatype); + // Send/Recv buffers to/from others; - total = rank + (size - (rank + 1)); - requests = xbt_new(MPI_Request, total); + requests = xbt_new(MPI_Request, size - 1); tmpbufs = xbt_new(void *, rank); index = 0; for(other = 0; other < rank; other++) { - tmpbufs[index] = xbt_malloc(count * datasize); + // FIXME: possibly overkill we we have contiguous/noncontiguous data + // mapping... + tmpbufs[index] = xbt_malloc(count * dataext); requests[index] = smpi_irecv_init(tmpbufs[index], count, datatype, other, system_tag, comm); @@ -717,7 +696,7 @@ void smpi_mpi_scan(void *sendbuf, void *recvbuf, int count, } // Wait for completion of all comms. smpi_mpi_startall(size - 1, requests); - for(other = 0; other < total; other++) { + for(other = 0; other < size - 1; other++) { index = smpi_mpi_waitany(size - 1, requests, MPI_STATUS_IGNORE); if(index == MPI_UNDEFINED) { break; @@ -727,7 +706,7 @@ void smpi_mpi_scan(void *sendbuf, void *recvbuf, int count, smpi_op_apply(op, tmpbufs[index], recvbuf, &count, &datatype); } } - for(index = 0; index < size - 1; index++) { + for(index = 0; index < rank; index++) { xbt_free(tmpbufs[index]); } xbt_free(tmpbufs); diff --git a/src/smpi/smpi_bench.c b/src/smpi/smpi_bench.c index c9d2c8b4ab..195aa7289d 100644 --- a/src/smpi/smpi_bench.c +++ b/src/smpi/smpi_bench.c @@ -10,18 +10,103 @@ #include "xbt/ex.h" #include "surf/surf.h" +#include +#include +#include +#include +#include +#include +#include +#include + XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi, "Logging specific to SMPI (benchmarking)"); -xbt_dict_t allocs = NULL; /* Allocated on first use */ -xbt_dict_t samples = NULL; /* Allocated on first use */ -xbt_dict_t calls = NULL; /* Allocated on first use */ +/* Shared allocations are handled through shared memory segments. + * Associated data and metadata are used as follows: + * + * mmap #1 + * `allocs' dict ---- -. + * ---------- shared_data_t shared_metadata_t / | | | + * .->| | ---> -------------------- <--. ----------------- | | | | + * | ---------- | fd of | | | size of mmap | --| | | | + * | | count (2) | |-- | data | \ | | | + * `----------------- | | | ----------------- ---- | + * -------------------- | ^ | + * | | | + * | | `allocs_metadata' dict | + * | | ---------------------- | + * | `-- | |<-' + * | .-- | |<-. + * | | ---------------------- | + * | | | + * | | | + * | | | + * | | mmap #2 | + * | v ---- -' + * | shared_metadata_t / | | + * | ----------------- | | | + * | | size of mmap | --| | | + * `-- | data | | | | + * ----------------- | | | + * \ | | + * ---- + */ + +#define PTR_STRLEN (2 + 2 * sizeof(void*) + 1) + +xbt_dict_t allocs = NULL; /* Allocated on first use */ +xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */ +xbt_dict_t samples = NULL; /* Allocated on first use */ +xbt_dict_t calls = NULL; /* Allocated on first use */ +__thread int smpi_current_rank = 0; /* Updated after each MPI call */ typedef struct { + int fd; int count; - char data[]; + char* loc; } shared_data_t; +typedef struct { + size_t size; + shared_data_t* data; +} shared_metadata_t; + +static size_t shm_size(int fd) { + struct stat st; + + if(fstat(fd, &st) < 0) { + xbt_die("Could not stat fd %d: %s", fd, strerror(errno)); + } + return (size_t)st.st_size; +} + +static void* shm_map(int fd, size_t size, shared_data_t* data) { + void* mem; + char loc[PTR_STRLEN]; + shared_metadata_t* meta; + + if(size > shm_size(fd)) { + if(ftruncate(fd, (off_t)size) < 0) { + xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno)); + } + } + mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if(mem == MAP_FAILED) { + xbt_die("Could not map fd %d: %s", fd, strerror(errno)); + } + if(!allocs_metadata) { + allocs_metadata = xbt_dict_new(); + } + snprintf(loc, PTR_STRLEN, "%p", mem); + meta = xbt_new(shared_metadata_t, 1); + meta->size = size; + meta->data = data; + xbt_dict_set(allocs_metadata, loc, meta, &free); + XBT_DEBUG("MMAP %zu to %p", size, mem); + return mem; +} + typedef struct { int count; double sum; @@ -73,6 +158,7 @@ static void smpi_execute(double duration) void smpi_bench_begin(void) { xbt_os_timer_start(smpi_process_timer()); + smpi_current_rank = smpi_process_index(); } void smpi_bench_end(void) @@ -191,41 +277,83 @@ void smpi_sample_flops(double flops) void *smpi_shared_malloc(size_t size, const char *file, int line) { - char *loc = bprintf("%s:%d:%zu", file, line, size); + char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line); + size_t len = strlen(loc); + size_t i; + int fd; + void* mem; shared_data_t *data; + for(i = 0; i < len; i++) { + /* Make the 'loc' ID be a flat filename */ + if(loc[i] == '/') { + loc[i] = '_'; + } + } if (!allocs) { allocs = xbt_dict_new(); } data = xbt_dict_get_or_null(allocs, loc); - if (!data) { - data = (shared_data_t *) xbt_malloc0(sizeof(int) + size); + if(!data) { + fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if(fd < 0) { + switch(errno) { + case EEXIST: + xbt_die("Please cleanup /dev/shm/%s", loc); + default: + xbt_die("An unhandled error occured while opening %s: %s", loc, strerror(errno)); + } + } + data = xbt_new(shared_data_t, 1); + data->fd = fd; data->count = 1; + data->loc = loc; + mem = shm_map(fd, size, data); + if(shm_unlink(loc) < 0) { + XBT_WARN("Could not early unlink %s: %s", loc, strerror(errno)); + } xbt_dict_set(allocs, loc, data, &free); + XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd); } else { + mem = shm_map(data->fd, size, data); data->count++; } - free(loc); - return data->data; + XBT_DEBUG("Malloc %zu in %p (metadata at %p)", size, mem, data); + return mem; } void smpi_shared_free(void *ptr) { - shared_data_t *data = (shared_data_t *) ((int *) ptr - 1); - char *loc; + char loc[PTR_STRLEN]; + shared_metadata_t* meta; + shared_data_t* data; if (!allocs) { XBT_WARN("Cannot free: nothing was allocated"); return; } - loc = xbt_dict_get_key(allocs, data); - if (!loc) { + if(!allocs_metadata) { + XBT_WARN("Cannot free: no metadata was allocated"); + } + snprintf(loc, PTR_STRLEN, "%p", ptr); + meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc); + if (!meta) { XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr); return; } + data = meta->data; + if(!data) { + XBT_WARN("Cannot free: something is broken in the metadata link"); + return; + } + if(munmap(ptr, meta->size) < 0) { + XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno)); + } data->count--; if (data->count <= 0) { - xbt_dict_remove(allocs, loc); + close(data->fd); + xbt_dict_remove(allocs, data->loc); + free(data->loc); } } diff --git a/src/smpi/smpi_coll.c b/src/smpi/smpi_coll.c index a12de9c56e..7798de86ea 100644 --- a/src/smpi/smpi_coll.c +++ b/src/smpi/smpi_coll.c @@ -58,31 +58,28 @@ static void free_tree(proc_tree_t tree) /** * Build the tree depending on a process rank (index) and the group size (extent) - * @param index the rank of the calling process - * @param extent the total number of processes + * @param root the rank of the tree root + * @param rank the rank of the calling process + * @param size the total number of processes **/ -static void build_tree(int index, int extent, proc_tree_t * tree) +static void build_tree(int root, int rank, int size, proc_tree_t * tree) { - int places = (*tree)->PROCTREE_A * index; - int i, ch, pr; + int index = (rank - root + size) % size; + int firstChildIdx = index * (*tree)->PROCTREE_A + 1; + int i; - (*tree)->me = index; - (*tree)->root = 0; - for (i = 1; i <= (*tree)->PROCTREE_A; i++) { - ++places; - ch = (*tree)->PROCTREE_A * index + i + (*tree)->root; - ch %= extent; - if (places < extent) { - (*tree)->child[i - 1] = ch; - (*tree)->numChildren++; - } + (*tree)->me = rank; + (*tree)->root = root; + + for (i = 0; i < (*tree)->PROCTREE_A && firstChildIdx + i < size; i++) { + (*tree)->child[i] = (firstChildIdx + i + root) % size; + (*tree)->numChildren++; } - if (index == (*tree)->root) { + if (rank == root) { (*tree)->isRoot = 1; } else { (*tree)->isRoot = 0; - pr = (index - 1) / (*tree)->PROCTREE_A; - (*tree)->parent = pr; + (*tree)->parent = (((index - 1) / (*tree)->PROCTREE_A) + root) % size; } } @@ -90,7 +87,7 @@ static void build_tree(int index, int extent, proc_tree_t * tree) * bcast **/ static void tree_bcast(void *buf, int count, MPI_Datatype datatype, - int root, MPI_Comm comm, proc_tree_t tree) + MPI_Comm comm, proc_tree_t tree) { int system_tag = 999; // used negative int but smpi_create_request() declares this illegal (to be checked) int rank, i; @@ -128,7 +125,7 @@ static void tree_bcast(void *buf, int count, MPI_Datatype datatype, * anti-bcast **/ static void tree_antibcast(void *buf, int count, MPI_Datatype datatype, - int root, MPI_Comm comm, proc_tree_t tree) + MPI_Comm comm, proc_tree_t tree) { int system_tag = 999; // used negative int but smpi_create_request() declares this illegal (to be checked) int rank, i; @@ -173,8 +170,8 @@ void nary_tree_bcast(void *buf, int count, MPI_Datatype datatype, int root, rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); - build_tree(rank, size, &tree); - tree_bcast(buf, count, datatype, root, comm, tree); + build_tree(root, rank, size, &tree); + tree_bcast(buf, count, datatype, comm, tree); free_tree(tree); } @@ -189,9 +186,9 @@ void nary_tree_barrier(MPI_Comm comm, int arity) rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); - build_tree(rank, size, &tree); - tree_antibcast(&dummy, 1, MPI_CHAR, 0, comm, tree); - tree_bcast(&dummy, 1, MPI_CHAR, 0, comm, tree); + build_tree(0, rank, size, &tree); + tree_antibcast(&dummy, 1, MPI_CHAR, comm, tree); + tree_bcast(&dummy, 1, MPI_CHAR, comm, tree); free_tree(tree); } @@ -199,6 +196,8 @@ void nary_tree_barrier(MPI_Comm comm, int arity) * Alltoall Bruck * * Openmpi calls this routine when the message size sent to each rank < 2000 bytes and size < 12 + * FIXME: uh, check smpi_pmpi again, but this routine is called for > 12, not + * less... **/ int smpi_coll_tuned_alltoall_bruck(void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, @@ -208,20 +207,21 @@ int smpi_coll_tuned_alltoall_bruck(void *sendbuf, int sendcount, int system_tag = 777; int i, rank, size, err, count; MPI_Aint lb; - MPI_Aint sendextent = 0; - MPI_Aint recvextent = 0; + MPI_Aint sendext = 0; + MPI_Aint recvext = 0; MPI_Request *requests; // FIXME: check implementation rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); XBT_DEBUG("<%d> algorithm alltoall_bruck() called.", rank); - err = smpi_datatype_extent(sendtype, &lb, &sendextent); - err = smpi_datatype_extent(recvtype, &lb, &recvextent); + err = smpi_datatype_extent(sendtype, &lb, &sendext); + err = smpi_datatype_extent(recvtype, &lb, &recvext); /* Local copy from self */ err = - smpi_datatype_copy(&((char *) sendbuf)[rank * sendextent], sendcount, - sendtype, &((char *) recvbuf)[rank * recvextent], + smpi_datatype_copy((char *)sendbuf + rank * sendcount * sendext, + sendcount, sendtype, + (char *)recvbuf + rank * recvcount * recvext, recvcount, recvtype); if (err == MPI_SUCCESS && size > 1) { /* Initiate all send/recv to/from others. */ @@ -235,7 +235,7 @@ int smpi_coll_tuned_alltoall_bruck(void *sendbuf, int sendcount, continue; } requests[count] = - smpi_irecv_init(&((char *) recvbuf)[i * recvextent], recvcount, + smpi_irecv_init((char *)recvbuf + i * recvcount * recvext, recvcount, recvtype, i, system_tag, comm); count++; } @@ -247,7 +247,7 @@ int smpi_coll_tuned_alltoall_bruck(void *sendbuf, int sendcount, continue; } requests[count] = - smpi_isend_init(&((char *) sendbuf)[i * sendextent], sendcount, + smpi_isend_init((char *)sendbuf + i * sendcount * sendext, sendcount, sendtype, i, system_tag, comm); count++; } @@ -271,24 +271,20 @@ int smpi_coll_tuned_alltoall_basic_linear(void *sendbuf, int sendcount, { int system_tag = 888; int i, rank, size, err, count; - MPI_Aint lb; - MPI_Aint sendinc = 0; - MPI_Aint recvinc = 0; + MPI_Aint lb = 0, sendext = 0, recvext = 0; MPI_Request *requests; /* Initialize. */ rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); XBT_DEBUG("<%d> algorithm alltoall_basic_linear() called.", rank); - err = smpi_datatype_extent(sendtype, &lb, &sendinc); - err = smpi_datatype_extent(recvtype, &lb, &recvinc); - sendinc *= sendcount; - recvinc *= recvcount; + err = smpi_datatype_extent(sendtype, &lb, &sendext); + err = smpi_datatype_extent(recvtype, &lb, &recvext); /* simple optimization */ - err = - smpi_datatype_copy(&((char *) sendbuf)[rank * sendinc], sendcount, - sendtype, &((char *) recvbuf)[rank * recvinc], - recvcount, recvtype); + err = smpi_datatype_copy((char *)sendbuf + rank * sendcount * sendext, + sendcount, sendtype, + (char *)recvbuf + rank * recvcount * recvext, + recvcount, recvtype); if (err == MPI_SUCCESS && size > 1) { /* Initiate all send/recv to/from others. */ requests = xbt_new(MPI_Request, 2 * (size - 1)); @@ -296,7 +292,7 @@ int smpi_coll_tuned_alltoall_basic_linear(void *sendbuf, int sendcount, count = 0; for (i = (rank + 1) % size; i != rank; i = (i + 1) % size) { requests[count] = - smpi_irecv_init(&((char *) recvbuf)[i * recvinc], recvcount, + smpi_irecv_init((char *)recvbuf + i * recvcount * recvext, recvcount, recvtype, i, system_tag, comm); count++; } @@ -305,10 +301,9 @@ int smpi_coll_tuned_alltoall_basic_linear(void *sendbuf, int sendcount, * when messages actually arrive in the order in which they were posted. * TODO: check the previous assertion */ - for (i = (rank + size - 1) % size; i != rank; - i = (i + size - 1) % size) { + for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size) { requests[count] = - smpi_isend_init(&((char *) sendbuf)[i * sendinc], sendcount, + smpi_isend_init((char *)sendbuf + i * sendcount * sendext, sendcount, sendtype, i, system_tag, comm); count++; } @@ -367,22 +362,20 @@ int smpi_coll_basic_alltoallv(void *sendbuf, int *sendcounts, { int system_tag = 889; int i, rank, size, err, count; - MPI_Aint lb; - MPI_Aint sendextent = 0; - MPI_Aint recvextent = 0; + MPI_Aint lb = 0, sendext = 0, recvext = 0; MPI_Request *requests; /* Initialize. */ rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); XBT_DEBUG("<%d> algorithm basic_alltoallv() called.", rank); - err = smpi_datatype_extent(sendtype, &lb, &sendextent); - err = smpi_datatype_extent(recvtype, &lb, &recvextent); + err = smpi_datatype_extent(sendtype, &lb, &sendext); + err = smpi_datatype_extent(recvtype, &lb, &recvext); /* Local copy from self */ err = - smpi_datatype_copy(&((char *) sendbuf)[senddisps[rank] * sendextent], + smpi_datatype_copy((char *)sendbuf + senddisps[rank] * sendext, sendcounts[rank], sendtype, - &((char *) recvbuf)[recvdisps[rank] * recvextent], + (char *)recvbuf + recvdisps[rank] * recvext, recvcounts[rank], recvtype); if (err == MPI_SUCCESS && size > 1) { /* Initiate all send/recv to/from others. */ @@ -397,7 +390,7 @@ int smpi_coll_basic_alltoallv(void *sendbuf, int *sendcounts, continue; } requests[count] = - smpi_irecv_init(&((char *) recvbuf)[recvdisps[i] * recvextent], + smpi_irecv_init((char *)recvbuf + recvdisps[i] * recvext, recvcounts[i], recvtype, i, system_tag, comm); count++; } @@ -410,7 +403,7 @@ int smpi_coll_basic_alltoallv(void *sendbuf, int *sendcounts, continue; } requests[count] = - smpi_isend_init(&((char *) sendbuf)[senddisps[i] * sendextent], + smpi_isend_init((char *)sendbuf + senddisps[i] * sendext, sendcounts[i], sendtype, i, system_tag, comm); count++; } diff --git a/src/smpi/smpi_global.c b/src/smpi/smpi_global.c index 1b98966471..ad839cbbf5 100644 --- a/src/smpi/smpi_global.c +++ b/src/smpi/smpi_global.c @@ -5,6 +5,7 @@ * under the terms of the license (GNU LGPL) which comes with this package. */ #include +#include #include #include "private.h" @@ -98,6 +99,7 @@ int smpi_global_size(void) { char* value = getenv("SMPI_GLOBAL_SIZE"); if(!value) { + fprintf(stderr, "Please set env var SMPI_GLOBAL_SIZE to expected number of processes.\n"); abort(); } return atoi(value); diff --git a/src/smpi/smpif2c.in b/src/smpi/smpif2c.in index 67ebfc3814..bdcbe867e2 100755 --- a/src/smpi/smpif2c.in +++ b/src/smpi/smpif2c.in @@ -31,7 +31,7 @@ foreach my $fortran (@ARGV) { if(/^} (.*?);/) { $_ = "}* __attribute__((weak)) $1 = NULL;\n"; } elsif(/^#define\s*(\S*)\s*\(?([^.]*)(\..*?)?\)?$/) { - $_ = "#define $1 $2\[smpi_process_index()\]"; + $_ = "#define $1 $2\[smpi_current_rank\]"; if(defined $3) { $_ .= $3; }