From: Martin Quinson Date: Fri, 14 Apr 2017 13:26:13 +0000 (+0200) Subject: Merge branch 'master' of github.com:simgrid/simgrid X-Git-Tag: v3.16~353 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/617a52e4b7e6b644578d242ce1ced9867fecc082?hp=2155dbd23cfe5c344d2ac177e7bfee31e9259ff9 Merge branch 'master' of github.com:simgrid/simgrid --- diff --git a/.gitignore b/.gitignore index 466f2bfb94..74adcccb87 100644 --- a/.gitignore +++ b/.gitignore @@ -986,6 +986,8 @@ teshsuite/smpi/coll-reduce/coll-reduce teshsuite/smpi/coll-reduce-scatter/coll-reduce-scatter teshsuite/smpi/coll-scatter/coll-scatter teshsuite/smpi/macro-shared/macro-shared +teshsuite/smpi/macro-partial-shared/macro-partial-shared +teshsuite/smpi/macro-partial-shared-communication/macro-partial-shared-communication teshsuite/smpi/type-struct/type-struct teshsuite/smpi/type-vector/type-vector teshsuite/s4u/actor/actor diff --git a/include/smpi/smpi.h b/include/smpi/smpi.h index 2d8e740c1e..311d9aa7c9 100644 --- a/include/smpi/smpi.h +++ b/include/smpi/smpi.h @@ -896,9 +896,11 @@ XBT_PUBLIC(void) smpi_trace_set_call_location__(const char *file, int* line); #define SMPI_SAMPLE_DELAY(duration) for(smpi_execute(duration); 0; ) #define SMPI_SAMPLE_FLOPS(flops) for(smpi_execute_flops(flops); 0; ) -XBT_PUBLIC(int) smpi_is_shared(void *buf); XBT_PUBLIC(void *) smpi_shared_malloc(size_t size, const char *file, int line); #define SMPI_SHARED_MALLOC(size) smpi_shared_malloc(size, __FILE__, __LINE__) +XBT_PUBLIC(void *) smpi_shared_malloc_global__(size_t size, const char *file, int line, size_t *shared_block_offsets, int nb_shared_blocks); +#define SMPI_PARTIAL_SHARED_MALLOC(size, shared_block_offsets, nb_shared_blocks)\ + smpi_shared_malloc_global__(size, __FILE__, __LINE__, shared_block_offsets, nb_shared_blocks) XBT_PUBLIC(void) smpi_shared_free(void *data); #define SMPI_SHARED_FREE(data) smpi_shared_free(data) diff --git a/include/smpi/smpi_shared_malloc.hpp b/include/smpi/smpi_shared_malloc.hpp new file mode 100644 index 0000000000..a047bf88f5 --- /dev/null +++ b/include/smpi/smpi_shared_malloc.hpp @@ -0,0 +1,18 @@ +#ifndef SMPI_SHARED_HPP +#define SMPI_SHARED_HPP +#include +#include +#include + + +/* + * We cannot put this declaration in smpi.h, since we use C++ features. + */ + + +XBT_PUBLIC(int) smpi_is_shared(void* ptr, std::vector> &private_blocks, size_t *offset); + +std::vector> shift_and_frame_private_blocks(const std::vector> vec, size_t offset, size_t buff_size); +std::vector> merge_private_blocks(std::vector> src, std::vector> dst); + +#endif diff --git a/src/smpi/smpi_datatype.cpp b/src/smpi/smpi_datatype.cpp index 4ce49854c2..2a774f30ac 100644 --- a/src/smpi/smpi_datatype.cpp +++ b/src/smpi/smpi_datatype.cpp @@ -271,11 +271,14 @@ int Datatype::copy(void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype){ int count; +// FIXME Handle the case of a partial shared malloc. +#if 0 if(smpi_is_shared(sendbuf)){ XBT_DEBUG("Copy input buf %p is shared. Let's ignore it.", sendbuf); }else if(smpi_is_shared(recvbuf)){ XBT_DEBUG("Copy output buf %p is shared. Let's ignore it.", recvbuf); } +#endif if(smpi_privatize_global_variables == SMPI_PRIVATIZE_MMAP){ smpi_switch_data_segment(smpi_process()->index()); diff --git a/src/smpi/smpi_global.cpp b/src/smpi/smpi_global.cpp index 72dff8954d..e6034648f6 100644 --- a/src/smpi/smpi_global.cpp +++ b/src/smpi/smpi_global.cpp @@ -14,6 +14,7 @@ #include "private.h" #include "private.hpp" #include "simgrid/s4u/Mailbox.hpp" +#include "smpi/smpi_shared_malloc.hpp" #include "simgrid/sg_config.h" #include "src/kernel/activity/SynchroComm.hpp" #include "src/mc/mc_record.h" @@ -129,49 +130,83 @@ void smpi_comm_set_copy_data_callback(void (*callback) (smx_activity_t, void*, s smpi_comm_copy_data_callback = callback; } +static void print(std::vector> vec) { + fprintf(stderr, "{"); + for(auto elt: vec) { + fprintf(stderr, "(0x%lx, 0x%lx),", elt.first, elt.second); + } + fprintf(stderr, "}\n"); +} +static void memcpy_private(void *dest, const void *src, size_t n, std::vector> &private_blocks) { + for(auto block : private_blocks) { + memcpy((uint8_t*)dest+block.first, (uint8_t*)src+block.first, block.second-block.first); + } +} + +static void check_blocks(std::vector> &private_blocks, size_t buff_size) { + for(auto block : private_blocks) { + xbt_assert(block.first <= block.second && block.second <= buff_size, "Oops, bug in shared malloc."); + } +} + void smpi_comm_copy_buffer_callback(smx_activity_t synchro, void *buff, size_t buff_size) { - simgrid::kernel::activity::Comm *comm = dynamic_cast(synchro); - + int src_shared=0, dst_shared=0; + size_t src_offset=0, dst_offset=0; + std::vector> src_private_blocks; + std::vector> dst_private_blocks; XBT_DEBUG("Copy the data over"); - if(smpi_is_shared(buff)){ + if((src_shared=smpi_is_shared(buff, src_private_blocks, &src_offset))) { XBT_DEBUG("Sender %p is shared. Let's ignore it.", buff); - }else if(smpi_is_shared((char*)comm->dst_buff)){ + src_private_blocks = shift_and_frame_private_blocks(src_private_blocks, src_offset, buff_size); + } + else { + src_private_blocks.clear(); + src_private_blocks.push_back(std::make_pair(0, buff_size)); + } + if((dst_shared=smpi_is_shared((char*)comm->dst_buff, dst_private_blocks, &dst_offset))) { XBT_DEBUG("Receiver %p is shared. Let's ignore it.", (char*)comm->dst_buff); - }else{ - void* tmpbuff=buff; - if((smpi_privatize_global_variables == SMPI_PRIVATIZE_MMAP) && (static_cast(buff) >= smpi_start_data_exe) - && (static_cast(buff) < smpi_start_data_exe + smpi_size_data_exe ) - ){ - XBT_DEBUG("Privatization : We are copying from a zone inside global memory... Saving data to temp buffer !"); - - smpi_switch_data_segment( - (static_cast((static_cast(comm->src_proc->data)->data))->index())); - tmpbuff = static_cast(xbt_malloc(buff_size)); - memcpy(tmpbuff, buff, buff_size); - } - - if((smpi_privatize_global_variables == SMPI_PRIVATIZE_MMAP) && ((char*)comm->dst_buff >= smpi_start_data_exe) - && ((char*)comm->dst_buff < smpi_start_data_exe + smpi_size_data_exe )){ - XBT_DEBUG("Privatization : We are copying to a zone inside global memory - Switch data segment"); - smpi_switch_data_segment( - (static_cast((static_cast(comm->dst_proc->data)->data))->index())); - } - - XBT_DEBUG("Copying %zu bytes from %p to %p", buff_size, tmpbuff,comm->dst_buff); - memcpy(comm->dst_buff, tmpbuff, buff_size); + dst_private_blocks = shift_and_frame_private_blocks(dst_private_blocks, dst_offset, buff_size); + } + else { + dst_private_blocks.clear(); + dst_private_blocks.push_back(std::make_pair(0, buff_size)); + } + check_blocks(src_private_blocks, buff_size); + check_blocks(dst_private_blocks, buff_size); + auto private_blocks = merge_private_blocks(src_private_blocks, dst_private_blocks); + check_blocks(private_blocks, buff_size); + void* tmpbuff=buff; + if((smpi_privatize_global_variables == SMPI_PRIVATIZE_MMAP) && (static_cast(buff) >= smpi_start_data_exe) + && (static_cast(buff) < smpi_start_data_exe + smpi_size_data_exe ) + ){ + XBT_DEBUG("Privatization : We are copying from a zone inside global memory... Saving data to temp buffer !"); + + smpi_switch_data_segment( + (static_cast((static_cast(comm->src_proc->data)->data))->index())); + tmpbuff = static_cast(xbt_malloc(buff_size)); + memcpy_private(tmpbuff, buff, buff_size, private_blocks); + } - if (comm->detached) { - // if this is a detached send, the source buffer was duplicated by SMPI - // sender to make the original buffer available to the application ASAP - xbt_free(buff); - //It seems that the request is used after the call there this should be free somewhere else but where??? - //xbt_free(comm->comm.src_data);// inside SMPI the request is kept inside the user data and should be free - comm->src_buff = nullptr; - } - if(tmpbuff!=buff)xbt_free(tmpbuff); + if((smpi_privatize_global_variables == SMPI_PRIVATIZE_MMAP) && ((char*)comm->dst_buff >= smpi_start_data_exe) + && ((char*)comm->dst_buff < smpi_start_data_exe + smpi_size_data_exe )){ + XBT_DEBUG("Privatization : We are copying to a zone inside global memory - Switch data segment"); + smpi_switch_data_segment( + (static_cast((static_cast(comm->dst_proc->data)->data))->index())); + } + XBT_DEBUG("Copying %zu bytes from %p to %p", buff_size, tmpbuff,comm->dst_buff); + memcpy_private(comm->dst_buff, tmpbuff, buff_size, private_blocks); + + if (comm->detached) { + // if this is a detached send, the source buffer was duplicated by SMPI + // sender to make the original buffer available to the application ASAP + xbt_free(buff); + //It seems that the request is used after the call there this should be free somewhere else but where??? + //xbt_free(comm->comm.src_data);// inside SMPI the request is kept inside the user data and should be free + comm->src_buff = nullptr; } + if(tmpbuff!=buff)xbt_free(tmpbuff); } diff --git a/src/smpi/smpi_request.cpp b/src/smpi/smpi_request.cpp index 3cb606554a..29661e8902 100644 --- a/src/smpi/smpi_request.cpp +++ b/src/smpi/smpi_request.cpp @@ -32,7 +32,8 @@ namespace smpi{ Request::Request(void *buf, int count, MPI_Datatype datatype, int src, int dst, int tag, MPI_Comm comm, unsigned flags) : buf_(buf), old_type_(datatype), src_(src), dst_(dst), tag_(tag), comm_(comm), flags_(flags) { void *old_buf = nullptr; - if(((((flags & RECV) != 0) && ((flags & ACCUMULATE) !=0)) || (datatype->flags() & DT_FLAG_DERIVED)) && (!smpi_is_shared(buf_))){ +// FIXME Handle the case of a partial shared malloc. + if(((((flags & RECV) != 0) && ((flags & ACCUMULATE) !=0)) || (datatype->flags() & DT_FLAG_DERIVED))) { // && (!smpi_is_shared(buf_))){ // This part handles the problem of non-contiguous memory old_buf = buf; if (count==0){ @@ -689,7 +690,8 @@ void Request::finish_wait(MPI_Request* request, MPI_Status * status) req->print_request("Finishing"); MPI_Datatype datatype = req->old_type_; - if((((req->flags_ & ACCUMULATE) != 0) || (datatype->flags() & DT_FLAG_DERIVED)) && (!smpi_is_shared(req->old_buf_))){ +// FIXME Handle the case of a partial shared malloc. + if((((req->flags_ & ACCUMULATE) != 0) || (datatype->flags() & DT_FLAG_DERIVED))){// && (!smpi_is_shared(req->old_buf_))){ if (!smpi_process()->replaying()){ if( smpi_privatize_global_variables != 0 && (static_cast(req->old_buf_) >= smpi_start_data_exe) diff --git a/src/smpi/smpi_shared.cpp b/src/smpi/smpi_shared.cpp index f8033fdfe3..67159714db 100644 --- a/src/smpi/smpi_shared.cpp +++ b/src/smpi/smpi_shared.cpp @@ -37,6 +37,7 @@ #include "private.h" #include "private.hpp" +#include "smpi/smpi_shared_malloc.hpp" #include "xbt/dict.h" #include @@ -114,6 +115,7 @@ typedef std::unordered_map::value_type shar typedef struct { size_t size; + std::vector> private_blocks; shared_data_key_type* data; } shared_metadata_t; @@ -168,66 +170,84 @@ static void* shm_map(int fd, size_t size, shared_data_key_type* data) { return mem; } -void *smpi_shared_malloc(size_t size, const char *file, int line) +static void *smpi_shared_malloc_local(size_t size, const char *file, int line) { void* mem; - if (size > 0 && smpi_cfg_shared_malloc == shmalloc_local) { - smpi_source_location loc(file, line); - auto res = allocs.insert(std::make_pair(loc, shared_data_t())); - auto data = res.first; - if (res.second) { - // The insertion did not take place. - // Generate a shared memory name from the address of the shared_data: - char shmname[32]; // cannot be longer than PSHMNAMLEN = 31 on Mac OS X (shm_open raises ENAMETOOLONG otherwise) - snprintf(shmname, 31, "/shmalloc%p", &*data); - int fd = shm_open(shmname, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); - if (fd < 0) { - if (errno == EEXIST) - xbt_die("Please cleanup /dev/shm/%s", shmname); - else - xbt_die("An unhandled error occurred while opening %s. shm_open: %s", shmname, strerror(errno)); - } - data->second.fd = fd; - data->second.count = 1; - mem = shm_map(fd, size, &*data); - if (shm_unlink(shmname) < 0) { - XBT_WARN("Could not early unlink %s. shm_unlink: %s", shmname, strerror(errno)); - } - XBT_DEBUG("Mapping %s at %p through %d", shmname, mem, fd); - } else { - mem = shm_map(data->second.fd, size, &*data); - data->second.count++; + smpi_source_location loc(file, line); + auto res = allocs.insert(std::make_pair(loc, shared_data_t())); + auto data = res.first; + if (res.second) { + // The insertion did not take place. + // Generate a shared memory name from the address of the shared_data: + char shmname[32]; // cannot be longer than PSHMNAMLEN = 31 on Mac OS X (shm_open raises ENAMETOOLONG otherwise) + snprintf(shmname, 31, "/shmalloc%p", &*data); + int fd = shm_open(shmname, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd < 0) { + if (errno == EEXIST) + xbt_die("Please cleanup /dev/shm/%s", shmname); + else + xbt_die("An unhandled error occurred while opening %s. shm_open: %s", shmname, strerror(errno)); } - XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, &*data); - - } else if (smpi_cfg_shared_malloc == shmalloc_global) { - /* First reserve memory area */ - mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - - xbt_assert(mem != MAP_FAILED, "Failed to allocate %luMiB of memory. Run \"sysctl vm.overcommit_memory=1\" as root " - "to allow big allocations.\n", - (unsigned long)(size >> 20)); - - /* Create bogus file if not done already */ - if (smpi_shared_malloc_bogusfile == -1) { - /* Create a fd to a new file on disk, make it smpi_shared_malloc_blocksize big, and unlink it. - * It still exists in memory but not in the file system (thus it cannot be leaked). */ - smpi_shared_malloc_blocksize = static_cast(xbt_cfg_get_double("smpi/shared-malloc-blocksize")); - XBT_DEBUG("global shared allocation. Blocksize %lu", smpi_shared_malloc_blocksize); - char* name = xbt_strdup("/tmp/simgrid-shmalloc-XXXXXX"); - smpi_shared_malloc_bogusfile = mkstemp(name); - unlink(name); - xbt_free(name); - char* dumb = (char*)calloc(1, smpi_shared_malloc_blocksize); - ssize_t err = write(smpi_shared_malloc_bogusfile, dumb, smpi_shared_malloc_blocksize); - if(err<0) - xbt_die("Could not write bogus file for shared malloc"); - xbt_free(dumb); + data->second.fd = fd; + data->second.count = 1; + mem = shm_map(fd, size, &*data); + if (shm_unlink(shmname) < 0) { + XBT_WARN("Could not early unlink %s. shm_unlink: %s", shmname, strerror(errno)); } + XBT_DEBUG("Mapping %s at %p through %d", shmname, mem, fd); + } else { + mem = shm_map(data->second.fd, size, &*data); + data->second.count++; + } + XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, &*data); + return mem; +} - /* Map the bogus file in place of the anonymous memory */ +// Align functions, from http://stackoverflow.com/questions/4840410/how-to-align-a-pointer-in-c +#define PAGE_SIZE 0x1000 +#define ALIGN_UP(n, align) (((n) + (align)-1) & -(align)) +#define ALIGN_DOWN(n, align) ((n) & -(align)) + +void *smpi_shared_malloc_global__(size_t size, const char *file, int line, size_t *shared_block_offsets, int nb_shared_blocks) { + void *mem; + xbt_assert(smpi_shared_malloc_blocksize % PAGE_SIZE == 0, "The block size of shared malloc should be a multiple of the page size."); + /* First reserve memory area */ + mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + xbt_assert(mem != MAP_FAILED, "Failed to allocate %luMiB of memory. Run \"sysctl vm.overcommit_memory=1\" as root " + "to allow big allocations.\n", + (unsigned long)(size >> 20)); + + /* Create bogus file if not done already */ + if (smpi_shared_malloc_bogusfile == -1) { + /* Create a fd to a new file on disk, make it smpi_shared_malloc_blocksize big, and unlink it. + * It still exists in memory but not in the file system (thus it cannot be leaked). */ + smpi_shared_malloc_blocksize = static_cast(xbt_cfg_get_double("smpi/shared-malloc-blocksize")); + XBT_DEBUG("global shared allocation. Blocksize %lu", smpi_shared_malloc_blocksize); + char* name = xbt_strdup("/tmp/simgrid-shmalloc-XXXXXX"); + smpi_shared_malloc_bogusfile = mkstemp(name); + unlink(name); + xbt_free(name); + char* dumb = (char*)calloc(1, smpi_shared_malloc_blocksize); + ssize_t err = write(smpi_shared_malloc_bogusfile, dumb, smpi_shared_malloc_blocksize); + if(err<0) + xbt_die("Could not write bogus file for shared malloc"); + xbt_free(dumb); + } + + /* Map the bogus file in place of the anonymous memory */ + for(int i_block = 0; i_block < nb_shared_blocks; i_block ++) { + size_t start_offset = shared_block_offsets[2*i_block]; + size_t stop_offset = shared_block_offsets[2*i_block+1]; + xbt_assert(start_offset < stop_offset, "start_offset (%lu) should be lower than stop offset (%lu)", start_offset, stop_offset); + xbt_assert(stop_offset <= size, "stop_offset (%lu) should be lower than size (%lu)", stop_offset, size); + if(i_block < nb_shared_blocks-1) + xbt_assert(stop_offset < shared_block_offsets[2*i_block+2], + "stop_offset (%lu) should be lower than its successor start offset (%lu)", stop_offset, shared_block_offsets[2*i_block+2]); + size_t start_block_offset = ALIGN_UP(start_offset, smpi_shared_malloc_blocksize); + size_t stop_block_offset = ALIGN_DOWN(stop_offset, smpi_shared_malloc_blocksize); unsigned int i; - for (i = 0; i < size / smpi_shared_malloc_blocksize; i++) { + for (i = start_block_offset / smpi_shared_malloc_blocksize; i < stop_block_offset / smpi_shared_malloc_blocksize; i++) { void* pos = (void*)((unsigned long)mem + i * smpi_shared_malloc_blocksize); void* res = mmap(pos, smpi_shared_malloc_blocksize, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED | MAP_POPULATE, smpi_shared_malloc_bogusfile, 0); @@ -236,50 +256,145 @@ void *smpi_shared_malloc(size_t size, const char *file, int line) "You can also try using the sysctl vm.max_map_count", strerror(errno)); } - if (size % smpi_shared_malloc_blocksize) { - void* pos = (void*)((unsigned long)mem + i * smpi_shared_malloc_blocksize); - void* res = mmap(pos, size % smpi_shared_malloc_blocksize, PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_SHARED | MAP_POPULATE, smpi_shared_malloc_bogusfile, 0); + size_t low_page_start_offset = ALIGN_UP(start_offset, PAGE_SIZE); + size_t low_page_stop_offset = start_block_offset < ALIGN_DOWN(stop_offset, PAGE_SIZE) ? start_block_offset : ALIGN_DOWN(stop_offset, PAGE_SIZE); + if(low_page_start_offset < low_page_stop_offset) { + void* pos = (void*)((unsigned long)mem + low_page_start_offset); + void* res = mmap(pos, low_page_stop_offset-low_page_start_offset, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED | MAP_POPULATE, + smpi_shared_malloc_bogusfile, 0); xbt_assert(res == pos, "Could not map folded virtual memory (%s). Do you perhaps need to increase the " "size of the mapped file using --cfg=smpi/shared-malloc-blocksize=newvalue (default 1048576) ?" "You can also try using the sysctl vm.max_map_count", strerror(errno)); } + if(low_page_stop_offset <= stop_block_offset) { + size_t high_page_stop_offset = stop_offset == size ? size : ALIGN_DOWN(stop_offset, PAGE_SIZE); + if(high_page_stop_offset > stop_block_offset) { + void* pos = (void*)((unsigned long)mem + stop_block_offset); + void* res = mmap(pos, high_page_stop_offset-stop_block_offset, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED | MAP_POPULATE, + smpi_shared_malloc_bogusfile, 0); + xbt_assert(res == pos, "Could not map folded virtual memory (%s). Do you perhaps need to increase the " + "size of the mapped file using --cfg=smpi/shared-malloc-blocksize=newvalue (default 1048576) ?" + "You can also try using the sysctl vm.max_map_count", + strerror(errno)); + } + } + } - shared_metadata_t newmeta; - //register metadata for memcpy avoidance - shared_data_key_type* data = (shared_data_key_type*)xbt_malloc(sizeof(shared_data_key_type)); - data->second.fd = -1; - data->second.count = 1; - newmeta.size = size; - newmeta.data = data; - allocs_metadata[mem] = newmeta; + shared_metadata_t newmeta; + //register metadata for memcpy avoidance + shared_data_key_type* data = (shared_data_key_type*)xbt_malloc(sizeof(shared_data_key_type)); + data->second.fd = -1; + data->second.count = 1; + newmeta.size = size; + newmeta.data = data; + if(shared_block_offsets[0] > 0) { + newmeta.private_blocks.push_back(std::make_pair(0, shared_block_offsets[0])); + } + int i_block; + for(i_block = 0; i_block < nb_shared_blocks-1; i_block ++) { + newmeta.private_blocks.push_back(std::make_pair(shared_block_offsets[2*i_block+1], shared_block_offsets[2*i_block+2])); + } + if(shared_block_offsets[2*i_block+1] < size) { + newmeta.private_blocks.push_back(std::make_pair(shared_block_offsets[2*i_block+1], size)); + } + allocs_metadata[mem] = newmeta; + + return mem; +} + +/* + * When nb_shared_blocks == -1, default behavior of smpi_shared_malloc: everything is shared. + * Otherwise, only the blocks described by shared_block_offsets are shared. + * This array contains the offsets (in bytes) of the block to share. + * Even indices are the start offsets (included), odd indices are the stop offsets (excluded). + * For instance, if shared_block_offsets == {27, 42}, then the elements mem[27], mem[28], ..., mem[41] are shared. The others are not. + */ +static void *smpi_shared_malloc_global(size_t size, const char *file, int line, size_t *shared_block_offsets=NULL, int nb_shared_blocks=-1) { + size_t tmp_shared_block_offsets[2]; + if(nb_shared_blocks == -1) { + nb_shared_blocks = 1; + shared_block_offsets = tmp_shared_block_offsets; + shared_block_offsets[0] = 0; + shared_block_offsets[1] = size; + } + return smpi_shared_malloc_global__(size, file, line, shared_block_offsets, nb_shared_blocks); +} + +void *smpi_shared_malloc(size_t size, const char *file, int line) { + void *mem; + if (size > 0 && smpi_cfg_shared_malloc == shmalloc_local) { + mem = smpi_shared_malloc_local(size, file, line); + } else if (smpi_cfg_shared_malloc == shmalloc_global) { + mem = smpi_shared_malloc_global(size, file, line); } else { mem = xbt_malloc(size); XBT_DEBUG("Classic malloc %zu in %p", size, mem); } - return mem; } -int smpi_is_shared(void* ptr){ +int smpi_is_shared(void* ptr, std::vector> &private_blocks, size_t *offset){ + private_blocks.clear(); // being paranoid if (allocs_metadata.empty()) return 0; if ( smpi_cfg_shared_malloc == shmalloc_local || smpi_cfg_shared_malloc == shmalloc_global) { auto low = allocs_metadata.lower_bound(ptr); - if (low->first==ptr) + if (low->first==ptr) { + private_blocks = low->second.private_blocks; + *offset = 0; return 1; + } if (low == allocs_metadata.begin()) return 0; low --; - if (ptr < (char*)low->first + low->second.size) + if (ptr < (char*)low->first + low->second.size) { + xbt_assert(ptr > (char*)low->first, "Oops, there seems to be a bug in the shared memory metadata."); + *offset = ((uint8_t*)ptr) - ((uint8_t*) low->first); + private_blocks = low->second.private_blocks; return 1; + } return 0; } else { return 0; } } +std::vector> shift_and_frame_private_blocks(const std::vector> vec, size_t offset, size_t buff_size) { + std::vector> result; + for(auto block: vec) { + auto new_block = std::make_pair(std::min(std::max((size_t)0, block.first-offset), buff_size), + std::min(std::max((size_t)0, block.second-offset), buff_size)); + if(new_block.second > 0 && new_block.first < buff_size) + result.push_back(new_block); + } + return result; +} + +std::vector> merge_private_blocks(std::vector> src, std::vector> dst) { + std::vector> result; + unsigned i_src=0, i_dst=0; + while(i_src < src.size() && i_dst < dst.size()) { + std::pair block; + if(src[i_src].second <= dst[i_dst].first) { + i_src++; + } + else if(dst[i_dst].second <= src[i_src].first) { + i_dst++; + } + else { // src.second > dst.first && dst.second > src.first → the blocks are overlapping + block = std::make_pair(std::max(src[i_src].first, dst[i_dst].first), + std::min(src[i_src].second, dst[i_dst].second)); + result.push_back(block); + if(src[i_src].second < dst[i_dst].second) + i_src ++; + else + i_dst ++; + } + } + return result; +} + void smpi_shared_free(void *ptr) { if (smpi_cfg_shared_malloc == shmalloc_local) { @@ -312,7 +427,7 @@ void smpi_shared_free(void *ptr) xbt_free(meta->second.data); } - munmap(ptr, 0); // the POSIX says that I should not give 0 as a length, but it seems to work OK + munmap(ptr, meta->second.size); } else { XBT_DEBUG("Classic free of %p", ptr); xbt_free(ptr); diff --git a/teshsuite/smpi/CMakeLists.txt b/teshsuite/smpi/CMakeLists.txt index 30ff3c606a..17ca6b6568 100644 --- a/teshsuite/smpi/CMakeLists.txt +++ b/teshsuite/smpi/CMakeLists.txt @@ -21,14 +21,26 @@ if(enable_smpi) add_executable (macro-shared macro-shared/macro-shared.c) target_link_libraries(macro-shared simgrid) set_target_properties(macro-shared PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/macro-shared) + + add_executable (macro-partial-shared macro-partial-shared/macro-partial-shared.c) + target_link_libraries(macro-partial-shared simgrid) + set_target_properties(macro-partial-shared PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/macro-partial-shared) + + add_executable (macro-partial-shared-communication macro-partial-shared-communication/macro-partial-shared-communication.c) + target_link_libraries(macro-partial-shared-communication simgrid) + set_target_properties(macro-partial-shared-communication PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/macro-partial-shared-communication) endif() endif() set(teshsuite_src ${teshsuite_src} ${CMAKE_CURRENT_SOURCE_DIR}/macro-shared/macro-shared.c PARENT_SCOPE) +set(teshsuite_src ${teshsuite_src} ${CMAKE_CURRENT_SOURCE_DIR}/macro-partial-shared/macro-partial-shared.c PARENT_SCOPE) +set(teshsuite_src ${teshsuite_src} ${CMAKE_CURRENT_SOURCE_DIR}/macro-partial-shared-communication/macro-partial-shared-communication.c PARENT_SCOPE) set(tesh_files ${tesh_files} ${CMAKE_CURRENT_SOURCE_DIR}/coll-allreduce/coll-allreduce-large.tesh ${CMAKE_CURRENT_SOURCE_DIR}/coll-allreduce/coll-allreduce-automatic.tesh ${CMAKE_CURRENT_SOURCE_DIR}/coll-alltoall/clusters.tesh ${CMAKE_CURRENT_SOURCE_DIR}/macro-shared/macro-shared.tesh + ${CMAKE_CURRENT_SOURCE_DIR}/macro-partial-shared/macro-partial-shared.tesh + ${CMAKE_CURRENT_SOURCE_DIR}/macro-partial-shared-communication/macro-partial-shared-communication.tesh ${CMAKE_CURRENT_SOURCE_DIR}/pt2pt-pingpong/broken_hostfiles.tesh ${CMAKE_CURRENT_SOURCE_DIR}/pt2pt-pingpong/TI_output.tesh PARENT_SCOPE) set(bin_files ${bin_files} ${CMAKE_CURRENT_SOURCE_DIR}/hostfile @@ -39,6 +51,8 @@ set(bin_files ${bin_files} ${CMAKE_CURRENT_SOURCE_DIR}/hostfile if(enable_smpi) if(NOT WIN32) ADD_TESH_FACTORIES(tesh-smpi-macro-shared "thread;ucontext;raw;boost" --setenv bindir=${CMAKE_BINARY_DIR}/teshsuite/smpi/macro-shared --cd ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/macro-shared macro-shared.tesh) + ADD_TESH_FACTORIES(tesh-smpi-macro-partial-shared "thread;ucontext;raw;boost" --setenv bindir=${CMAKE_BINARY_DIR}/teshsuite/smpi/macro-partial-shared --cd ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/macro-partial-shared macro-partial-shared.tesh) + ADD_TESH_FACTORIES(tesh-smpi-macro-partial-shared-communication "thread;ucontext;raw;boost" --setenv bindir=${CMAKE_BINARY_DIR}/teshsuite/smpi/macro-partial-shared-communication --cd ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/macro-partial-shared-communication macro-partial-shared-communication.tesh) endif() foreach(x coll-allgather coll-allgatherv coll-allreduce coll-alltoall coll-alltoallv coll-barrier coll-bcast diff --git a/teshsuite/smpi/macro-partial-shared-communication/macro-partial-shared-communication.c b/teshsuite/smpi/macro-partial-shared-communication/macro-partial-shared-communication.c new file mode 100644 index 0000000000..afed268325 --- /dev/null +++ b/teshsuite/smpi/macro-partial-shared-communication/macro-partial-shared-communication.c @@ -0,0 +1,125 @@ +/* Copyright (c) 2009-2015. The SimGrid Team. + * All rights reserved. */ + +/* This program is free software; you can redistribute it and/or modify it + * under the terms of the license (GNU LGPL) which comes with this package. */ + +#include +#include +#include +#include +#include + +// Set the elements between buf[start] and buf[stop-1] to (i+value)%256 +static void set(uint8_t *buf, size_t start, size_t stop, uint8_t value) { + for(size_t i = start; i < stop; i++) { + buf[i] = (i+value)%256; + } +} + +// Return the number of times that an element is equal to (i+value)%256 between buf[start] and buf[stop-1]. +static int count_all(uint8_t *buf, size_t start, size_t stop, uint8_t value) { + size_t occ = 0; + for(size_t i = start ; i < stop ; i++) { + if(buf[i] == (i+value)%256) { + occ ++; + } + } + return occ; +} + +// Return true iff the values from buf[start] to buf[stop-1] are all equal to (i+value)%256. +static int check_all(uint8_t *buf, size_t start, size_t stop, uint8_t value) { + size_t occ = count_all(buf, start, stop, value); + return occ == stop-start; +} + +// Return true iff "enough" elements are equal to (i+value)%256 between buf[start] and buf[stop-1]. +static int check_enough(uint8_t *buf, size_t start, size_t stop, uint8_t value) { + int page_size = 0x1000; + size_t size = stop-start; + if(size <= 2*page_size) // we are not sure to have a whole page that is shared + return 1; + size_t occ = count_all(buf, start, stop, value); + return occ >= size - 2*page_size; +} + +int main(int argc, char *argv[]) +{ + MPI_Init(&argc, &argv); + int rank; + int size; + size_t mem_size = 0x10000000; + size_t shared_blocks[] = { + 0, 0x1234567, + 0x1300000, 0x1300010, + 0x3456789, 0x3457890, + 0x4444444, 0x5555555, + 0x5555565, 0x5600000, + 0x8000000, 0x10000000 + }; + int nb_blocks = (sizeof(shared_blocks)/sizeof(size_t))/2; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + //Let's Allocate a shared memory buffer + assert(size%2 == 0); + uint8_t *buf; + buf = SMPI_PARTIAL_SHARED_MALLOC(mem_size, shared_blocks, nb_blocks); + memset(buf, rank, mem_size); + MPI_Barrier(MPI_COMM_WORLD); + + // Even processes write their rank in private blocks + if(rank%2 == 0) { + for(int i = 0; i < nb_blocks-1; i++) { + size_t start = shared_blocks[2*i+1]; + size_t stop = shared_blocks[2*i+2]; + set(buf, start, stop, rank); + } + } + // Then, even processes send their buffer to their successor + if(rank%2 == 0) { + MPI_Send(buf, mem_size, MPI_UINT8_T, rank+1, 0, MPI_COMM_WORLD); + } + else { + MPI_Recv(buf, mem_size, MPI_UINT8_T, rank-1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } + + + // Odd processes verify that they successfully received the message + if(rank%2 == 1) { + for(int i = 0; i < nb_blocks-1; i++) { + size_t start = shared_blocks[2*i+1]; + size_t stop = shared_blocks[2*i+2]; + int comm = check_all(buf, start, stop, rank-1); + printf("[%d] The result of the (normal) communication check for block (0x%lx, 0x%lx) is: %d\n", rank, start, stop, comm); + } + memset(buf, rank, mem_size); + } + + MPI_Barrier(MPI_COMM_WORLD); + + // Then, even processes send a sub-part of their buffer their successor + // Note that the last block should not be copied entirely + if(rank%2 == 0) { + MPI_Send(buf+0x10000, mem_size-0xa000000, MPI_UINT8_T, rank+1, 0, MPI_COMM_WORLD); + } + else { + MPI_Recv(buf+0x10000, mem_size-0xa000000, MPI_UINT8_T, rank-1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } + + + // Odd processes verify that they successfully received the message + if(rank%2 == 1) { + for(int i = 0; i < nb_blocks-1; i++) { + size_t start = shared_blocks[2*i+1]; + size_t stop = shared_blocks[2*i+2]; + int comm = check_all(buf, start, stop, rank-1); + printf("[%d] The result of the (shifted) communication check for block (0x%lx, 0x%lx) is: %d\n", rank, start, stop, comm); + } + } + + SMPI_SHARED_FREE(buf); + + MPI_Finalize(); + return 0; +} diff --git a/teshsuite/smpi/macro-partial-shared-communication/macro-partial-shared-communication.tesh b/teshsuite/smpi/macro-partial-shared-communication/macro-partial-shared-communication.tesh new file mode 100644 index 0000000000..43ace2fa0b --- /dev/null +++ b/teshsuite/smpi/macro-partial-shared-communication/macro-partial-shared-communication.tesh @@ -0,0 +1,25 @@ +p Test compute +! setenv LD_LIBRARY_PATH=../../lib +! output sort +! timeout 5 +$ ${bindir:=.}/../../../smpi_script/bin/smpirun -hostfile ../hostfile -platform ../../../examples/platforms/small_platform.xml -np 4 ${bindir:=.}/macro-partial-shared-communication --log=smpi_kernel.thres:warning --log=xbt_cfg.thres:warning +> [3] The result of the (normal) communication check for block (0x1234567, 0x1300000) is: 1 +> [3] The result of the (normal) communication check for block (0x1300010, 0x3456789) is: 1 +> [3] The result of the (normal) communication check for block (0x3457890, 0x4444444) is: 1 +> [3] The result of the (normal) communication check for block (0x5555555, 0x5555565) is: 1 +> [3] The result of the (normal) communication check for block (0x5600000, 0x8000000) is: 1 +> [1] The result of the (normal) communication check for block (0x1234567, 0x1300000) is: 1 +> [1] The result of the (normal) communication check for block (0x1300010, 0x3456789) is: 1 +> [1] The result of the (normal) communication check for block (0x3457890, 0x4444444) is: 1 +> [1] The result of the (normal) communication check for block (0x5555555, 0x5555565) is: 1 +> [1] The result of the (normal) communication check for block (0x5600000, 0x8000000) is: 1 +> [3] The result of the (shifted) communication check for block (0x1234567, 0x1300000) is: 1 +> [3] The result of the (shifted) communication check for block (0x1300010, 0x3456789) is: 1 +> [3] The result of the (shifted) communication check for block (0x3457890, 0x4444444) is: 1 +> [3] The result of the (shifted) communication check for block (0x5555555, 0x5555565) is: 1 +> [3] The result of the (shifted) communication check for block (0x5600000, 0x8000000) is: 0 +> [1] The result of the (shifted) communication check for block (0x1234567, 0x1300000) is: 1 +> [1] The result of the (shifted) communication check for block (0x1300010, 0x3456789) is: 1 +> [1] The result of the (shifted) communication check for block (0x3457890, 0x4444444) is: 1 +> [1] The result of the (shifted) communication check for block (0x5555555, 0x5555565) is: 1 +> [1] The result of the (shifted) communication check for block (0x5600000, 0x8000000) is: 0 diff --git a/teshsuite/smpi/macro-partial-shared/macro-partial-shared.c b/teshsuite/smpi/macro-partial-shared/macro-partial-shared.c new file mode 100644 index 0000000000..305810ecc9 --- /dev/null +++ b/teshsuite/smpi/macro-partial-shared/macro-partial-shared.c @@ -0,0 +1,100 @@ +/* Copyright (c) 2009-2015. The SimGrid Team. + * All rights reserved. */ + +/* This program is free software; you can redistribute it and/or modify it + * under the terms of the license (GNU LGPL) which comes with this package. */ + +#include +#include +#include +#include + +// Set the elements between buf[start] and buf[stop-1] to (i+value)%256 +static void set(uint8_t *buf, size_t start, size_t stop, uint8_t value) { + for(size_t i = start; i < stop; i++) { + buf[i] = (i+value)%256; + } +} + +// Return the number of times that an element is equal to (i+value)%256 between buf[start] and buf[stop-1]. +static int count_all(uint8_t *buf, size_t start, size_t stop, uint8_t value) { + size_t occ = 0; + for(size_t i = start ; i < stop ; i++) { + if(buf[i] == (i+value)%256) { + occ ++; + } + } + return occ; +} + +// Return true iff the values from buf[start] to buf[stop-1] are all equal to (i+value)%256. +static int check_all(uint8_t *buf, size_t start, size_t stop, uint8_t value) { + size_t occ = count_all(buf, start, stop, value); + return occ == stop-start; +} + +// Return true iff "enough" elements are equal to (i+value)%256 between buf[start] and buf[stop-1]. +static int check_enough(uint8_t *buf, size_t start, size_t stop, uint8_t value) { + int page_size = 0x1000; + size_t size = stop-start; + if(size <= 2*page_size) // we are not sure to have a whole page that is shared + return 1; + size_t occ = count_all(buf, start, stop, value); + return occ >= size - 2*page_size; +} + +int main(int argc, char *argv[]) +{ + MPI_Init(&argc, &argv); + int rank; + int size; + size_t mem_size = 0x10000000; + size_t shared_blocks[] = { + 0, 0x1234567, + 0x1300000, 0x1300010, + 0x3456789, 0x3457890, + 0x4444444, 0x5555555, + 0x5555565, 0x5600000, + 0x8000000, 0x10000000 + }; + int nb_blocks = (sizeof(shared_blocks)/sizeof(size_t))/2; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + //Let's Allocate a shared memory buffer + uint8_t *buf; + buf = SMPI_PARTIAL_SHARED_MALLOC(mem_size, shared_blocks, nb_blocks); + set(buf, 0, mem_size, 0); + MPI_Barrier(MPI_COMM_WORLD); + + // Process 0 write in shared blocks + if(rank == 0) { + for(int i = 0; i < nb_blocks; i++) { + size_t start = shared_blocks[2*i]; + size_t stop = shared_blocks[2*i+1]; + set(buf, start, stop, 42); + } + } + MPI_Barrier(MPI_COMM_WORLD); + // All processes check that their shared blocks have been written (at least partially) + for(int i = 0; i < nb_blocks; i++) { + size_t start = shared_blocks[2*i]; + size_t stop = shared_blocks[2*i+1]; + int is_shared = check_enough(buf, start, stop, 42); + printf("[%d] The result of the shared check for block (0x%lx, 0x%lx) is: %d\n", rank, start, stop, is_shared); + } + + + // Check the private blocks + MPI_Barrier(MPI_COMM_WORLD); + for(int i = 0; i < nb_blocks-1; i++) { + size_t start = shared_blocks[2*i+1]; + size_t stop = shared_blocks[2*i+2]; + int is_private = check_all(buf, start, stop, 0); + printf("[%d] The result of the private check for block (0x%lx, 0x%lx) is: %d\n", rank, start, stop, is_private); + } + + SMPI_SHARED_FREE(buf); + + MPI_Finalize(); + return 0; +} diff --git a/teshsuite/smpi/macro-partial-shared/macro-partial-shared.tesh b/teshsuite/smpi/macro-partial-shared/macro-partial-shared.tesh new file mode 100644 index 0000000000..075ce123cf --- /dev/null +++ b/teshsuite/smpi/macro-partial-shared/macro-partial-shared.tesh @@ -0,0 +1,49 @@ +p Test compute +! setenv LD_LIBRARY_PATH=../../lib +! output sort +! timeout 5 +$ ${bindir:=.}/../../../smpi_script/bin/smpirun -hostfile ../hostfile -platform ../../../examples/platforms/small_platform.xml -np 4 ${bindir:=.}/macro-partial-shared --log=smpi_kernel.thres:warning --log=xbt_cfg.thres:warning +> [0] The result of the shared check for block (0x0, 0x1234567) is: 1 +> [0] The result of the shared check for block (0x1300000, 0x1300010) is: 1 +> [0] The result of the shared check for block (0x3456789, 0x3457890) is: 1 +> [0] The result of the shared check for block (0x4444444, 0x5555555) is: 1 +> [0] The result of the shared check for block (0x5555565, 0x5600000) is: 1 +> [0] The result of the shared check for block (0x8000000, 0x10000000) is: 1 +> [3] The result of the shared check for block (0x0, 0x1234567) is: 1 +> [3] The result of the shared check for block (0x1300000, 0x1300010) is: 1 +> [3] The result of the shared check for block (0x3456789, 0x3457890) is: 1 +> [3] The result of the shared check for block (0x4444444, 0x5555555) is: 1 +> [3] The result of the shared check for block (0x5555565, 0x5600000) is: 1 +> [3] The result of the shared check for block (0x8000000, 0x10000000) is: 1 +> [1] The result of the shared check for block (0x0, 0x1234567) is: 1 +> [1] The result of the shared check for block (0x1300000, 0x1300010) is: 1 +> [1] The result of the shared check for block (0x3456789, 0x3457890) is: 1 +> [1] The result of the shared check for block (0x4444444, 0x5555555) is: 1 +> [1] The result of the shared check for block (0x5555565, 0x5600000) is: 1 +> [1] The result of the shared check for block (0x8000000, 0x10000000) is: 1 +> [2] The result of the shared check for block (0x0, 0x1234567) is: 1 +> [2] The result of the shared check for block (0x1300000, 0x1300010) is: 1 +> [2] The result of the shared check for block (0x3456789, 0x3457890) is: 1 +> [2] The result of the shared check for block (0x4444444, 0x5555555) is: 1 +> [2] The result of the shared check for block (0x5555565, 0x5600000) is: 1 +> [2] The result of the shared check for block (0x8000000, 0x10000000) is: 1 +> [0] The result of the private check for block (0x1234567, 0x1300000) is: 1 +> [0] The result of the private check for block (0x1300010, 0x3456789) is: 1 +> [0] The result of the private check for block (0x3457890, 0x4444444) is: 1 +> [0] The result of the private check for block (0x5555555, 0x5555565) is: 1 +> [0] The result of the private check for block (0x5600000, 0x8000000) is: 1 +> [3] The result of the private check for block (0x1234567, 0x1300000) is: 1 +> [3] The result of the private check for block (0x1300010, 0x3456789) is: 1 +> [3] The result of the private check for block (0x3457890, 0x4444444) is: 1 +> [3] The result of the private check for block (0x5555555, 0x5555565) is: 1 +> [3] The result of the private check for block (0x5600000, 0x8000000) is: 1 +> [1] The result of the private check for block (0x1234567, 0x1300000) is: 1 +> [1] The result of the private check for block (0x1300010, 0x3456789) is: 1 +> [1] The result of the private check for block (0x3457890, 0x4444444) is: 1 +> [1] The result of the private check for block (0x5555555, 0x5555565) is: 1 +> [1] The result of the private check for block (0x5600000, 0x8000000) is: 1 +> [2] The result of the private check for block (0x1234567, 0x1300000) is: 1 +> [2] The result of the private check for block (0x1300010, 0x3456789) is: 1 +> [2] The result of the private check for block (0x3457890, 0x4444444) is: 1 +> [2] The result of the private check for block (0x5555555, 0x5555565) is: 1 +> [2] The result of the private check for block (0x5600000, 0x8000000) is: 1