1 /* Copyright (c) 2015-2022. The SimGrid Team. All rights reserved. */
3 /* This program is free software; you can redistribute it and/or modify it
4 * under the terms of the license (GNU LGPL) which comes with this package. */
16 #include <sys/types.h>
23 #include "private.hpp"
24 #include "src/internal_config.h"
25 #include "src/smpi/include/smpi_actor.hpp"
26 #include "src/xbt/memory_map.hpp"
27 #include "xbt/virtu.h"
29 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_memory, smpi, "Memory layout support for SMPI");
31 char* smpi_data_exe_start = nullptr;
32 size_t smpi_data_exe_size = 0;
33 SmpiPrivStrategies smpi_privatize_global_variables;
34 static void* smpi_data_exe_copy;
36 // Initialized by smpi_prepare_global_memory_segment().
37 static std::vector<simgrid::xbt::VmMap> initial_vm_map;
39 // We keep a copy of all the privatization regions: We can then delete everything easily by iterating over this
40 // collection and nothing can be leaked. We could also iterate over all actors but we would have to be diligent when two
41 // actors use the same privatization region (so, smart pointers would have to be used etc.)
42 // Use a std::deque so that pointers remain valid after push_back().
43 static std::deque<s_smpi_privatization_region_t> smpi_privatization_regions;
45 static constexpr int PROT_RWX = PROT_READ | PROT_WRITE | PROT_EXEC;
46 static constexpr int PROT_RW = PROT_READ | PROT_WRITE;
48 /** Take a snapshot of the process' memory map.
50 void smpi_prepare_global_memory_segment()
52 initial_vm_map = simgrid::xbt::get_memory_map(getpid());
55 static void smpi_get_executable_global_size()
57 char* buffer = realpath(simgrid::xbt::binary_name.c_str(), nullptr);
58 xbt_assert(buffer != nullptr, "Could not resolve real path of binary file '%s'", simgrid::xbt::binary_name.c_str());
59 std::string full_name = buffer;
62 std::vector<simgrid::xbt::VmMap> map = simgrid::xbt::get_memory_map(getpid());
63 for (auto i = map.begin(); i != map.end() ; ++i) {
64 // TODO, In practice, this implementation would not detect a completely
65 // anonymous data segment. This does not happen in practice, however.
67 // File backed RW entry:
68 if (i->pathname == full_name && (i->prot & PROT_RWX) == PROT_RW) {
69 smpi_data_exe_start = (char*)i->start_addr;
70 smpi_data_exe_size = i->end_addr - i->start_addr;
71 /* Here we are making the assumption that a suitable empty region
72 following the rw- area is the end of the data segment. It would
73 be better to check with the size of the data segment. */
74 if (auto j = i + 1; j != map.end() && j->pathname.empty() && (j->prot & PROT_RWX) == PROT_RW &&
75 (char*)j->start_addr == smpi_data_exe_start + smpi_data_exe_size) {
76 // Only count the portion of this region not present in the initial map.
77 auto found = std::find_if(initial_vm_map.begin(), initial_vm_map.end(), [&j](const simgrid::xbt::VmMap& m) {
78 return j->start_addr <= m.start_addr && m.start_addr < j->end_addr;
80 auto end_addr = (found == initial_vm_map.end() ? j->end_addr : found->start_addr);
81 smpi_data_exe_size = (char*)end_addr - smpi_data_exe_start;
86 xbt_die("Did not find my data segment.");
90 #if HAVE_SANITIZER_ADDRESS
91 #include <sanitizer/asan_interface.h>
92 static void* asan_safe_memcpy(void* dest, void* src, size_t n)
94 char* psrc = static_cast<char*>(src);
95 char* pdest = static_cast<char*>(dest);
96 for (size_t i = 0; i < n;) {
97 while (i < n && __asan_address_is_poisoned(psrc + i))
100 char* p = static_cast<char*>(__asan_region_is_poisoned(psrc + i, n - i));
101 size_t j = p ? (p - psrc) : n;
102 memcpy(pdest + i, psrc + i, j - i);
109 #define asan_safe_memcpy(dest, src, n) memcpy((dest), (src), (n))
113 * @brief Uses shm_open to get a temporary shm, and returns its file descriptor.
115 int smpi_temp_shm_get()
117 constexpr unsigned INDEX_MASK = 0xffffffffUL;
118 static unsigned index = INDEX_MASK;
119 char shmname[32]; // cannot be longer than PSHMNAMLEN = 31 on macOS (shm_open raises ENAMETOOLONG otherwise)
122 unsigned limit = index;
124 index = (index + 1) & INDEX_MASK;
125 snprintf(shmname, sizeof(shmname), "/smpi-buffer-%016x", index);
126 fd = shm_open(shmname, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
127 } while (fd == -1 && errno == EEXIST && index != limit);
130 if (errno == EMFILE) {
131 xbt_die("Impossible to create temporary file for memory mapping: %s\n\
132 The shm_open() system call failed with the EMFILE error code (too many files). \n\n\
133 This means that you reached the system limits concerning the amount of files per process. \
134 This is not a surprise if you are trying to virtualize many processes on top of SMPI. \
135 Don't panic -- you should simply increase your system limits and try again. \n\n\
136 First, check what your limits are:\n\
137 cat /proc/sys/fs/file-max # Gives you the system-wide limit\n\
138 ulimit -Hn # Gives you the per process hard limit\n\
139 ulimit -Sn # Gives you the per process soft limit\n\
140 cat /proc/self/limits # Displays any per-process limitation (including the one given above)\n\n\
141 If one of these values is less than the amount of MPI processes that you try to run, then you got the explanation of this error. \
142 Ask the Internet about tutorials on how to increase the files limit such as: https://rtcamp.com/tutorials/linux/increase-open-files-limit/",
145 xbt_die("Impossible to create temporary file for memory mapping. shm_open: %s", strerror(errno));
147 XBT_DEBUG("Got temporary shm %s (fd = %d)", shmname, fd);
148 if (shm_unlink(shmname) < 0)
149 XBT_WARN("Could not early unlink %s. shm_unlink: %s", shmname, strerror(errno));
154 * @brief Mmap a region of size bytes from temporary shm with file descriptor fd.
156 void* smpi_temp_shm_mmap(int fd, size_t size)
159 xbt_assert(fstat(fd, &st) == 0, "Could not stat fd %d: %s", fd, strerror(errno));
160 xbt_assert(static_cast<off_t>(size) <= st.st_size || ftruncate(fd, static_cast<off_t>(size)) == 0,
161 "Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
162 void* mem = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
165 "Failed to map fd %d with size %zu: %s\n"
166 "If you are running a lot of ranks, you may be exceeding the amount of mappings allowed per process.\n"
167 "On Linux systems, change this value with sudo sysctl -w vm.max_map_count=newvalue (default value: 65536)\n"
168 "Please see https://simgrid.org/doc/latest/Configuring_SimGrid.html#configuring-the-user-code-virtualization for "
170 fd, size, strerror(errno));
174 /** Map a given SMPI privatization segment (make an SMPI process active)
176 * When doing a state restoration, the state of the restored variables might not be consistent with the state of the
177 * virtual memory. In this case, we have to change the data segment.
179 * If 'addr' is not null, only switch if it's an address from the data segment.
181 * Returns 'true' if the segment has to be switched (mmap privatization and 'addr' in data segment).
183 bool smpi_switch_data_segment(simgrid::s4u::ActorPtr actor, const void* addr)
185 if (smpi_cfg_privatization() != SmpiPrivStrategies::MMAP || smpi_data_exe_size == 0)
186 return false; // no need to switch
188 if (addr != nullptr &&
189 not(static_cast<const char*>(addr) >= smpi_data_exe_start &&
190 static_cast<const char*>(addr) < smpi_data_exe_start + smpi_data_exe_size))
191 return false; // no need to switch, addr is not concerned
193 static aid_t smpi_loaded_page = -1;
194 if (smpi_loaded_page == actor->get_pid()) // no need to switch, we've already loaded the one we want
195 return true; // return 'true' anyway
197 #if HAVE_PRIVATIZATION
198 // FIXME, cross-process support (mmap across process when necessary)
199 XBT_DEBUG("Switching data frame to the one of process %ld", actor->get_pid());
200 const simgrid::smpi::ActorExt* process = smpi_process_remote(actor);
201 int current = process->privatized_region()->file_descriptor;
202 xbt_assert(mmap(TOPAGE(smpi_data_exe_start), smpi_data_exe_size, PROT_RW, MAP_FIXED | MAP_SHARED, current, 0) ==
203 TOPAGE(smpi_data_exe_start),
204 "Couldn't map the new region (errno %d): %s", errno, strerror(errno));
205 smpi_loaded_page = actor->get_pid();
212 * @brief Makes a backup of the segment in memory that stores the global variables of a process.
213 * This backup is then used to initialize the global variables for every single
214 * process that is added, regardless of the progress of the simulation.
216 void smpi_backup_global_memory_segment()
218 #if HAVE_PRIVATIZATION
219 smpi_get_executable_global_size();
220 initial_vm_map.clear();
221 initial_vm_map.shrink_to_fit();
223 XBT_DEBUG("bss+data segment found : size %zu starting at %p", smpi_data_exe_size, smpi_data_exe_start);
225 if (smpi_data_exe_size == 0) { // no need to do anything as global variables don't exist
226 smpi_privatize_global_variables = SmpiPrivStrategies::NONE;
230 smpi_data_exe_copy = ::operator new(smpi_data_exe_size);
231 // Make a copy of the data segment. This clean copy is retained over the whole runtime
232 // of the simulation and can be used to initialize a dynamically added, new process.
233 asan_safe_memcpy(smpi_data_exe_copy, TOPAGE(smpi_data_exe_start), smpi_data_exe_size);
234 #else /* ! HAVE_PRIVATIZATION */
235 xbt_die("You are trying to use privatization on a system that does not support it. Don't.");
239 // Initializes the memory mapping for a single process and returns the privatization region
240 smpi_privatization_region_t smpi_init_global_memory_segment_process()
242 int file_descriptor = smpi_temp_shm_get();
244 // ask for a free region
245 void* address = smpi_temp_shm_mmap(file_descriptor, smpi_data_exe_size);
247 // initialize the values
248 asan_safe_memcpy(address, smpi_data_exe_copy, smpi_data_exe_size);
250 // store the address of the mapping for further switches
251 smpi_privatization_regions.emplace_back(s_smpi_privatization_region_t{address, file_descriptor});
253 return &smpi_privatization_regions.back();
256 void smpi_destroy_global_memory_segments(){
257 if (smpi_data_exe_size == 0) // no need to switch
259 #if HAVE_PRIVATIZATION
260 for (auto const& region : smpi_privatization_regions) {
261 if (munmap(region.address, smpi_data_exe_size) < 0)
262 XBT_WARN("Unmapping of fd %d failed: %s", region.file_descriptor, strerror(errno));
263 close(region.file_descriptor);
265 smpi_privatization_regions.clear();
266 ::operator delete(smpi_data_exe_copy);
270 static std::vector<unsigned char> sendbuffer;
271 static std::vector<unsigned char> recvbuffer;
273 //allocate a single buffer for all sends, growing it if needed
274 unsigned char* smpi_get_tmp_sendbuffer(size_t size)
276 if (not smpi_process()->replaying())
277 return new unsigned char[size];
278 // FIXME: a resize() may invalidate a previous pointer. Maybe we need to handle a queue of buffers with a reference
279 // counter. The same holds for smpi_get_tmp_recvbuffer.
280 if (sendbuffer.size() < size)
281 sendbuffer.resize(size);
282 return sendbuffer.data();
285 //allocate a single buffer for all recv
286 unsigned char* smpi_get_tmp_recvbuffer(size_t size)
288 if (not smpi_process()->replaying())
289 return new unsigned char[size];
290 if (recvbuffer.size() < size)
291 recvbuffer.resize(size);
292 return recvbuffer.data();
295 void smpi_free_tmp_buffer(const unsigned char* buf)
297 if (not smpi_process()->replaying())
301 void smpi_free_replay_tmp_buffers()
303 std::vector<unsigned char>().swap(sendbuffer);
304 std::vector<unsigned char>().swap(recvbuffer);