From 9d195a1f6e6006762e394ab69955d0824bbfe2d8 Mon Sep 17 00:00:00 2001 From: Martin Quinson Date: Fri, 24 Mar 2023 22:06:04 +0100 Subject: [PATCH] Fix the refork feature by not ptracing App so that it dies properly --- src/mc/explo/DFSExplorer.cpp | 3 +-- src/mc/remote/AppSide.cpp | 15 +++++++++------ src/mc/remote/CheckerSide.cpp | 35 +++++++++++++++++++++-------------- 3 files changed, 31 insertions(+), 22 deletions(-) diff --git a/src/mc/explo/DFSExplorer.cpp b/src/mc/explo/DFSExplorer.cpp index 456869ecc6..5632d1fff3 100644 --- a/src/mc/explo/DFSExplorer.cpp +++ b/src/mc/explo/DFSExplorer.cpp @@ -297,8 +297,7 @@ void DFSExplorer::backtrack() } // If no backtracing point, then the stack is empty and the exploration is over } -//DFSExplorer::DFSExplorer(const std::vector& args, bool with_dpor) : Exploration(args, _sg_mc_termination) // UNCOMMENT TO ACTIVATE REFORKS -DFSExplorer::DFSExplorer(const std::vector& args, bool with_dpor) : Exploration(args, true) // This version does not use reforks as it breaks +DFSExplorer::DFSExplorer(const std::vector& args, bool with_dpor) : Exploration(args, _sg_mc_termination) { if (with_dpor) reduction_mode_ = ReductionMode::dpor; diff --git a/src/mc/remote/AppSide.cpp b/src/mc/remote/AppSide.cpp index 216227f761..4baa05966f 100644 --- a/src/mc/remote/AppSide.cpp +++ b/src/mc/remote/AppSide.cpp @@ -64,16 +64,19 @@ AppSide* AppSide::initialize() instance_ = std::make_unique(fd); // Wait for the model-checker: - errno = 0; + if (getenv("MC_NEED_PTRACE") != nullptr) { + errno = 0; #if defined __linux__ - ptrace(PTRACE_TRACEME, 0, nullptr, nullptr); + ptrace(PTRACE_TRACEME, 0, nullptr, nullptr); #elif defined BSD - ptrace(PT_TRACE_ME, 0, nullptr, 0); + ptrace(PT_TRACE_ME, 0, nullptr, 0); #else -#error "no ptrace equivalent coded for this platform" + xbt_die("no ptrace equivalent coded for this platform, please don't use the liveness checker here."); #endif - xbt_assert(errno == 0 && raise(SIGSTOP) == 0, "Could not wait for the model-checker (errno = %d: %s)", errno, - strerror(errno)); + + xbt_assert(errno == 0 && raise(SIGSTOP) == 0, "Could not wait for the model-checker (errno = %d: %s)", errno, + strerror(errno)); + } instance_->handle_messages(); return instance_.get(); diff --git a/src/mc/remote/CheckerSide.cpp b/src/mc/remote/CheckerSide.cpp index 9cc1f7d833..dcf347a25a 100644 --- a/src/mc/remote/CheckerSide.cpp +++ b/src/mc/remote/CheckerSide.cpp @@ -37,7 +37,7 @@ static simgrid::config::Flag _sg_mc_setenv{ namespace simgrid::mc { -XBT_ATTRIB_NORETURN static void run_child_process(int socket, const std::vector& args) +XBT_ATTRIB_NORETURN static void run_child_process(int socket, const std::vector& args, bool need_ptrace) { /* On startup, simix_global_init() calls simgrid::mc::Client::initialize(), which checks whether the MC_ENV_SOCKET_FD * env variable is set. If so, MC mode is assumed, and the client is setup from its side @@ -63,6 +63,8 @@ XBT_ATTRIB_NORETURN static void run_child_process(int socket, const std::vector< "Could not remove CLOEXEC for socket"); setenv(MC_ENV_SOCKET_FD, std::to_string(socket).c_str(), 1); + if (need_ptrace) + setenv("MC_NEED_PTRACE", "1", 1); /* Setup the tokenizer that parses the cfg:model-check/setenv parameter */ using Tokenizer = boost::tokenizer>; @@ -144,7 +146,9 @@ void CheckerSide::setup_events() throw simgrid::xbt::errno_error(); } - if (not checker->handle_message(buffer.data(), size)) + if (size == 0) // The app closed the socket. It must be dead by now. + checker->handle_waitpid(); + else if (not checker->handle_message(buffer.data(), size)) checker->break_loop(); } else { xbt_die("Unexpected event"); @@ -170,8 +174,10 @@ void CheckerSide::setup_events() event_add(signal_event_, nullptr); } -CheckerSide::CheckerSide(const std::vector& args, bool need_memory_introspection) : running_(true) +CheckerSide::CheckerSide(const std::vector& args, bool need_memory_info) : running_(true) { + bool need_ptrace = not need_memory_info; + // Create an AF_LOCAL socketpair used for exchanging messages between the model-checker process (ancestor) // and the application process (child) int sockets[2]; @@ -182,7 +188,7 @@ CheckerSide::CheckerSide(const std::vector& args, bool need_memory_intros if (pid_ == 0) { // Child ::close(sockets[1]); - run_child_process(sockets[0], args); + run_child_process(sockets[0], args, need_ptrace); DIE_IMPOSSIBLE; } @@ -191,10 +197,11 @@ CheckerSide::CheckerSide(const std::vector& args, bool need_memory_intros channel_.reset_socket(sockets[1]); setup_events(); - wait_application_process(pid_); + if (need_ptrace) + wait_application_process(pid_); // Request the initial memory on need - if (need_memory_introspection) { + if (need_memory_info) { channel_.send(MessageType::INITIAL_ADDRESSES); s_mc_message_initial_addresses_reply_t answer; ssize_t answer_size = channel_.receive(answer); @@ -219,14 +226,14 @@ CheckerSide::~CheckerSide() event_free(signal_event_); if (running()) { - XBT_DEBUG("Killing process"); - kill(get_pid(), SIGKILL); - while (waitpid(-1, nullptr, WNOHANG) > 0) { - /* we don't really care about errors here, as we are shutting things down anyway */ - /* The child will get ripped by the next waitpid anyway */ - } + errno = 0; + xbt_assert(kill(get_pid(), SIGKILL) == 0); + do { + errno = 0; + waitpid(get_pid(), nullptr, 0); + xbt_assert(errno == 0); + } while (EAGAIN == errno); } - // usleep(500); // Try to reduce the load on my system. Commented because it's not even enough :( } void CheckerSide::finalize(bool terminate_asap) @@ -258,7 +265,7 @@ void CheckerSide::break_loop() const bool CheckerSide::handle_message(const char* buffer, ssize_t size) { s_mc_message_t base_message; - xbt_assert(size >= (ssize_t)sizeof(base_message), "Broken message"); + xbt_assert(size >= (ssize_t)sizeof(base_message), "Broken message. Got only %ld bytes.", size); memcpy(&base_message, buffer, sizeof(base_message)); switch (base_message.type) { -- 2.20.1