- INFO2("Sending a 'Find Successor' request to %d for key %d", ask_to, id);
- m_task_t task = MSG_task_create("Find Successor", 1000, 5000, &req_data);
- MSG_task_send(task, get_mailbox(ask_to));
-
- // receive the answer
- task = NULL;
- MSG_task_receive(&task, req_data.answer_to);
- task_data_t ans_data;
- ans_data = MSG_task_get_data(task);
- int successor = ans_data->answer_id;
- xbt_free(mailbox);
- INFO2("Received the answer to my 'Find Successor' request: the successor of key %d is %d", id, successor);
+ msg_task_t task_sent = MSG_task_create(NULL, COMP_SIZE, COMM_SIZE, req_data);
+ XBT_DEBUG("Sending a 'Find Successor' request (task %p) to %d for id %d", task_sent, ask_to, id);
+ msg_error_t res = MSG_task_send_with_timeout(task_sent, mailbox, timeout);
+
+ if (res != MSG_OK) {
+ XBT_DEBUG("Failed to send the 'Find Successor' request (task %p) to %d for id %d",
+ task_sent, ask_to, id);
+ task_free(task_sent);
+ }
+ else {
+
+ // receive the answer
+ XBT_DEBUG("Sent a 'Find Successor' request (task %p) to %d for key %d, waiting for the answer",
+ task_sent, ask_to, id);
+
+ do {
+ if (node->comm_receive == NULL) {
+ msg_task_t task_received = NULL;
+ node->comm_receive = MSG_task_irecv(&task_received, node->mailbox);
+ }
+
+ res = MSG_comm_wait(node->comm_receive, timeout);
+
+ if (res != MSG_OK) {
+ XBT_DEBUG("Failed to receive the answer to my 'Find Successor' request (task %p): %d",
+ task_sent, (int)res);
+ stop = 1;
+ MSG_comm_destroy(node->comm_receive);
+ node->comm_receive = NULL;
+ }
+ else {
+ msg_task_t task_received = MSG_comm_get_task(node->comm_receive);
+ XBT_DEBUG("Received a task (%p)", task_received);
+ task_data_t ans_data = MSG_task_get_data(task_received);
+
+ // Once upon a time, our code assumed that here, task_received != task_sent all the time
+ //
+ // This assumption is wrong (as messages from differing round can interleave), leading to a bug in our code.
+ // We failed to find this bug directly, as it only occured on large platforms, leading to hardly usable traces.
+ // Instead, we used the model-checker to track down the issue by adding the following test here in the code:
+ // if (MC_is_active()) {
+ // MC_assert(task_received == task_sent);
+ // }
+ // That explained the bug in a snap, with a very cool example and everything.
+ //
+ // This MC_assert is now desactivated as the case is now properly handled in our code and we don't want the
+ // MC to fail any further under that condition, but this comment is here to as a memorial for this first
+ // brillant victory of the model-checking in the SimGrid community :)
+
+ if (task_received != task_sent ||
+ ans_data->type != TASK_FIND_SUCCESSOR_ANSWER) {
+ // this is not the expected answer
+ MSG_comm_destroy(node->comm_receive);
+ node->comm_receive = NULL;
+ handle_task(node, task_received);
+ }
+ else {
+ // this is our answer
+ XBT_DEBUG("Received the answer to my 'Find Successor' request for id %d (task %p): the successor of key %d is %d",
+ ans_data->request_id, task_received, id, ans_data->answer_id);
+ successor = ans_data->answer_id;
+ stop = 1;
+ MSG_comm_destroy(node->comm_receive);
+ node->comm_receive = NULL;
+ task_free(task_received);
+ }
+ }
+ } while (!stop);
+ }