Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
828235a065ab09f099164ec5d0179e5f1192fc1e
[simgrid.git] / src / simix / smx_network.cpp
1 /* Copyright (c) 2009-2019. The SimGrid Team. All rights reserved.          */
2
3 /* This program is free software; you can redistribute it and/or modify it
4  * under the terms of the license (GNU LGPL) which comes with this package. */
5
6 #include "mc/mc.h"
7 #include "simgrid/Exception.hpp"
8 #include "src/kernel/activity/MailboxImpl.hpp"
9 #include "src/mc/mc_replay.hpp"
10 #include "src/simix/smx_private.hpp"
11 #include "src/surf/cpu_interface.hpp"
12 #include "src/surf/network_interface.hpp"
13
14 #include <boost/range/algorithm.hpp>
15
16 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(simix_network, simix, "SIMIX network-related synchronization");
17
18 static void SIMIX_waitany_remove_simcall_from_actions(smx_simcall_t simcall);
19
20 /******************************************************************************/
21 /*                          Communication synchros                            */
22 /******************************************************************************/
23 XBT_PRIVATE void simcall_HANDLER_comm_send(smx_simcall_t simcall, smx_actor_t src, smx_mailbox_t mbox, double task_size,
24                                            double rate, void* src_buff, size_t src_buff_size,
25                                            int (*match_fun)(void*, void*, simgrid::kernel::activity::CommImpl*),
26                                            void (*copy_data_fun)(smx_activity_t, void*, size_t), void* data,
27                                            double timeout)
28 {
29   smx_activity_t comm = simcall_HANDLER_comm_isend(simcall, src, mbox, task_size, rate,
30                            src_buff, src_buff_size, match_fun, nullptr, copy_data_fun,
31                data, 0);
32   SIMCALL_SET_MC_VALUE(simcall, 0);
33   simcall_HANDLER_comm_wait(simcall, comm, timeout);
34 }
35
36 XBT_PRIVATE smx_activity_t simcall_HANDLER_comm_isend(
37     smx_simcall_t /*simcall*/, smx_actor_t src_proc, smx_mailbox_t mbox, double task_size, double rate, void* src_buff,
38     size_t src_buff_size, int (*match_fun)(void*, void*, simgrid::kernel::activity::CommImpl*),
39     void (*clean_fun)(void*), // used to free the synchro in case of problem after a detached send
40     void (*copy_data_fun)(smx_activity_t, void*, size_t), // used to copy data if not default one
41     void* data, int detached)
42 {
43   XBT_DEBUG("send from mailbox %p", mbox);
44
45   /* Prepare a synchro describing us, so that it gets passed to the user-provided filter of other side */
46   simgrid::kernel::activity::CommImplPtr this_comm = simgrid::kernel::activity::CommImplPtr(
47       new simgrid::kernel::activity::CommImpl(simgrid::kernel::activity::CommImpl::Type::SEND));
48
49   /* Look for communication synchro matching our needs. We also provide a description of
50    * ourself so that the other side also gets a chance of choosing if it wants to match with us.
51    *
52    * If it is not found then push our communication into the rendez-vous point */
53   simgrid::kernel::activity::CommImplPtr other_comm =
54       mbox->find_matching_comm(simgrid::kernel::activity::CommImpl::Type::RECEIVE, match_fun, data, this_comm,
55                                /*done*/ false, /*remove_matching*/ true);
56
57   if (not other_comm) {
58     other_comm = std::move(this_comm);
59
60     if (mbox->permanent_receiver_ != nullptr) {
61       //this mailbox is for small messages, which have to be sent right now
62       other_comm->state_  = SIMIX_READY;
63       other_comm->dst_actor_ = mbox->permanent_receiver_.get();
64       mbox->done_comm_queue_.push_back(other_comm);
65       XBT_DEBUG("pushing a message into the permanent receive list %p, comm %p", mbox, other_comm.get());
66
67     }else{
68       mbox->push(other_comm);
69     }
70   } else {
71     XBT_DEBUG("Receive already pushed");
72
73     other_comm->state_ = SIMIX_READY;
74     other_comm->type   = simgrid::kernel::activity::CommImpl::Type::READY;
75   }
76   src_proc->comms.push_back(other_comm);
77
78   if (detached) {
79     other_comm->detached = true;
80     other_comm->clean_fun = clean_fun;
81   } else {
82     other_comm->clean_fun = nullptr;
83   }
84
85   /* Setup the communication synchro */
86   other_comm->src_actor_     = src_proc;
87   other_comm->task_size_     = task_size;
88   other_comm->rate_          = rate;
89   other_comm->src_buff_      = src_buff;
90   other_comm->src_buff_size_ = src_buff_size;
91   other_comm->src_data_      = data;
92
93   other_comm->match_fun = match_fun;
94   other_comm->copy_data_fun = copy_data_fun;
95
96
97   if (MC_is_active() || MC_record_replay_is_active()) {
98     other_comm->state_ = SIMIX_RUNNING;
99     return (detached ? nullptr : other_comm);
100   }
101
102   other_comm->start();
103
104   return (detached ? nullptr : other_comm);
105 }
106
107 XBT_PRIVATE void simcall_HANDLER_comm_recv(smx_simcall_t simcall, smx_actor_t receiver, smx_mailbox_t mbox,
108                                            void* dst_buff, size_t* dst_buff_size,
109                                            int (*match_fun)(void*, void*, simgrid::kernel::activity::CommImpl*),
110                                            void (*copy_data_fun)(smx_activity_t, void*, size_t), void* data,
111                                            double timeout, double rate)
112 {
113   smx_activity_t comm = simcall_HANDLER_comm_irecv(simcall, receiver, mbox, dst_buff, dst_buff_size, match_fun,
114                                                    copy_data_fun, data, rate);
115   SIMCALL_SET_MC_VALUE(simcall, 0);
116   simcall_HANDLER_comm_wait(simcall, comm, timeout);
117 }
118
119 XBT_PRIVATE smx_activity_t simcall_HANDLER_comm_irecv(smx_simcall_t /*simcall*/, smx_actor_t receiver,
120                                                       smx_mailbox_t mbox, void* dst_buff, size_t* dst_buff_size,
121                                                       simix_match_func_t match_fun,
122                                                       void (*copy_data_fun)(smx_activity_t, void*, size_t), void* data,
123                                                       double rate)
124 {
125   simgrid::kernel::activity::CommImplPtr this_synchro = simgrid::kernel::activity::CommImplPtr(
126       new simgrid::kernel::activity::CommImpl(simgrid::kernel::activity::CommImpl::Type::RECEIVE));
127   XBT_DEBUG("recv from mbox %p. this_synchro=%p", mbox, this_synchro.get());
128
129   simgrid::kernel::activity::CommImplPtr other_comm;
130   //communication already done, get it inside the list of completed comms
131   if (mbox->permanent_receiver_ != nullptr && not mbox->done_comm_queue_.empty()) {
132
133     XBT_DEBUG("We have a comm that has probably already been received, trying to match it, to skip the communication");
134     //find a match in the list of already received comms
135     other_comm = mbox->find_matching_comm(simgrid::kernel::activity::CommImpl::Type::SEND, match_fun, data,
136                                           this_synchro, /*done*/ true,
137                                           /*remove_matching*/ true);
138     //if not found, assume the receiver came first, register it to the mailbox in the classical way
139     if (not other_comm) {
140       XBT_DEBUG("We have messages in the permanent receive list, but not the one we are looking for, pushing request into list");
141       other_comm = std::move(this_synchro);
142       mbox->push(other_comm);
143     } else {
144       if (other_comm->surf_action_ && other_comm->remains() < 1e-12) {
145         XBT_DEBUG("comm %p has been already sent, and is finished, destroy it", other_comm.get());
146         other_comm->state_ = SIMIX_DONE;
147         other_comm->type   = simgrid::kernel::activity::CommImpl::Type::DONE;
148         other_comm->mbox = nullptr;
149       }
150     }
151   } else {
152     /* Prepare a comm describing us, so that it gets passed to the user-provided filter of other side */
153
154     /* Look for communication activity matching our needs. We also provide a description of
155      * ourself so that the other side also gets a chance of choosing if it wants to match with us.
156      *
157      * If it is not found then push our communication into the rendez-vous point */
158     other_comm = mbox->find_matching_comm(simgrid::kernel::activity::CommImpl::Type::SEND, match_fun, data,
159                                           this_synchro, /*done*/ false,
160                                           /*remove_matching*/ true);
161
162     if (other_comm == nullptr) {
163       XBT_DEBUG("Receive pushed first (%zu comm enqueued so far)", mbox->comm_queue_.size());
164       other_comm = std::move(this_synchro);
165       mbox->push(other_comm);
166     } else {
167       XBT_DEBUG("Match my %p with the existing %p", this_synchro.get(), other_comm.get());
168
169       other_comm->state_ = SIMIX_READY;
170       other_comm->type   = simgrid::kernel::activity::CommImpl::Type::READY;
171     }
172     receiver->comms.push_back(other_comm);
173   }
174
175   /* Setup communication synchro */
176   other_comm->dst_actor_     = receiver;
177   other_comm->dst_buff_      = dst_buff;
178   other_comm->dst_buff_size_ = dst_buff_size;
179   other_comm->dst_data_      = data;
180
181   if (rate > -1.0 && (other_comm->rate_ < 0.0 || rate < other_comm->rate_))
182     other_comm->rate_ = rate;
183
184   other_comm->match_fun = match_fun;
185   other_comm->copy_data_fun = copy_data_fun;
186
187   if (MC_is_active() || MC_record_replay_is_active()) {
188     other_comm->state_ = SIMIX_RUNNING;
189     return other_comm;
190   }
191   other_comm->start();
192   return other_comm;
193 }
194
195 void simcall_HANDLER_comm_wait(smx_simcall_t simcall, smx_activity_t synchro, double timeout)
196 {
197   /* Associate this simcall to the wait synchro */
198   XBT_DEBUG("simcall_HANDLER_comm_wait, %p", synchro.get());
199
200   synchro->simcalls_.push_back(simcall);
201   simcall->issuer->waiting_synchro = synchro;
202
203   if (MC_is_active() || MC_record_replay_is_active()) {
204     int idx = SIMCALL_GET_MC_VALUE(simcall);
205     if (idx == 0) {
206       synchro->state_ = SIMIX_DONE;
207     } else {
208       /* If we reached this point, the wait simcall must have a timeout */
209       /* Otherwise it shouldn't be enabled and executed by the MC */
210       if (timeout < 0.0)
211         THROW_IMPOSSIBLE;
212
213       simgrid::kernel::activity::CommImplPtr comm =
214           boost::static_pointer_cast<simgrid::kernel::activity::CommImpl>(synchro);
215       if (comm->src_actor_ == simcall->issuer)
216         comm->state_ = SIMIX_SRC_TIMEOUT;
217       else
218         comm->state_ = SIMIX_DST_TIMEOUT;
219     }
220
221     SIMIX_comm_finish(synchro);
222     return;
223   }
224
225   /* If the synchro has already finish perform the error handling, */
226   /* otherwise set up a waiting timeout on the right side          */
227   if (synchro->state_ != SIMIX_WAITING && synchro->state_ != SIMIX_RUNNING) {
228     SIMIX_comm_finish(synchro);
229   } else { /* we need a sleep action (even when there is no timeout) to be notified of host failures */
230     simgrid::kernel::resource::Action* sleep = simcall->issuer->get_host()->pimpl_cpu->sleep(timeout);
231     sleep->set_data(synchro.get());
232
233     simgrid::kernel::activity::CommImplPtr comm =
234         boost::static_pointer_cast<simgrid::kernel::activity::CommImpl>(synchro);
235     if (simcall->issuer == comm->src_actor_)
236       comm->src_timeout_ = sleep;
237     else
238       comm->dst_timeout_ = sleep;
239   }
240 }
241
242 void simcall_HANDLER_comm_test(smx_simcall_t simcall, smx_activity_t synchro)
243 {
244   simgrid::kernel::activity::CommImplPtr comm =
245       boost::static_pointer_cast<simgrid::kernel::activity::CommImpl>(synchro);
246
247   int res;
248
249   if (MC_is_active() || MC_record_replay_is_active()){
250     res = comm->src_actor_ && comm->dst_actor_;
251     if (res)
252       synchro->state_ = SIMIX_DONE;
253   } else {
254     res = synchro->state_ != SIMIX_WAITING && synchro->state_ != SIMIX_RUNNING;
255   }
256
257   simcall_comm_test__set__result(simcall, res);
258   if (simcall_comm_test__get__result(simcall)) {
259     synchro->simcalls_.push_back(simcall);
260     SIMIX_comm_finish(synchro);
261   } else {
262     SIMIX_simcall_answer(simcall);
263   }
264 }
265
266 void simcall_HANDLER_comm_testany(smx_simcall_t simcall, simgrid::kernel::activity::ActivityImplPtr comms[],
267                                   size_t count)
268 {
269   // The default result is -1 -- this means, "nothing is ready".
270   // It can be changed below, but only if something matches.
271   simcall_comm_testany__set__result(simcall, -1);
272
273   if (MC_is_active() || MC_record_replay_is_active()){
274     int idx = SIMCALL_GET_MC_VALUE(simcall);
275     if(idx == -1){
276       SIMIX_simcall_answer(simcall);
277     }else{
278       simgrid::kernel::activity::ActivityImplPtr synchro = comms[idx];
279       simcall_comm_testany__set__result(simcall, idx);
280       synchro->simcalls_.push_back(simcall);
281       synchro->state_ = SIMIX_DONE;
282       SIMIX_comm_finish(synchro);
283     }
284     return;
285   }
286
287   for (std::size_t i = 0; i != count; ++i) {
288     simgrid::kernel::activity::ActivityImplPtr synchro = comms[i];
289     if (synchro->state_ != SIMIX_WAITING && synchro->state_ != SIMIX_RUNNING) {
290       simcall_comm_testany__set__result(simcall, i);
291       synchro->simcalls_.push_back(simcall);
292       SIMIX_comm_finish(synchro);
293       return;
294     }
295   }
296   SIMIX_simcall_answer(simcall);
297 }
298
299 void simcall_HANDLER_comm_waitany(smx_simcall_t simcall, xbt_dynar_t synchros, double timeout)
300 {
301   if (MC_is_active() || MC_record_replay_is_active()){
302     if (timeout > 0.0)
303       xbt_die("Timeout not implemented for waitany in the model-checker");
304     int idx = SIMCALL_GET_MC_VALUE(simcall);
305     smx_activity_t synchro = xbt_dynar_get_as(synchros, idx, smx_activity_t);
306     synchro->simcalls_.push_back(simcall);
307     simcall_comm_waitany__set__result(simcall, idx);
308     synchro->state_ = SIMIX_DONE;
309     SIMIX_comm_finish(synchro);
310     return;
311   }
312
313   if (timeout < 0.0){
314     simcall->timer = NULL;
315   } else {
316     simcall->timer = SIMIX_timer_set(SIMIX_get_clock() + timeout, [simcall]() {
317       SIMIX_waitany_remove_simcall_from_actions(simcall);
318       simcall_comm_waitany__set__result(simcall, -1);
319       SIMIX_simcall_answer(simcall);
320     });
321   }
322
323   unsigned int cursor;
324   simgrid::kernel::activity::ActivityImpl* ptr;
325   xbt_dynar_foreach(synchros, cursor, ptr){
326     smx_activity_t synchro = simgrid::kernel::activity::ActivityImplPtr(ptr);
327     /* associate this simcall to the the synchro */
328     synchro->simcalls_.push_back(simcall);
329
330     /* see if the synchro is already finished */
331     if (synchro->state_ != SIMIX_WAITING && synchro->state_ != SIMIX_RUNNING) {
332       SIMIX_comm_finish(synchro);
333       break;
334     }
335   }
336 }
337
338 void SIMIX_waitany_remove_simcall_from_actions(smx_simcall_t simcall)
339 {
340   unsigned int cursor = 0;
341   xbt_dynar_t synchros = simcall_comm_waitany__get__comms(simcall);
342
343   simgrid::kernel::activity::ActivityImpl* ptr;
344   xbt_dynar_foreach(synchros, cursor, ptr){
345     smx_activity_t synchro = simgrid::kernel::activity::ActivityImplPtr(ptr);
346
347     // Remove the first occurence of simcall:
348     auto i = boost::range::find(synchro->simcalls_, simcall);
349     if (i != synchro->simcalls_.end())
350       synchro->simcalls_.erase(i);
351   }
352 }
353
354 /**
355  * @brief Answers the SIMIX simcalls associated to a communication synchro.
356  * @param synchro a finished communication synchro
357  */
358 void SIMIX_comm_finish(smx_activity_t synchro)
359 {
360   simgrid::kernel::activity::CommImplPtr comm =
361       boost::static_pointer_cast<simgrid::kernel::activity::CommImpl>(synchro);
362
363   while (not synchro->simcalls_.empty()) {
364     smx_simcall_t simcall = synchro->simcalls_.front();
365     synchro->simcalls_.pop_front();
366
367     /* If a waitany simcall is waiting for this synchro to finish, then remove it from the other synchros in the waitany
368      * list. Afterwards, get the position of the actual synchro in the waitany dynar and return it as the result of the
369      * simcall */
370
371     if (simcall->call == SIMCALL_NONE) //FIXME: maybe a better way to handle this case
372       continue; // if process handling comm is killed
373     if (simcall->call == SIMCALL_COMM_WAITANY) {
374       SIMIX_waitany_remove_simcall_from_actions(simcall);
375       if (simcall->timer) {
376         SIMIX_timer_remove(simcall->timer);
377         simcall->timer = nullptr;
378       }
379       if (not MC_is_active() && not MC_record_replay_is_active())
380         simcall_comm_waitany__set__result(simcall,
381                                           xbt_dynar_search(simcall_comm_waitany__get__comms(simcall), &synchro));
382     }
383
384     /* If the synchro is still in a rendez-vous point then remove from it */
385     if (comm->mbox)
386       comm->mbox->remove(comm);
387
388     XBT_DEBUG("SIMIX_comm_finish: synchro state = %d", (int)synchro->state_);
389
390     /* Check out for errors */
391
392     if (not simcall->issuer->get_host()->is_on()) {
393       simcall->issuer->context_->iwannadie = true;
394       simcall->issuer->exception_ =
395           std::make_exception_ptr(simgrid::HostFailureException(XBT_THROW_POINT, "Host failed"));
396     } else {
397       switch (comm->state_) {
398
399         case SIMIX_DONE:
400           XBT_DEBUG("Communication %p complete!", synchro.get());
401           comm->copy_data();
402           break;
403
404         case SIMIX_SRC_TIMEOUT:
405           simcall->issuer->exception_ = std::make_exception_ptr(
406               simgrid::TimeoutError(XBT_THROW_POINT, "Communication timeouted because of the sender"));
407           break;
408
409         case SIMIX_DST_TIMEOUT:
410           simcall->issuer->exception_ = std::make_exception_ptr(
411               simgrid::TimeoutError(XBT_THROW_POINT, "Communication timeouted because of the receiver"));
412           break;
413
414         case SIMIX_SRC_HOST_FAILURE:
415           if (simcall->issuer == comm->src_actor_)
416             simcall->issuer->context_->iwannadie = true;
417           else
418             simcall->issuer->exception_ =
419                 std::make_exception_ptr(simgrid::NetworkFailureException(XBT_THROW_POINT, "Remote peer failed"));
420           break;
421
422         case SIMIX_DST_HOST_FAILURE:
423           if (simcall->issuer == comm->dst_actor_)
424             simcall->issuer->context_->iwannadie = true;
425           else
426             simcall->issuer->exception_ =
427                 std::make_exception_ptr(simgrid::NetworkFailureException(XBT_THROW_POINT, "Remote peer failed"));
428           break;
429
430         case SIMIX_LINK_FAILURE:
431           XBT_DEBUG("Link failure in synchro %p between '%s' and '%s': posting an exception to the issuer: %s (%p) "
432                     "detached:%d",
433                     synchro.get(), comm->src_actor_ ? comm->src_actor_->get_host()->get_cname() : nullptr,
434                     comm->dst_actor_ ? comm->dst_actor_->get_host()->get_cname() : nullptr,
435                     simcall->issuer->get_cname(), simcall->issuer, comm->detached);
436           if (comm->src_actor_ == simcall->issuer) {
437             XBT_DEBUG("I'm source");
438           } else if (comm->dst_actor_ == simcall->issuer) {
439             XBT_DEBUG("I'm dest");
440           } else {
441             XBT_DEBUG("I'm neither source nor dest");
442           }
443           simcall->issuer->throw_exception(
444               std::make_exception_ptr(simgrid::NetworkFailureException(XBT_THROW_POINT, "Link failure")));
445           break;
446
447         case SIMIX_CANCELED:
448           if (simcall->issuer == comm->dst_actor_)
449             simcall->issuer->exception_ = std::make_exception_ptr(
450                 simgrid::CancelException(XBT_THROW_POINT, "Communication canceled by the sender"));
451           else
452             simcall->issuer->exception_ = std::make_exception_ptr(
453                 simgrid::CancelException(XBT_THROW_POINT, "Communication canceled by the receiver"));
454           break;
455
456         default:
457           xbt_die("Unexpected synchro state in SIMIX_comm_finish: %d", (int)synchro->state_);
458       }
459     }
460
461     /* if there is an exception during a waitany or a testany, indicate the position of the failed communication */
462     if (simcall->issuer->exception_ &&
463         (simcall->call == SIMCALL_COMM_WAITANY || simcall->call == SIMCALL_COMM_TESTANY)) {
464       // First retrieve the rank of our failing synchro
465       int rank = -1;
466       if (simcall->call == SIMCALL_COMM_WAITANY) {
467         rank = xbt_dynar_search(simcall_comm_waitany__get__comms(simcall), &synchro);
468       } else if (simcall->call == SIMCALL_COMM_TESTANY) {
469         rank         = -1;
470         auto* comms  = simcall_comm_testany__get__comms(simcall);
471         auto count   = simcall_comm_testany__get__count(simcall);
472         auto element = std::find(comms, comms + count, synchro);
473         if (element == comms + count)
474           rank = -1;
475         else
476           rank = element - comms;
477       }
478
479       // In order to modify the exception we have to rethrow it:
480       try {
481         std::rethrow_exception(simcall->issuer->exception_);
482       } catch (simgrid::TimeoutError& e) {
483         e.value                    = rank;
484         simcall->issuer->exception_ = std::make_exception_ptr(e);
485       } catch (simgrid::NetworkFailureException& e) {
486         e.value                    = rank;
487         simcall->issuer->exception_ = std::make_exception_ptr(e);
488       } catch (simgrid::CancelException& e) {
489         e.value                    = rank;
490         simcall->issuer->exception_ = std::make_exception_ptr(e);
491       }
492     }
493
494     simcall->issuer->waiting_synchro = nullptr;
495     simcall->issuer->comms.remove(synchro);
496     if(comm->detached){
497       if (simcall->issuer == comm->src_actor_) {
498         if (comm->dst_actor_)
499           comm->dst_actor_->comms.remove(synchro);
500       } else if (simcall->issuer == comm->dst_actor_) {
501         if (comm->src_actor_)
502           comm->src_actor_->comms.remove(synchro);
503       }
504       else{
505         comm->dst_actor_->comms.remove(synchro);
506         comm->src_actor_->comms.remove(synchro);
507       }
508     }
509
510     if (simcall->issuer->get_host()->is_on())
511       SIMIX_simcall_answer(simcall);
512     else
513       simcall->issuer->context_->iwannadie = true;
514   }
515 }
516
517 void SIMIX_comm_copy_buffer_callback(smx_activity_t synchro, void* buff, size_t buff_size)
518 {
519   simgrid::kernel::activity::CommImplPtr comm =
520       boost::static_pointer_cast<simgrid::kernel::activity::CommImpl>(synchro);
521
522   XBT_DEBUG("Copy the data over");
523   memcpy(comm->dst_buff_, buff, buff_size);
524   if (comm->detached) { // if this is a detached send, the source buffer was duplicated by SMPI sender to make the original buffer available to the application ASAP
525     xbt_free(buff);
526     comm->src_buff_ = nullptr;
527   }
528 }