Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Fixed a race condition in msg for communication between two processes. The
[simgrid.git] / src / msg / msg_mailbox.c
1 #include "mailbox.h"
2 #include "msg/private.h"
3
4 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(msg_mailbox, msg,
5                                 "Logging specific to MSG (mailbox)");
6
7 static xbt_dict_t msg_mailboxes = NULL;
8
9 void MSG_mailbox_mod_init(void)
10 {
11   msg_mailboxes = xbt_dict_new();
12 }
13
14 void MSG_mailbox_mod_exit(void)
15 {
16   xbt_dict_free(&msg_mailboxes);
17 }
18
19 msg_mailbox_t MSG_mailbox_create(const char *alias)
20 {
21   msg_mailbox_t mailbox = xbt_new0(s_msg_mailbox_t, 1);
22
23   mailbox->tasks = xbt_fifo_new();
24   mailbox->cond = NULL;
25   mailbox->alias = alias ? xbt_strdup(alias) : NULL;
26   mailbox->hostname = NULL;
27
28   return mailbox;
29 }
30
31 msg_mailbox_t MSG_mailbox_new(const char *alias)
32 {
33   msg_mailbox_t mailbox = MSG_mailbox_create(alias);
34
35   /* add the mbox in the dictionary */
36   xbt_dict_set(msg_mailboxes, alias, mailbox, MSG_mailbox_free);
37
38   return mailbox;
39 }
40
41 void MSG_mailbox_free(void *mailbox)
42 {
43   msg_mailbox_t _mailbox = (msg_mailbox_t) mailbox;
44
45   if (_mailbox->hostname)
46     free(_mailbox->hostname);
47
48   xbt_fifo_free(_mailbox->tasks);
49   free(_mailbox->alias);
50
51   free(_mailbox);
52 }
53
54 smx_cond_t MSG_mailbox_get_cond(msg_mailbox_t mailbox)
55 {
56   return mailbox->cond;
57 }
58
59 void MSG_mailbox_remove(msg_mailbox_t mailbox, m_task_t task)
60 {
61   xbt_fifo_remove(mailbox->tasks, task);
62 }
63
64 int MSG_mailbox_is_empty(msg_mailbox_t mailbox)
65 {
66   return (NULL == xbt_fifo_get_first_item(mailbox->tasks));
67 }
68
69 m_task_t MSG_mailbox_pop_head(msg_mailbox_t mailbox)
70 {
71   return (m_task_t) xbt_fifo_shift(mailbox->tasks);
72 }
73
74 m_task_t MSG_mailbox_get_head(msg_mailbox_t mailbox)
75 {
76   xbt_fifo_item_t item;
77
78   if (!(item = xbt_fifo_get_first_item(mailbox->tasks)))
79     return NULL;
80
81   return (m_task_t) xbt_fifo_get_item_content(item);
82 }
83
84
85 m_task_t
86 MSG_mailbox_get_first_host_task(msg_mailbox_t mailbox, m_host_t host)
87 {
88   m_task_t task = NULL;
89   xbt_fifo_item_t item = NULL;
90
91   xbt_fifo_foreach(mailbox->tasks, item, task, m_task_t)
92       if (task->simdata->source == host) {
93     xbt_fifo_remove_item(mailbox->tasks, item);
94     return task;
95   }
96
97   return NULL;
98 }
99
100 int
101 MSG_mailbox_get_count_host_waiting_tasks(msg_mailbox_t mailbox,
102                                          m_host_t host)
103 {
104   m_task_t task = NULL;
105   xbt_fifo_item_t item = NULL;
106   int count = 0;
107
108   xbt_fifo_foreach(mailbox->tasks, item, task, m_task_t) {
109     if (task->simdata->source == host)
110       count++;
111   }
112
113   return count;
114 }
115
116 void MSG_mailbox_set_cond(msg_mailbox_t mailbox, smx_cond_t cond)
117 {
118   mailbox->cond = cond;
119 }
120
121 const char *MSG_mailbox_get_alias(msg_mailbox_t mailbox)
122 {
123   return mailbox->alias;
124 }
125
126 const char *MSG_mailbox_get_hostname(msg_mailbox_t mailbox)
127 {
128   return mailbox->hostname;
129 }
130
131 void MSG_mailbox_set_hostname(msg_mailbox_t mailbox, const char *hostname)
132 {
133   mailbox->hostname = xbt_strdup(hostname);
134 }
135
136 msg_mailbox_t MSG_mailbox_get_by_alias(const char *alias)
137 {
138
139   msg_mailbox_t mailbox = xbt_dict_get_or_null(msg_mailboxes, alias);
140
141   if (!mailbox) {
142     mailbox = MSG_mailbox_new(alias);
143     MSG_mailbox_set_hostname(mailbox, MSG_host_self()->name);
144   }
145
146   return mailbox;
147 }
148
149 msg_mailbox_t
150 MSG_mailbox_get_by_channel(m_host_t host, m_channel_t channel)
151 {
152   xbt_assert0((host != NULL), "Invalid host");
153   xbt_assert1((channel >= 0)
154               && (channel < msg_global->max_channel), "Invalid channel %d",
155               channel);
156
157   return host->simdata->mailboxes[(size_t) channel];
158 }
159
160 MSG_error_t
161 MSG_mailbox_get_task_ext(msg_mailbox_t mailbox, m_task_t * task,
162                          m_host_t host, double timeout)
163 {
164   m_process_t process = MSG_process_self();
165   m_task_t t = NULL;
166   m_host_t h = NULL;
167   simdata_task_t t_simdata = NULL;
168   simdata_host_t h_simdata = NULL;
169   double start_time = SIMIX_get_clock();
170
171   smx_cond_t cond = NULL;       //conditional wait if the task isn't on the channel yet
172
173   CHECK_HOST();
174
175   /* Sanity check */
176   xbt_assert0(task, "Null pointer for the task storage");
177
178   if (*task)
179     CRITICAL0
180         ("MSG_task_get() was asked to write in a non empty task struct.");
181
182   /* Get the task */
183   h = MSG_host_self();
184   h_simdata = h->simdata;
185
186   SIMIX_mutex_lock(h->simdata->mutex);
187
188   if (MSG_mailbox_get_cond(mailbox)) {
189     CRITICAL1("A process is already blocked on the channel %s",
190               MSG_mailbox_get_alias(mailbox));
191     SIMIX_cond_display_info(MSG_mailbox_get_cond(mailbox));
192     xbt_die("Go fix your code!");
193   }
194
195   while (1) {
196     /* if the mailbox is empty (has no task */
197     if (!MSG_mailbox_is_empty(mailbox)) {
198       if (!host) {
199         /* pop the head of the mailbox */
200         t = MSG_mailbox_pop_head(mailbox);
201         break;
202       } else {
203         /* get the first task of the host */
204         if ((t = MSG_mailbox_get_first_host_task(mailbox, host)))
205           break;
206       }
207     }
208
209     if ((timeout > 0) && (SIMIX_get_clock()-start_time>=timeout)) {
210       SIMIX_mutex_unlock(h->simdata->mutex);
211       MSG_mailbox_set_cond(mailbox, NULL);
212       SIMIX_cond_destroy(cond);
213       MSG_RETURN(MSG_TRANSFER_FAILURE);
214     }
215
216     if(!cond) {
217       cond = SIMIX_cond_init();
218       MSG_mailbox_set_cond(mailbox, cond);
219     }
220
221     if (timeout > 0)
222       SIMIX_cond_wait_timeout(cond, h->simdata->mutex, timeout-start_time);
223     else
224       SIMIX_cond_wait(MSG_mailbox_get_cond(mailbox), h->simdata->mutex);
225
226     if (SIMIX_host_get_state(h_simdata->smx_host) == 0) {
227       SIMIX_mutex_unlock(h->simdata->mutex);
228       MSG_mailbox_set_cond(mailbox, NULL);
229       SIMIX_cond_destroy(cond);
230       MSG_RETURN(MSG_HOST_FAILURE);
231     }
232   }
233
234
235   DEBUG1("OK, got a task (%s)", t->name);
236   /* clean conditional */
237   if (cond) {
238     MSG_mailbox_set_cond(mailbox, NULL);
239     SIMIX_cond_destroy(cond);
240   }
241
242   SIMIX_mutex_unlock(h->simdata->mutex);
243
244   t_simdata = t->simdata;
245   t_simdata->receiver = process;
246   *task = t;
247
248   SIMIX_mutex_lock(t_simdata->mutex);
249
250   /* Transfer */
251   /* create SIMIX action to the communication */
252   t_simdata->comm =
253       SIMIX_action_communicate(t_simdata->sender->simdata->m_host->
254                                simdata->smx_host,
255                                process->simdata->m_host->simdata->smx_host,
256                                t->name, t_simdata->message_size,
257                                t_simdata->rate);
258
259   /* This is a hack. We know that both the receiver and the sender will
260      need to look at the content of t_simdata->comm. And it needs to be
261      destroyed. However, we don't known whether the receiver or the sender
262      will get to it first. So by setting whit refcount to 2 we can enforce
263      that things happen correctly. An alternative would be to only do ++ and
264      -- on this refcount and to sprinkle them judiciously throughout the code,
265      which appears perhaps worse? Or perhaps the refcount field of
266      task->simdata can be used for this? At any rate, this will do for now */
267   t_simdata->comm->refcount = 2;
268
269   /* if the process is suspend, create the action but stop its execution, it will be restart when the sender process resume */
270   if (MSG_process_is_suspended(t_simdata->sender)) {
271     DEBUG1("Process sender (%s) suspended", t_simdata->sender->name);
272     SIMIX_action_set_priority(t_simdata->comm, 0);
273   }
274
275   process->simdata->waiting_task = t;
276   SIMIX_register_action_to_condition(t_simdata->comm, t_simdata->cond);
277
278   while (1) {
279     SIMIX_cond_wait(t_simdata->cond, t_simdata->mutex);
280
281     if (SIMIX_action_get_state(t_simdata->comm) != SURF_ACTION_RUNNING)
282       break;
283   }
284
285   SIMIX_unregister_action_to_condition(t_simdata->comm, t_simdata->cond);
286   process->simdata->waiting_task = NULL;
287
288   /* the task has already finished and the pointer must be null */
289   if (t->simdata->sender) {
290     t->simdata->sender->simdata->waiting_task = NULL;
291   }
292
293   /* for this process, don't need to change in get function */
294   t->simdata->receiver = NULL;
295   SIMIX_mutex_unlock(t_simdata->mutex);
296
297
298   if (SIMIX_action_get_state(t_simdata->comm) == SURF_ACTION_DONE) {
299     if (t_simdata->comm->refcount == 1) {
300       SIMIX_action_destroy(t_simdata->comm);
301       t_simdata->comm = NULL;
302     } else {
303       t_simdata->comm->refcount --;
304     }
305     t_simdata->refcount --;
306     MSG_RETURN(MSG_OK);
307   } else if (SIMIX_host_get_state(h_simdata->smx_host) == 0) {
308     if (t_simdata->comm->refcount == 1) {
309       SIMIX_action_destroy(t_simdata->comm);
310       t_simdata->comm = NULL;
311     } else {
312       t_simdata->comm->refcount --;
313     }
314     t_simdata->refcount --;
315     MSG_RETURN(MSG_HOST_FAILURE);
316   } else {
317     if (t_simdata->comm->refcount ==1 ) {
318       SIMIX_action_destroy(t_simdata->comm);
319       t_simdata->comm = NULL;
320     } else {
321       t_simdata->comm->refcount --;
322     }
323     t_simdata->refcount --;
324     MSG_RETURN(MSG_TRANSFER_FAILURE);
325   }
326 }
327
328 MSG_error_t
329 MSG_mailbox_put_with_timeout(msg_mailbox_t mailbox, m_task_t task,
330                              double timeout)
331 {
332   m_process_t process = MSG_process_self();
333   const char *hostname;
334   simdata_task_t t_simdata = NULL;
335   m_host_t local_host = NULL;
336   m_host_t remote_host = NULL;
337   smx_cond_t cond = NULL;
338
339   CHECK_HOST();
340
341   t_simdata = task->simdata;
342   t_simdata->sender = process;
343   t_simdata->source = MSG_process_get_host(process);
344
345   xbt_assert0(t_simdata->refcount  == 1,
346               "This task is still being used somewhere else. You cannot send it now. Go fix your code!");
347
348   t_simdata->comm = NULL;
349
350   t_simdata->refcount ++;
351   local_host = ((simdata_process_t) process->simdata)->m_host;
352   msg_global->sent_msg++;
353
354   /* get the host name containing the mailbox */
355   hostname = MSG_mailbox_get_hostname(mailbox);
356
357   remote_host = MSG_get_host_by_name(hostname);
358
359   if (!remote_host)
360     THROW1(not_found_error, 0, "Host %s not fount", hostname);
361
362
363   DEBUG4
364       ("Trying to send a task (%g kB) from %s to %s on the channel aliased by the alias %s",
365        t_simdata->message_size / 1000, local_host->name,
366        remote_host->name, MSG_mailbox_get_alias(mailbox));
367
368   SIMIX_mutex_lock(remote_host->simdata->mutex);
369
370   /* put the task in the mailbox */
371   xbt_fifo_push(mailbox->tasks, task);
372
373   if ((cond = MSG_mailbox_get_cond(mailbox))) {
374     DEBUG0("Somebody is listening. Let's wake him up!");
375     SIMIX_cond_signal(cond);
376   }
377
378   SIMIX_mutex_unlock(remote_host->simdata->mutex);
379
380   SIMIX_mutex_lock(t_simdata->mutex);
381
382   process->simdata->waiting_task = task;
383
384   if (timeout > 0) {
385     xbt_ex_t e;
386     double time;
387     double time_elapsed;
388     time = SIMIX_get_clock();
389
390     TRY {
391       /*verify if the action that ends is the correct. Call the wait_timeout with the new time. If the timeout occurs, an exception is raised */
392       while (1) {
393         time_elapsed = SIMIX_get_clock() - time;
394         SIMIX_cond_wait_timeout(t_simdata->cond, t_simdata->mutex,
395                                 timeout - time_elapsed);
396
397         if ((t_simdata->comm != NULL)
398             && (SIMIX_action_get_state(t_simdata->comm) !=
399                 SURF_ACTION_RUNNING))
400           break;
401       }
402     }
403     CATCH(e) {
404       if (e.category == timeout_error) {
405         xbt_ex_free(e);
406         /* verify if the timeout happened and the communication didn't started yet */
407         if (t_simdata->comm == NULL) {
408           process->simdata->waiting_task = NULL;
409
410           /* remove the task from the mailbox */
411           MSG_mailbox_remove(mailbox, task);
412
413           if (t_simdata->receiver) {
414             t_simdata->receiver->simdata->waiting_task = NULL;
415           }
416
417           t_simdata->sender = NULL;
418
419           SIMIX_mutex_unlock(t_simdata->mutex);
420           MSG_RETURN(MSG_TRANSFER_FAILURE);
421         }
422       } else {
423         RETHROW;
424       }
425     }
426   } else {
427     while (1) {
428       SIMIX_cond_wait(t_simdata->cond, t_simdata->mutex);
429
430       if (SIMIX_action_get_state(t_simdata->comm) !=
431           SURF_ACTION_RUNNING)
432         break;
433     }
434   }
435
436   DEBUG1("Action terminated %s", task->name);
437   process->simdata->waiting_task = NULL;
438
439   /* the task has already finished and the pointer must be null */
440   if (t_simdata->receiver) {
441     t_simdata->receiver->simdata->waiting_task = NULL;
442   }
443
444   t_simdata->sender = NULL;
445   SIMIX_mutex_unlock(task->simdata->mutex);
446
447   if (SIMIX_action_get_state(t_simdata->comm) == SURF_ACTION_DONE) {
448     if (t_simdata->comm->refcount == 1) {
449       SIMIX_action_destroy(t_simdata->comm);
450       t_simdata->comm = NULL;
451     } else {
452       t_simdata->comm->refcount --;
453     }
454     MSG_RETURN(MSG_OK);
455   } else if (SIMIX_host_get_state(local_host->simdata->smx_host) == 0) {
456     if (t_simdata->comm->refcount == 1) {
457       SIMIX_action_destroy(t_simdata->comm);
458       t_simdata->comm = NULL;
459     } else {
460       t_simdata->comm->refcount --;
461     }
462     MSG_RETURN(MSG_HOST_FAILURE);
463   } else {
464     if (t_simdata->comm->refcount == 1) {
465       SIMIX_action_destroy(t_simdata->comm);
466       t_simdata->comm = NULL;
467     } else {
468       t_simdata->comm->refcount --;
469     }
470     MSG_RETURN(MSG_TRANSFER_FAILURE);
471   }
472 }