From 193e26b3f5577626bce1131ea5977663610e8a0b Mon Sep 17 00:00:00 2001 From: Martin Quinson Date: Thu, 8 Mar 2018 17:01:32 +0100 Subject: [PATCH] completely revamp the MSG tutorial I should have converted this to S4U while I was at it, I know. --- .gitignore | 10 +- doc/Doxyfile.in | 5 +- doc/doxygen/tutorial_msg.doc | 438 ------ doc/{msg-tuto-src => tuto-msg}/Makefile | 15 +- .../deployment0.xml | 0 .../deployment1.xml | 0 .../deployment2.xml | 2 +- .../deployment3.xml | 12 +- .../deployment_general.xml | 0 .../masterworker-sol1.c} | 36 +- .../masterworker-sol2.c} | 38 +- .../masterworker-sol3.c} | 33 +- .../masterworker-sol4.c} | 84 +- .../masterworker.c} | 26 +- doc/tuto-msg/overview.svg | 1240 +++++++++++++++++ doc/tuto-msg/tuto-msg.doc | 422 ++++++ tools/cmake/DefinePackages.cmake | 24 +- tools/cmake/Tests.cmake | 6 +- 18 files changed, 1810 insertions(+), 581 deletions(-) delete mode 100644 doc/doxygen/tutorial_msg.doc rename doc/{msg-tuto-src => tuto-msg}/Makefile (77%) rename doc/{msg-tuto-src => tuto-msg}/deployment0.xml (100%) rename doc/{msg-tuto-src => tuto-msg}/deployment1.xml (100%) rename doc/{msg-tuto-src => tuto-msg}/deployment2.xml (81%) rename doc/{msg-tuto-src => tuto-msg}/deployment3.xml (55%) rename doc/{msg-tuto-src => tuto-msg}/deployment_general.xml (100%) rename doc/{msg-tuto-src/masterworker1.c => tuto-msg/masterworker-sol1.c} (75%) rename doc/{msg-tuto-src/masterworker2.c => tuto-msg/masterworker-sol2.c} (73%) rename doc/{msg-tuto-src/masterworker3.c => tuto-msg/masterworker-sol3.c} (76%) rename doc/{msg-tuto-src/masterworker4.c => tuto-msg/masterworker-sol4.c} (61%) rename doc/{msg-tuto-src/masterworker0.c => tuto-msg/masterworker.c} (81%) create mode 100644 doc/tuto-msg/overview.svg create mode 100644 doc/tuto-msg/tuto-msg.doc diff --git a/.gitignore b/.gitignore index 58d9ebaaba..7004580027 100644 --- a/.gitignore +++ b/.gitignore @@ -117,11 +117,11 @@ tags callgrind.out.* ### Examples and traces *.exe -doc/msg-tuto-src/masterworker0 -doc/msg-tuto-src/masterworker1 -doc/msg-tuto-src/masterworker2 -doc/msg-tuto-src/masterworker3 -doc/msg-tuto-src/masterworker4 +doc/tuto-msg/masterworker +doc/tuto-msg/masterworker-sol1 +doc/tuto-msg/masterworker2 +doc/tuto-msg/masterworker3 +doc/tuto-msg/masterworker4 examples/msg/cloud-masterworker/cloud-masterworker examples/msg/dht-kademlia/dht-kademlia examples/msg/dht-pastry/dht-pastry diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index bb6958f951..c40b913ced 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -659,7 +659,7 @@ INPUT = doxygen/index.doc \ doxygen/outcomes_logs.doc \ doxygen/outcomes_vizu.doc \ doxygen/outcomes_MC.doc \ - doxygen/tutorial_msg.doc \ + tuto-msg/tuto-msg.doc \ doxygen/tutorial_smpi.doc \ doxygen/examples.doc \ doxygen/howtos.doc \ @@ -772,7 +772,8 @@ EXAMPLE_PATH = ./ \ @CMAKE_HOME_DIRECTORY@/src/xbt/ \ @CMAKE_HOME_DIRECTORY@/include \ @CMAKE_HOME_DIRECTORY@/examples \ - @CMAKE_HOME_DIRECTORY@/doc/example_lists + @CMAKE_HOME_DIRECTORY@/doc/example_lists \ + @CMAKE_HOME_DIRECTORY@/doc/tuto-msg # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp diff --git a/doc/doxygen/tutorial_msg.doc b/doc/doxygen/tutorial_msg.doc deleted file mode 100644 index 3149ffc262..0000000000 --- a/doc/doxygen/tutorial_msg.doc +++ /dev/null @@ -1,438 +0,0 @@ -/*! @page tutorial_msg SimGrid Tutorial with MSG - -SimGrid is a toolkit providing the core functionalities for the -simulation of distributed applications in heterogeneous distributed -environments. - -The project goal is both to facilitate research and to help improving -real applications in the area of distributed and parallel systems, -ranging from simple network of workstations to Computational Grids to -Clouds and to supercomputers. - -\tableofcontents - -\section Scenario -The goal of this practical session is to illustrate various usage of -the MSG interface. To this end we will use the following simple setting: - -> Assume we have a (possibly large) bunch of (possibly large) data to -> process and which originally reside on a server (a.k.a. master). For -> sake of simplicity, we assume all input file require the same amount -> of computation. We assume the server can be helped by a (possibly -> large) set of worker machines. What is the best way to organize the -> computations ? - -Although this looks like a very simple setting it raises several -interesting questions: - -- Which algorithm should the master use to send workload? - - The most obvious algorithm would be to send tasks to workers in a - round-robin fashion. This is the initial code we provide you. - - A less obvious but probably more efficient approach would be to set up - a request mechanism where a client first ask for tasks, which allows - the server to decide which request to answer and possibly to send - the tasks to the fastest machines. Maybe you can think of a - smarter mechanism... - -- How many tasks should the client ask for? - - Indeed, if we set up a request mechanism so that workers only - send request whenever they have no more task to process, they are - likely to be poorly exploited since they will have to wait for the - master to consider their request and for the input data to be - transferred. A client should thus probably request a pool of tasks - but if it requests too many tasks, it is likely to lead to a poor - load-balancing... - -- How is the quality of such algorithm dependent on the platform - characteristics and on the task characteristics? - - Whenever the input communication time is very small compared to - processing time and workers are homogeneous, it is likely that the - round-robin algorithm performs very well. Would it still hold true - when transfer time is not negligible and the platform is, say, - a volunteer computing system ? - -- The network topology interconnecting the master and the workers - may be quite complicated. How does such a topology impact the - previous result? - - When data transfers are the bottleneck, it is likely that a good - modeling of the platform becomes essential. In this case, you may - want to be able to account for complex platform topologies. - -- Do the algorithms depend on a perfect knowledge of this - topology? - - Should we still use a flat master worker deployment or should we - use a - -- How is such an algorithm sensitive to external workload variation? - - What if bandwidth, latency and power can vary with no warning? - Shouldn't you study whether your algorithm is sensitive to such - load variations? - -- Although an algorithm may be more efficient than another, how - does it interfere with other applications? - - As you can see, this very simple setting may need to evolve way - beyond what you initially imagined. - -
Premature optimization is the root of all evil. -- D.E.Knuth
- - Furthermore, writing your own simulator is much harder than you - may imagine. This is why you should rely on an established and flexible - one. - -The following figure is a screenshot of [triva][fn:1] visualizing a [SimGrid -simulation][fn:2] of two master worker applications (one in light gray and -the other in dark gray) running in concurrence and showing resource -usage over a long period of time. - -![Test](./sc3-description.png) - -\section Prerequisites - -Of course, you need to install SimGrid before taking this tutorial. -Please refer to the relevant Section: \ref install. - -## Tutorials - -A lot of information on how to install and use Simgrid are -provided by the [online documentation][fn:4] and by several tutorials: - -- http://simgrid.gforge.inria.fr/tutorials/simgrid-use-101.pdf -- http://simgrid.gforge.inria.fr/tutorials/simgrid-tracing-101.pdf -- http://simgrid.gforge.inria.fr/tutorials/simgrid-platf-101.pdf - -## Installing the visualization softwares - -Several tools can be used to visualize the result of SimGrid -simulations and get a better understanding of simulations. - -- [pajeng][fn:5] provides a Gantt-chart visualization. -- [Vite][fn:6] also provides a Gantt-chart visualization. - -Under Debian or Ubuntu, this is really easy with apt-get, while you -may have to install from the source on other systems. Check the -documentation of each software for more details. - -~~~~{.sh} -sudo apt-get install pajeng vite -~~~~ - -\section intro_start Let's get started - -\anchor intro_setup -## Setting up and Compiling - -The corresponding source files can be obtained -[online on GitLab](https://gitlab.inria.fr/simgrid/simgrid/tree/master/doc/msg-tuto-src). -If you find the right button on the top right of the interface, you can download the whole -directory in one archive file. If you wish, you can find other platform file in -[this GitLab directory](https://gitlab.inria.fr/simgrid/simgrid/tree/master/examples/platforms). - -As you can see, there is already a little Makefile that compiles -everything for you. If you struggle with the compilation, then you should double check -your @ref install "SimGrid installation". -On need, please refer to the @ref install_yours_trouble section. - -Once the tiny example has been compiled and it can be easily run as follows: - -~~~~{.sh} -./masterworker0 platforms/platform.xml deployment0.xml -~~~~ - -For a more "fancy" output, you can use simgrid-colorizer. - -~~~~{.sh} -./masterworker0 platforms/platform.xml deployment0.xml 2>&1 | simgrid-colorizer -~~~~ - -If you installed SimGrid to a non-standard path, you may have to -specify the full path to simgrid-colorizer on the above line, such as -\c /opt/simgrid/bin/simgrid-colorizer. If you did not install it at all, -you can find it in /bin/colorize. - -For a classical Gantt-Chart visualization, you can produce a [Paje][fn:5] trace: - -~~~~{.sh} -./masterworker0 platforms/platform.xml deployment0.xml --cfg=tracing:yes \ - --cfg=tracing/msg/process:yes -pajeng simgrid.trace -~~~~ - -Alternatively, you can use [vite][fn:6]. - -~~~~{.sh} -./masterworker0 platforms/platform.xml deployment0.xml --cfg=tracing:yes \ - --cfg=tracing/msg/process:yes --cfg=tracing/basic:yes -vite simgrid.trace -~~~~ - -## Getting Rid of Workers in the Deployment File - -In the previous example, the deployment file `deployment0.xml` -is tightly connected to the platform file `platform.xml` and a -worker process is launched on each host: - -~~~~{.xml} - - - - - - - - - - - - - - - - - - - - - -~~~~ - -This is ok as the platform is rather small but will be painful when -using larger platforms. Instead, modify the simulator -`masterworker0.c` into `masterworker1.c` so that the master -launches a worker process on all the other machines at startup. The -new deployment file `deployment1.xml` should thus now simply be: - -~~~~{.xml} - - - - - - - - - - -~~~~ - -To this end you may need the following MSG functions (click on the links -to see their descriptions): - -~~~~{.c} -int MSG_get_host_number(void); -xbt_dynar_t MSG_hosts_as_dynar(void); -void * xbt_dynar_to_array (xbt_dynar_t dynar); -msg_process_t MSG_process_create(const char *name, xbt_main_func_t code, - void *data, msg_host_t host); -~~~~ - -\note - It may avoid bugs later to avoid launching a worker on - the master host so you probably want to remove it from the host - list. - -The `data` field of the @ref MSG_process_create can be used to pass -a channel name that will be private between master -and workers (e.g., `master_name:worker_name`). Adding the -`master_name` in the channel name will allow to easily have several -masters and a worker per master on each machine. To this end, you -may need to use the following functions: - -~~~~{.c} -msg_host_t MSG_host_self(void); -const char * MSG_host_get_name(msg_host_t host); -msg_process_t MSG_process_self(void); -void * MSG_process_get_data(msg_process_t process); -~~~~ - -If you are not too familiar with string -manipulation in C, you may want to use the following functions -(see the C reference for details): - -~~~~{.c} -char *strcpy(char *dest, const char *src); -char *strcat(char *dest, const char *src); -~~~~ - -## Setting up a Time Limit Mechanism - -In the current version, the number of tasks is defined through the -worker arguments. Hence, tasks are created at the very beginning of -the simulation. Instead, create tasks as needed and provide a time -limit indicating when it stops sending tasks. To this end, you will -obviously need to know what time it is: - -~~~~{.c} -double MSG_get_clock(void); -~~~~ - -Otherwise, a quite effective way of terminating the simulation -would be to use some of the following functions: - -~~~~{.c} -void MSG_process_kill(msg_process_t process); -int MSG_process_killall(int reset_PIDs); -~~~~ - -Anyway, the new deployment `deployment2.xml` file should thus look -like this: - -~~~~{.xml} - - - - - - - - - -~~~~ - -It may also be a good idea to transform most of the `XBT_INFO` into -`XBT_DEBUG` (e.g., keep the information on the total number of -tasks processed). These debug messages can be activated as follows: - -~~~~{.sh} -./masterworker2 platforms/platform.xml deployment2.xml --log=msg_test.thres:debug -~~~~ - -## Using the Tracing Mechanism - -SimGrid can trace all resource consumption and the outcome can be -displayed as illustrated in the section \ref intro_setup. However, when several -masters are deployed, it is hard to understand what happens. - -~~~~{.xml} - - - - - - - - - - - - - - - - - - - -~~~~ - -So let's use categories to track more precisely who does what and when: - -~~~~{.c} -void TRACE_category(const char *category); -void MSG_task_set_category (msg_task_t task, const char *category); -~~~~ - -The outcome can then be visualized as a Gantt-chart as follows: - -~~~~{.sh} -./masterworker3 platforms/platform.xml deployment3.xml --cfg=tracing:yes \ - --cfg=tracing/msg/process:yes -pajeng simgrid.trace -~~~~ - -Right now, you should realize that nothing is behaving like you expect. Most -workers are idle even though input data are ridiculous and there are several -masters deployed on the platform. So it should now be obvious that round robin -is actually very bad. - -## Improving the Scheduling - -Instead of a round-robin scheduling, let's implement a first-come -first-served mechanism. To this end, workers need to send a tiny -request first. A possible way to implement such a request with MSG -is to send on a specific channel (e.g., the name of the master -name) a task with payload 0 and whose attached data is the worker -name. This way, the master can keep track of which workers are idle -and willing to work. - -To know whether it has pending requests, the master can use the -following [function][fn:7]: - -~~~~{.c} -int MSG_task_listen(const char *alias); -~~~~ - -If so, it should get the request and push the corresponding host -into a dynar so that they can later be retrieved when sending a -real [task][fn:7]. - -~~~~{.c} -xbt_dynar_t xbt_dynar_new(const unsigned long elm_size, - void_f_pvoid_t const free_f); -void xbt_dynar_push(xbt_dynar_t const dynar, const void *src); -void xbt_dynar_shift(xbt_dynar_t const dynar, void *const dst); -unsigned long xbt_dynar_length(const xbt_dynar_t dynar); -~~~~ - -As you will soon realize, with such simple mechanisms, simple -deadlocks will soon appear. They can easily be removed with a -simple polling mechanism, hence the need for the following -[function][fn:7]: - -~~~~{.c} -msg_error_t MSG_process_sleep(double nb_sec); -~~~~ - -As you should quickly realize, on the simple previous example, it -will double the throughput of the platform but will be quite -ineffective when input size of the tasks is not negligible anymore. - -From this, many things can easily be added. For example, you could: -- add a performance measurement mechanism; -- enable the master to make smart scheduling choices using - measurement information; -- allow workers to have several pending requests so as to overlap - communication and computations as much as possible; -- ... - -## Using More Elaborate Platforms - -SimGrid offers a rather powerful platform modeling mechanism. The -`src/examples/platforms/` repository comprises a variety of platforms ranging -from simple to elaborate. Associated to a good -visualization tool to ensure your simulation is meaningful, they -can allow you to study to which extent your algorithm scales... - -What is the largest number of tasks requiring 50e6 flops and 1e5 -bytes that you manage to distribute and process in one hour on -`g5k.xml` (you should use `deployment_general.xml`)? - -\section intro_todo TODO: Points to improve for the next time - -- Propose equivalent exercises and skeleton in java. -- Propose a virtualbox image with everything (simgrid, pajeng, ...) already set - up. -- Ease the installation on mac OS X (binary installer) and - windows. -- Explain that programming in C or java and having a working - development environment is a prerequisite. - -[fn:1]: http://triva.gforge.inria.fr/index.html -[fn:2]: http://hal.inria.fr/inria-00529569 -[fn:3]: http://hal.inria.fr/hal-00738321 -[fn:4]: http://simgrid.gforge.inria.fr/simgrid/latest/doc/ -[fn:5]: https://github.com/schnorr/pajeng/ -[fn:6]: http://vite.gforge.inria.fr/ - - - - - -*/ diff --git a/doc/msg-tuto-src/Makefile b/doc/tuto-msg/Makefile similarity index 77% rename from doc/msg-tuto-src/Makefile rename to doc/tuto-msg/Makefile index 316c306b22..01981ad10a 100644 --- a/doc/msg-tuto-src/Makefile +++ b/doc/tuto-msg/Makefile @@ -4,23 +4,20 @@ # http://simgrid.gforge.inria.fr/simgrid/latest/doc/install_yours.html # Some configuration -SIMGRID_INSTALL_PATH = /opt/simgrid # Where you installed simgrid +SIMGRID_INSTALL_PATH = ../.. # Where you installed simgrid CC = gcc # Your compiler (on Mac, use clang instead) # No change needed bellow for this tutorial. ############################################################################ -all: masterworker0 masterworker1 masterworker2 masterworker3 masterworker4 -masterworker0: masterworker0.o -masterworker1: masterworker1.o -masterworker2: masterworker2.o -masterworker3: masterworker3.o -masterworker4: masterworker4.o +all: masterworker +masterworker: masterworker.o WARNING = -Wshadow -Wcast-align -Waggregate-return -Wmissing-prototypes \ -Wmissing-declarations -Wstrict-prototypes -Wmissing-prototypes \ -Wmissing-declarations -Wmissing-noreturn -Wredundant-decls \ -Wnested-externs -Wpointer-arith -Wwrite-strings +WARNING += -Werror # Comment that line to not be in paranoid mode # CFLAGS = -g -O0 $(WARNINGS) # Use this line to make debugging easier CFLAGS = -g -O2 $(WARNINGS) # Use this line to get better performance @@ -35,8 +32,10 @@ CFLAGS = -g -O2 $(WARNINGS) # Use this line to get better performance $(CC) -L$(strip $(SIMGRID_INSTALL_PATH))/lib/ $(CFLAGS) $^ -lsimgrid -o $@ %.o: %.c $(CC) -I$(strip $(SIMGRID_INSTALL_PATH))/include $(CFLAGS) -c -o $@ $< +%: %.c + $(CC) -I$(strip $(SIMGRID_INSTALL_PATH))/include -L$(strip $(SIMGRID_INSTALL_PATH))/lib/ $(CFLAGS) $^ -lsimgrid -o $@ clean: - rm -f *.o *~ masterworker0 masterworker1 masterworker2 masterworker3 masterworker4 + rm -f *.o *~ masterworker .PHONY: clean diff --git a/doc/msg-tuto-src/deployment0.xml b/doc/tuto-msg/deployment0.xml similarity index 100% rename from doc/msg-tuto-src/deployment0.xml rename to doc/tuto-msg/deployment0.xml diff --git a/doc/msg-tuto-src/deployment1.xml b/doc/tuto-msg/deployment1.xml similarity index 100% rename from doc/msg-tuto-src/deployment1.xml rename to doc/tuto-msg/deployment1.xml diff --git a/doc/msg-tuto-src/deployment2.xml b/doc/tuto-msg/deployment2.xml similarity index 81% rename from doc/msg-tuto-src/deployment2.xml rename to doc/tuto-msg/deployment2.xml index 99a9fcad83..6cddb86c47 100644 --- a/doc/msg-tuto-src/deployment2.xml +++ b/doc/tuto-msg/deployment2.xml @@ -2,7 +2,7 @@ - + diff --git a/doc/msg-tuto-src/deployment3.xml b/doc/tuto-msg/deployment3.xml similarity index 55% rename from doc/msg-tuto-src/deployment3.xml rename to doc/tuto-msg/deployment3.xml index c602a27cf0..e5dcfe11d5 100644 --- a/doc/msg-tuto-src/deployment3.xml +++ b/doc/tuto-msg/deployment3.xml @@ -2,18 +2,18 @@ - + - + - + - + - + - + diff --git a/doc/msg-tuto-src/deployment_general.xml b/doc/tuto-msg/deployment_general.xml similarity index 100% rename from doc/msg-tuto-src/deployment_general.xml rename to doc/tuto-msg/deployment_general.xml diff --git a/doc/msg-tuto-src/masterworker1.c b/doc/tuto-msg/masterworker-sol1.c similarity index 75% rename from doc/msg-tuto-src/masterworker1.c rename to doc/tuto-msg/masterworker-sol1.c index 35deb99254..c0ceffc153 100644 --- a/doc/msg-tuto-src/masterworker1.c +++ b/doc/tuto-msg/masterworker-sol1.c @@ -7,9 +7,9 @@ XBT_LOG_NEW_DEFAULT_CATEGORY(msg_test, "Messages specific for this msg example"); -#define FINALIZE ((void*)221297) /* a magic number to tell people to stop working */ +#define FINALIZE ((void*)221297) /* a magic number to tell people to stop working */ -static char * build_channel_name(char *buffer, const char *sender, const char* receiver) +static char* build_channel_name(char* buffer, const char* sender, const char* receiver) { strcpy(buffer, sender); strcat(buffer, ":"); @@ -21,15 +21,15 @@ static char * build_channel_name(char *buffer, const char *sender, const char* r static int master(int argc, char* argv[]); static int worker(int argc, char* argv[]); -static int master(int argc, char *argv[]) +static int master(int argc, char* argv[]) { - msg_host_t host_self = MSG_host_self(); + msg_host_t host_self = MSG_host_self(); const char* master_name = MSG_host_get_name(host_self); char channel[1024]; - long number_of_tasks = xbt_str_parse_int(argv[1], "Invalid amount of tasks: %s"); /** - Number of tasks */ - double comp_size = xbt_str_parse_double(argv[2], "Invalid computational size: %s"); /** - Task compute cost */ - double comm_size = xbt_str_parse_double(argv[3], "Invalid communication size: %s"); /** - Task communication size */ + long number_of_tasks = xbt_str_parse_int(argv[1], "Invalid amount of tasks: %s"); /** - Number of tasks */ + double comp_size = xbt_str_parse_double(argv[2], "Invalid computational size: %s"); /** - Task compute cost */ + double comm_size = xbt_str_parse_double(argv[3], "Invalid communication size: %s"); /** - Task communication size */ /* Create the tasks in advance */ msg_task_t* todo = xbt_new0(msg_task_t, number_of_tasks); @@ -41,7 +41,7 @@ static int master(int argc, char *argv[]) } /* Get the info about the worker processes (directly from SimGrid) */ - int workers_count = argc - 4; + int workers_count = MSG_get_host_number(); msg_host_t* workers = xbt_dynar_to_array(MSG_hosts_as_dynar()); for (int i = 0; i < workers_count; i++) @@ -65,20 +65,20 @@ static int master(int argc, char *argv[]) XBT_INFO("Sent"); } - XBT_INFO ("All tasks have been dispatched. Let's tell everybody the computation is over."); + XBT_INFO("All tasks have been dispatched. Let's tell everybody the computation is over."); for (int i = 0; i < workers_count; i++) { msg_task_t finalize = MSG_task_create("finalize", 0, 0, FINALIZE); - MSG_task_send(finalize, build_channel_name(channel,master_name, MSG_host_get_name(workers[i % workers_count]))); + MSG_task_send(finalize, build_channel_name(channel, master_name, MSG_host_get_name(workers[i % workers_count]))); } XBT_INFO("Goodbye now!"); free(workers); free(todo); return 0; -} /* end_of_master */ +} /* end_of_master */ /** Receiver function */ -static int worker(int argc, char *argv[]) +static int worker(int argc, char* argv[]) { char channel[1024]; @@ -88,7 +88,7 @@ static int worker(int argc, char *argv[]) while (1) { msg_task_t task = NULL; - int res = MSG_task_receive(&(task), channel); + int res = MSG_task_receive(&(task), channel); xbt_assert(res == MSG_OK, "MSG_task_receive failed"); XBT_INFO("Received '%s'", MSG_task_get_name(task)); @@ -104,14 +104,16 @@ static int worker(int argc, char *argv[]) } XBT_INFO("I'm done. See you!"); return 0; -} /* end_of_worker */ +} /* end_of_worker */ /** Main function */ -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { MSG_init(&argc, argv); - xbt_assert(argc > 2, "Usage: %s platform_file deployment_file\n" - "\tExample: %s msg_platform.xml msg_deployment.xml\n", argv[0], argv[0]); + xbt_assert(argc > 2, + "Usage: %s platform_file deployment_file\n" + "\tExample: %s msg_platform.xml msg_deployment.xml\n", + argv[0], argv[0]); /* Create a simulated platform */ MSG_create_environment(argv[1]); diff --git a/doc/msg-tuto-src/masterworker2.c b/doc/tuto-msg/masterworker-sol2.c similarity index 73% rename from doc/msg-tuto-src/masterworker2.c rename to doc/tuto-msg/masterworker-sol2.c index 1974889d19..317335f068 100644 --- a/doc/msg-tuto-src/masterworker2.c +++ b/doc/tuto-msg/masterworker-sol2.c @@ -7,9 +7,9 @@ XBT_LOG_NEW_DEFAULT_CATEGORY(msg_test, "Messages specific for this msg example"); -#define FINALIZE ((void*)221297) /* a magic number to tell people to stop working */ +#define FINALIZE ((void*)221297) /* a magic number to tell people to stop working */ -static char * build_channel_name(char *buffer, const char *sender, const char* receiver) +static char* build_channel_name(char* buffer, const char* sender, const char* receiver) { strcpy(buffer, sender); strcat(buffer, ":"); @@ -21,18 +21,18 @@ static char * build_channel_name(char *buffer, const char *sender, const char* r static int master(int argc, char* argv[]); static int worker(int argc, char* argv[]); -static int master(int argc, char *argv[]) +static int master(int argc, char* argv[]) { msg_host_t host_self = MSG_host_self(); - char *master_name = (char *) MSG_host_get_name(host_self); + char* master_name = (char*)MSG_host_get_name(host_self); char channel[1024]; - double timeout = xbt_str_parse_double(argv[1], "Invalid timeout: %s"); /** - timeout */ - double comp_size = xbt_str_parse_double(argv[2], "Invalid computational size: %s"); /** - Task compute cost */ - double comm_size = xbt_str_parse_double(argv[3], "Invalid communication size: %s"); /** - Task communication size */ + double timeout = xbt_str_parse_double(argv[1], "Invalid timeout: %s"); /** - timeout */ + double comp_size = xbt_str_parse_double(argv[2], "Invalid computational size: %s"); /** - Task compute cost */ + double comm_size = xbt_str_parse_double(argv[3], "Invalid communication size: %s"); /** - Task communication size */ /* Get the info about the worker processes (directly from SimGrid) */ - int workers_count = argc - 4; + int workers_count = MSG_get_host_number(); msg_host_t* workers = xbt_dynar_to_array(MSG_hosts_as_dynar()); for (int i = 0; i < workers_count; i++) // Remove my host from the list @@ -64,31 +64,29 @@ static int master(int argc, char *argv[]) task_num++; } - - XBT_DEBUG ("All tasks have been dispatched. Let's tell everybody the computation is over."); + XBT_DEBUG("All tasks have been dispatched. Let's tell everybody the computation is over."); for (int i = 0; i < workers_count; i++) { msg_task_t finalize = MSG_task_create("finalize", 0, 0, FINALIZE); - MSG_task_send(finalize, build_channel_name(channel,master_name, - MSG_host_get_name(workers[i % workers_count]))); + MSG_task_send(finalize, build_channel_name(channel, master_name, MSG_host_get_name(workers[i % workers_count]))); } - XBT_INFO("Sent %d tasks in total!", task_num); + XBT_DEBUG("Sent %d tasks in total!", task_num); free(workers); return 0; } /** Worker function */ -static int worker(int argc, char *argv[]) +static int worker(int argc, char* argv[]) { char channel[1024]; - build_channel_name(channel,MSG_process_get_data(MSG_process_self()), MSG_host_get_name(MSG_host_self())); + build_channel_name(channel, MSG_process_get_data(MSG_process_self()), MSG_host_get_name(MSG_host_self())); XBT_DEBUG("Receiving on channel '%s'", channel); while (1) { msg_task_t task = NULL; - int res = MSG_task_receive(&(task), channel); + int res = MSG_task_receive(&(task), channel); xbt_assert(res == MSG_OK, "MSG_task_receive failed"); XBT_DEBUG("Received '%s'", MSG_task_get_name(task)); @@ -107,11 +105,13 @@ static int worker(int argc, char *argv[]) } /** Main function */ -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { MSG_init(&argc, argv); - xbt_assert(argc > 2, "Usage: %s platform_file deployment_file\n" - "\tExample: %s msg_platform.xml msg_deployment.xml\n", argv[0], argv[0]); + xbt_assert(argc > 2, + "Usage: %s platform_file deployment_file\n" + "\tExample: %s msg_platform.xml msg_deployment.xml\n", + argv[0], argv[0]); /* Create a simulated platform */ MSG_create_environment(argv[1]); diff --git a/doc/msg-tuto-src/masterworker3.c b/doc/tuto-msg/masterworker-sol3.c similarity index 76% rename from doc/msg-tuto-src/masterworker3.c rename to doc/tuto-msg/masterworker-sol3.c index 7919c3736f..29d10fa57c 100644 --- a/doc/msg-tuto-src/masterworker3.c +++ b/doc/tuto-msg/masterworker-sol3.c @@ -7,9 +7,9 @@ XBT_LOG_NEW_DEFAULT_CATEGORY(msg_test, "Messages specific for this msg example"); -#define FINALIZE ((void*)221297) /* a magic number to tell people to stop working */ +#define FINALIZE ((void*)221297) /* a magic number to tell people to stop working */ -static char * build_channel_name(char *buffer, const char *sender, const char* receiver) +static char* build_channel_name(char* buffer, const char* sender, const char* receiver) { strcpy(buffer, sender); strcat(buffer, ":"); @@ -21,20 +21,20 @@ static char * build_channel_name(char *buffer, const char *sender, const char* r static int master(int argc, char* argv[]); static int worker(int argc, char* argv[]); -static int master(int argc, char *argv[]) +static int master(int argc, char* argv[]) { msg_host_t host_self = MSG_host_self(); - char *master_name = (char *) MSG_host_get_name(host_self); + char* master_name = (char*)MSG_host_get_name(host_self); char channel[1024]; TRACE_category(master_name); - double timeout = xbt_str_parse_double(argv[1], "Invalid timeout: %s"); /** - timeout */ - double comp_size = xbt_str_parse_double(argv[2], "Invalid computational size: %s"); /** - Task compute cost */ - double comm_size = xbt_str_parse_double(argv[3], "Invalid communication size: %s"); /** - Task communication size */ + double timeout = xbt_str_parse_double(argv[1], "Invalid timeout: %s"); /** - timeout */ + double comp_size = xbt_str_parse_double(argv[2], "Invalid computational size: %s"); /** - Task compute cost */ + double comm_size = xbt_str_parse_double(argv[3], "Invalid communication size: %s"); /** - Task communication size */ /* Get the info about the worker processes */ - int workers_count = argc - 4; + int workers_count = MSG_get_host_number(); msg_host_t* workers = xbt_dynar_to_array(MSG_hosts_as_dynar()); for (int i = 0; i < workers_count; i++) // Remove my host from the list @@ -65,12 +65,13 @@ static int master(int argc, char *argv[]) XBT_DEBUG("Sending '%s' to channel '%s'", task->name, channel); MSG_task_send(task, channel); XBT_DEBUG("Sent"); + task_num++; } - XBT_DEBUG ("All tasks have been dispatched. Let's tell everybody the computation is over."); + XBT_DEBUG("All tasks have been dispatched. Let's tell everybody the computation is over."); for (int i = 0; i < workers_count; i++) { msg_task_t finalize = MSG_task_create("finalize", 0, 0, FINALIZE); - MSG_task_send(finalize, build_channel_name(channel,master_name, MSG_host_get_name(workers[i % workers_count]))); + MSG_task_send(finalize, build_channel_name(channel, master_name, MSG_host_get_name(workers[i % workers_count]))); } XBT_INFO("Sent %d tasks in total!", task_num); @@ -79,10 +80,10 @@ static int master(int argc, char *argv[]) } /** Worker function */ -static int worker(int argc, char *argv[]) +static int worker(int argc, char* argv[]) { char channel[1024]; - build_channel_name(channel,MSG_process_get_data(MSG_process_self()), MSG_host_get_name(MSG_host_self())); + build_channel_name(channel, MSG_process_get_data(MSG_process_self()), MSG_host_get_name(MSG_host_self())); XBT_DEBUG("Receiving on channel '%s'", channel); @@ -107,11 +108,13 @@ static int worker(int argc, char *argv[]) } /** Main function */ -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { MSG_init(&argc, argv); - xbt_assert(argc > 2, "Usage: %s platform_file deployment_file\n" - "\tExample: %s msg_platform.xml msg_deployment.xml\n", argv[0], argv[0]); + xbt_assert(argc > 2, + "Usage: %s platform_file deployment_file\n" + "\tExample: %s msg_platform.xml msg_deployment.xml\n", + argv[0], argv[0]); /* Create a simulated platform */ MSG_create_environment(argv[1]); diff --git a/doc/msg-tuto-src/masterworker4.c b/doc/tuto-msg/masterworker-sol4.c similarity index 61% rename from doc/msg-tuto-src/masterworker4.c rename to doc/tuto-msg/masterworker-sol4.c index acec3257bd..a322cbbceb 100644 --- a/doc/msg-tuto-src/masterworker4.c +++ b/doc/tuto-msg/masterworker-sol4.c @@ -7,9 +7,9 @@ XBT_LOG_NEW_DEFAULT_CATEGORY(msg_test, "Messages specific for this msg example"); -#define FINALIZE ((void*)221297) /* a magic number to tell people to stop working */ +#define FINALIZE ((void*)221297) /* a magic number to tell people to stop working */ -static char * build_channel_name(char *buffer, const char *sender, const char* receiver) +static char* build_channel_name(char* buffer, const char* sender, const char* receiver) { strcpy(buffer, sender); strcat(buffer, ":"); @@ -21,20 +21,20 @@ static char * build_channel_name(char *buffer, const char *sender, const char* r static int master(int argc, char* argv[]); static int worker(int argc, char* argv[]); -static int master(int argc, char *argv[]) +static int master(int argc, char* argv[]) { msg_host_t host_self = MSG_host_self(); - char *master_name = (char *) MSG_host_get_name(host_self); + char* master_name = (char*)MSG_host_get_name(host_self); char channel[1024]; TRACE_category(master_name); - double timeout = xbt_str_parse_double(argv[1], "Invalid timeout: %s"); /** - timeout */ - double comp_size = xbt_str_parse_double(argv[2], "Invalid computational size: %s"); /** - Task compute cost */ - double comm_size = xbt_str_parse_double(argv[3], "Invalid communication size: %s"); /** - Task communication size */ + double timeout = xbt_str_parse_double(argv[1], "Invalid timeout: %s"); /** - timeout */ + double comp_size = xbt_str_parse_double(argv[2], "Invalid computational size: %s"); /** - Task compute cost */ + double comm_size = xbt_str_parse_double(argv[3], "Invalid communication size: %s"); /** - Task communication size */ /* Get the info about the worker processes */ - int workers_count = argc - 4; + int workers_count = MSG_get_host_number(); msg_host_t* workers = xbt_dynar_to_array(MSG_hosts_as_dynar()); for (int i = 0; i < workers_count; i++) // Remove my host from the list @@ -50,41 +50,25 @@ static int master(int argc, char *argv[]) /* Dispatch the tasks */ xbt_dynar_t idle_hosts = xbt_dynar_new(sizeof(msg_host_t), NULL); - msg_host_t request_host = NULL; - int task_num = 0; - while (1) { - - while (MSG_task_listen(master_name)) { - msg_task_t request = NULL; - int res = MSG_task_receive(&(request), master_name); - xbt_assert(res == MSG_OK, "MSG_task_receive failed"); - request_host = MSG_task_get_data(request); - xbt_dynar_push(idle_hosts, &request_host); - MSG_task_destroy(request); - } + int task_num = 0; + while (MSG_get_clock() < timeout) { - if(MSG_get_clock()>timeout) { - if(xbt_dynar_length(idle_hosts) == workers_count) break; - else { - MSG_process_sleep(.1); - continue; - } - } - - if(xbt_dynar_length(idle_hosts)<=0) { - /* No request. Let's wait... */ - MSG_process_sleep(.1); - continue; - } + /* Retrieve the next incomming request */ + XBT_DEBUG("Retrieve the next incomming request on %s", master_name); + msg_task_t request = NULL; + int res = MSG_task_receive(&(request), master_name); + xbt_assert(res == MSG_OK, "MSG_task_receive failed"); + msg_host_t requester = MSG_task_get_data(request); + MSG_task_destroy(request); + /* Prepare the task to be sent */ char sprintf_buffer[64]; sprintf(sprintf_buffer, "Task_%d", task_num); msg_task_t task = MSG_task_create(sprintf_buffer, comp_size, comm_size, NULL); MSG_task_set_category(task, master_name); - xbt_dynar_shift(idle_hosts, &request_host); - - build_channel_name(channel,master_name, MSG_host_get_name(request_host)); + /* Send this out */ + build_channel_name(channel, master_name, MSG_host_get_name(requester)); XBT_DEBUG("Sending '%s' to channel '%s'", task->name, channel); MSG_task_send(task, channel); @@ -92,10 +76,19 @@ static int master(int argc, char *argv[]) task_num++; } - XBT_DEBUG ("All tasks have been dispatched. Let's tell everybody the computation is over."); - for (int i = 0; i < workers_count; i++) { + XBT_DEBUG("Time is up. Let's tell everybody the computation is over."); + for (int i = 0; i < workers_count; i++) { /* We don't write in order, but the total amount is right + + /* Don't write to a worker that did not request for work, or it will deadlock: both would be sending something */ + msg_task_t request = NULL; + int res = MSG_task_receive(&(request), master_name); + xbt_assert(res == MSG_OK, "MSG_task_receive failed"); + msg_host_t requester = MSG_task_get_data(request); + MSG_task_destroy(request); + + XBT_DEBUG("Stop worker %s", MSG_host_get_name(requester)); msg_task_t finalize = MSG_task_create("finalize", 0, 0, FINALIZE); - MSG_task_send(finalize, build_channel_name(channel,master_name, MSG_host_get_name(workers[i % workers_count]))); + MSG_task_send(finalize, build_channel_name(channel, master_name, MSG_host_get_name(requester))); } XBT_INFO("Sent %d tasks in total!", task_num); @@ -104,17 +97,18 @@ static int master(int argc, char *argv[]) } /** Worker function */ -static int worker(int argc, char *argv[]) +static int worker(int argc, char* argv[]) { char channel[1024]; - const char *my_master = MSG_process_get_data(MSG_process_self()); + const char* my_master = MSG_process_get_data(MSG_process_self()); build_channel_name(channel, my_master, MSG_host_get_name(MSG_host_self())); XBT_DEBUG("Receiving on channel \"%s\"", channel); while (1) { /* Send a request */ + XBT_DEBUG("Sent a request to my master on %s", my_master); msg_task_t request = MSG_task_create("request", 0, 0, MSG_host_self()); MSG_task_send(request, my_master); @@ -139,11 +133,13 @@ static int worker(int argc, char *argv[]) } /** Main function */ -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { MSG_init(&argc, argv); - xbt_assert(argc > 2, "Usage: %s platform_file deployment_file\n" - "\tExample: %s msg_platform.xml msg_deployment.xml\n", argv[0], argv[0]); + xbt_assert(argc > 2, + "Usage: %s platform_file deployment_file\n" + "\tExample: %s msg_platform.xml msg_deployment.xml\n", + argv[0], argv[0]); /* Create a simulated platform */ MSG_create_environment(argv[1]); diff --git a/doc/msg-tuto-src/masterworker0.c b/doc/tuto-msg/masterworker.c similarity index 81% rename from doc/msg-tuto-src/masterworker0.c rename to doc/tuto-msg/masterworker.c index e20b2eba06..64cccb64b4 100644 --- a/doc/msg-tuto-src/masterworker0.c +++ b/doc/tuto-msg/masterworker.c @@ -7,14 +7,14 @@ XBT_LOG_NEW_DEFAULT_CATEGORY(msg_test, "Messages specific for this msg example"); -#define FINALIZE ((void*)221297) /* a magic number to tell people to stop working */ +#define FINALIZE ((void*)221297) /* a magic number to tell people to stop working */ /** Master expects 3+ arguments given in the XML deployment file: */ -static int master(int argc, char *argv[]) +static int master(int argc, char* argv[]) { - long number_of_tasks = xbt_str_parse_int(argv[1], "Invalid amount of tasks: %s"); /** - Number of tasks */ - double comp_size = xbt_str_parse_double(argv[2], "Invalid computational size: %s"); /** - Task compute cost */ - double comm_size = xbt_str_parse_double(argv[3], "Invalid communication size: %s"); /** - Task communication size */ + long number_of_tasks = xbt_str_parse_int(argv[1], "Invalid amount of tasks: %s"); /** - Number of tasks */ + double comp_size = xbt_str_parse_double(argv[2], "Invalid computational size: %s"); /** - Task compute cost */ + double comm_size = xbt_str_parse_double(argv[3], "Invalid communication size: %s"); /** - Task communication size */ /* Create the tasks in advance */ msg_task_t* todo = xbt_new0(msg_task_t, number_of_tasks); @@ -58,8 +58,8 @@ static int master(int argc, char *argv[]) return 0; } -/** Worker expects a single argument given in the XML deployment file: */ -static int worker(int argc, char *argv[]) +/** Worker does not expect any argument from XML deployment file. */ +static int worker(int argc, char* argv[]) { while (1) { msg_task_t task = NULL; @@ -79,15 +79,17 @@ static int worker(int argc, char *argv[]) } XBT_INFO("I'm done. See you!"); return 0; -} /* end_of_worker */ +} /* end_of_worker */ /** Main function */ -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { MSG_init(&argc, argv); - xbt_assert(argc > 2, "Usage: %s platform_file deployment_file\n" - "\tExample: %s msg_platform.xml msg_deployment.xml\n", argv[0], argv[0]); + xbt_assert(argc > 2, + "Usage: %s platform_file deployment_file\n" + "\tExample: %s msg_platform.xml msg_deployment.xml\n", + argv[0], argv[0]); /* Create a simulated platform */ MSG_create_environment(argv[1]); @@ -102,4 +104,4 @@ int main(int argc, char *argv[]) XBT_INFO("Simulation time %g", MSG_get_clock()); return (res != MSG_OK); -} /* end_of_main */ +} /* end_of_main */ diff --git a/doc/tuto-msg/overview.svg b/doc/tuto-msg/overview.svg new file mode 100644 index 0000000000..c3188764f9 --- /dev/null +++ b/doc/tuto-msg/overview.svg @@ -0,0 +1,1240 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + Master + + + + T + + + + T + + + + Worker + + + + Worker + + + + Worker + + + + Worker + + + + Worker + + + + + + + + + How should the masterdistribute the tasks? + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + + T + + + ? + + diff --git a/doc/tuto-msg/tuto-msg.doc b/doc/tuto-msg/tuto-msg.doc new file mode 100644 index 0000000000..86813ba5a9 --- /dev/null +++ b/doc/tuto-msg/tuto-msg.doc @@ -0,0 +1,422 @@ +/*! @page tutorial_msg SimGrid Tutorial with MSG + +\tableofcontents + +\section tuto-msg-intro Introduction + +\subsection tuto-msg-intro-settings Settings + +This tutorial will guide your create and run your first SimGrid +simulator. Let's consider the following scenario: + +> Assume we have a (possibly large) bunch of (possibly large) data to +> process and which originally reside on a server (a.k.a. master). For +> sake of simplicity, we assume all input file require the same amount +> of computation. We assume the server can be helped by a (possibly +> large) set of worker machines. What is the best way to organize the +> computations ? + +\htmlonly +
+\endhtmlonly +\htmlinclude tuto-msg/overview.svg +\htmlonly +
+\endhtmlonly + +\subsection tuto-msg-intro-questions Raised Questions + +Although this looks like a very simple setting it raises several +interesting questions: + +- Which algorithm should the master use to send workload? + + The provided code sends the tasks to the workers with a trivial + round-robin algorithm. It would probably be more efficient if the + workers were asking for tasks, to let the master distribute the + tasks in a more cleaver way. + +- Should the worker specify how many tasks they want? Or should the + master decide everything? + + The workers will starve if they don't get the tasks fast + enough. One possibility to reduce latency would be to send tasks + in pools instead of one by one. But if the pools are too big, the + load balancing will likely get uneven, in particular when + distributing the last tasks. + +- How does the quality of such algorithm dependent on the platform + characteristics and on the task characteristics? + + Whenever the input communication time is very small compared to + processing time and workers are homogeneous, it is likely that the + round-robin algorithm performs very well. Would it still hold true + when transfer time is not negligible and the platform is, say, a + volunteer computing system ? What if some tasks are performed + faster on some specific nodes? + +- The network topology interconnecting the master and the workers + may be quite complicated. How does such a topology impact the + previous result? + + When data transfers are the bottleneck, it is likely that a good + modeling of the platform becomes essential. The SimGrid platform + models are particularly handy to account for complex platform + topologies. + +- What topology to use for the application? + + Is a flat master worker deployment sufficient? Should we go for a + hierarchical algorithm, with some forwarders taking large pools of + tasks from the master, each of them distributing their tasks to a + sub-pool of workers? Or should we introduce super-peers, + dupplicating the master's role in a peer-to-peer manner? Do the + algorithms require a perfect knowledge of the network? + +- How is such an algorithm sensitive to external workload variation? + + What if bandwidth, latency and computing speed can vary with no + warning? Shouldn't you study whether your algorithm is sensitive + to such load variations? + +- Although an algorithm may be more efficient than another, how + does it interfere with other applications? + + +- Etc, etc. + +As you can see, this very simple setting may need to evolve way beyond +what you initially imagined. And this is a good news. + +But don't believe the fools saying that all you need to study such +settings is a simple discrete event simulator. Do you really want to +reinvent the wheel, write your own tool, debug it, optimize it and +validate its models against real settings for ages, or do you prefer +to sit on the shoulders of a giant?
+With SimGrid, you can forget about most technical details (but not +all), and focus on your algorithm. The whole simulation mechanism is +already working. + +\subsection tuto-msg-intro-goal Envisionned Study + + +The following figure is a screenshot of [triva][fn:1] visualizing a [SimGrid +simulation][fn:2] of two master worker applications (one in light gray and +the other in dark gray) running in concurrence and showing resource +usage over a long period of time. + +![Test](./sc3-description.png) + +\section tuto-msg-starting Getting Started + +\subsection tuto-msg-prerequesite Prerequisite + +In this example, we use Pajeng and Vite to visualize the result of +SimGrid simulations. These external tools are usually very easy to +install. On Debian and Ubuntu for example, you can get them as follows: + +~~~~{.sh} +sudo apt-get install pajeng vite +~~~~ + +\subsection tuto-msg-setup Setting up and Compiling + +The corresponding source files can be obtained +[online on GitLab](https://gitlab.inria.fr/simgrid/simgrid/tree/master/doc/tuto-msg/src). +There is a button on the top right to download the whole +directory in one archive file. If you wish, other platform files are available from +[this GitLab directory](https://gitlab.inria.fr/simgrid/simgrid/tree/master/examples/platforms). + +As you can see, there is already a little Makefile that compiles +everything for you. If you struggle with the compilation, then you should double check +your @ref install "SimGrid installation". +On need, please refer to the @ref install_yours_trouble section. + +\section tuto-msg-ex0 Discovering the provided simulator + +Please compile and execute the provided simulator as follows: + +~~~~{.sh} +make masterworker +./masterworker examples/platforms/small_platform.xml deployment0.xml +~~~~ + +For a more "fancy" output, you can use simgrid-colorizer. + +~~~~{.sh} +./masterworker examples/platforms/small_platform.xml deployment0.xml 2>&1 | simgrid-colorizer +~~~~ + +If you installed SimGrid to a non-standard path, you may have to +specify the full path to simgrid-colorizer on the above line, such as +\c /opt/simgrid/bin/simgrid-colorizer. If you did not install it at all, +you can find it in /bin/colorize. + +For a classical Gantt-Chart visualization, you can produce a [Paje][fn:5] trace: + +~~~~{.sh} +./masterworker platforms/platform.xml deployment0.xml --cfg=tracing:yes \ + --cfg=tracing/msg/process:yes +pajeng simgrid.trace +~~~~ + +Alternatively, you can use [vite][fn:6]. + +~~~~{.sh} +./masterworker platforms/platform.xml deployment0.xml --cfg=tracing:yes \ + --cfg=tracing/msg/process:yes --cfg=tracing/basic:yes +vite simgrid.trace +~~~~ + +\subsection tuto-msg-exo0-source Understanding this source code + +Explore the \ref doc/tuto-msg/masterworker.c source file. It contains 3 functions: + - \c master: that's the code executed by the master process.
+ It creates a large array containing all tasks, + dispatches all tasks to the workers and then dispatch + specific tasks which name is "finalize". + - \c worker: each workers will execute this function.
+ That's an infinite loop waiting for incomming tasks. + We exit the loop if the name of the received task is "finalize", or process the task otherwise. + - \c main: this setups the simulation. + +How does SimGrid know that we need one master and several workers? +Because it's written in the deployment file (called \c +deployment0.xml), that we pass to MSG_create_environment() during the setup. + +\include doc/tuto-msg/deployment0.xml + +\section tuto-msg-exo1 Exercise 1: Simplifying the deployment file + +In the provided example, the deployment file `deployment0.xml` is +tightly connected to the platform file `small_platform.xml` and adding +more workers quickly becomes a pain: You need to start them (at the +bottom of the file), add to inform the master that they are available +(in the master parameters list). + +Instead, modify the simulator `masterworker.c` into `masterworker-exo1.c` +so that the master launches a worker process on all the other machines +at startup. The new deployment file `deployment1.xml` should be as +simple as: + +\include doc/tuto-msg/deployment1.xml + +For that, the master needs to retrieve the list of hosts declared in +the platform, with the following functions (follow the links for their +documentation): + +~~~~{.c} +int MSG_get_host_number(void); +xbt_dynar_t MSG_hosts_as_dynar(void); +void * xbt_dynar_to_array (xbt_dynar_t dynar); +~~~~ + +Then, the master should start the worker processes with the following function: + +~~~~{.c} +msg_process_t MSG_process_create(const char *name, xbt_main_func_t code, void *data, msg_host_t host); +~~~~ + +\subsection tuto-msg-exo1-config Increasing configurability + +The worker processes wait for incomming messages on a channel which +name they need to know beforehand. In the provided code, each worker +uses the name of its host as a channel name. You can see this in the +receiver source code: + +~~~~{.c} + int res = MSG_task_receive(&(task), MSG_host_get_name(MSG_host_self())); + xbt_assert(res == MSG_OK, "MSG_task_receive failed"); +~~~~ + +This way, you can have at most one worker per host. To later study the +behavior of concurrent applications on the platform, we need to +alleviate this. Several solutions exist: + +Now that the the master creates the workers, it knows their PID +(process ID -- given by @ref MSG_process_get_pid()), so you could use +it in the channel name. + +Another possibility for the master is to determine a channel name +before the process creation, and give that name as a parameter to the +starting process. This is what the `data` parameter of @ref +MSG_process_create is meant for. You can pass any arbitrary pointer, +and the created process can retrieve this value later with the @ref +MSG_process_get_data and @ref MSG_process_self functions. Since we +want later to study concurrent applications, it is advised to use a +channel name such as `master_name:worker_name`. + +A third possibility would be to inverse the communication architecture +and have the workers pulling work from the master. This require to +pass the master's channel to the workers. + +\subsection tuto-msg-exo1-wrapup Wrap up + +In this exercise, we reduced the amount of configuration that our +simulator requests. This is both a good idea, and a dangerous +trend. This simplification is an application of the good old DRY/SPOT +programming principle (Don't Repeat Yourself / Single Point Of Truth +-- more on wikipedia), +and you really want your programming artefacts to follow these software engineering principles. + +But at the same time, you should be careful in separating your +scientific contribution (the master/wokers algorithm) and the +artefacts used to test it (platform, deployment and workload). This is +why SimGrid forces you to expres your platform and deployment files in +XML instead of using a programming interface: it forces a clear +separation of concerns between things that are of very different +nature. + +If you struggle with this exercise, have a look at +our solution in \ref doc/tuto-msg/masterworker-sol1.c +This is not perfect at all, and many other solutions would have been possible, of course. + +\section tuto-msg-exo2 Exercise 2: Infinite amount of work, fixed experiment duration + +In the current version, the number of tasks is defined through the +worker arguments. Hence, tasks are created at the very beginning of +the simulation. Instead, have the master dispatching tasks for a +predetermined amount of time. The tasks must now be created on demand +instead of beforehand. + +Of course, usual time functions like `gettimeofday` will give you the +time on your real machine, which is prety useless in the +simulation. Instead, retrieve the time in the simulated world with +@ref MSG_get_clock. + +You can still stop your workers with a specific task as previously, +but other methods exist. You can forcefully stop processes with the +following functions, but be warned that SimGrid traditionnally had +issues with forcefully stopping procsses involved in computations or +communications. We hope that it's better now, but YMMV. + +~~~~{.c} +void MSG_process_kill(msg_process_t process); +int MSG_process_killall(int reset_PIDs); +~~~~ + +Anyway, the new deployment `deployment2.xml` file should thus look +like this: + +\include doc/tuto-msg/deployment2.xml + +\subsection tuto-msg-exo2-verbosity Controlling the message verbosity + +Not all messages are equally informative, so you probably want to +change most of the `XBT_INFO` into `XBT_DEBUG` so that they are hidden +by default. You could for example show only the total number of tasks +processed by default. You can still see the debug messages as follows: + +~~~~{.sh} +./masterworker examples/platforms/small_platform.xml deployment2.xml --log=msg_test.thres:debug +~~~~ + +\subsection tuto-msg-exo2-wrapup Wrap up + +Our imperfect solution to this exercise is available as @ref doc/tuto-msg/masterworker-sol2.c +But there is still much to improve in that code. + +\section tuto-msg-exo3 Exercise 3: Understanding how competing applications behave + +It is now time to start several applications at once, with the following `deployment3.xml` file. + +\include doc/tuto-msg/deployment3.xml + +Things happen when you do so, but it remains utterly difficult to +understand what's happening exactely. Even visualizations with pajeng +and Vite contain too much information to be useful: it is impossible +to understand which task belong to which application. To fix this, we +will categorize the tasks. + +For that, first let each master create its own category of tasks with +@ref TRACE_category(), and then assign this category to each task using +@ref MSG_task_set_category(). + +The outcome can then be visualized as a Gantt-chart as follows: + +~~~~{.sh} +./masterworker examples/platforms/small_platform.xml deployment3.xml --cfg=tracing:yes --cfg=tracing/msg/process:yes +vite simgrid.trace +~~~~ + +\subsection tuto-msg-exo3-further Going further + +vite is not enough to understand the situation, because it does not +deal with categorization. That is why you should switch to R to +visualize your outcomes, as explained on this +page. + +As usual, you can explore our imperfect solution, in @ref doc/tuto-msg/masterworker-sol3.c. + +\section tuto-msg-exo4 Exercise 4: Better scheduling: FCFS + +You don't need a very advanced visualization solution to notice that +round-robin is completely suboptimal: most of the workers keep waiting +for more work. We will move to a First-Come First-Served mechanism +instead. + +For that, your workers should explicitely request for work with a +message sent to a channel that is specific to their master. The name +of their private channel name should be attached (using the last +parameter of @ref MSG_task_create()) to the message sent, so that +their master can answer. + +The master should serve requests in a round-robin manner, until the +time is up. Things get a bit more complex to stop the workers +afterward: the master cannot simply send a terminating task, as the +workers are blocked until their request for work is accepted. So +instead, the master should wait for incomming requests even once the +time is up, and answer with a terminating task. + +Once it works, you will see that such as simple FCFS schema allows to +double the amount of tasks handled over time in this case. + +\subsection tuto-msg-exo4-further Going further + +From this, many things can easily be added. For example, you could: +- Allow workers to have several pending requests so as to overlap + communication and computations as much as possible. Non-blocking communication will probably become handy here. +- Add a performance measurement mechanism, enabling the master to make smart scheduling choices. +- Test your code on other platforms, from the `examples/platforms` directory in your archive.
+ What is the largest number of tasks requiring 50e6 flops and 1e5 + bytes that you manage to distribute and process in one hour on + `g5k.xml` (you should use `deployment_general.xml`)? +- Optimize not only for the amount of tasks handled, but also for the total energy dissipated. +- And so on. If you come up with a really nice extension, please share it with us so that we can extend this tutorial. + +\section tuto-msg-further Where to go from here? + +This tutorial is now terminated. You could keep reading the [online documentation][fn:4] or +[tutorials][fn:7], or you could head up to the example section to read some code. + +\subsection tuto-msg-further-todo TODO: Points to improve for the next time + +- Propose equivalent exercises and skeleton in java. +- Propose a virtualbox image with everything (simgrid, pajeng, ...) already set + up. +- Ease the installation on mac OS X (binary installer) and + windows. +- Explain that programming in C or java and having a working + development environment is a prerequisite. + + +[fn:1]: http://triva.gforge.inria.fr/index.html +[fn:2]: http://hal.inria.fr/inria-00529569 +[fn:3]: http://hal.inria.fr/hal-00738321 +[fn:4]: http://simgrid.gforge.inria.fr/simgrid/latest/doc/ +[fn:5]: https://github.com/schnorr/pajeng/ +[fn:6]: http://vite.gforge.inria.fr/ +[fn:7]: http://simgrid.org/tutorials/ + + +*/ + + +/** + * @example doc/tuto-msg/masterworker.c + * @example doc/tuto-msg/masterworker-sol1.c + * @example doc/tuto-msg/masterworker-sol2.c + * @example doc/tuto-msg/masterworker-sol3.c + * @example doc/tuto-msg/masterworker-sol4.c + */ \ No newline at end of file diff --git a/tools/cmake/DefinePackages.cmake b/tools/cmake/DefinePackages.cmake index 1b87e2de30..ba15c8c017 100644 --- a/tools/cmake/DefinePackages.cmake +++ b/tools/cmake/DefinePackages.cmake @@ -863,7 +863,7 @@ set(DOC_SOURCES doc/doxygen/install.doc doc/doxygen/install_yours.doc doc/doxygen/java.doc - doc/doxygen/tutorial_msg.doc + doc/tuto-msg/tuto-msg.doc doc/doxygen/tutorial_smpi.doc doc/doxygen/models.doc doc/doxygen/module-msg.doc @@ -896,17 +896,17 @@ set(DOC_SOURCES doc/manpage/smpirun.1 doc/manpage/tesh.pod - doc/msg-tuto-src/deployment0.xml - doc/msg-tuto-src/deployment1.xml - doc/msg-tuto-src/deployment2.xml - doc/msg-tuto-src/deployment3.xml - doc/msg-tuto-src/deployment_general.xml - doc/msg-tuto-src/Makefile - doc/msg-tuto-src/masterworker0.c - doc/msg-tuto-src/masterworker1.c - doc/msg-tuto-src/masterworker2.c - doc/msg-tuto-src/masterworker3.c - doc/msg-tuto-src/masterworker4.c + doc/tuto-msg/deployment0.xml + doc/tuto-msg/deployment1.xml + doc/tuto-msg/deployment2.xml + doc/tuto-msg/deployment3.xml + doc/tuto-msg/deployment_general.xml + doc/tuto-msg/Makefile + doc/tuto-msg/masterworker.c + doc/tuto-msg/masterworker-sol1.c + doc/tuto-msg/masterworker-sol2.c + doc/tuto-msg/masterworker-sol3.c + doc/tuto-msg/masterworker-sol4.c CITATION.bib ) diff --git a/tools/cmake/Tests.cmake b/tools/cmake/Tests.cmake index 85f7d06e39..b4c86131b8 100644 --- a/tools/cmake/Tests.cmake +++ b/tools/cmake/Tests.cmake @@ -89,8 +89,6 @@ ENDIF() ADD_TEST(testall ${CMAKE_BINARY_DIR}/testall) # New tests should use the Boost Unit Test Framework - - if(Boost_UNIT_TEST_FRAMEWORK_FOUND) add_executable (unit_tmgr src/surf/trace_mgr_test.cpp) target_link_libraries(unit_tmgr simgrid ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY}) @@ -99,3 +97,7 @@ if(Boost_UNIT_TEST_FRAMEWORK_FOUND) else() set(EXTRA_DIST ${EXTRA_DIST} src/surf/trace_mgr_test.cpp) endif() + + +# Also test the tutorial +ADD_TEST(tuto-msg-1 sh -c "make -C doc/tuto-msg/src masterworker0 && doc/tuto-msg/src/masterworker0 examples/platforms/small_platform.xml doc/tuto-msg/src/deployment0.xml") \ No newline at end of file -- 2.20.1