Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Add new entry in Release_Notes.
[simgrid.git] / src / smpi / smpirun.in
1 #!/usr/bin/env sh
2
3 # Copyright (c) 2007-2023. The SimGrid Team. All rights reserved.
4
5 # This program is free software; you can redistribute it and/or modify it
6 # under the terms of the license (GNU LGPL) which comes with this package.
7
8 @CMAKE_SMPI_COMMAND@
9
10 SIMGRID_VERSION="@SIMGRID_VERSION_STRING@"
11 SIMGRID_GITHASH="@GIT_VERSION@"
12
13 DEFAULT_LOOPBACK_BANDWIDTH="498000000Bps"
14 DEFAULT_LOOPBACK_LATENCY="0.000004s"
15 DEFAULT_NETWORK_BANDWIDTH="$((26 * 1024 * 1024))Bps"
16 DEFAULT_NETWORK_LATENCY="0.000005s"
17 DEFAULT_SPEED="100flops"
18
19 LOOPBACK_BANDWIDTH="${DEFAULT_LOOPBACK_BANDWIDTH}"
20 LOOPBACK_LATENCY="${DEFAULT_LOOPBACK_LATENCY}"
21 NETWORK_BANDWIDTH="${DEFAULT_NETWORK_BANDWIDTH}"
22 NETWORK_LATENCY="${DEFAULT_NETWORK_LATENCY}"
23 SPEED="${DEFAULT_SPEED}"
24
25 PRIVATIZE="--cfg=smpi/privatization:${SMPI_PRIVATIZATION:-@HAVE_PRIVATIZATION@}"
26
27 NUMPROCS=0
28 DEPLOYOPTS=""
29
30 SIMOPTS="--cfg=precision/timing:1e-9 --cfg=network/model:SMPI"
31
32 SMPITMPDIR="$(dirname $(mktemp -u))"
33
34 #usage to print the way this script should be called
35 usage () {
36     cat <<EOF
37 Usage: $0 [OPTIONS] -platform <xmldesc|so> -hostfile <hostfile> program [program-options]
38        $0 [OPTIONS] -platform <xmldesc|so> -hostfile <hostfile> -replay <tracefile> [program] [program-options]
39 Options:
40   -analyze                   # show information about allocations and timings at the end of execution
41   -keep-temps                # don't remove the generated files after execution
42   -wrapper <command>         # use command to run the program (e.g. "valgrind" or "gdb --args")
43   -gdb                       # run within GDB (-wrapper "gdb --args" -keep-temps)
44   -lldb                      # run within LLDB (-wrapper "lldb --" -keep-temps)
45   -vgdb                      # run within Valgrind+GDB (-wrapper "valgrind --vgdb=yes --vgdb-error=0" -keep-temps)
46   -map                       # display the machine on which each process rank is mapped
47   -np <numprocs>             # use that amount of processes from the hostfile.
48                              # By default, all processes of the hostfile are used.
49   -no-privatize              # Disable the globals privatization, that is activated by default
50   -tmpdir                    # Directory used to store temporary files. Defaults to system's.
51   -trace-ti                  # activate time independent tracing (for replay, default in smpi_simgrid.txt)
52   -trace                     # activate tracing (Paje, default in smpi_simgrid.trace)
53   -trace-comment <comment>   # put a comment on the top of the trace file
54   -trace-comment-file <file> # put file contents on the top of the trace file as comment
55   -trace-grouped             # group MPI processes by location
56   -trace-resource            # trace resource utilization
57   -trace-file <tracefile>    # name of the tracefile (simgrid_smpi.trace)
58   -replay <tracefile>        # replays a trace instead of actually executing an application
59
60   -version                   # Displays the SimGrid version (human readable)
61   -git-version               # Displays the git hash of SimGrid
62   -help                      # Displays this information
63   -help-coll                 # Displays all available collective algorithms
64
65 or (deprecated usage):
66   $0 [-keep-temps] [-np <numprocs>] [-bandwidth <bytes/sec>] [-latency <secs>] program [program-options]
67
68 EOF
69 }
70
71 #check if we have at least one parameter
72 if [ $# -eq 0 ]
73 then
74     usage
75     exit
76 fi
77
78 WRAPPER=""
79 HOSTFILE=""
80 QUIET=""
81
82 unset pid
83
84 trapped_signals="HUP INT QUIT ILL ABRT SEGV FPE ALRM TERM USR1 USR2 BUS"
85
86 die () {
87     printf '[%s] ** error: %s. Aborting.\n' "$(basename "$0")" "$*" >&2
88     exit 1
89 }
90
91 smpirun_cleanup()
92 {
93   if [ -z "${KEEP}" ] ; then
94       if [ -z "${PLATFORM}" ] && [ -n "$PLATFORMTMP" ]; then
95         rm -f "${PLATFORMTMP}"
96         PLATFORMTMP=""
97       fi
98       if [ "${UNROLLEDHOSTFILETMP}" = 1 ] && [ -n "$UNROLLEDHOSTFILE" ] ; then
99           rm -f "${UNROLLEDHOSTFILE}"
100           UNROLLEDHOSTFILE=""
101       fi
102   fi
103 }
104
105 smpirun_trap() {
106   local sig
107   sig="$1"
108
109   # Cleanup and kill the child process:
110   smpirun_cleanup
111   if [ -n "$pid" ]; then
112     kill -TERM "$pid"
113   fi
114   unset pid
115
116   # Raise the same signal again (remove the traps first):
117   trap - $trapped_signals
118   kill -"$sig" $$
119
120   # This should never happen:
121   kill -ABRT $$
122   kill -TERM $$
123 }
124
125 for s in $trapped_signals; do
126   trap "smpirun_trap $s" "$s"
127 done
128
129 while true; do
130     case "$1" in
131         "-np" | "-n")
132             NUMPROCS="$2"
133             shift 2
134             ;;
135         "-bandwidth")
136             NETWORK_BANDWIDTH="$2"
137             shift 2
138             ;;
139         "-latency")
140             NETWORK_LATENCY="$2"
141             shift 2
142             ;;
143         "-platform")
144             PLATFORM="$2"
145             if [ ! -f "${PLATFORM}" ]; then
146                 die "the file '${PLATFORM}' does not exist"
147             fi
148             shift 2
149             ;;
150         "-hostfile" | "-machinefile")
151             HOSTFILE="$2"
152             if [ ! -f "${HOSTFILE}" ]; then
153                 die "the file '${HOSTFILE}' does not exist"
154             fi
155             shift 2
156             ;;
157         "-replay")
158             APP_TRACES="$2"
159             if [ ! -f "${APP_TRACES}" ]; then
160                 die "the file '${APP_TRACES}' does not exist"
161             fi
162             DEPLOYOPTS="${DEPLOYOPTS} --cfg=smpi/replay:${APP_TRACES}"
163             shift 2
164             ;;
165         "-no-privatize")
166             PRIVATIZE="--cfg=smpi/privatization:no"
167             shift 1
168             ;;
169         "-map")
170             DEPLOYOPTS="${DEPLOYOPTS} --cfg=smpi/map:1"
171             shift 1
172             ;;
173         "-tmpdir")
174             SMPITMPDIR="$2"
175             shift 1
176             ;;
177         "-trace")
178             TRACE_ACTIVE="true"
179             shift 1
180             ;;
181         "-trace-ti")
182             TRACE_ACTIVE="true"
183             TRACE_TI_ACTIVE="true"
184             shift 1
185             ;;
186         "-trace-comment")
187             TRACE_COMMENT="$2"
188             shift 2
189             ;;
190         "-trace-comment-file")
191             TRACE_COMMENT_FILE="$2"
192             shift 2
193             ;;
194         "-trace-file")
195             TRACE_FILENAME="$2"
196             shift 2
197             ;;
198         "-trace-grouped")
199             TRACE_GROUPED="true"
200             shift 1
201             ;;
202         "-trace-resource")
203             TRACE_RESOURCE="true"
204             shift 1
205             ;;
206         "-keep-temps")
207             KEEP="true"
208             SIMOPTS="$SIMOPTS --cfg=smpi/keep-temps:yes"
209             shift 1
210             ;;
211         "-quiet")
212             QUIET="true"
213             shift 1
214             ;;
215         "-wrapper")
216             WRAPPER="$2"
217             shift 2
218             ;;
219         "-gdb")
220             WRAPPER="gdb --args"
221             KEEP="true"
222             SIMOPTS="$SIMOPTS --cfg=smpi/keep-temps:yes"
223             shift 1
224             ;;
225         "-vgdb")
226             WRAPPER="valgrind --vgdb=yes --vgdb-error=0"
227             KEEP="true"
228             SIMOPTS="$SIMOPTS --cfg=smpi/keep-temps:yes"
229             shift 1
230             ;;
231         "-lldb")
232             WRAPPER="lldb --"
233             KEEP="true"
234             SIMOPTS="$SIMOPTS --cfg=smpi/keep-temps:yes"
235             shift 1
236             ;;
237         "-analyze")
238             SIMOPTS="$SIMOPTS --cfg=smpi/display-timing:yes --cfg=smpi/display-allocs:yes --cfg=smpi/list-leaks:50 --cfg=smpi/pedantic:true --cfg=smpi/barrier-collectives:true"
239             shift 1
240             ;;
241         "-help" | "--help" | "-h")
242             usage
243             exit 0
244             ;;
245         "-help-coll" | "--help-coll")
246             ${WRAPPER} "@SMPIMAIN@" --help-coll
247             exit 0
248             ;;
249         "-version" | "--version" | "-v")
250             printf '%b\n' "$SIMGRID_VERSION"
251             exit 0
252             ;;
253         "-git-version" | "--git-version")
254             printf '%b\n' "$SIMGRID_GITHASH"
255             exit 0
256             ;;
257         "--cfg="*|"--log="*)
258             for OPT in ${1#*=}
259             do
260                 SIMOPTS="$SIMOPTS ${1%%=*}=$OPT"
261             done
262             shift 1
263             ;;
264         "-foreground")
265             # Nothing to do, compatibility.
266             shift 1
267             ;;
268         *)
269             break
270             ;;
271     esac
272 done
273
274 #setup tmp dir
275 SIMOPTS="$SIMOPTS --cfg=smpi/tmpdir:$SMPITMPDIR"
276 export LD_LIBRARY_PATH="$SMPITMPDIR:$LD_LIBRARY_PATH"
277
278 if [ -n "${APP_TRACES}" ] ; then
279     if [ $# -eq 0 ] ; then
280         EXEC="@SMPIREPLAYMAIN@"
281     else
282         EXEC="$1"
283         shift
284     fi
285 else
286     # check if we still have at least one parameter beyond options
287     if [ $# -eq 0 ]
288     then
289         echo "Error: no program to execute!"
290         usage
291         exit
292     fi
293
294     EXEC="$1"
295     shift
296 fi
297
298 # steal --cfg and --logs options
299 while [ $# -gt 0 ]; do
300     case "$1" in
301         "--cfg="*|"--log="*)
302             for OPT in ${1#*=}
303             do
304                 SIMOPTS="$SIMOPTS ${1%%=*}=$OPT"
305             done
306             shift 1
307             ;;
308         *)
309             PROC_ARGS="${PROC_ARGS:+$PROC_ARGS }$1"
310             shift
311             ;;
312     esac
313 done
314
315 if [ -z "${HOSTFILE}" ] && [ -z "${PLATFORM}" ] ; then
316     echo "No hostfile nor platform specified."
317     usage
318     exit 1
319 fi
320
321 UNROLLEDHOSTFILETMP=0
322
323 # parse if our lines are terminated by :num_process
324 if [ -n "${HOSTFILE}" ] && grep -q ':' "${HOSTFILE}" ; then
325     UNROLLEDHOSTFILETMP=1
326     UNROLLEDHOSTFILE="$(mktemp smpitmp-hostfXXXXXX)"
327     @PYTHON_EXECUTABLE@ -c '
328 import sys
329 import re
330
331 for line in sys.stdin:
332     m = re.match("(.*):(.*)", line)
333     if m:
334         for i in range(0, int(m.group(2))):
335             print(m.group(1))
336     else:
337         print(line.strip())
338 ' < "${HOSTFILE}"  > "${UNROLLEDHOSTFILE}"
339     HOSTFILE=$UNROLLEDHOSTFILE
340 fi
341
342 DEPLOYOPTS="${DEPLOYOPTS} --cfg=smpi/np:${NUMPROCS}"
343 DEPLOYOPTS="${DEPLOYOPTS} --cfg=smpi/hostfile:${HOSTFILE}"
344
345 ##-------------------------------- DEFAULT or SPECIFIED PLATFORM --------------------------------------
346 if [ -z "${PLATFORM}" ]; then
347     PLATFORMTMP="$(mktemp smpitmp-platfXXXXXX)"
348
349     cat > "${PLATFORMTMP}" <<PLATFORMHEAD
350 <?xml version='1.0'?>
351 <!DOCTYPE platform SYSTEM "https://simgrid.org/simgrid.dtd">
352 <platform version="4.1">
353 <zone id="AS0" routing="Full">
354 PLATFORMHEAD
355
356     i=${NUMPROCS}
357     while [ "$i" -gt 0 ]; do
358         {
359         echo "  <host id=\"host$i\" speed=\"${SPEED}\"/>"
360         echo "  <link id=\"loop$i\" bandwidth=\"${LOOPBACK_BANDWIDTH}\" latency=\"${LOOPBACK_LATENCY}\"/>"
361         echo "  <link id=\"link$i\" bandwidth=\"${NETWORK_BANDWIDTH}\" latency=\"${NETWORK_LATENCY}\"/>"
362         } >> "${PLATFORMTMP}"
363         i=$((i - 1))
364     done
365
366     i=${NUMPROCS}
367     while [ "$i" -gt 0 ]; do
368         j=${NUMPROCS}
369         while [ "$j" -gt 0 ]; do
370             if [ "$i" -eq "$j" ]; then
371                 echo "  <route src=\"host$i\" dst=\"host$j\"><link_ctn id=\"loop$i\"/></route>" >> "${PLATFORMTMP}"
372             else
373                 echo "  <route src=\"host$i\" dst=\"host$j\"><link_ctn id=\"link$i\"/><link_ctn id=\"link$j\"/></route>" >> "${PLATFORMTMP}"
374             fi
375             j=$((j - 1))
376         done
377         i=$((i - 1))
378     done
379
380     cat >> "${PLATFORMTMP}" <<PLATFORMFOOT
381 </zone>
382 </platform>
383 PLATFORMFOOT
384
385 else
386     PLATFORMTMP=${PLATFORM}
387 fi
388 ##-------------------------------- end DEFAULT or SPECIFIED PLATFORM --------------------------------------
389 ##---------------------- SMPI TRACING OPTIONS ---------------------------------
390 if [ -n "${TRACE_ACTIVE}" ]; then
391     #define trace filename
392     if [ -n "${TRACE_TI_ACTIVE}" ]; then
393         if [ -z "${TRACE_FILENAME}" ]; then
394             TRACE_FILENAME="smpi_simgrid.txt"
395         fi
396         TRACEOPTIONS="--cfg=tracing:yes --cfg=tracing/filename:${TRACE_FILENAME} --cfg=tracing/smpi:yes --cfg=tracing/smpi/format:TI --cfg=tracing/smpi/computing:yes"
397     else
398         if [ -z "${TRACE_FILENAME}" ]; then
399             TRACE_FILENAME="smpi_simgrid.trace"
400         fi
401         TRACEOPTIONS="--cfg=tracing:yes --cfg=tracing/filename:${TRACE_FILENAME} --cfg=tracing/smpi:yes"
402     fi
403
404     if [ -n "${TRACE_COMMENT}" ]; then
405         TRACEOPTIONS="${TRACEOPTIONS} --cfg=tracing/comment:${TRACE_COMMENT}"
406     fi
407
408     if [ -n "${TRACE_COMMENT_FILE}" ]; then
409         TRACEOPTIONS="${TRACEOPTIONS} --cfg=tracing/comment-file:${TRACE_COMMENT_FILE}"
410     fi
411
412     if [ -n "${TRACE_GROUPED}" ]; then
413         TRACEOPTIONS="${TRACEOPTIONS} --cfg=tracing/smpi/group:yes"
414     fi
415
416     if [ -n "${TRACE_RESOURCE}" ]; then
417         TRACEOPTIONS="${TRACEOPTIONS} --cfg=tracing/categorized:yes --cfg=tracing/uncategorized:yes"
418     fi
419 fi
420 ##---------------------- end SMPI TRACING OPTIONS ---------------------------------
421
422 # Do not remove, this variable may be used by user code (e.g. StarPU)
423 export SMPI_GLOBAL_SIZE=${NUMPROCS}
424 if [ -n "${KEEP}" ] && [ -z "${QUIET}" ] ; then
425     echo "${EXEC}" ${PRIVATIZE} "${TRACEOPTIONS}" "${SIMOPTS}" "${PLATFORMTMP}"
426     if [ ${UNROLLEDHOSTFILETMP} = 1 ] ; then
427         echo "Generated unrolled hostfile ${UNROLLEDHOSTFILE} kept."
428     fi
429 fi
430
431 # Execute the process
432 #
433 # The shell still need to be alive for the duration in order to do some cleanup after the process.
434 #
435 # We are going through great lengths in order to both keep stdin and be able to handle signals:
436 #
437 # * The job is launched in the background in order to be able to handle signals.
438 #
439 # * The FD 3 is used to temporarily store FD 1. This is because the shell connects FD 1 to /dev/null when the command
440 #   is launched in the background: this can be overridden in bash but not in standard bourne shell.
441 exec 3<&0
442 ${WRAPPER} "@SMPIMAIN@" "${EXEC}" ${PRIVATIZE} ${DEPLOYOPTS} ${TRACEOPTIONS} ${SIMOPTS} "${PLATFORMTMP}" ${PROC_ARGS} <&3 3>&- &
443 pid=$!
444 exec 3>&-
445 wait $pid
446 status=$?
447 # With dash on Windows WSL/Ubuntu, "wait" sometimes returns early with an exit
448 # status of 128. Try again.
449 while test $status -eq 128 && kill -0 $pid 2>/dev/null; do
450     wait $pid
451     status=$?
452 done
453 pid=""
454
455 # Keep temporary files on failures to help debugging
456 #
457 if [ ${status} -ne 0 ] ; then
458     if [ -z "${KEEP}" ] && [ -z "${QUIET}" ]; then
459         echo "${EXEC}" ${PRIVATIZE} "${TRACEOPTIONS}" "${SIMOPTS}" "${PLATFORMTMP}"
460         if [ ${UNROLLEDHOSTFILETMP} = 1 ] ; then
461             echo "Generated unrolled hostfile ${UNROLLEDHOSTFILE} kept."
462         fi
463         KEEP=true
464     fi
465     echo "Execution failed with code ${status}."
466 fi
467
468 smpirun_cleanup
469
470 exit $status