- Activity::set_remaining() is not public anymore. Use for example
Comm::set_payload_size() to change the size of the simulated data.
+MPI:
+ - New option smpi/barrier-collectives to add a barrier in all
+ collectives to detect dangerous code that /may/ work on some implems.
+
Models:
- WiFi: the total capacity of a link depends on the amout of flows on that link.
- Use the nonlinear callback feature of LMM to reflect this.
- **For collective operations of SMPI,** please refer to Section :ref:`cfg=smpi/coll-selector`
- **smpi/auto-shared-malloc-thresh:** :ref:`cfg=smpi/auto-shared-malloc-thresh`
- **smpi/async-small-thresh:** :ref:`cfg=smpi/async-small-thresh`
+- **smpi/barrier-finalization:** :ref:`cfg=smpi/barrier-finalization`
+- **smpi/barrier-collectives:** :ref:`cfg=smpi/barrier-collectives`
- **smpi/buffering:** :ref:`cfg=smpi/buffering`
- **smpi/bw-factor:** :ref:`cfg=smpi/bw-factor`
- **smpi/coll-selector:** :ref:`cfg=smpi/coll-selector`
- **smpi/display-allocs:** :ref:`cfg=smpi/display-allocs`
- **smpi/display-timing:** :ref:`cfg=smpi/display-timing`
- **smpi/errors-are-fatal:** :ref:`cfg=smpi/errors-are-fatal`
-- **smpi/finalization-barrier:** :ref:`cfg=smpi/finalization-barrier`
- **smpi/grow-injected-times:** :ref:`cfg=smpi/grow-injected-times`
- **smpi/host-speed:** :ref:`cfg=smpi/host-speed`
- **smpi/IB-penalty-factors:** :ref:`cfg=smpi/IB-penalty-factors`
.. TODO:: All available collective algorithms will be made available
via the ``smpirun --help-coll`` command.
-.. _cfg=smpi/finalization-barrier:
+.. _cfg=smpi/barrier-collectives:
+
+Add a barrier in all collectives
+................................
+
+**Option** ``smpi/barrier-collectives`` **default:** off
+
+This option adds a simple barrier in all collectives operation to catch dangerous
+code that may or may not work depending on the MPI implementation. It is disabled
+by default, and activated by the `-analyze` flag of smpirun.
+
+For example, the following code works with OpenMPI while it deadlocks in MPICH and
+Intel MPI. It seems to mean that OpenMPI has a "fire and forget" implementation for
+Broadcast.
+
+.. code-block:: C
+
+ if (rank == 0) {
+ MPI_Bcast(buf1, buff_size, MPI_CHAR, 0, newcom);
+ MPI_Send(&buf2, buff_size, MPI_CHAR, 1, tag, newcom);
+ } else if (rank==1) {
+ MPI_Recv(&buf2, buff_size, MPI_CHAR, 0, tag, newcom, MPI_STATUS_IGNORE);
+ MPI_Bcast(buf1, buff_size, MPI_CHAR, 0, newcom);
+ }
+
+.. _cfg=smpi/barrier-finalization:
Add a barrier in MPI_Finalize
.............................
smpi_process()->mark_as_finalizing();
TRACE_smpi_comm_in(rank_traced, __func__, new simgrid::instr::NoOpTIData("finalize"));
- if(simgrid::config::get_value<bool>("smpi/finalization-barrier"))
+ if (simgrid::config::get_value<bool>("smpi/barrier-finalization"))
simgrid::smpi::colls::barrier(MPI_COMM_WORLD);
smpi_process()->finalize();
new simgrid::instr::CollTIData(request == MPI_REQUEST_IGNORED ? "bcast" : "ibcast", root, -1.0,
count, 0,
simgrid::smpi::Datatype::encode(datatype), ""));
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
if (comm->size() > 1) {
if (request == MPI_REQUEST_IGNORED)
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
aid_t pid = simgrid::s4u::this_actor::get_pid();
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
const void* real_sendbuf = sendbuf;
int real_sendcount = sendcount;
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
aid_t pid = simgrid::s4u::this_actor::get_pid();
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
if (sendbuf == MPI_IN_PLACE) {
sendbuf = static_cast<char*>(recvbuf) + recvtype->get_extent() * displs[comm->rank()];
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
aid_t pid = simgrid::s4u::this_actor::get_pid();
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
aid_t pid = simgrid::s4u::this_actor::get_pid();
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
aid_t pid = simgrid::s4u::this_actor::get_pid();
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
std::vector<unsigned char> tmp_sendbuf;
const void* real_sendbuf = smpi_get_in_place_buf(sendbuf, recvbuf, tmp_sendbuf, count, datatype);
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
aid_t pid = simgrid::s4u::this_actor::get_pid();
std::vector<unsigned char> tmp_sendbuf;
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
aid_t pid = simgrid::s4u::this_actor::get_pid();
std::vector<unsigned char> tmp_sendbuf;
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
aid_t pid = simgrid::s4u::this_actor::get_pid();
auto trace_recvcounts = std::make_shared<std::vector<int>>();
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
int count = comm->size();
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
TRACE_smpi_comm_in(pid, request == MPI_REQUEST_IGNORED ? "PMPI_Alltoall" : "PMPI_Ialltoall",
new simgrid::instr::CollTIData(
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
int send_size = 0;
int recv_size = 0;
const SmpiBenchGuard suspend_bench;
- if(simgrid::config::get_value<bool>("smpi/colls-inject-barrier"))
- simgrid::smpi::colls::barrier(comm);
+ if (simgrid::config::get_value<bool>("smpi/barrier-collectives"))
+ smpi_deployment_startup_barrier(smpi_process()->get_instance_id());
int send_size = 0;
int recv_size = 0;
"Whether we should display the n first MPI handle leaks (addresses and type only) after simulation",
-1);
-simgrid::config::Flag<bool> _smpi_cfg_colls_inject_barrier{
- "smpi/colls-inject-barrier", "Inject a barrier in each colllective operation, to detect some deadlocks in incorrect MPI codes, which may not be triggered in all cases", false };
-
double smpi_cfg_host_speed(){
return _smpi_cfg_host_speed;
}
simgrid::config::declare_flag<std::string>(
"smpi/or", "Small messages timings (MPI_Recv minimum time for small messages)", "0:0:0:0:0");
- simgrid::config::declare_flag<bool>("smpi/finalization-barrier", "Do we add a barrier in MPI_Finalize or not", false);
+ simgrid::config::declare_flag<bool>("smpi/barrier-finalization", {"smpi/finalization-barrier"},
+ "Do we add a barrier in MPI_Finalize or not", false);
+ simgrid::config::declare_flag<bool>("smpi/barrier-collectives",
+ "Inject a barrier in each colllective operation, to detect some deadlocks in "
+ "incorrect MPI codes, which may not be triggered in all cases",
+ false);
smpi_options_initialized = true;
}
simgrid::smpi::Request::waitall(count_requests, requests.data(), MPI_STATUSES_IGNORE);
}
- if(simgrid::config::get_value<bool>("smpi/finalization-barrier"))
+ if (simgrid::config::get_value<bool>("smpi/barrier-finalization"))
simgrid::smpi::colls::barrier(MPI_COMM_WORLD);
active_processes--;
shift 1
;;
"-analyze")
- SIMOPTS="$SIMOPTS --cfg=smpi/display-timing:yes --cfg=smpi/display-allocs:yes --cfg=smpi/list-leaks:50 --cfg=smpi/pedantic:true --cfg=smpi/colls-inject-barrier:true"
+ SIMOPTS="$SIMOPTS --cfg=smpi/display-timing:yes --cfg=smpi/display-allocs:yes --cfg=smpi/list-leaks:50 --cfg=smpi/pedantic:true --cfg=smpi/barrier-collectives:true"
shift 1
;;
"-help" | "--help" | "-h")
(name, path, binary, filename) = sys.argv
for test in mbi.parse_one_code(filename):
- execcmd = test['cmd'].replace("mpirun", f"{path}/smpi_script/bin/smpirun -wrapper '{path}/bin/simgrid-mc --log=mc_safety.t:info' -platform ./cluster.xml -analyze --cfg=smpi/finalization-barrier:on --cfg=smpi/list-leaks:10 --cfg=model-check/max-depth:10000")
+ execcmd = test['cmd'].replace("mpirun", f"{path}/smpi_script/bin/smpirun -wrapper '{path}/bin/simgrid-mc --log=mc_safety.t:info' -platform ./cluster.xml -analyze --cfg=smpi/barrier-finalization:on --cfg=smpi/list-leaks:10 --cfg=model-check/max-depth:10000")
execcmd = execcmd.replace('${EXE}', binary)
execcmd = execcmd.replace('$zero_buffer', "--cfg=smpi/buffering:zero")
execcmd = execcmd.replace('$infty_buffer', "--cfg=smpi/buffering:infty")
outfile.write(' <cluster id="acme" prefix="node-" radical="0-99" suffix="" speed="1Gf" bw="125MBps" lat="50us"/>\n')
outfile.write('</platform>\n')
- execcmd = execcmd.replace("mpirun", "smpirun -wrapper simgrid-mc -platform ./cluster.xml -analyze --cfg=smpi/finalization-barrier:on --cfg=smpi/list-leaks:10 --cfg=model-check/max-depth:10000")
+ execcmd = execcmd.replace("mpirun", "smpirun -wrapper simgrid-mc -platform ./cluster.xml -analyze --cfg=smpi/barrier-finalization:on --cfg=smpi/list-leaks:10 --cfg=model-check/max-depth:10000")
execcmd = execcmd.replace('${EXE}', binary)
execcmd = execcmd.replace('$zero_buffer', "--cfg=smpi/buffering:zero")
execcmd = execcmd.replace('$infty_buffer', "--cfg=smpi/buffering:infty")
p Test dsend
! output sort
-$ ${bindir:=.}/../../../smpi_script/bin/smpirun -map -hostfile ${bindir:=.}/../hostfile -platform ${platfdir}/small_platform.xml -np 2 --log=no_loc ${bindir:=.}/pt2pt-dsend -s --long --log=smpi_config.thres:warning --log=xbt_cfg.thres:warning --cfg=smpi/simulate-computation:no --cfg=smpi/finalization-barrier:on
+$ ${bindir:=.}/../../../smpi_script/bin/smpirun -map -hostfile ${bindir:=.}/../hostfile -platform ${platfdir}/small_platform.xml -np 2 --log=no_loc ${bindir:=.}/pt2pt-dsend -s --long --log=smpi_config.thres:warning --log=xbt_cfg.thres:warning --cfg=smpi/simulate-computation:no --cfg=smpi/barrier-finalization:on
> [Jupiter:1:(2) 0.000000] [dsend/INFO] rank 1: data exchanged
> [Tremblay:0:(1) 0.005896] [dsend/INFO] rank 0: data exchanged
> [0.000000] [smpi/INFO] [rank 0] -> Tremblay
p process 1 will finish at 0.5+2*4 (send) + 1+0.1*4 (isend) = 9.9s
p process 2 will finish at 0.5+2*4 (time before first send) + 2*(1+0.5*4) (recv+irecv) + 0.005890 (network time, same as before) = 14.505890s
! output sort
-$ ${bindir:=.}/../../../smpi_script/bin/smpirun -map -hostfile ${bindir:=.}/../hostfile -platform ${platfdir}/small_platform.xml -np 2 --log=no_loc ${bindir:=.}/pt2pt-dsend -s --long --log=smpi_config.thres:warning --cfg=smpi/or:0:1:0.5 --cfg=smpi/os:0:0.5:2 --cfg=smpi/ois:0:1:0.1 --cfg=smpi/simulate-computation:no --cfg=smpi/finalization-barrier:on --log=xbt_cfg.thres:warning
+$ ${bindir:=.}/../../../smpi_script/bin/smpirun -map -hostfile ${bindir:=.}/../hostfile -platform ${platfdir}/small_platform.xml -np 2 --log=no_loc ${bindir:=.}/pt2pt-dsend -s --long --log=smpi_config.thres:warning --cfg=smpi/or:0:1:0.5 --cfg=smpi/os:0:0.5:2 --cfg=smpi/ois:0:1:0.1 --cfg=smpi/simulate-computation:no --cfg=smpi/barrier-finalization:on --log=xbt_cfg.thres:warning
> [Jupiter:1:(2) 9.900000] [dsend/INFO] rank 1: data exchanged
> [Tremblay:0:(1) 14.505896] [dsend/INFO] rank 0: data exchanged
> [0.000000] [smpi/INFO] [rank 0] -> Tremblay