Before, in sampling mode, each sampled loop was executed once, and the process would inject the time it took immediately.
This lead to a simcall being executed, which would then yield execution and possibly give the hand to another process.
This was destroying the caches, and resulted in over-estimated times.
This modification holds the injected time until the end of the sampling phase, trying to do the most of it on one process, without swapping contextes in between.
TODO :
- check what happens if num sampling iters < num iters/process (mandatory swapping)
Issues might be triggered if process leaves sampling before reaching the given bounds first.
void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
{
SampleLocation loc(global, file, line);
void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
{
SampleLocation loc(global, file, line);
-
- smpi_bench_end(); /* Take time from previous, unrelated computation into account */
- smpi_process()->set_sampling(1);
+ if (not smpi_process()->sampling()) { /* Only at first call when benchmarking, skip for next ones */
+ smpi_bench_end(); /* Take time from previous, unrelated computation into account */
+ smpi_process()->set_sampling(1);
+ }
auto insert = samples.emplace(loc, LocalData{
threshold, // threshold
auto insert = samples.emplace(loc, LocalData{
threshold, // threshold
if (data.benching) {
// we need to run a new bench
if (data.benching) {
// we need to run a new bench
- XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
- data.count, data.iters, data.relstderr, data.threshold, data.mean);
+ XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f; total:%f",
+ data.count, data.iters, data.relstderr, data.threshold, data.mean, data.sum);
+ smpi_bench_begin();
res = 1;
} else {
// Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just
//ran one bench and need to bail out now that our job is done). Just sleep instead
res = 1;
} else {
// Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just
//ran one bench and need to bail out now that our job is done). Just sleep instead
- if (not data.need_more_benchs())
- XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f)."
- " apply the %fs delay instead",
- data.count, data.iters, data.relstderr, data.threshold, data.mean);
- else
- XBT_DEBUG("Skipping - Benchmark already performed");
- smpi_execute(data.mean);
- smpi_process()->set_sampling(0);
+ if (not data.need_more_benchs()){
+ XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). "
+ "Mean is %f %s",
+ data.count, data.iters, data.relstderr, data.threshold, data.mean,
+ (data.sum != 0.0 ? "Applying it count times because we finished benching count iterations": ""));
+ double sleep = data.mean;
+ if (data.sum != 0.0){ //we finished benching, sum is unecessary after the first injection, we can reset it.
+ sleep = data.sum;
+ data.sum = 0.0;
+ }
+ smpi_process()->set_sampling(0);
+ smpi_execute(sleep);
+ smpi_bench_begin();
+ } else {
+ XBT_DEBUG("Skipping - Benchmark already performed - accumulating time");
+ xbt_os_threadtimer_start(smpi_process()->timer());
+ }
res = 0; // prepare to capture future, unrelated computations
}
res = 0; // prepare to capture future, unrelated computations
}
double n = static_cast<double>(data.count);
data.mean = data.sum / n;
data.relstderr = sqrt((data.sum_pow2 / n - data.mean * data.mean) / n) / data.mean;
double n = static_cast<double>(data.count);
data.mean = data.sum / n;
data.relstderr = sqrt((data.sum_pow2 / n - data.mean * data.mean) / n) / data.mean;
- if (data.need_more_benchs()) {
- data.mean = period; // Still in benching process; We want sample_2 to simulate the exact time of this loop
- // occurrence before leaving, not the mean over the history
- }
XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)",
data.count, data.mean, data.relstderr, period);
XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)",
data.count, data.mean, data.relstderr, period);
else
fprintf(stderr, "(1)");
fprintf(stderr,
else
fprintf(stderr, "(1)");
fprintf(stderr,
- " [rank:%d] Run the first (locally benched) computation. It's locally benched, and I want the "
+ " [rank:%d] Run the second (locally benched) computation. It's locally benched, and I want the "
"standard error to go below 0.1 second (count is not >0)\n", rank);
}
d = compute(d);
"standard error to go below 0.1 second (count is not >0)\n", rank);
}
d = compute(d);
> (0) Run the first computation. It's globally benched, and I want no more than 4 benchmarks (thres<0)
> (0) Run the first computation. It's globally benched, and I want no more than 4 benchmarks (thres<0)
> (0) Run the first computation. It's globally benched, and I want no more than 4 benchmarks (thres<0)
> (0) Run the first computation. It's globally benched, and I want no more than 4 benchmarks (thres<0)
> (0) Run the first computation. It's globally benched, and I want no more than 4 benchmarks (thres<0)
> (0) Run the first computation. It's globally benched, and I want no more than 4 benchmarks (thres<0)
-> (1) [rank:0] Run the first (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
-> (1) [rank:0] Run the first (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
-> (1) [rank:1] Run the first (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
-> (1) [rank:1] Run the first (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
-> (1) [rank:2] Run the first (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
-> (1) [rank:2] Run the first (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
+> (1) [rank:0] Run the second (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
+> (1) [rank:0] Run the second (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
+> (1) [rank:1] Run the second (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
+> (1) [rank:1] Run the second (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
+> (1) [rank:2] Run the second (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
+> (1) [rank:2] Run the second (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
> (2) [rank:0] Done.
> (2) [rank:1] Done.
> (2) [rank:2] Done.
> (2) [rank:0] Done.
> (2) [rank:1] Done.
> (2) [rank:2] Done.