From 030f7802748d2e268d26355cba6f254c8995edcf Mon Sep 17 00:00:00 2001
From: degomme <adegomme@users.noreply.github.com>
Date: Tue, 19 Feb 2019 16:54:41 +0100
Subject: [PATCH] [SMPI] Change sampling behavior. Before, in sampling mode,
 each sampled loop was executed once, and the process would inject the time it
 took immediately. This lead to a simcall being executed, which would then
 yield execution and possibly give the hand to another process. This was
 destroying the caches, and resulted in over-estimated times.

This modification holds the injected time until the end of the sampling phase, trying to do the most of it on one process, without swapping contextes in between.

TODO :
- check what happens if num sampling iters < num iters/process (mandatory swapping)
Issues might be triggered if process leaves sampling before reaching the given bounds first.
---
 src/smpi/internals/smpi_bench.cpp             | 44 +++++++++++--------
 teshsuite/smpi/macro-sample/macro-sample.c    |  2 +-
 teshsuite/smpi/macro-sample/macro-sample.tesh | 12 ++---
 3 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/src/smpi/internals/smpi_bench.cpp b/src/smpi/internals/smpi_bench.cpp
index 9fbc6b0dc5..1c1de228ea 100644
--- a/src/smpi/internals/smpi_bench.cpp
+++ b/src/smpi/internals/smpi_bench.cpp
@@ -337,9 +337,10 @@ std::unordered_map<SampleLocation, LocalData, std::hash<std::string>> samples;
 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
 {
   SampleLocation loc(global, file, line);
-
-  smpi_bench_end();     /* Take time from previous, unrelated computation into account */
-  smpi_process()->set_sampling(1);
+  if (not smpi_process()->sampling()) { /* Only at first call when benchmarking, skip for next ones */
+    smpi_bench_end();     /* Take time from previous, unrelated computation into account */
+    smpi_process()->set_sampling(1);
+  }
 
   auto insert = samples.emplace(loc, LocalData{
                                          threshold, // threshold
@@ -385,23 +386,33 @@ int smpi_sample_2(int global, const char *file, int line)
 
   if (data.benching) {
     // we need to run a new bench
-    XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
-              data.count, data.iters, data.relstderr, data.threshold, data.mean);
+    XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f; total:%f",
+              data.count, data.iters, data.relstderr, data.threshold, data.mean, data.sum);
+    smpi_bench_begin();
     res = 1;
   } else {
     // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just
     //ran one bench and need to bail out now that our job is done). Just sleep instead
-    if (not data.need_more_benchs())
-      XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f)."
-              " apply the %fs delay instead",
-              data.count, data.iters, data.relstderr, data.threshold, data.mean);
-    else
-      XBT_DEBUG("Skipping - Benchmark already performed");
-    smpi_execute(data.mean);
-    smpi_process()->set_sampling(0);
+    if (not data.need_more_benchs()){
+      XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). "
+              "Mean is %f %s",
+              data.count, data.iters, data.relstderr, data.threshold, data.mean, 
+              (data.sum != 0.0 ? "Applying it count times because we finished benching count iterations": ""));
+      double sleep = data.mean;
+      if (data.sum != 0.0){ //we finished benching, sum is unecessary after the first injection, we can reset it.
+        sleep = data.sum;
+        data.sum = 0.0;
+      }
+      smpi_process()->set_sampling(0);
+      smpi_execute(sleep);
+      smpi_bench_begin();
+    } else {
+      XBT_DEBUG("Skipping - Benchmark already performed - accumulating time");
+      xbt_os_threadtimer_start(smpi_process()->timer());
+    }
     res = 0; // prepare to capture future, unrelated computations
   }
-  smpi_bench_begin();
+
   return res;
 }
 
@@ -429,10 +440,7 @@ void smpi_sample_3(int global, const char *file, int line)
   double n       = static_cast<double>(data.count);
   data.mean      = data.sum / n;
   data.relstderr = sqrt((data.sum_pow2 / n - data.mean * data.mean) / n) / data.mean;
-  if (data.need_more_benchs()) {
-    data.mean = period; // Still in benching process; We want sample_2 to simulate the exact time of this loop
-    // occurrence before leaving, not the mean over the history
-  }
+
   XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)",
             data.count, data.mean, data.relstderr, period);
 
diff --git a/teshsuite/smpi/macro-sample/macro-sample.c b/teshsuite/smpi/macro-sample/macro-sample.c
index 395b2b638a..29c3fd5917 100644
--- a/teshsuite/smpi/macro-sample/macro-sample.c
+++ b/teshsuite/smpi/macro-sample/macro-sample.c
@@ -57,7 +57,7 @@ int main(int argc, char *argv[])
         else
           fprintf(stderr, "(1)");
         fprintf(stderr,
-                " [rank:%d] Run the first (locally benched) computation. It's locally benched, and I want the "
+                " [rank:%d] Run the second (locally benched) computation. It's locally benched, and I want the "
                 "standard error to go below 0.1 second (count is not >0)\n", rank);
       }
       d = compute(d);
diff --git a/teshsuite/smpi/macro-sample/macro-sample.tesh b/teshsuite/smpi/macro-sample/macro-sample.tesh
index 563e68ba39..fc13419864 100644
--- a/teshsuite/smpi/macro-sample/macro-sample.tesh
+++ b/teshsuite/smpi/macro-sample/macro-sample.tesh
@@ -6,12 +6,12 @@ $ ${bindir:=.}/../../../smpi_script/bin/smpirun -hostfile ../hostfile -platform
 > (0) Run the first computation. It's globally benched, and I want no more than 4 benchmarks (thres<0)
 > (0) Run the first computation. It's globally benched, and I want no more than 4 benchmarks (thres<0)
 > (0) Run the first computation. It's globally benched, and I want no more than 4 benchmarks (thres<0)
-> (1) [rank:0] Run the first (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
-> (1) [rank:0] Run the first (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
-> (1) [rank:1] Run the first (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
-> (1) [rank:1] Run the first (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
-> (1) [rank:2] Run the first (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
-> (1) [rank:2] Run the first (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
+> (1) [rank:0] Run the second (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
+> (1) [rank:0] Run the second (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
+> (1) [rank:1] Run the second (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
+> (1) [rank:1] Run the second (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
+> (1) [rank:2] Run the second (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
+> (1) [rank:2] Run the second (locally benched) computation. It's locally benched, and I want the standard error to go below 0.1 second (count is not >0)
 > (2) [rank:0] Done.
 > (2) [rank:1] Done.
 > (2) [rank:2] Done.
-- 
2.20.1