Changed optimizing metric to log error

[simgrid.git] / contrib / network_model / regress.py
diff --git a/contrib/network_model/regress.py b/contrib/network_model/regress.py

index 07636bb..60f0dc3 100755 (executable)
--- a/contrib/network_model/regress.py
+++ b/contrib/network_model/regress.py
@@ -1,7 +1,26 @@
  #!/usr/bin/env python
  
+#---------------------------------------------------------------------------------------------------
+# Given two vectors of same length n: message size S(.. s_i ..), and communication time T( .. t_i .. )
+# where t_i is the time associated to a mesage size s_i, computes the segmentation of the vectors   
+# in 3 segments such that linear regressions on the 3 segments maximize correlation.
+# The metric for correlation is now the cumulative, log error of the estimated time value given 
+# by regression, to the real time value.
+# 
+# Call r(X[i:j],Y[i:j]) = ax + b  the regression line for data of X,Y between indices i and j
+# Call E[i:j] = E( e_i, .., e_j) the vector of estimates from the regression, where e_i = a*S[i] + b
+# Call mean_logerr( T,T' ) the average log error of paired elements t_i and t'_i, of vectors len n T and T' resp.
+#                 i.e  mean_logerr( T,T' ) =  1/n * Sum_i^n ( exp(| ln(t_i)-ln(t'_i)|) - 1 )
+#
+# The script computes indices k and l, s.t.
+#   mean_logerr( r(S[0:k],T[0:k]) , E[0,k] ) + 
+#   mean_logerr( r(S[k:l],T[k:l]) , E[k,l] ) + 
+#   mean_logerr( r(S[l:n],T[l:n]) , E[l,n] ) 
+# is minimum.
+#---------------------------------------------------------------------------------------------------
+
  import sys
-from math import sqrt
+from math import sqrt,log,exp
  
  
  if len(sys.argv) != 2 and len(sys.argv) != 4:
@@ -31,7 +50,6 @@ def avg (l):
  ## = 1/n \Sum_{i=1}^n (x_i - avg(x)) * (y_i - avg(y))
  ##--------------------------------------------------
  def cov(X,Y):
-
         assert(len(X)==len(Y))
         n=len(X)   #  n=len(X)=len(Y)
         avg_X = avg( X )
@@ -46,7 +64,7 @@ def cov(X,Y):
  ##----------------------------------
  ## variance : variance 
  ## param X data vector ( ..x_i.. )
-## (S_X)^2 = (Sum ( x_i - avg(x) )^2 ) / n
+## (S_X)^2 = (Sum ( x_i - avg(X) )^2 ) / n
  ##----------------------------------
  def variance( X ):
         S_X2 = 0
@@ -57,6 +75,61 @@ def variance( X ):
  
         return (S_X2/n)
  
+##----------------------------------
+## mean log_error
+## param X data vector ( ..x_i.. ), length n
+## param Y data vector ( ..y_i.. ), length n
+## return mean(   1/n * Sum_i^n ( exp(| ln(x_i)-ln(y_i)|) - 1 ) 
+##----------------------------------
+def mean_logerr( X,Y ):
+       assert( len(X) == len(Y) )
+       E = list();                     # the list of errors
+       for i in range(len(X)):
+               E.append( exp(abs(log(X[i])-log(Y[i])))-1 ) 
+       return (avg( E ))
+
+
+##-----------------------------------------------------------------------------------------------
+## correl_split_weighted_logerr : compute regression on each segment and
+## return the weigthed sum of correlation coefficients
+## param X first data vector (..x_i..)
+## param Y second data vector (..x_i..)
+## param segments list of pairs (i,j) where i refers to the ith value in X, and  jth value in X
+## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
+##    where i1,j1 is the first segment,  c1 the correlation coef on this segment, n1 the number of values
+##          i2,j2 is the second segment, c2 the correlation coef on this segment, n2 the number of values
+##          ...
+##    and C=c1/n1+c2/n2+...
+##-----------------------------------------------------------------------------------------------
+def correl_split_weighted_logerr( X , Y , segments ):
+       # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)] 
+       correl = list()
+       interv = list()  # regr. line coeffs and range
+       glob_err=0
+       for (start,stop) in segments:
+               #if start==stop :
+               #       return 0
+               S_XY= cov( X [start:stop+1], Y [start:stop+1] )
+               S_X2 = variance(  X [start:stop+1] ) 
+               a = S_XY/S_X2                           # regr line coeffs
+               b = avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
+               # fill a vector (Z) with predicted values from regression
+               Z=list()
+               for i in range(start,stop+1):
+                       Z.append( a * X[i] + b )
+               # compare real values and computed values
+               e = mean_logerr( Y[start:stop+1] , Z )
+               #print("   range [%d,%d] err=%f‰ weight=%f" % (X[start],X[stop],e,(stop-start+1)/len(X))) 
+               correl.append( (e, stop-start+1) );   # store correl. coef + number of values (segment length)
+               interv.append( (a,b, X[start],X[stop],e) );
+       
+       for (e,l) in correl:
+               glob_err = glob_err + (e*l/len( X ))  # the average log err for this segment (e) is 
+                                                               # weighted by the number of values of the segment (l) out of the total number of values
+               
+       #print("-> glob_corr={}\n".format(glob_corr))
+       return (glob_err,interv);
+       
  ##-----------------------------------------------------------------------------------------------
  ## correl_split_weighted : compute regression on each segment and
  ## return the weigthed sum of correlation coefficients
@@ -87,15 +160,15 @@ def correl_split_weighted( X , Y , segments ):
                 c = S_XY/(sqrt(S_X2)*sqrt(S_Y2)) 
                 a = S_XY/S_X2                           # regr line coeffs
                 b= avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
-               print("   range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b)) 
+               #print("   range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b)) 
                 correl.append( (c, stop-start) );   # store correl. coef + number of values (segment length)
                 interv.append( (a,b, X[start],X[stop]) );
         
         for (c,l) in correl:
                 glob_corr = glob_corr + (l/sum_nb_val)*c  # weighted product of correlation 
-               print('-- %f * %f' % (c,l/sum_nb_val))
+               #print('-- %f * %f' % (c,l/sum_nb_val))
                 
-       print("-> glob_corr={}\n".format(glob_corr))
+       #print("-> glob_corr={}\n".format(glob_corr))
         return (glob_corr,interv);
         
  
@@ -129,13 +202,12 @@ def correl_split( X , Y , segments ):
                 c = S_XY/(sqrt(S_X2)*sqrt(S_Y2)) 
                 a = S_XY/S_X2                           # regr line coeffs
                 b= avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
-               print("   range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b)) 
+               #print("   range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b)) 
                 correl.append( (c, stop-start) );   # store correl. coef + number of values (segment length)
                 interv.append( (a,b, X[start],X[stop]) );
         
         for (c,l) in correl:
                 glob_corr = glob_corr * c  # product of correlation coeffs
-       print("-> glob_corr=%f\n" % glob_corr)
         return (glob_corr,interv);
         
  
@@ -166,6 +238,9 @@ for line in skampidat:
  sorteddata = sorted( readdata, key=lambda pair: pair[0])
  sizes,timings = zip(*sorteddata);
  
+# zip makes tuples; cast to lists for backward compatibility with python2.X
+sizes   = list(sizes)
+timings = list(timings)
  
  ##----------------------- search for best break points-----------------
  ## example
@@ -178,41 +253,58 @@ if len(sys.argv) == 4:
  
         p1inx = sizes.index( p1 );
         p2inx = sizes.index( p2 );
-       max_glob_corr = 0;
+       max_glob_corr = 999990;
         max_p1inx = p1inx
         max_p2inx = p2inx
  
         ## tweak parameters here to extend/reduce search
-       search_p1 = 30          # number of values to search +/- around p1
-       search_p2 = 65          # number of values to search +/- around p2
+       search_p1 = 70          # number of values to search +/- around p1
+       search_p2 = 70          # number of values to search +/- around p2
         min_seg_size = 3
+       if (search_p2 + min_seg_size > len(sizes)):  # reduce min segment sizes when points close to data extrema
+               min_seg_size = len(sizes)-search_p2
+       if (search_p1 - min_seg_size < 0):
+               min_seg_size = search_p1
  
-       lb1 = max(1, p1inx-search_p1)           
-       ub1 = min(p1inx+search_p1,search_p1, p2inx);
-       lb2 = max(p1inx,p2inx-search_p2)        # breakpoint +/- delta
-       ub2 = min(p2inx+search_p2,len(sizes)-1);    
+       lb1 = max( 1, p1inx-search_p1 )         
+       ub1 = min( p1inx+search_p1, p2inx);
+       lb2 = max( p1inx, p2inx-search_p2)      # breakpoint +/- delta
+       ub2 = min( p2inx+search_p2, len(sizes)-1);    
  
         print("** evaluating over \n");
         print("interv1:\t %d <--- %d ---> %d" % (sizes[lb1],p1,sizes[ub1]))
         print("rank:   \t (%d)<---(%d)--->(%d)\n" % (lb1,p1inx,ub1))
         print("interv2:\t\t %d <--- %d ---> %d" % (sizes[lb2],p2,sizes[ub2]))
         print("rank:   \t\t(%d)<---(%d)--->(%d)\n" % (lb2,p2inx,ub2))
+
+       result = list()
         for i in range(lb1,ub1+1):
                 for j in range(lb2,ub2+1):
                         if i<j:         # segments must not overlap
                                 if i+1 >=min_seg_size and j-i+1 >= min_seg_size and len(sizes)-1-j >= min_seg_size : # not too small segments
-                                       print("** i=%d,j=%d" % (i,j))
+                                       #print("** i=%d,j=%d" % (i,j))
                                         segments = [(0,i),(i,j),(j,len(sizes)-1)]
-                                       (glob_cor, interv) = correl_split( sizes, timings, segments)
-                                       if ( glob_cor > max_glob_corr):
-                                               max_glob_corr = glob_cor
-                                               max_interv = interv
+                                       result.append( correl_split_weighted_logerr( sizes, timings, segments) )  # add pair (metric,interval)
+
+       # sort results on ascending metric: ok for logerr. Add "reverse=true" for desc sort if you use  a correlation metric
+       result = sorted( result, key=lambda pair: pair[0])
+
+
+       top_n_sol=5;  # tweak to display top best n solution
  
         print("#-------------------- result summary ---------------------------------------------------------------------\n");
-       for (a,b,i,j) in max_interv:
-               print("** OPT: [%d .. %d] correl coef prod=%f           slope: %f x + %f" % (i,j,max_glob_corr,a,b))
  
-       print("#-------------------- cut here the gnuplot code -----------------------------------------------------------\n");
+       for k in range(top_n_sol):
+               (err,interval) = result[k]
+               print("\n   RANK {}\n-------".format(k))
+               print("** overall metric = {0}".format(err))
+               for (a,b,i,j,e) in interval:
+                       print("** OPT: [{0} .. {1}] segment_metric={2}          slope: {3} x + {4}".format(i,j,e,a,b))
+
+
+       print("\n\n\n")
+
+       print("#-------------------- Best Solution : cut here the gnuplot code -----------------------------------------------------------\n");
         preamble='set output "regr.eps"\n\
  set terminal postscript eps color\n\
  set key left\n\
@@ -224,7 +316,8 @@ set grid'
  
         print(preamble);
         print('plot "%s" u 3:4:($5) with errorbars title "skampi traces %s",\\' % (sys.argv[1],sys.argv[1]));
-       for (a,b,i,j) in max_interv:
+       (err,interval) = result[0]
+       for (a,b,i,j,e) in interval:
                 print('"%s" u (%d<=$3 && $3<=%d? $3:0/0):(%f*($3)+%f) w linespoints title "regress. %s-%s bytes",\\' % (sys.argv[1],i,j,a,b,i,j))
         
         print("#-------------------- /cut here the gnuplot code -----------------------------------------------------------\n");
@@ -241,7 +334,7 @@ else:
         S_X2 = variance(  sizes ) 
         S_Y2 = variance(  timings )  # to compute correlation
  
-       a = S_XY/S_X2;
+       a = S_XY/S_X2
         correl = S_XY/(sqrt(S_X2)*sqrt(S_Y2))  # corealation coeff (Bravais-Pearson)