contrib/network_model/regress.py

   1 #!/usr/bin/env python
   2
   3 #---------------------------------------------------------------------------------------------------
   4 # Given two vectors of same length n: message size S(.. s_i ..), and communication time T( .. t_i .. )
   5 # where t_i is the time associated to a mesage size s_i, computes the segmentation of the vectors
   6 # in 3 segments such that linear regressions on the 3 segments maximize correlation.
   7 # The metric for correlation is now the cumulative, log error of the estimated time value given
   8 # by regression, to the real time value.
   9 #
  10 # Call r(X[i:j],Y[i:j]) = ax + b  the regression line for data of X,Y between indices i and j
  11 # Call E[i:j] = E( e_i, .., e_j) the vector of estimates from the regression, where e_i = a*S[i] + b
  12 # Call mean_logerr( T,T' ) the average log error of paired elements t_i and t'_i, of vectors len n T and T' resp.
  13 #                 i.e  mean_logerr( T,T' ) =  1/n * Sum_i^n ( exp(| ln(t_i)-ln(t'_i)|) - 1 )
  14 #
  15 # The script computes indices k and l, s.t.
  16 #   mean_logerr( r(S[0:k],T[0:k]) , E[0,k] ) +
  17 #   mean_logerr( r(S[k:l],T[k:l]) , E[k,l] ) +
  18 #   mean_logerr( r(S[l:n],T[l:n]) , E[l,n] )
  19 # is minimum.
  20 #---------------------------------------------------------------------------------------------------
  21
  22 import sys
  23 from math import sqrt,log,exp
  24
  25
  26 if len(sys.argv) != 2 and len(sys.argv) != 4:
  27         print("Usage : {} datafile".format(sys.argv[0]))
  28         print("or    : {} datafile p1 p2".format(sys.argv[0]))
  29         print("where : p1 < p2 belongs to sizes in datafiles")
  30         sys.exit(-1)
  31
  32 if len(sys.argv) == 4:
  33         p1=int(sys.argv[2])
  34         p2=int(sys.argv[3])
  35
  36 ##-----------------------------------------
  37 ## avg : return average of a list of values
  38 ## param l list of values
  39 ##-----------------------------------------
  40 def avg (l):
  41         sum=0
  42         for e in l:
  43                 sum=sum+e;
  44         return sum/len(l)
  45
  46 ##-------------------------------------------------
  47 ## cov : covariance
  48 ## param X first data vector (..x_i..)
  49 ## param Y second data vector (..y_i..)
  50 ## = 1/n \Sum_{i=1}^n (x_i - avg(x)) * (y_i - avg(y))
  51 ##--------------------------------------------------
  52 def cov(X,Y):
  53         assert(len(X)==len(Y))
  54         n=len(X)   #  n=len(X)=len(Y)
  55         avg_X = avg( X )
  56         avg_Y = avg( Y )
  57         S_XY=0
  58         for i in range(n):
  59                 S_XY = S_XY + ((X[i]-avg_X)*(Y[i]-avg_Y))
  60
  61         return (S_XY/n)
  62
  63
  64 ##----------------------------------
  65 ## variance : variance
  66 ## param X data vector ( ..x_i.. )
  67 ## (S_X)^2 = (Sum ( x_i - avg(X) )^2 ) / n
  68 ##----------------------------------
  69 def variance( X ):
  70         S_X2 = 0
  71         n = len( X )
  72         avg_X = avg ( X )
  73         for i in range(n):
  74                 S_X2 = S_X2 + ((X[i] - avg_X)**2)
  75
  76         return (S_X2/n)
  77
  78 ##----------------------------------
  79 ## mean log_error
  80 ## param X data vector ( ..x_i.. ), length n
  81 ## param Y data vector ( ..y_i.. ), length n
  82 ## return mean(   1/n * Sum_i^n ( exp(| ln(x_i)-ln(y_i)|) - 1 )
  83 ##----------------------------------
  84 def mean_logerr( X,Y ):
  85         assert( len(X) == len(Y) )
  86         E = list();                     # the list of errors
  87         for i in range(len(X)):
  88                 E.append( exp(abs(log(X[i])-log(Y[i])))-1 )
  89         return (avg( E ))
  90
  91
  92 ##-----------------------------------------------------------------------------------------------
  93 ## correl_split_weighted_logerr : compute regression on each segment and
  94 ## return the weigthed sum of correlation coefficients
  95 ## param X first data vector (..x_i..)
  96 ## param Y second data vector (..x_i..)
  97 ## param segments list of pairs (i,j) where i refers to the ith value in X, and  jth value in X
  98 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
  99 ##    where i1,j1 is the first segment,  c1 the correlation coef on this segment, n1 the number of values
 100 ##          i2,j2 is the second segment, c2 the correlation coef on this segment, n2 the number of values
 101 ##          ...
 102 ##    and C=c1/n1+c2/n2+...
 103 ##-----------------------------------------------------------------------------------------------
 104 def correl_split_weighted_logerr( X , Y , segments ):
 105         # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
 106         correl = list()
 107         interv = list()  # regr. line coeffs and range
 108         glob_err=0
 109         for (start,stop) in segments:
 110                 #if start==stop :
 111                 #       return 0
 112                 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
 113                 S_X2 = variance(  X [start:stop+1] )
 114                 a = S_XY/S_X2                           # regr line coeffs
 115                 b = avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
 116                 # fill a vector (Z) with predicted values from regression
 117                 Z=list()
 118                 for i in range(start,stop+1):
 119                         Z.append( a * X[i] + b )
 120                 # compare real values and computed values
 121                 e = mean_logerr( Y[start:stop+1] , Z )
 122                 #print("   range [%d,%d] err=%f‰ weight=%f" % (X[start],X[stop],e,(stop-start+1)/len(X)))
 123                 correl.append( (e, stop-start+1) );   # store correl. coef + number of values (segment length)
 124                 interv.append( (a,b, X[start],X[stop],e) );
 125
 126         for (e,l) in correl:
 127                 glob_err = glob_err + (e*l/len( X ))  # the average log err for this segment (e) is
 128                                                                 # weighted by the number of values of the segment (l) out of the total number of values
 129
 130         #print("-> glob_corr={}\n".format(glob_corr))
 131         return (glob_err,interv);
 132
 133 ##-----------------------------------------------------------------------------------------------
 134 ## correl_split_weighted : compute regression on each segment and
 135 ## return the weigthed sum of correlation coefficients
 136 ## param X first data vector (..x_i..)
 137 ## param Y second data vector (..x_i..)
 138 ## param segments list of pairs (i,j) where i refers to the ith value in X, and  jth value in X
 139 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
 140 ##    where i1,j1 is the first segment,  c1 the correlation coef on this segment, n1 the number of values
 141 ##          i2,j2 is the second segment, c2 the correlation coef on this segment, n2 the number of values
 142 ##          ...
 143 ##    and C=c1/n1+c2/n2+...
 144 ##-----------------------------------------------------------------------------------------------
 145 def correl_split_weighted( X , Y , segments ):
 146         # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
 147         correl = list();
 148         interv = list();  # regr. line coeffs and range
 149         glob_corr=0
 150         sum_nb_val=0
 151         for (start,stop) in segments:
 152                 sum_nb_val = sum_nb_val + stop - start;
 153                 #if start==stop :
 154                 #       return 0
 155                 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
 156                 S_X2 = variance(  X [start:stop+1] )
 157                 S_Y2 = variance(  Y [start:stop+1] )  # to compute correlation
 158                 if S_X2*S_Y2 == 0:
 159                         return (0,[])
 160                 c = S_XY/(sqrt(S_X2)*sqrt(S_Y2))
 161                 a = S_XY/S_X2                           # regr line coeffs
 162                 b= avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
 163                 #print("   range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b))
 164                 correl.append( (c, stop-start) );   # store correl. coef + number of values (segment length)
 165                 interv.append( (a,b, X[start],X[stop]) );
 166
 167         for (c,l) in correl:
 168                 glob_corr = glob_corr + (l/sum_nb_val)*c  # weighted product of correlation
 169                 #print('-- %f * %f' % (c,l/sum_nb_val))
 170
 171         #print("-> glob_corr={}\n".format(glob_corr))
 172         return (glob_corr,interv);
 173
 174
 175
 176
 177 ##-----------------------------------------------------------------------------------------------
 178 ## correl_split : compute regression on each segment and
 179 ## return the product of correlation coefficient
 180 ## param X first data vector (..x_i..)
 181 ## param Y second data vector (..x_i..)
 182 ## param segments list of pairs (i,j) where i refers to the ith value in X, and  jth value in X
 183 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
 184 ##    where i1,j1 is the first segment,  c1 the correlation coef on this segment,
 185 ##          i2,j2 is the second segment, c2 the correlation coef on this segment,
 186 ##          ...
 187 ##    and C=c1*c2*...
 188 ##-----------------------------------------------------------------------------------------------
 189 def correl_split( X , Y , segments ):
 190         # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
 191         correl = list();
 192         interv = list();  # regr. line coeffs and range
 193         glob_corr=1
 194         for (start,stop) in segments:
 195                 #if start==stop :
 196                 #       return 0
 197                 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
 198                 S_X2 = variance(  X [start:stop+1] )
 199                 S_Y2 = variance(  Y [start:stop+1] )  # to compute correlation
 200                 if S_X2*S_Y2 == 0:
 201                         return (0,[])
 202                 c = S_XY/(sqrt(S_X2)*sqrt(S_Y2))
 203                 a = S_XY/S_X2                           # regr line coeffs
 204                 b= avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
 205                 #print("   range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b))
 206                 correl.append( (c, stop-start) );   # store correl. coef + number of values (segment length)
 207                 interv.append( (a,b, X[start],X[stop]) );
 208
 209         for (c,l) in correl:
 210                 glob_corr = glob_corr * c  # product of correlation coeffs
 211         return (glob_corr,interv);
 212
 213
 214
 215 ##-----------------------------------------------------------------------------------------------
 216 ## main
 217 ##-----------------------------------------------------------------------------------------------
 218 sum=0
 219 nblines=0
 220 skampidat = open(sys.argv[1], "r")
 221
 222
 223 ## read data from skampi logs.
 224 timings = []
 225 sizes = []
 226 readdata =[]
 227 for line in skampidat:
 228         l = line.split();
 229         if line[0] != '#' and len(l) >= 3:   # is it a comment ?
 230         ## expected format
 231         ## ---------------
 232         #count= 8388608  8388608  144916.1       7.6       32  144916.1  143262.0
 233         #("%s %d %d %f %f %d %f %f\n" % (countlbl, count, countn, time, stddev, iter, mini, maxi)
 234                 readdata.append( (int(l[1]),float(l[3])) );
 235                 nblines=nblines+1
 236
 237 ## These may not be sorted so sort it by message size before processing.
 238 sorteddata = sorted( readdata, key=lambda pair: pair[0])
 239 sizes,timings = zip(*sorteddata);
 240
 241 # zip makes tuples; cast to lists for backward compatibility with python2.X
 242 sizes   = list(sizes)
 243 timings = list(timings)
 244
 245 ##----------------------- search for best break points-----------------
 246 ## example
 247 ## p1=2048  -> p1inx=11  delta=3 -> [8;14]
 248 ##                                              8 : segments[(0,7),(8,13),(13,..)]
 249 ##                                              ....
 250 ## p2=65536 -> p2inx=16  delta=3 -> [13;19]
 251
 252 if len(sys.argv) == 4:
 253
 254         p1inx = sizes.index( p1 );
 255         p2inx = sizes.index( p2 );
 256         max_glob_corr = 999990;
 257         max_p1inx = p1inx
 258         max_p2inx = p2inx
 259
 260         ## tweak parameters here to extend/reduce search
 261         search_p1 = 70          # number of values to search +/- around p1
 262         search_p2 = 70          # number of values to search +/- around p2
 263         min_seg_size = 3
 264         if (search_p2 + min_seg_size > len(sizes)):  # reduce min segment sizes when points close to data extrema
 265                 min_seg_size = len(sizes)-search_p2
 266         if (search_p1 - min_seg_size < 0):
 267                 min_seg_size = search_p1
 268
 269         lb1 = max( 1, p1inx-search_p1 )
 270         ub1 = min( p1inx+search_p1, p2inx);
 271         lb2 = max( p1inx, p2inx-search_p2)      # breakpoint +/- delta
 272         ub2 = min( p2inx+search_p2, len(sizes)-1);
 273
 274         print("** evaluating over \n");
 275         print("interv1:\t %d <--- %d ---> %d" % (sizes[lb1],p1,sizes[ub1]))
 276         print("rank:   \t (%d)<---(%d)--->(%d)\n" % (lb1,p1inx,ub1))
 277         print("interv2:\t\t %d <--- %d ---> %d" % (sizes[lb2],p2,sizes[ub2]))
 278         print("rank:   \t\t(%d)<---(%d)--->(%d)\n" % (lb2,p2inx,ub2))
 279
 280         result = list()
 281         for i in range(lb1,ub1+1):
 282                 for j in range(lb2,ub2+1):
 283                         if i<j:         # segments must not overlap
 284                                 if i+1 >=min_seg_size and j-i+1 >= min_seg_size and len(sizes)-1-j >= min_seg_size : # not too small segments
 285                                         #print("** i=%d,j=%d" % (i,j))
 286                                         segments = [(0,i),(i,j),(j,len(sizes)-1)]
 287                                         result.append( correl_split_weighted_logerr( sizes, timings, segments) )  # add pair (metric,interval)
 288
 289         # sort results on ascending metric: ok for logerr. Add "reverse=true" for desc sort if you use  a correlation metric
 290         result = sorted( result, key=lambda pair: pair[0])
 291
 292
 293         top_n_sol=5;  # tweak to display top best n solution
 294
 295         print("#-------------------- result summary ---------------------------------------------------------------------\n");
 296
 297         for k in range(top_n_sol):
 298                 (err,interval) = result[k]
 299                 print("\n   RANK {}\n-------".format(k))
 300                 print("** overall metric = {0}".format(err))
 301                 for (a,b,i,j,e) in interval:
 302                         print("** OPT: [{0} .. {1}] segment_metric={2}          slope: {3} x + {4}".format(i,j,e,a,b))
 303
 304
 305         print("\n\n\n")
 306
 307         print("#-------------------- Best Solution : cut here the gnuplot code -----------------------------------------------------------\n");
 308         preamble='set output "regr.eps"\n\
 309 set terminal postscript eps color\n\
 310 set key left\n\
 311 set xlabel "Each message size in bytes"\n\
 312 set ylabel "Time in us"\n\
 313 set logscale x\n\
 314 set logscale y\n\
 315 set grid'
 316
 317         print(preamble);
 318         print('plot "%s" u 3:4:($5) with errorbars title "skampi traces %s",\\' % (sys.argv[1],sys.argv[1]));
 319         (err,interval) = result[0]
 320         for (a,b,i,j,e) in interval:
 321                 print('"%s" u (%d<=$3 && $3<=%d? $3:0/0):(%f*($3)+%f) w linespoints title "regress. %s-%s bytes",\\' % (sys.argv[1],i,j,a,b,i,j))
 322
 323         print("#-------------------- /cut here the gnuplot code -----------------------------------------------------------\n");
 324
 325
 326 else:
 327         print('\n** Linear regression on %d values **\n' % (nblines))
 328         print('\n   sizes=',sizes,'\n\n')
 329         avg_sizes = avg( sizes )
 330         avg_timings = avg( timings )
 331         print("avg_timings=%f, avg_sizes=%f, nblines=%d\n" % (avg_timings,avg_sizes,nblines))
 332
 333         S_XY= cov( sizes, timings )
 334         S_X2 = variance(  sizes )
 335         S_Y2 = variance(  timings )  # to compute correlation
 336
 337         a = S_XY/S_X2
 338         correl = S_XY/(sqrt(S_X2)*sqrt(S_Y2))  # corealation coeff (Bravais-Pearson)
 339
 340
 341         b= avg_timings - a * avg_sizes
 342         print("[S_XY=%f, S_X2=%f]\n[correlation=%f, coeff det=%f]\n[a=%f, b=%f]\n" % (S_XY, S_X2, correl,correl**2,a, b))
 343
 344