contrib/network_model/regress.py

   1 #!/usr/bin/env python
   2
   3 #---------------------------------------------------------------------------------------------------
   4 # Example invokation:
   5 # % ./regress.py griffon_skampi_pt2pt.ski.dat 65536 120832
   6 #
   7 #
   8 # Given two vectors of same length n: message size S(.. s_i ..), and communication time T( .. t_i .. )
   9 # where t_i is the time associated to a mesage size s_i, computes the segmentation of the vectors
  10 # in 3 segments such that linear regressions on the 3 segments maximize correlation.
  11 # The metric for correlation is now the cumulative, log error of the estimated time value given
  12 # by regression, to the real time value.
  13 #
  14 # Call r(X[i:j],Y[i:j]) = ax + b  the regression line for data of X,Y between indices i and j
  15 # Call E[i:j] = E( e_i, .., e_j) the vector of estimates from the regression, where e_i = a*S[i] + b
  16 # Call mean_logerr( T,T' ) the average log error of paired elements t_i and t'_i, of vectors len n T and T' resp.
  17 #                 i.e  mean_logerr( T,T' ) =  1/n * Sum_i^n ( exp(| ln(t_i)-ln(t'_i)|) - 1 )
  18 #
  19 # The script computes indices k and l, s.t.
  20 #   mean_logerr( r(S[0:k],T[0:k]) , E[0,k] ) +
  21 #   mean_logerr( r(S[k:l],T[k:l]) , E[k,l] ) +
  22 #   mean_logerr( r(S[l:n],T[l:n]) , E[l,n] )
  23 # is minimum.
  24 #---------------------------------------------------------------------------------------------------
  25
  26 import sys
  27 from math import sqrt,log,exp
  28
  29
  30 if len(sys.argv) != 2 and len(sys.argv) != 4:
  31         print("Usage : {} datafile".format(sys.argv[0]))
  32         print("or    : {0} datafile p1 p2".format(sys.argv[0]))
  33         print("where : p1 < p2 belongs to sizes in datafiles")
  34         sys.exit(-1)
  35
  36 if len(sys.argv) == 4:
  37         p1=int(sys.argv[2])
  38         p2=int(sys.argv[3])
  39
  40 ##-----------------------------------------
  41 ## avg : return average of a list of values
  42 ## param l list of values
  43 ##-----------------------------------------
  44 def avg (l):
  45         sum=0
  46         for e in l:
  47                 sum=sum+e;
  48         return sum/len(l)
  49
  50 ##-------------------------------------------------
  51 ## cov : covariance
  52 ## param X first data vector (..x_i..)
  53 ## param Y second data vector (..y_i..)
  54 ## = 1/n \Sum_{i=1}^n (x_i - avg(x)) * (y_i - avg(y))
  55 ##--------------------------------------------------
  56 def cov(X,Y):
  57         assert(len(X)==len(Y))
  58         n=len(X)   #  n=len(X)=len(Y)
  59         avg_X = avg( X )
  60         avg_Y = avg( Y )
  61         S_XY=0
  62         for i in range(n):
  63                 S_XY = S_XY + ((X[i]-avg_X)*(Y[i]-avg_Y))
  64
  65         return (S_XY/n)
  66
  67
  68 ##----------------------------------
  69 ## variance : variance
  70 ## param X data vector ( ..x_i.. )
  71 ## (S_X)^2 = (Sum ( x_i - avg(X) )^2 ) / n
  72 ##----------------------------------
  73 def variance( X ):
  74         S_X2 = 0
  75         n = len( X )
  76         avg_X = avg ( X )
  77         for i in range(n):
  78                 S_X2 = S_X2 + ((X[i] - avg_X)**2)
  79
  80         return (S_X2/n)
  81
  82 ##----------------------------------
  83 ## mean log_error
  84 ## param X data vector ( ..x_i.. ), length n
  85 ## param Y data vector ( ..y_i.. ), length n
  86 ## return mean(   1/n * Sum_i^n ( exp(| ln(x_i)-ln(y_i)|) - 1 )
  87 ##----------------------------------
  88 def mean_logerr( X,Y ):
  89         assert( len(X) == len(Y) )
  90         E = list();                     # the list of errors
  91         for i in range(len(X)):
  92                 E.append( exp(abs(log(X[i])-log(Y[i])))-1 )
  93         return (avg( E ))
  94
  95
  96 ##-----------------------------------------------------------------------------------------------
  97 ## correl_split_weighted_logerr : compute regression on each segment and
  98 ## return the weigthed sum of correlation coefficients
  99 ## param X first data vector (..x_i..)
 100 ## param Y second data vector (..x_i..)
 101 ## param segments list of pairs (i,j) where i refers to the ith value in X, and  jth value in X
 102 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
 103 ##    where i1,j1 is the first segment,  c1 the correlation coef on this segment, n1 the number of values
 104 ##          i2,j2 is the second segment, c2 the correlation coef on this segment, n2 the number of values
 105 ##          ...
 106 ##    and C=c1/n1+c2/n2+...
 107 ##-----------------------------------------------------------------------------------------------
 108 def correl_split_weighted_logerr( X , Y , segments ):
 109         # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
 110         correl = list()
 111         interv = list()  # regr. line coeffs and range
 112         glob_err=0
 113         for (start,stop) in segments:
 114                 #if start==stop :
 115                 #       return 0
 116                 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
 117                 S_X2 = variance(  X [start:stop+1] )
 118                 a = S_XY/S_X2                           # regr line coeffs
 119                 b = avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
 120                 # fill a vector (Z) with predicted values from regression
 121                 Z=list()
 122                 for i in range(start,stop+1):
 123                         Z.append( a * X[i] + b )
 124                 # compare real values and computed values
 125                 e = mean_logerr( Y[start:stop+1] , Z )
 126                 correl.append( (e, stop-start+1) );   # store correl. coef + number of values (segment length)
 127                 interv.append( (a,b, X[start],X[stop],e) );
 128
 129         for (e,l) in correl:
 130                 glob_err = glob_err + (e*l/len( X ))  # the average log err for this segment (e) is
 131                                                                 # weighted by the number of values of the segment (l) out of the total number of values
 132
 133         #print("-> glob_corr={}\n".format(glob_corr))
 134         return (glob_err,interv);
 135
 136 ##-----------------------------------------------------------------------------------------------
 137 ## correl_split_weighted : compute regression on each segment and
 138 ## return the weigthed sum of correlation coefficients
 139 ## param X first data vector (..x_i..)
 140 ## param Y second data vector (..x_i..)
 141 ## param segments list of pairs (i,j) where i refers to the ith value in X, and  jth value in X
 142 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
 143 ##    where i1,j1 is the first segment,  c1 the correlation coef on this segment, n1 the number of values
 144 ##          i2,j2 is the second segment, c2 the correlation coef on this segment, n2 the number of values
 145 ##          ...
 146 ##    and C=c1/n1+c2/n2+...
 147 ##-----------------------------------------------------------------------------------------------
 148 def correl_split_weighted( X , Y , segments ):
 149         # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
 150         correl = list();
 151         interv = list();  # regr. line coeffs and range
 152         glob_corr=0
 153         sum_nb_val=0
 154         for (start,stop) in segments:
 155                 sum_nb_val = sum_nb_val + stop - start;
 156                 #if start==stop :
 157                 #       return 0
 158                 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
 159                 S_X2 = variance(  X [start:stop+1] )
 160                 S_Y2 = variance(  Y [start:stop+1] )  # to compute correlation
 161                 if S_X2*S_Y2 == 0:
 162                         return (0,[])
 163                 c = S_XY/(sqrt(S_X2)*sqrt(S_Y2))
 164                 a = S_XY/S_X2                           # regr line coeffs
 165                 b= avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
 166                 #print("   range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b))
 167                 correl.append( (c, stop-start) );   # store correl. coef + number of values (segment length)
 168                 interv.append( (a,b, X[start],X[stop]) );
 169
 170         for (c,l) in correl:
 171                 glob_corr = glob_corr + (l/sum_nb_val)*c  # weighted product of correlation
 172                 #print('-- %f * %f' % (c,l/sum_nb_val))
 173
 174         #print("-> glob_corr={}\n".format(glob_corr))
 175         return (glob_corr,interv);
 176
 177
 178
 179
 180 ##-----------------------------------------------------------------------------------------------
 181 ## correl_split : compute regression on each segment and
 182 ## return the product of correlation coefficient
 183 ## param X first data vector (..x_i..)
 184 ## param Y second data vector (..x_i..)
 185 ## param segments list of pairs (i,j) where i refers to the ith value in X, and  jth value in X
 186 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
 187 ##    where i1,j1 is the first segment,  c1 the correlation coef on this segment,
 188 ##          i2,j2 is the second segment, c2 the correlation coef on this segment,
 189 ##          ...
 190 ##    and C=c1*c2*...
 191 ##-----------------------------------------------------------------------------------------------
 192 def correl_split( X , Y , segments ):
 193         # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
 194         correl = list();
 195         interv = list();  # regr. line coeffs and range
 196         glob_corr=1
 197         for (start,stop) in segments:
 198                 #if start==stop :
 199                 #       return 0
 200                 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
 201                 S_X2 = variance(  X [start:stop+1] )
 202                 S_Y2 = variance(  Y [start:stop+1] )  # to compute correlation
 203                 if S_X2*S_Y2 == 0:
 204                         return (0,[])
 205                 c = S_XY/(sqrt(S_X2)*sqrt(S_Y2))
 206                 a = S_XY/S_X2                           # regr line coeffs
 207                 b= avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
 208                 #print("   range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b))
 209                 correl.append( (c, stop-start) );   # store correl. coef + number of values (segment length)
 210                 interv.append( (a,b, X[start],X[stop]) );
 211
 212         for (c,l) in correl:
 213                 glob_corr = glob_corr * c  # product of correlation coeffs
 214         return (glob_corr,interv);
 215
 216
 217
 218 ##-----------------------------------------------------------------------------------------------
 219 ## main
 220 ##-----------------------------------------------------------------------------------------------
 221 sum=0
 222 nblines=0
 223 skampidat = open(sys.argv[1], "r")
 224
 225
 226 ## read data from skampi logs.
 227 timings = []
 228 sizes = []
 229 readdata =[]
 230 for line in skampidat:
 231         l = line.split();
 232         if line[0] != '#' and len(l) >= 3:   # is it a comment ?
 233         ## expected format
 234         ## ---------------
 235         #count= 8388608  8388608  144916.1       7.6       32  144916.1  143262.0
 236         #("%s %d %d %f %f %d %f %f\n" % (countlbl, count, countn, time, stddev, iter, mini, maxi)
 237                 readdata.append( (int(l[1]),float(l[3])) );
 238                 nblines=nblines+1
 239
 240 ## These may not be sorted so sort it by message size before processing.
 241 sorteddata = sorted( readdata, key=lambda pair: pair[0])
 242 sizes,timings = zip(*sorteddata);
 243
 244 # zip makes tuples; cast to lists for backward compatibility with python2.X
 245 sizes   = list(sizes)
 246 timings = list(timings)
 247
 248 ##----------------------- search for best break points-----------------
 249 ## example
 250 ## p1=2048  -> p1inx=11  delta=3 -> [8;14]
 251 ##                                              8 : segments[(0,7),(8,13),(13,..)]
 252 ##                                              ....
 253 ## p2=65536 -> p2inx=16  delta=3 -> [13;19]
 254
 255 if len(sys.argv) == 4:
 256
 257         p1inx = sizes.index( p1 );
 258         p2inx = sizes.index( p2 );
 259         max_glob_corr = 999990;
 260         max_p1inx = p1inx
 261         max_p2inx = p2inx
 262
 263         ## tweak parameters here to extend/reduce search
 264         search_p1 = 70          # number of values to search +/- around p1
 265         search_p2 = 70          # number of values to search +/- around p2
 266         min_seg_size = 3
 267         if (search_p2 + min_seg_size > len(sizes)):  # reduce min segment sizes when points close to data extrema
 268                 min_seg_size = len(sizes)-search_p2
 269         if (search_p1 - min_seg_size < 0):
 270                 min_seg_size = search_p1
 271
 272         lb1 = max( 1, p1inx-search_p1 )
 273         ub1 = min( p1inx+search_p1, p2inx);
 274         lb2 = max( p1inx, p2inx-search_p2)      # breakpoint +/- delta
 275         ub2 = min( p2inx+search_p2, len(sizes)-1);
 276
 277         print("** evaluating over \n");
 278         print("interv1:\t %d <--- %d ---> %d" % (sizes[lb1],p1,sizes[ub1]))
 279         print("rank:   \t (%d)<---(%d)--->(%d)\n" % (lb1,p1inx,ub1))
 280         print("interv2:\t\t %d <--- %d ---> %d" % (sizes[lb2],p2,sizes[ub2]))
 281         print("rank:   \t\t(%d)<---(%d)--->(%d)\n" % (lb2,p2inx,ub2))
 282
 283         result = list()
 284         for i in range(lb1,ub1+1):
 285                 for j in range(lb2,ub2+1):
 286                         if i<j:         # segments must not overlap
 287                                 if i+1 >=min_seg_size and j-i+1 >= min_seg_size and len(sizes)-1-j >= min_seg_size : # not too small segments
 288                                         #print("** i=%d,j=%d" % (i,j))
 289                                         segments = [(0,i),(i,j),(j,len(sizes)-1)]
 290                                         result.append( correl_split_weighted_logerr( sizes, timings, segments) )  # add pair (metric,interval)
 291
 292         # sort results on ascending metric: ok for logerr. Add "reverse=true" for desc sort if you use  a correlation metric
 293         result = sorted( result, key=lambda pair: pair[0])
 294
 295
 296         top_n_sol=5;  # tweak to display top best n solution
 297
 298         print("#-------------------- result summary ---------------------------------------------------------------------\n");
 299
 300         for k in range(top_n_sol):
 301                 (err,interval) = result[k]
 302                 print(k)
 303                 print("\n   RANK {0}\n-------".format(k))
 304                 print("** overall metric = {0}".format(err))
 305                 for (a,b,i,j,e) in interval:
 306                         print("** OPT: [{0} .. {1}] segment_metric={2}          slope: {3} x + {4}".format(i,j,e,a,b))
 307
 308
 309         print("\n\n\n")
 310
 311         print("#-------------------- Best Solution : cut here the gnuplot code -----------------------------------------------------------\n");
 312         preamble='set output "regr.eps"\n\
 313 set terminal postscript eps color\n\
 314 set key left\n\
 315 set xlabel "Each message size in bytes"\n\
 316 set ylabel "Time in us"\n\
 317 set logscale x\n\
 318 set logscale y\n\
 319 set grid'
 320
 321         print(preamble);
 322         print('plot "%s" u 3:4:($5) with errorbars title "skampi traces %s",\\' % (sys.argv[1],sys.argv[1]));
 323         (err,interval) = result[0]
 324         for (a,b,i,j,e) in interval:
 325                 print('"%s" u (%d<=$3 && $3<=%d? $3:0/0):(%f*($3)+%f) w linespoints title "regress. %s-%s bytes",\\' % (sys.argv[1],i,j,a,b,i,j))
 326
 327         print("#-------------------- /cut here the gnuplot code -----------------------------------------------------------\n");
 328
 329
 330 else:
 331         print('\n** Linear regression on %d values **\n' % (nblines))
 332         print('\n   sizes=',sizes,'\n\n')
 333         avg_sizes = avg( sizes )
 334         avg_timings = avg( timings )
 335         print("avg_timings=%f, avg_sizes=%f, nblines=%d\n" % (avg_timings,avg_sizes,nblines))
 336
 337         S_XY= cov( sizes, timings )
 338         S_X2 = variance(  sizes )
 339         S_Y2 = variance(  timings )  # to compute correlation
 340
 341         a = S_XY/S_X2
 342         correl = S_XY/(sqrt(S_X2)*sqrt(S_Y2))  # corealation coeff (Bravais-Pearson)
 343
 344
 345         b= avg_timings - a * avg_sizes
 346         print("[S_XY=%f, S_X2=%f]\n[correlation=%f, coeff det=%f]\n[a=%f, b=%f]\n" % (S_XY, S_X2, correl,correl**2,a, b))
 347
 348