contrib/network_model/regress.py

   1 #!/usr/bin/env python
   2
   3 import sys
   4 from math import sqrt
   5
   6
   7 if len(sys.argv) != 2 and len(sys.argv) != 4:
   8         print("Usage : {} datafile".format(sys.argv[0]))
   9         print("or    : {} datafile p1 p2".format(sys.argv[0]))
  10         print("where : p1 < p2 belongs to sizes in datafiles")
  11         sys.exit(-1)
  12
  13 if len(sys.argv) == 4:
  14         p1=int(sys.argv[2])
  15         p2=int(sys.argv[3])
  16
  17 ##-----------------------------------------
  18 ## avg : return average of a list of values
  19 ## param l list of values
  20 ##-----------------------------------------
  21 def avg (l):
  22         sum=0
  23         for e in l:
  24                 sum=sum+e;
  25         return sum/len(l)
  26
  27 ##-------------------------------------------------
  28 ## cov : covariance
  29 ## param X first data vector (..x_i..)
  30 ## param Y second data vector (..y_i..)
  31 ## = 1/n \Sum_{i=1}^n (x_i - avg(x)) * (y_i - avg(y))
  32 ##--------------------------------------------------
  33 def cov(X,Y):
  34
  35         assert(len(X)==len(Y))
  36         n=len(X)   #  n=len(X)=len(Y)
  37         avg_X = avg( X )
  38         avg_Y = avg( Y )
  39         S_XY=0
  40         for i in range(n):
  41                 S_XY = S_XY + ((X[i]-avg_X)*(Y[i]-avg_Y))
  42
  43         return (S_XY/n)
  44
  45
  46 ##----------------------------------
  47 ## variance : variance
  48 ## param X data vector ( ..x_i.. )
  49 ## (S_X)^2 = (Sum ( x_i - avg(x) )^2 ) / n
  50 ##----------------------------------
  51 def variance( X ):
  52         S_X2 = 0
  53         n = len( X )
  54         avg_X = avg ( X )
  55         for i in range(n):
  56                 S_X2 = S_X2 + ((X[i] - avg_X)**2)
  57
  58         return (S_X2/n)
  59
  60 ##-----------------------------------------------------------------------------------------------
  61 ## correl_split_weighted : compute regression on each segment and
  62 ## return the weigthed sum of correlation coefficients
  63 ## param X first data vector (..x_i..)
  64 ## param Y second data vector (..x_i..)
  65 ## param segments list of pairs (i,j) where i refers to the ith value in X, and  jth value in X
  66 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
  67 ##    where i1,j1 is the first segment,  c1 the correlation coef on this segment, n1 the number of values
  68 ##          i2,j2 is the second segment, c2 the correlation coef on this segment, n2 the number of values
  69 ##          ...
  70 ##    and C=c1/n1+c2/n2+...
  71 ##-----------------------------------------------------------------------------------------------
  72 def correl_split_weighted( X , Y , segments ):
  73         # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
  74         correl = list();
  75         interv = list();  # regr. line coeffs and range
  76         glob_corr=0
  77         sum_nb_val=0
  78         for (start,stop) in segments:
  79                 sum_nb_val = sum_nb_val + stop - start;
  80                 #if start==stop :
  81                 #       return 0
  82                 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
  83                 S_X2 = variance(  X [start:stop+1] )
  84                 S_Y2 = variance(  Y [start:stop+1] )  # to compute correlation
  85                 if S_X2*S_Y2 == 0:
  86                         return (0,[])
  87                 c = S_XY/(sqrt(S_X2)*sqrt(S_Y2))
  88                 a = S_XY/S_X2                           # regr line coeffs
  89                 b= avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
  90                 print("   range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b))
  91                 correl.append( (c, stop-start) );   # store correl. coef + number of values (segment length)
  92                 interv.append( (a,b, X[start],X[stop]) );
  93
  94         for (c,l) in correl:
  95                 glob_corr = glob_corr + (l/sum_nb_val)*c  # weighted product of correlation
  96                 print('-- %f * %f' % (c,l/sum_nb_val))
  97
  98         print("-> glob_corr={}\n".format(glob_corr))
  99         return (glob_corr,interv);
 100
 101
 102
 103
 104 ##-----------------------------------------------------------------------------------------------
 105 ## correl_split : compute regression on each segment and
 106 ## return the product of correlation coefficient
 107 ## param X first data vector (..x_i..)
 108 ## param Y second data vector (..x_i..)
 109 ## param segments list of pairs (i,j) where i refers to the ith value in X, and  jth value in X
 110 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
 111 ##    where i1,j1 is the first segment,  c1 the correlation coef on this segment,
 112 ##          i2,j2 is the second segment, c2 the correlation coef on this segment,
 113 ##          ...
 114 ##    and C=c1*c2*...
 115 ##-----------------------------------------------------------------------------------------------
 116 def correl_split( X , Y , segments ):
 117         # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
 118         correl = list();
 119         interv = list();  # regr. line coeffs and range
 120         glob_corr=1
 121         for (start,stop) in segments:
 122                 #if start==stop :
 123                 #       return 0
 124                 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
 125                 S_X2 = variance(  X [start:stop+1] )
 126                 S_Y2 = variance(  Y [start:stop+1] )  # to compute correlation
 127                 if S_X2*S_Y2 == 0:
 128                         return (0,[])
 129                 c = S_XY/(sqrt(S_X2)*sqrt(S_Y2))
 130                 a = S_XY/S_X2                           # regr line coeffs
 131                 b= avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
 132                 print("   range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b))
 133                 correl.append( (c, stop-start) );   # store correl. coef + number of values (segment length)
 134                 interv.append( (a,b, X[start],X[stop]) );
 135
 136         for (c,l) in correl:
 137                 glob_corr = glob_corr * c  # product of correlation coeffs
 138         print("-> glob_corr=%f\n" % glob_corr)
 139         return (glob_corr,interv);
 140
 141
 142
 143 ##-----------------------------------------------------------------------------------------------
 144 ## main
 145 ##-----------------------------------------------------------------------------------------------
 146 sum=0
 147 nblines=0
 148 skampidat = open(sys.argv[1], "r")
 149
 150
 151 ## read data from skampi logs.
 152 timings = []
 153 sizes = []
 154 readdata =[]
 155 for line in skampidat:
 156         l = line.split();
 157         if line[0] != '#' and len(l) >= 3:   # is it a comment ?
 158         ## expected format
 159         ## ---------------
 160         #count= 8388608  8388608  144916.1       7.6       32  144916.1  143262.0
 161         #("%s %d %d %f %f %d %f %f\n" % (countlbl, count, countn, time, stddev, iter, mini, maxi)
 162                 readdata.append( (int(l[1]),float(l[3])) );
 163                 nblines=nblines+1
 164
 165 ## These may not be sorted so sort it by message size before processing.
 166 sorteddata = sorted( readdata, key=lambda pair: pair[0])
 167 sizes,timings = zip(*sorteddata);
 168
 169
 170 ##----------------------- search for best break points-----------------
 171 ## example
 172 ## p1=2048  -> p1inx=11  delta=3 -> [8;14]
 173 ##                                              8 : segments[(0,7),(8,13),(13,..)]
 174 ##                                              ....
 175 ## p2=65536 -> p2inx=16  delta=3 -> [13;19]
 176
 177 if len(sys.argv) == 4:
 178
 179         p1inx = sizes.index( p1 );
 180         p2inx = sizes.index( p2 );
 181         max_glob_corr = 0;
 182         max_p1inx = p1inx
 183         max_p2inx = p2inx
 184
 185         ## tweak parameters here to extend/reduce search
 186         search_p1 = 30          # number of values to search +/- around p1
 187         search_p2 = 65          # number of values to search +/- around p2
 188         min_seg_size = 3
 189
 190         lb1 = max(1, p1inx-search_p1)
 191         ub1 = min(p1inx+search_p1,search_p1, p2inx);
 192         lb2 = max(p1inx,p2inx-search_p2)        # breakpoint +/- delta
 193         ub2 = min(p2inx+search_p2,len(sizes)-1);
 194
 195         print("** evaluating over \n");
 196         print("interv1:\t %d <--- %d ---> %d" % (sizes[lb1],p1,sizes[ub1]))
 197         print("rank:   \t (%d)<---(%d)--->(%d)\n" % (lb1,p1inx,ub1))
 198         print("interv2:\t\t %d <--- %d ---> %d" % (sizes[lb2],p2,sizes[ub2]))
 199         print("rank:   \t\t(%d)<---(%d)--->(%d)\n" % (lb2,p2inx,ub2))
 200         for i in range(lb1,ub1+1):
 201                 for j in range(lb2,ub2+1):
 202                         if i<j:         # segments must not overlap
 203                                 if i+1 >=min_seg_size and j-i+1 >= min_seg_size and len(sizes)-1-j >= min_seg_size : # not too small segments
 204                                         print("** i=%d,j=%d" % (i,j))
 205                                         segments = [(0,i),(i,j),(j,len(sizes)-1)]
 206                                         (glob_cor, interv) = correl_split( sizes, timings, segments)
 207                                         if ( glob_cor > max_glob_corr):
 208                                                 max_glob_corr = glob_cor
 209                                                 max_interv = interv
 210
 211         print("#-------------------- result summary ---------------------------------------------------------------------\n");
 212         for (a,b,i,j) in max_interv:
 213                 print("** OPT: [%d .. %d] correl coef prod=%f           slope: %f x + %f" % (i,j,max_glob_corr,a,b))
 214
 215         print("#-------------------- cut here the gnuplot code -----------------------------------------------------------\n");
 216         preamble='set output "regr.eps"\n\
 217 set terminal postscript eps color\n\
 218 set key left\n\
 219 set xlabel "Each message size in bytes"\n\
 220 set ylabel "Time in us"\n\
 221 set logscale x\n\
 222 set logscale y\n\
 223 set grid'
 224
 225         print(preamble);
 226         print('plot "%s" u 3:4:($5) with errorbars title "skampi traces %s",\\' % (sys.argv[1],sys.argv[1]));
 227         for (a,b,i,j) in max_interv:
 228                 print('"%s" u (%d<=$3 && $3<=%d? $3:0/0):(%f*($3)+%f) w linespoints title "regress. %s-%s bytes",\\' % (sys.argv[1],i,j,a,b,i,j))
 229
 230         print("#-------------------- /cut here the gnuplot code -----------------------------------------------------------\n");
 231
 232
 233 else:
 234         print('\n** Linear regression on %d values **\n' % (nblines))
 235         print('\n   sizes=',sizes,'\n\n')
 236         avg_sizes = avg( sizes )
 237         avg_timings = avg( timings )
 238         print("avg_timings=%f, avg_sizes=%f, nblines=%d\n" % (avg_timings,avg_sizes,nblines))
 239
 240         S_XY= cov( sizes, timings )
 241         S_X2 = variance(  sizes )
 242         S_Y2 = variance(  timings )  # to compute correlation
 243
 244         a = S_XY/S_X2;
 245         correl = S_XY/(sqrt(S_X2)*sqrt(S_Y2))  # corealation coeff (Bravais-Pearson)
 246
 247
 248         b= avg_timings - a * avg_sizes
 249         print("[S_XY=%f, S_X2=%f]\n[correlation=%f, coeff det=%f]\n[a=%f, b=%f]\n" % (S_XY, S_X2, correl,correl**2,a, b))
 250
 251