contrib/network_model/regress.py

   1 #!/usr/bin/env python
   2
   3 # Copyright (c) 2010-2011, 2014. The SimGrid Team.
   4 # All rights reserved.
   5
   6 # This program is free software; you can redistribute it and/or modify it
   7 # under the terms of the license (GNU LGPL) which comes with this package.
   8
   9 #---------------------------------------------------------------------------------------------------
  10 # Example invokation:
  11 # % ./regress.py griffon_skampi_pt2pt.ski.dat 65536 120832
  12 #
  13 #
  14 # Given two vectors of same length n: message size S(.. s_i ..), and communication time T( .. t_i .. )
  15 # where t_i is the time associated to a mesage size s_i, computes the segmentation of the vectors
  16 # in 3 segments such that linear regressions on the 3 segments maximize correlation.
  17 # The metric for correlation is now the cumulative, log error of the estimated time value given
  18 # by regression, to the real time value.
  19 #
  20 # Call r(X[i:j],Y[i:j]) = ax + b  the regression line for data of X,Y between indices i and j
  21 # Call E[i:j] = E( e_i, .., e_j) the vector of estimates from the regression, where e_i = a*S[i] + b
  22 # Call mean_logerr( T,T' ) the average log error of paired elements t_i and t'_i, of vectors len n T and T' resp.
  23 #                 i.e  mean_logerr( T,T' ) =  1/n * Sum_i^n ( exp(| ln(t_i)-ln(t'_i)|) - 1 )
  24 #
  25 # The script computes indices k and l, s.t.
  26 #   mean_logerr( r(S[0:k],T[0:k]) , E[0,k] ) +
  27 #   mean_logerr( r(S[k:l],T[k:l]) , E[k,l] ) +
  28 #   mean_logerr( r(S[l:n],T[l:n]) , E[l,n] )
  29 # is minimum.
  30 #---------------------------------------------------------------------------------------------------
  31
  32 import sys
  33 from math import sqrt,log,exp
  34
  35
  36 if len(sys.argv) != 2 and len(sys.argv) != 4:
  37         print("Usage : {} datafile".format(sys.argv[0]))
  38         print("or    : {0} datafile p1 p2".format(sys.argv[0]))
  39         print("where : p1 < p2 belongs to sizes in datafiles")
  40         sys.exit(-1)
  41
  42 if len(sys.argv) == 4:
  43         p1=int(sys.argv[2])
  44         p2=int(sys.argv[3])
  45
  46 ##-----------------------------------------
  47 ## avg : return average of a list of values
  48 ## param l list of values
  49 ##-----------------------------------------
  50 def avg (l):
  51         sum=0
  52         for e in l:
  53                 sum=sum+e;
  54         return sum/len(l)
  55
  56 ##-------------------------------------------------
  57 ## cov : covariance
  58 ## param X first data vector (..x_i..)
  59 ## param Y second data vector (..y_i..)
  60 ## = 1/n \Sum_{i=1}^n (x_i - avg(x)) * (y_i - avg(y))
  61 ##--------------------------------------------------
  62 def cov(X,Y):
  63         assert(len(X)==len(Y))
  64         n=len(X)   #  n=len(X)=len(Y)
  65         avg_X = avg( X )
  66         avg_Y = avg( Y )
  67         S_XY=0
  68         for i in range(n):
  69                 S_XY = S_XY + ((X[i]-avg_X)*(Y[i]-avg_Y))
  70
  71         return (S_XY/n)
  72
  73
  74 ##----------------------------------
  75 ## variance : variance
  76 ## param X data vector ( ..x_i.. )
  77 ## (S_X)^2 = (Sum ( x_i - avg(X) )^2 ) / n
  78 ##----------------------------------
  79 def variance( X ):
  80         S_X2 = 0
  81         n = len( X )
  82         avg_X = avg ( X )
  83         for i in range(n):
  84                 S_X2 = S_X2 + ((X[i] - avg_X)**2)
  85
  86         return (S_X2/n)
  87
  88 ##----------------------------------
  89 ## mean log_error
  90 ## param X data vector ( ..x_i.. ), length n
  91 ## param Y data vector ( ..y_i.. ), length n
  92 ## return mean(   1/n * Sum_i^n ( exp(| ln(x_i)-ln(y_i)|) - 1 )
  93 ##----------------------------------
  94 def mean_logerr( X,Y ):
  95         assert( len(X) == len(Y) )
  96         E = list();                     # the list of errors
  97         for i in range(len(X)):
  98                 E.append( exp(abs(log(X[i])-log(Y[i])))-1 )
  99         return (avg( E ))
 100
 101
 102 ##-----------------------------------------------------------------------------------------------
 103 ## correl_split_weighted_logerr : compute regression on each segment and
 104 ## return the weigthed sum of correlation coefficients
 105 ## param X first data vector (..x_i..)
 106 ## param Y second data vector (..x_i..)
 107 ## param segments list of pairs (i,j) where i refers to the ith value in X, and  jth value in X
 108 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
 109 ##    where i1,j1 is the first segment,  c1 the correlation coef on this segment, n1 the number of values
 110 ##          i2,j2 is the second segment, c2 the correlation coef on this segment, n2 the number of values
 111 ##          ...
 112 ##    and C=c1/n1+c2/n2+...
 113 ##-----------------------------------------------------------------------------------------------
 114 def correl_split_weighted_logerr( X , Y , segments ):
 115         # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
 116         correl = list()
 117         interv = list()  # regr. line coeffs and range
 118         glob_err=0
 119         for (start,stop) in segments:
 120                 #if start==stop :
 121                 #       return 0
 122                 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
 123                 S_X2 = variance(  X [start:stop+1] )
 124                 a = S_XY/S_X2                           # regr line coeffs
 125                 b = avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
 126                 # fill a vector (Z) with predicted values from regression
 127                 Z=list()
 128                 for i in range(start,stop+1):
 129                         Z.append( a * X[i] + b )
 130                 # compare real values and computed values
 131                 e = mean_logerr( Y[start:stop+1] , Z )
 132                 correl.append( (e, stop-start+1) );   # store correl. coef + number of values (segment length)
 133                 interv.append( (a,b, X[start],X[stop],e) );
 134
 135         for (e,l) in correl:
 136                 glob_err = glob_err + (e*l/len( X ))  # the average log err for this segment (e) is
 137                                                                 # weighted by the number of values of the segment (l) out of the total number of values
 138
 139         #print("-> glob_corr={}\n".format(glob_corr))
 140         return (glob_err,interv);
 141
 142 ##-----------------------------------------------------------------------------------------------
 143 ## correl_split_weighted : compute regression on each segment and
 144 ## return the weigthed sum of correlation coefficients
 145 ## param X first data vector (..x_i..)
 146 ## param Y second data vector (..x_i..)
 147 ## param segments list of pairs (i,j) where i refers to the ith value in X, and  jth value in X
 148 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
 149 ##    where i1,j1 is the first segment,  c1 the correlation coef on this segment, n1 the number of values
 150 ##          i2,j2 is the second segment, c2 the correlation coef on this segment, n2 the number of values
 151 ##          ...
 152 ##    and C=c1/n1+c2/n2+...
 153 ##-----------------------------------------------------------------------------------------------
 154 def correl_split_weighted( X , Y , segments ):
 155         # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
 156         correl = list();
 157         interv = list();  # regr. line coeffs and range
 158         glob_corr=0
 159         sum_nb_val=0
 160         for (start,stop) in segments:
 161                 sum_nb_val = sum_nb_val + stop - start;
 162                 #if start==stop :
 163                 #       return 0
 164                 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
 165                 S_X2 = variance(  X [start:stop+1] )
 166                 S_Y2 = variance(  Y [start:stop+1] )  # to compute correlation
 167                 if S_X2*S_Y2 == 0:
 168                         return (0,[])
 169                 c = S_XY/(sqrt(S_X2)*sqrt(S_Y2))
 170                 a = S_XY/S_X2                           # regr line coeffs
 171                 b= avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
 172                 #print("   range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b))
 173                 correl.append( (c, stop-start) );   # store correl. coef + number of values (segment length)
 174                 interv.append( (a,b, X[start],X[stop]) );
 175
 176         for (c,l) in correl:
 177                 glob_corr = glob_corr + (l/sum_nb_val)*c  # weighted product of correlation
 178                 #print('-- %f * %f' % (c,l/sum_nb_val))
 179
 180         #print("-> glob_corr={}\n".format(glob_corr))
 181         return (glob_corr,interv);
 182
 183
 184
 185
 186 ##-----------------------------------------------------------------------------------------------
 187 ## correl_split : compute regression on each segment and
 188 ## return the product of correlation coefficient
 189 ## param X first data vector (..x_i..)
 190 ## param Y second data vector (..x_i..)
 191 ## param segments list of pairs (i,j) where i refers to the ith value in X, and  jth value in X
 192 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
 193 ##    where i1,j1 is the first segment,  c1 the correlation coef on this segment,
 194 ##          i2,j2 is the second segment, c2 the correlation coef on this segment,
 195 ##          ...
 196 ##    and C=c1*c2*...
 197 ##-----------------------------------------------------------------------------------------------
 198 def correl_split( X , Y , segments ):
 199         # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
 200         correl = list();
 201         interv = list();  # regr. line coeffs and range
 202         glob_corr=1
 203         for (start,stop) in segments:
 204                 #if start==stop :
 205                 #       return 0
 206                 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
 207                 S_X2 = variance(  X [start:stop+1] )
 208                 S_Y2 = variance(  Y [start:stop+1] )  # to compute correlation
 209                 if S_X2*S_Y2 == 0:
 210                         return (0,[])
 211                 c = S_XY/(sqrt(S_X2)*sqrt(S_Y2))
 212                 a = S_XY/S_X2                           # regr line coeffs
 213                 b= avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
 214                 #print("   range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b))
 215                 correl.append( (c, stop-start) );   # store correl. coef + number of values (segment length)
 216                 interv.append( (a,b, X[start],X[stop]) );
 217
 218         for (c,l) in correl:
 219                 glob_corr = glob_corr * c  # product of correlation coeffs
 220         return (glob_corr,interv);
 221
 222
 223
 224 ##-----------------------------------------------------------------------------------------------
 225 ## main
 226 ##-----------------------------------------------------------------------------------------------
 227 sum=0
 228 nblines=0
 229 skampidat = open(sys.argv[1], "r")
 230
 231
 232 ## read data from skampi logs.
 233 timings = []
 234 sizes = []
 235 readdata =[]
 236 for line in skampidat:
 237         l = line.split();
 238         if line[0] != '#' and len(l) >= 3:   # is it a comment ?
 239         ## expected format
 240         ## ---------------
 241         #count= 8388608  8388608  144916.1       7.6       32  144916.1  143262.0
 242         #("%s %d %d %f %f %d %f %f\n" % (countlbl, count, countn, time, stddev, iter, mini, maxi)
 243                 readdata.append( (int(l[1]),float(l[3])) );
 244                 nblines=nblines+1
 245
 246 ## These may not be sorted so sort it by message size before processing.
 247 sorteddata = sorted( readdata, key=lambda pair: pair[0])
 248 sizes,timings = zip(*sorteddata);
 249
 250 # zip makes tuples; cast to lists for backward compatibility with python2.X
 251 sizes   = list(sizes)
 252 timings = list(timings)
 253
 254 ##----------------------- search for best break points-----------------
 255 ## example
 256 ## p1=2048  -> p1inx=11  delta=3 -> [8;14]
 257 ##                                              8 : segments[(0,7),(8,13),(13,..)]
 258 ##                                              ....
 259 ## p2=65536 -> p2inx=16  delta=3 -> [13;19]
 260
 261 if len(sys.argv) == 4:
 262
 263         p1inx = sizes.index( p1 );
 264         p2inx = sizes.index( p2 );
 265         max_glob_corr = 999990;
 266         max_p1inx = p1inx
 267         max_p2inx = p2inx
 268
 269         ## tweak parameters here to extend/reduce search
 270         search_p1 = 70          # number of values to search +/- around p1
 271         search_p2 = 70          # number of values to search +/- around p2
 272         min_seg_size = 3
 273         if (search_p2 + min_seg_size > len(sizes)):  # reduce min segment sizes when points close to data extrema
 274                 min_seg_size = len(sizes)-search_p2
 275         if (search_p1 - min_seg_size < 0):
 276                 min_seg_size = search_p1
 277
 278         lb1 = max( 1, p1inx-search_p1 )
 279         ub1 = min( p1inx+search_p1, p2inx);
 280         lb2 = max( p1inx, p2inx-search_p2)      # breakpoint +/- delta
 281         ub2 = min( p2inx+search_p2, len(sizes)-1);
 282
 283         print("** evaluating over \n");
 284         print("interv1:\t %d <--- %d ---> %d" % (sizes[lb1],p1,sizes[ub1]))
 285         print("rank:   \t (%d)<---(%d)--->(%d)\n" % (lb1,p1inx,ub1))
 286         print("interv2:\t\t %d <--- %d ---> %d" % (sizes[lb2],p2,sizes[ub2]))
 287         print("rank:   \t\t(%d)<---(%d)--->(%d)\n" % (lb2,p2inx,ub2))
 288
 289         result = list()
 290         for i in range(lb1,ub1+1):
 291                 for j in range(lb2,ub2+1):
 292                         if i<j:         # segments must not overlap
 293                                 if i+1 >=min_seg_size and j-i+1 >= min_seg_size and len(sizes)-1-j >= min_seg_size : # not too small segments
 294                                         #print("** i=%d,j=%d" % (i,j))
 295                                         segments = [(0,i),(i,j),(j,len(sizes)-1)]
 296                                         result.append( correl_split_weighted_logerr( sizes, timings, segments) )  # add pair (metric,interval)
 297
 298         # sort results on ascending metric: ok for logerr. Add "reverse=true" for desc sort if you use  a correlation metric
 299         result = sorted( result, key=lambda pair: pair[0])
 300
 301
 302         top_n_sol=5;  # tweak to display top best n solution
 303
 304         print("#-------------------- result summary ---------------------------------------------------------------------\n");
 305
 306         for k in range(top_n_sol):
 307                 (err,interval) = result[k]
 308                 print(k)
 309                 print("\n   RANK {0}\n-------".format(k))
 310                 print("** overall metric = {0}".format(err))
 311                 for (a,b,i,j,e) in interval:
 312                         print("** OPT: [{0} .. {1}] segment_metric={2}          slope: {3} x + {4}".format(i,j,e,a,b))
 313
 314
 315         print("\n\n\n")
 316
 317         print("#-------------------- Best Solution : cut here the gnuplot code -----------------------------------------------------------\n");
 318         preamble='set output "regr.eps"\n\
 319 set terminal postscript eps color\n\
 320 set key left\n\
 321 set xlabel "Each message size in bytes"\n\
 322 set ylabel "Time in us"\n\
 323 set logscale x\n\
 324 set logscale y\n\
 325 set grid'
 326
 327         print(preamble);
 328         print('plot "%s" u 3:4:($5) with errorbars title "skampi traces %s",\\' % (sys.argv[1],sys.argv[1]));
 329         (err,interval) = result[0]
 330         for (a,b,i,j,e) in interval:
 331                 print('"%s" u (%d<=$3 && $3<=%d? $3:0/0):(%f*($3)+%f) w linespoints title "regress. %s-%s bytes",\\' % (sys.argv[1],i,j,a,b,i,j))
 332
 333         print("#-------------------- /cut here the gnuplot code -----------------------------------------------------------\n");
 334
 335
 336 else:
 337         print('\n** Linear regression on %d values **\n' % (nblines))
 338         print('\n   sizes=',sizes,'\n\n')
 339         avg_sizes = avg( sizes )
 340         avg_timings = avg( timings )
 341         print("avg_timings=%f, avg_sizes=%f, nblines=%d\n" % (avg_timings,avg_sizes,nblines))
 342
 343         S_XY= cov( sizes, timings )
 344         S_X2 = variance(  sizes )
 345         S_Y2 = variance(  timings )  # to compute correlation
 346
 347         a = S_XY/S_X2
 348         correl = S_XY/(sqrt(S_X2)*sqrt(S_Y2))  # corealation coeff (Bravais-Pearson)
 349
 350
 351         b= avg_timings - a * avg_sizes
 352         print("[S_XY=%f, S_X2=%f]\n[correlation=%f, coeff det=%f]\n[a=%f, b=%f]\n" % (S_XY, S_X2, correl,correl**2,a, b))
 353
 354