#------------------------------------------------------------------------------- # Copyright (c) 2012 University of Illinois, NCSA. # All rights reserved. This program and the accompanying materials # are made available under the terms of the # University of Illinois/NCSA Open Source License # which accompanies this distribution, and is available at # http://opensource.ncsa.illinois.edu/license.html #------------------------------------------------------------------------------- ##--------------------------------------------------------------------------------------------------# ##' Variable-width (dagonally cut) histogram ##' ##' ##' When constructing a histogram, it is common to make all bars the same width. ##' One could also choose to make them all have the same area. ##' These two options have complementary strengths and weaknesses; the equal-width histogram oversmooths in regions of high density, and is poor at identifying sharp peaks; the equal-area histogram oversmooths in regions of low density, and so does not identify outliers. ##' We describe a compromise approach which avoids both of these defects. We regard the histogram as an exploratory device, rather than as an estimate of a density. ##' @name dhist ##' @title Diagonally Cut Histogram ##' @param x is a numeric vector (the data) ##' @param a is the scaling factor, default is 5 * IQR ##' @param nbins is the number of bins, default is assigned by the Stuges method ##' @param rx is the range used for the left of the left-most bin to the right of the right-most bin ##' @param eps used to set artificial bound on min width / max height of bins as described in Denby and Mallows (2009) on page 24. ##' @param xlab is label for the x axis ##' @param plot = TRUE produces the plot, FALSE returns the heights, breaks and counts ##' @param lab.spikes = TRUE labels the \% of data in the spikes ##' @return list with two elements, heights of length n and breaks of length n+1 indicating the heights and break points of the histogram bars. ##' @author Lorraine Denby, Colin Mallows ##' @references Lorraine Denby, Colin Mallows. Journal of Computational and Graphical Statistics. March 1, 2009, 18(1): 21-31. doi:10.1198/jcgs.2009.0002. dhist <- function(x, a=5*iqr(x), nbins=nclass.Sturges(x), rx = range(x,na.rm = TRUE), eps=.15, xlab = "x", plot = TRUE,lab.spikes = TRUE) { if(is.character(nbins)) nbins <- switch(casefold(nbins), sturges = nclass.Sturges(x), fd = nclass.FD(x), scott = nclass.scott(x), stop("Nclass method not recognized")) else if(is.function(nbins)) nbins <- nbins(x) x <- sort(x[!is.na(x)]) if(a == 0) a <- diff(range(x))/100000000 if(a != 0 & a != Inf) { n <- length(x) h <- (rx[2] + a - rx[1])/nbins ybr <- rx[1] + h * (0:nbins) yupper <- x + (a * (1:n))/n # upper and lower corners in the ecdf ylower <- yupper - a/n # cmtx <- cbind(cut(yupper, breaks = ybr), cut(yupper, breaks = ybr, left.include = TRUE), cut(ylower, breaks = ybr), cut(ylower, breaks = ybr, left.include = TRUE)) cmtx[1, 3] <- cmtx[1, 4] <- 1 # to replace NAs when default r is used cmtx[n, 1] <- cmtx[n, 2] <- nbins # #checksum <- apply(cmtx, 1, sum) %% 4 checksum <- (cmtx[, 1] + cmtx[, 2] + cmtx[, 3] + cmtx[, 4]) %% 4 # will be 2 for obs. that straddle two bins straddlers <- (1:n)[checksum == 2] # to allow for zero counts if(length(straddlers) > 0) { counts <- table(c(1:nbins, cmtx[ - straddlers, 1])) } else { counts <- table(c(1:nbins, cmtx[, 1])) } counts <- counts - 1 # if(length(straddlers) > 0) { for(i in straddlers) { binno <- cmtx[i, 1] theta <- ((yupper[i] - ybr[binno]) * n)/a counts[binno - 1] <- counts[binno - 1] + ( 1 - theta) counts[binno] <- counts[binno] + theta } } xbr <- ybr xbr[-1] <- ybr[-1] - (a * cumsum(counts))/n spike<-eps*diff(rx)/nbins flag.vec<-c(diff(xbr)1) { xbr.new<-xbr counts.new<-counts diff.xbr<-abs(diff(xbr)) amt.spike<-diff.xbr[length(diff.xbr)] for (i in rev(2:length(diff.xbr))) { if (diff.xbr[i-1] <= spike&diff.xbr[i] <= spike & !is.na(diff.xbr[i])) { amt.spike <- amt.spike+diff.xbr[i-1] counts.new[i-1] <- counts.new[i-1]+counts.new[i] xbr.new[i] <- NA counts.new[i] <- NA flag.vec[i-1] <- T } else amt.spike<-diff.xbr[i-1] } flag.vec<-flag.vec[!is.na(xbr.new)] flag.vec<-flag.vec[-length(flag.vec)] counts<-counts.new[!is.na(counts.new)] xbr<-xbr.new[!is.na(xbr.new)] } else flag.vec<-flag.vec[-length(flag.vec)] widths <- abs(diff(xbr)) ## N.B. argument "widths" in barplot must be xbr heights <- counts/widths } bin.size <- length(x)/nbins cut.pt <- unique(c(min(x) - abs(min(x))/1000, approx(seq(length(x)), x, (1:(nbins - 1)) * bin.size, rule = 2)$y, max(x))) aa <- hist(x, breaks = cut.pt, plot = FALSE, probability = TRUE) if(a == Inf) { heights <- aa$counts xbr <- aa$breaks } amt.height<-3 q75<-quantile(heights,.75) if (sum(flag.vec)!=0) { amt<-max(heights[!flag.vec]) ylim.height<-amt*amt.height ind.h<-flag.vec&heights> ylim.height flag.vec[heights=1) { usr<-par('usr') for ( i in seq(length(xbr)-1)) { if (!flag.vec[i]) { amt.txt<-0 if (xbr[i]-xbr[1]