Log Related to Density Estimation & Nonparametric Regression ================================= 4/28/04 ## Examples use "galaxies" data (in web-page ## data-directory) along with "geyser" ## data supplied in standard Splus6.0 Selected statements/functions for class use. -------------------------------------------- > galaxies <- scan("galaxies.dat", skip=3)/1000 > summary(galaxies) Min. 1st Qu. Median Mean 3rd Qu. Max. 9.172 19.53 20.83 20.83 23.13 34.28 > hist(galaxies, nclass=16, prob=T, xlim=c(4,40), ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", main= "Gaussian Window, Default") > dens1 <- density(galaxies) lines(dens1) > hist(galaxies, nclass=16, prob=T, xlim=c(4,40), ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", main= "Rectang Window, Default") lines(density(galaxies,window="r")) > pts <- dens1$x kerMat <- outer(pts, galaxies, function(x,y) dlogis(x,y,.2) > dim(kerMat) [1] 50 82 > dens3 <- cbind(pts, c(kerMat %*% rep(1/82,82))) === Next a little bit on cross-validated bandwidth selection === WHAT WE DISCUSS HERE IS "LEAST-SQUARES CROSS-VALIDATION". More details can be found in: W Haerdle, SMOOTHING TECHNIQUES WITH IMPLEMENTATIION IN S (Springer-Verlage book, 1991), Sec 4.3.2 Scott, D. and Terrell, G. (1987) JASA v.82, p.1131 article. > LgisCV <- function(b, Pts) { ### little function to evaluate least-sq error criterion ### for galaxies data with bin-width b kertmp <- outer(Pts, galaxies, function(x,y,B) dlogis(x,y,B), B=b) denstmp <- cbind(Pts, c(kertmp %*% rep(1/82,82))) delta <- c(range(Pts) %*% c(-1,1))/(length(Pts)-1) intsq <- delta*(sum(denstmp[,2]^2) - 0.5*(denstmp[1,2]^2+ denstmp[length(Pts),2]^2)) ### The numerical integration here uses trapezoid rule. penalty <- numeric(82) for(i in 1:82) penalty[i] <- mean( dlogis(galaxies[i],galaxies[-i],b)) intsq - 2*mean(penalty) } ### NOTE: minimizing this function gives a ### "Cross-Validated Bandwidth" b . > LgisCV(.2,pts) -0.0998805 > LgisCV(.5,pts) -0.1042199 > LgisCV(.8,pts) -0.09922277 > optimize(LgisCV, c(.1,1.5), Pts=pts)[1:2] $minimum: [1] 0.3783803 $objective: [1] -0.1052062 ### Thus the optimized bandwidth is 0.3784: > lines(pts, c(outer(pts, galaxies, function(x,y) dlogis(x,y, 0.3784)) %*% rep(1/82,82)), lty=4) ### Adding this curve to either of the Gaussian or ### Rectangular default plots above shows a really ### good job of fitting !! ------------------------------------------------------------ ### Next: spline-based estimate, using all points as "knots". ### First get spline estimate of cdf, then density using ### deriv=1 in "predict.smooth.spline". > rnkx <- numeric(50) for (i in 1:50) rnkx[i] <- sum(galaxies<= pts[i])/82 > splgalx <- smooth.spline(pts, rnkx, spar=1.e-6) > plot(pts,rnkx) lines(pts, predict.smooth.spline(splgalx,pts)$y) > par(mfrow=c(2,2)) spars <- c(1.e-6,1.e-5,5.e-5, 1.e-4) for(i in 1:4) { hist(galaxies, nclass=16, prob=T, xlim=c(7,40), ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", main= paste("Spline Fit, spar=",spars[i],sep="")) splgalx <- smooth.spline(pts, rnkx, spar=spars[i]) lines(pts, predict.smooth.spline(splgalx,pts,1)$y) } > printgraph(file="SplinDns.ps") ----------------------------------------------------------- ### Now let's compare with mixture of logistics: > lLk <- function(x, pwt, mn, scal, dens = dlogis) { invsc <- 1/scal dnsmat <- invsc*matrix(dens(invsc* outer(mn,x,"-")), ncol=length(x)) sum(log(pwt %*% dnsmat)) } > lLk(galaxies, c(.5,.5),c(20,30),c(2,2)) ## -268.8732 > tmpft <- nlmin(function(w) -lLk(galaxies,c(plogis(w[1]), 1-plogis(w[1])), w[2:3], exp(w[4:5])), c(0,20,30,2,2), print.level=1, max.fcal=300, max.iter=100) it nf f reldf preldf reldx 0 1 2.966e2 1 3 2.594e2 1.254e-1 1.395e-1 1.582e-2 2 4 2.421e2 6.679e-2 1.096e-1 1.424e-2 3 5 2.329e2 3.816e-2 5.112e-2 1.252e-2 ... 19 27 2.225e2 8.169e-11 8.239e-11 4.790e-6 ***** relative function convergence ***** function 2.225247e2 reldx 4.7895e-6 function evaluations: 27 gradient evaluations: 116 preldf 8.2388e-11 npreldf 8.2388e-11 > tmpft$x 1] 1.1663496 21.2954746 18.7628558 0.1574711 1.6160374 ### So assign mean 21.295 with weight plogis(1.1663) = 0.762 ============================================================ Remainder of this log has selected Nonparametric Regression and Smoothing steps for class 4/30/04 -------------------------------------. > gfram <- cbind.data.frame(Wait=geyser$waiting, Lgth=geyser$duration) > plot(gfram[,1], gfram[,2], xlab="Wait", ylab="Lgth", main= "Scatterplot of Geyser Data with Lowess, n=299") > for(i in 1:4) abline(v=42+i*12, lty=3) for(i in 1:5) { inds <- (1:299)[abs(gfram$Wait-42-12*i+6) < 6] itmp <- order(gfram$Wait[inds]) lines(gfram$Wait[inds[itmp]], lm(Lgth ~ Wait, data=gfram[ inds,])$fit[itmp], lty=6) } > tmplow <- lowess(gfram$Wait,gfram$Lgth) lines(tmplow$x,tmplow$y, lty=4) ### Now kernel-based nonparametric-regression fit: ### Take expectations near each of 50 evenly spaced points > xp <- seq(40,110, length=50) kermat <- outer(xp, gfram$Wait, function(x,y) dnorm(x,y,1)) ### Our bandwidth is 1, using Gaussian kernel. ### Now create conditional expectation using kernel as weights > cexp <- c((kermat %*% gfram$Lgth)/kermat %*% rep(1,299)) points(xp, cexp, pch=5) > tmpsp <- smooth.spline(gfram$Wait,gfram$Lgth, all.knots=F, spar=1e-5) lines(tmpsp$x, tmpsp$y, lty=1) legend(locator(), legend=c("Piecewise linear","Lowess","Spline"), lty=c(6,4,1)) text(locator(),paste("Hollow diamonds are \n", "kernel-regression points") > printgraph(file="geyserNPR.ps") ----- Remains to use Cross-Validation to check -------------- Predictive Success of all of these modelling Strategies ! Method: leave 30 points out (at random), fit using the rest by all of the NPR methods (lowess, spline, kernel-regression). Repeat 1000 times to get averaged mean-square prediction error per obs ! ## Let's try to do cross-validation using only spline and kernel regression, since with lowess or the related function loess.smooth, we do not automatically have smoothed-function evaluations at newly specified points. (The documentation just suggests to use "approx" for lowess fits, or linear interpolation, to get smoothed-function approximations at new points.) > splerr <- kererr <- numeric(100) for (i in 1:100) { leftout <- sample(299, 30) leftin <- setdiff(1:299,leftout) tmpsp <- smooth.spline(gfram$Wait[leftin],gfram$Lgth[leftin], all.knots=F, spar=1e-5) splpred <- predict.smooth.spline(tmpsp, gfram$Wait[leftout])$y kermat <- outer(gfram$Wait[leftout], gfram$Wait[leftin], function(x,y) dnorm(x,y,1)) yexp <- c((kermat %*% gfram$Lgth[leftin])/ kermat %*% rep(1,269)) splerr[i] <- mean((splpred-gfram$Lgth[leftout])^2) kererr[i] <- mean((yexp-gfram$Lgth[leftout])^2) } > summary(splerr) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.439724 0.695004 0.818389 0.849251 0.940258 2.198689 > summary(kererr) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.451945 0.685362 0.792912 0.810285 0.902269 1.354800 ### This comparison seems to show that overall, the kernel-density estimator is doing a little better than the spline. Maybe we can broaden the comparison a little by trying the spline again ,with smoothing parameter a little larger, and the kernel-density with a bandwidth optimized by least-squares cross-validation for the x-coordinate data alone ... > LgisCV <- function(b, Pts, datavec) { ### little function to evaluate least-sq error criterion ### for galaxies data with bin-width b nd <- length(datavec) kertmp <- outer(Pts, datavec, function(x,y,B) dlogis(x,y,B), B=b) denstmp <- cbind(Pts, c(kertmp %*% rep(1/nd,nd))) delta <- c(range(Pts) %*% c(-1,1))/(length(Pts)-1) intsq <- delta*(sum(denstmp[,2]^2) - 0.5*(denstmp[1,2]^2+ denstmp[length(Pts),2]^2)) ### The numerical integration here uses trapezoid rule. penalty <- numeric(nd) for(i in 1:nd) penalty[i] <- mean( dlogis(datavec[i],datavec[-i],b)) intsq - 2*mean(penalty) } > optimize(LgisCV, c(.1,2), Pts=45:105, datavec=gfram$Wait)[1:2] $minimum: [1] 1.25539 $objective: [1] -0.02346868 > splerr2 <- kererr2 <- numeric(100) for (i in 1:100) { leftout <- sample(299, 30) leftin <- setdiff(1:299,leftout) tmpsp <- smooth.spline(gfram$Wait[leftin],gfram$Lgth[leftin], all.knots=F, spar=5e-5) splpred <- predict.smooth.spline(tmpsp, gfram$Wait[leftout])$y kermat <- outer(gfram$Wait[leftout], gfram$Wait[leftin], function(x,y) dnorm(x,y,1.2554)) yexp <- c((kermat %*% gfram$Lgth[leftin])/ kermat %*% rep(1,269)) splerr2[i] <- mean((splpred-gfram$Lgth[leftout])^2) kererr2[i] <- mean((yexp-gfram$Lgth[leftout])^2) } > rbind(summary(splerr),summary(kererr), summary(splerr2), summary(kererr2)) Min. 1st Qu. Median Mean 3rd Qu. Max. [1,] 0.4397243 0.6950036 0.8183886 0.8492507 0.9402580 2.198689 [2,] 0.4519450 0.6853620 0.7929116 0.8102853 0.9022691 1.354800 [3,] 0.3766736 0.6528958 0.7617886 0.7814295 0.9017232 1.206226 [4,] 0.3849044 0.6522058 0.7497226 0.7686785 0.8710377 1.198971 ### Of these methods, the cross-validated-bandwidth kernel NPR method ### seems to be the clear winner. Perhaps we can learn via ### bootstrap, next time, what the resulting standard errors of ### prediction are as a function of x !!