LOG to summarize R Discriminant Analysis Steps on Iris Data ============================================= ### You will follow similar steps for your ### exercise with the banknotes. After loading package "base": > data(iris3) ### 50x4x3 array > dimnames(iris3)[2:3] [[1]] [1] "Sepal L." "Sepal W." "Petal L." "Petal W." [[2]] [1] "Setosa" "Versicolor" "Virginica" > apply(iris3,2:3,mean) Setosa Versicolor Virginica Sepal L. 5.006 5.936 6.588 Sepal W. 3.428 2.770 2.974 Petal L. 1.462 4.260 5.552 Petal W. 0.246 1.326 2.026 > round(apply(iris3,2:3,var)*49/50,4) Setosa Versicolor Virginica Sepal L. 0.1218 0.2611 0.3963 Sepal W. 0.1408 0.0965 0.1019 Petal L. 0.0296 0.2164 0.2985 Petal W. 0.0109 0.0383 0.0739 ### as in book pp.309ff > SSet = var(iris3[,1:2,1]) SVers = var(iris3[,1:2,2]) avec=solve(0.5*(SSet+SVers),apply(iris3[,1:2,1]- iris3[,1:2,2],2,mean)) > avec Sepal L. Sepal W. -11.43636 14.14303 ### To get picture in Fig 11.3.1: > plot(iris3[,1,1], iris3[,2,1], xlab="Sepal Lgth", ylab="Sepal Width", xlim=range(c(iris3[,1,1:2])), ylim = range(c(iris3[,2,1:2])), type="n", main="Iris Data Species 1:2 Discrimination") points(iris3[,1,1],iris3[,2,1], pch=5) points(iris3[,1,2],iris3[,2,2], pch=18) legend(locator(), legend=c("Setosa","Versic."), pch=c(5,18)) ### Discrimination line: region 1=Setosa assigned when avec'x > avec'(apply(iris3[,1:2,1]+iris3[,1:2,2],2,mean))/2 = -18.7391 i.e. -11.43636*x1 + 14.14303*x2 > -18.7391 > abline(-18.7391/14.1430,11.4364/14.1430, lty=3) ### Picture shows there is only one misclassified point !! ### Now for Fisher Linear Discriminant rule: > x1bar = apply(iris3[,1:2,1],2,mean) x2bar = apply(iris3[,1:2,2],2,mean) xbar = (x1bar+x2bar)/2 B = 25 * (x1bar-x2bar) %*% t(x1bar-x2bar) W = 49*(SSet + SVers) > eigen(solve(W)%*% B)$value[1] [1] 5.087225e+00 > avecF = eigen(solve(W)%*% B)$vec[,1] > avecF [1] 0.6287743 -0.7775879 ## Now Fisher discriminant region for Setosa is set ## of xv = (x1,x2) such that ## avecF %*% xv < (1/2)*(avecF %*% x1bar + avecF %*% x2bar) = 1.0303 > abline(-1.0303/.77759, .62877/.77759, lty=1) ### This is exactly the same as the ML discriminant region! ### Now for the quadratic discriminant rule based on ## ML sample-based rule with distinct variances. ## function whose root gives x2 as fcn of x1 for boundary ## of quadratic region > 0.5*(log(det(SSet))-log(det(SVers))) [1] -0.431362 > x2fcn = function(x2,x1) { xv = c(x1,x2) -0.431362 + 0.5 * (t(xv-x1bar) %*% solve(SSet,xv-x1bar) - t(xv-x2bar) %*% solve(SVers,xv-x2bar)) } ### When this function is negative, the quadratic discrimination ### rule decides "Setosa", otherwise "Versicolor" . > x2rts = numeric(51) > x1seq = seq(4.5,7.0,.05) for (i in 1:51) x2rts[i] = uniroot(x2fcn,c(1,8),x1=x1seq[i])$root > lines(x1seq,x2rts, lty=5) ### Although the variances are clearly different, the quadratic ### discrimination boundary is very little different and ### results in the same 1 misclassified point ! ## Cross-validation method and theoretical misclassification- ### probability calculation: come next. ## First for probability of multivariate-normal Y with Setosa ## parameters to give avecF %*% Y > 1.0303 > 1-pnorm((1.0303-c(avecF %*% x1bar))/sqrt(t(avecF) %*% SSet %*% avecF)) #### = 0.00275 theoretical misclass-prob for Setosa > pnorm((1.0303-c(avecF %*% x2bar))/sqrt(t(avecF) %*% SVers %*% avecF)) ### = 0.0275 theoretical misclass-prob for Versicolor ## Next to the quadratic-discrimination theoretical ## misclassification prob's based on simulation. > SSetInv = solve(SSet) SVersInv = solve(SVers) ## We modify the coding of the previous x2fcn function used ## in the quadratic discrimination so that it "parallelizes", ## i.e., allows 2-column matrix input xmat and vector output, ## with first column of xmat serving in place of the former x1 ## argument of x2fcn and the second in place of x2. ## (Without parallelizing in this way, tallying the misclassifications ## in the R-code simulation takes sveral minutes of CPU time !! ## But with this re-coding, it takes only a couple of seconds.) > x2Bfcn = function(xmat) { aux1 = xmat - outer(rep(1,nrow(xmat)), x1bar,"*") aux2 = xmat - outer(rep(1,nrow(xmat)), x2bar,"*") -0.431362 + 0.5 * ( (aux1 * (aux1 %*% SSetInv) - aux2 * (aux2 %*% SVersInv)) %*% c(1,1) ) } > x1sim = array(rnorm(2e5), c(1e5,2)) tmp = eigen(SSet) S1rt = tmp$vec %*% diag(sqrt(tmp$val)) %*% t(tmp$vec) x1sim = x1sim %*% S1rt + outer(rep(1,1e5),x1bar, "*") sum(x1sim %*% avecF > 1.0303)/1e5 ### did this as check on ### theoretical misclass prob above, got .00279 OK sum(x2Bfcn(x1sim) > 0)/1e5 ### .00851 ## So the estimated misclassification probability for the ## quadratic rule on data with Setosa mvnorm parameters is .0085. > x2sim = array(rnorm(2e5), c(1e5,2)) tmp = eigen(SVers) S2rt = tmp$vec %*% diag(sqrt(tmp$val)) %*% t(tmp$vec) x2sim = x2sim %*% S2rt + outer(rep(1,1e5),x2bar, "*") sum(x2sim %*% avecF < 1.0303)/1e5 ### did this as check on ### theoretical misclass prob above, got .00275 OK sum(x2Bfcn(x2sim) < 0)/1e5 ### .01417 ## This seems to indicate that the observaed misclassification ## rate of 1 out of 100 is something like the "theoretical" ## rate. ============================================================= Now we code the "cross-validation" approach to estimating 2-group misclassification probability. This is done by successively leaving out 1 of the 100 observations, fitting the linear and/or quadratic discriminant on the rest, and then applying it to classify the one left out and record whether the classification is incorrect. The function to do this once is as follows.The underlying dataset is > tmpDat = rbind(cbind(iris3[,1:2,1],Type=1), cbind(iris3[,1:2,2],Type=2)) > LeaveOut = function(indout, gpout, Dset ) { ## indout = index of omitted observation, 1:n_g ## gpout = group (1=Setosa, 2=Versicolor) in which obs ## is to be left out ## Dset = matrix of dimension n by (p+1) ## with p+1 column = "Type"= group ng = table(Dset[,3]) np = ncol(Dset)-1 iout = (gpout-1)*ng[1]+indout xnew = Dset[iout,1:np] Dset = Dset[setdiff(1:sum(ng),iout),] ng[gpout] = ng[gpout]-1 dat1 = Dset[Dset[,3]==1,1:np] dat2 = Dset[Dset[,3]==2,1:np] x1b = apply(dat1,2,mean) S1 = var(dat1) x2b = apply(dat2,2,mean) S2 = var(dat2) avec=solve(((ng[1]-1)*S1+(ng[2]-1)*S2)/(sum(ng)-2), x1b-x2b) Grp = 1+as.numeric(sum(avec*(xnew -(x1b+x2b)/2))) list(avec=avec, Gp=Grp, wrong=as.numeric(gpout==Grp)) } > LeaveOut(1,1,tmpDat) $avec Sepal L. Sepal W. -11.32617 13.99130 $Gp [1] 10.81194 $wrong [1] 0 > error = array(0,c(50,2)) for(i in 1:50) for (g in 1:2) error[i,g] = LeaveOut(i,g,tmpDat)$wrong > apply(error,2,sum) [1] 0 0 ### This way, get NO errors at all !!!