PCA- A unsupervised feature extraction algorithm that aims to maximize the variance along the extracted features
Why maximize variance?
each=200
Ca<-cbind(rnorm(each,mean=5,sd=1.15),rnorm(200,mean=4,sd=1.05))
Cb<-cbind(rnorm(each,mean=14,sd=1.15),rnorm(200,mean=7,sd=1.05))
X<-rbind(Ca,Cb)
cat(" Glimpse of Dataset X: \n")
print(X[1:5,])
cat("\n Dimension of Dataset: \t Samples:",dim(X)[1],"\t Features:",dim(X)[2])
true=c(rep(1,each),rep(2,each))
colvec = c("coral3","darkseagreen3")[true]
pchs= c(22,24)[true]
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Scatter plot of Data")
X<-scale(X,center=TRUE,scale=FALSE)
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Scatter plot of Mean-Centered Data")
xm=min(X[,1])
ym=min(X[,2])
ProjF1=cbind(X[,1],rep(ym,nrow(X)))
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Projection of Data along Feature1")
points(ProjF1,col="black",bg=colvec,pch=pchs)
text(0,ym+1,"Well-separated",font=2)
ProjF2=cbind(rep(xm,nrow(X)),X[,2])
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Projection of Data along Feature2")
points(ProjF2,col="black",bg=colvec,pch=pchs)
text(xm+1,2,"poor separation",font=2,srt=90)
PCA Objective Find direction a linear combination of Feature 1 and Feature 2 that maximizes the variance of the projected data
Projected data:
Say , the projected data implies
is a linear combination of Feature 1 and Feature 2.
For mean-centered data :
is the Covariance matrix of
where and is unit vector as only the direction of maximum variance matters.
Solution for in given by the eigenvector of corresponding largest eigenvalue.
cat("IRIS dataset\n")
head(iris)
X<-iris[,-5]
class=as.numeric(iris$Species)
cat("\n Samples: ",dim(X)[1],"\t Features: ",dim(X)[2],"\t Classes: ",levels(iris$Species))
colvec = c("coral3","darkseagreen3","darkgoldenrod2")[class]
pchs= c(22,23,24)[class]
pairs(X, col=colvec, pch=pchs)
X<-scale(X,center=TRUE,scale=FALSE)
n=nrow(X)
C=(t(X)%*%X)/(n-1)
cat("\n Covariance matrix of mean-centered X:\n\n")
print(C)
cat("\n Dimension of C: ",dim(C)[1],"x", dim(C)[2])
eigC=eigen(C)
cat(" Eigenvalues of C: ",eigC$values)
cat("\n Eigenvectors of C: \n\n")
print(eigC$vectors)
v1=eigC$vectors[,1,drop=FALSE]
cat("\n Eigenvector v1= \n")
cat(" ",v1,sep="\n")
First principal component
y1=X%*%v1
cat("\n First principal component y1= \n ")
cat(" ",y1[1:8],sep="\n")
cat("\n Largest eigenvalue= ",eigC$values[1])
pc1Mat=cbind(y1,rep(0,n))
plot(pc1Mat,col="black",bg=colvec,pch=pchs,xlab="Principal Component 1",ylab="",ylim=c(-1,1),xlim=c(-4, 4),main="First principal component")
text(0,-0.2,paste("variance= ",var(y1)),font=2)
v2=eigC$vectors[,2,drop=FALSE]
y2=X%*%v2
cat("\n Second principal component y2= \n ")
cat(" ",y2[1:8],sep="\n")
cat("\n Second Largest eigenvalue= ",eigC$values[2])
pc1Mat=cbind(rep(0,n),y2)
plot(pc1Mat,col="black",bg=colvec,pch=pchs,xlab="",ylab="Principal Component 2",ylim=c(-1,1.5),xlim=c(-1, 1),main="Second principal component")
text(0.2,0.2,paste("variance= ",var(y2)),font=2,srt=90)
PC=cbind(y1,y2)
plot(PC,col="black",bg=colvec,pch=pchs,xlab="Principal Component 1",ylab="Principal Component 2",ylim=c(-1.2,1.3),xlim=c(-3.5,4.5),main="Top Two Principal Components")
pc=prcomp(X, center=TRUE, scale=FALSE, retx=TRUE)
cat("\n Directions/Eigenvectors: \n")
print(pc$rotation)
cat("\n Principal Components: \n")
print(pc$x[1:5,])
cat("\n Variance along principal components: ")
cat(pc$sdev^2)
library(mlbench)
data(PimaIndiansDiabetes)
Dataset<-PimaIndiansDiabetes
cat("\n Predict the onset of diabetes in female Pima Indians from medical record data.")
cat("\n Dimension of dataset: ",dim(Dataset))
cat("\n Classes: ",levels(Dataset$diabetes))
head(Dataset)
class=as.numeric(Dataset$diabetes)
X<-Dataset[,-ncol(Dataset)]
X<-as.matrix(as.data.frame(lapply(X, as.numeric)))
colvec = c("cyan3","plum3")[class]
pchs= c(22,24)[class]
pairs(X[,1:4], col=colvec, pch=pchs)
pc=prcomp(X, center=TRUE, scale=FALSE, retx=TRUE)
show=4
cat("\n Directions/Eigenvectors: \n")
print(pc$rotation[1:5,1:show])
cat("\n Principal Components: \n")
print(pc$x[1:5,1:show])
cat("\n Variance along principal components: ")
cat(pc$sdev^2)
PC=pc$x[,1:2]
plot(PC,col="black",bg=colvec,pch=pchs,xlab="Principal Component 1",ylab="Principal Component 2",,main="Top Two Principal Components")
Some machine learning datasets in "mlbench" package
https://machinelearningmastery.com/machine-learning-datasets-in-r/