Data Preprocessing

Scaling

## Sample data set
set.seed(1410)
y <- matrix(rnorm(50), 10, 5, dimnames=list(paste("g", 1:10, sep=""), 
            paste("t", 1:5, sep="")))
dim(y)
## [1] 10  5
## Scaling
yscaled <- t(scale(t(y))) # Centers and scales y row-wise
apply(yscaled, 1, sd)
##  g1  g2  g3  g4  g5  g6  g7  g8  g9 g10 
##   1   1   1   1   1   1   1   1   1   1

Distance Matrices

Euclidean distance matrix

dist(y[1:4,], method = "euclidean")
##          g1       g2       g3
## g2 4.793697                  
## g3 4.932658 6.354978         
## g4 4.033789 4.788508 1.671968

Correlation-based distance matrix

Correlation matrix

c <- cor(t(y), method="pearson") 
as.matrix(c)[1:4,1:4]
##             g1         g2          g3         g4
## g1  1.00000000 -0.2965885 -0.00206139 -0.4042011
## g2 -0.29658847  1.0000000 -0.91661118 -0.4512912
## g3 -0.00206139 -0.9166112  1.00000000  0.7435892
## g4 -0.40420112 -0.4512912  0.74358925  1.0000000

Correlation-based distance matrix

d <- as.dist(1-c)
as.matrix(d)[1:4,1:4]
##          g1       g2        g3        g4
## g1 0.000000 1.296588 1.0020614 1.4042011
## g2 1.296588 0.000000 1.9166112 1.4512912
## g3 1.002061 1.916611 0.0000000 0.2564108
## g4 1.404201 1.451291 0.2564108 0.0000000

Hierarchical Clustering with hclust

Hierarchical clustering with complete linkage and basic tree plotting

hr <- hclust(d, method = "complete", members=NULL)
names(hr)
## [1] "merge"       "height"      "order"       "labels"      "method"      "call"       
## [7] "dist.method"
par(mfrow = c(1, 2)); plot(hr, hang = 0.1); plot(hr, hang = -1) 

Tree plotting I

plot(as.dendrogram(hr), edgePar=list(col=3, lwd=4), horiz=T) 

Tree plotting II

The ape library provides more advanced features for tree plotting

library(ape) 
plot.phylo(as.phylo(hr), type="p", edge.col=4, edge.width=2, 
           show.node.label=TRUE, no.margin=TRUE)

Tree Cutting

Accessing information in hclust objects

hr
## 
## Call:
## hclust(d = d, method = "complete", members = NULL)
## 
## Cluster method   : complete 
## Number of objects: 10
## Print row labels in the order they appear in the tree
hr$labels[hr$order] 
##  [1] "g10" "g3"  "g4"  "g2"  "g9"  "g6"  "g7"  "g1"  "g5"  "g8"

Tree cutting with cutree

mycl <- cutree(hr, h=max(hr$height)/2)
mycl[hr$labels[hr$order]] 
## g10  g3  g4  g2  g9  g6  g7  g1  g5  g8 
##   3   3   3   2   2   5   5   1   4   4

Heatmaps

With heatmap.2

All in one step: clustering and heatmap plotting

library(gplots)
heatmap.2(y, col=redgreen(75))

With pheatmap

All in one step: clustering and heatmap plotting

library(pheatmap); library("RColorBrewer")
pheatmap(y, color=brewer.pal(9,"Blues"))

Customizing heatmaps

Customizes row and column clustering and shows tree cutting result in row color bar. Additional color schemes can be found here.

hc <- hclust(as.dist(1-cor(y, method="spearman")), method="complete")
mycol <- colorpanel(40, "darkblue", "yellow", "white")
heatmap.2(y, Rowv=as.dendrogram(hr), Colv=as.dendrogram(hc), col=mycol,
          scale="row", density.info="none", trace="none", 
          RowSideColors=as.character(mycl))

K-Means Clustering with PAM

Runs K-means clustering with PAM (partitioning around medoids) algorithm and shows result in color bar of hierarchical clustering result from before.

library(cluster)
pamy <- pam(d, 4)
(kmcol <- pamy$clustering)
##  g1  g2  g3  g4  g5  g6  g7  g8  g9 g10 
##   1   2   3   3   4   4   4   4   2   3
heatmap.2(y, Rowv=as.dendrogram(hr), Colv=as.dendrogram(hc), col=mycol,
          scale="row", density.info="none", trace="none", 
          RowSideColors=as.character(kmcol))

K-Means Fuzzy Clustering

Performs k-means fuzzy clustering

library(cluster)
fannyy <- fanny(d, k=4, memb.exp = 1.5)
round(fannyy$membership, 2)[1:4,]
##    [,1] [,2] [,3] [,4]
## g1 1.00 0.00 0.00 0.00
## g2 0.00 0.99 0.00 0.00
## g3 0.02 0.01 0.95 0.03
## g4 0.00 0.00 0.99 0.01
fannyy$clustering 
##  g1  g2  g3  g4  g5  g6  g7  g8  g9 g10 
##   1   2   3   3   4   4   4   4   2   3
## Returns multiple cluster memberships for coefficient above a certain 
## value (here >0.1)
fannyyMA <- round(fannyy$membership, 2) > 0.10 
apply(fannyyMA, 1, function(x) paste(which(x), collapse="_"))
##    g1    g2    g3    g4    g5    g6    g7    g8    g9   g10 
##   "1"   "2"   "3"   "3"   "4"   "4"   "4" "2_4"   "2"   "3"

Multidimensional Scaling (MDS)

Performs MDS analysis on the geographic distances between European cities

loc <- cmdscale(eurodist) 
## Plots the MDS results in 2D plot. The minus is required in this example to 
## flip the plotting orientation.
plot(loc[,1], -loc[,2], type="n", xlab="", ylab="", main="cmdscale(eurodist)")
text(loc[,1], -loc[,2], rownames(loc), cex=0.8) 

Principal Component Analysis (PCA)

Performs PCA analysis after scaling the data. It returns a list with class prcomp that contains five components: (1) the standard deviations (sdev) of the principal components, (2) the matrix of eigenvectors (rotation), (3) the principal component data (x), (4) the centering (center) and (5) scaling (scale) used.

library(scatterplot3d)
pca <- prcomp(y, scale=TRUE)
names(pca)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
summary(pca) # Prints variance summary for all principal components.
## Importance of components:
##                           PC1    PC2    PC3     PC4    PC5
## Standard deviation     1.3611 1.1777 1.0420 0.69264 0.4416
## Proportion of Variance 0.3705 0.2774 0.2172 0.09595 0.0390
## Cumulative Proportion  0.3705 0.6479 0.8650 0.96100 1.0000
scatterplot3d(pca$x[,1:3], pch=20, color="blue") 

Additional Exercises

See here

Jump to: next_page