User Tools

Site Tools


en:history:2015-04-24-anadatr

2015-04-24 Cluster analysis

anadatr24042015.R
## 1. Agglomerative cluster analysis using Vltava dataset
 
library (vegan)
vltava.spe <- read.delim ('https://raw.githubusercontent.com/zdealveindy/anadat-r/master/data/vltava-spe.txt', row.names = 1)
vltava.spe.t <- log1p (vltava.spe)
 
# first, calculate distance matrix (Bray-Curtis dissimilarities between all pairs of samples)
dis <- vegdist (vltava.spe.t, method = 'bray')
 
# then, apply the hclust function with different clustering algorithms
cluster.single <- hclust (dis, method = 'single')
cluster.complete <- hclust (dis, method = 'complete')
cluster.average <- hclust (dis, method = 'average')
 
# and draw the results together into one diagram
windows ()  # this will open native R graphical window (you may not need this)
par (mfrow = c(1,3))  # this separates ploting window into one row with three columns
plot (cluster.single)
plot (cluster.complete)
plot (cluster.average)
 
# The following draws only one result (average-linkage clustering algorithm) with rectangles around groups of samples
windows ()
plot (cluster.average)
rect.hclust (tree = cluster.average, k = 4)
rect.hclust (tree = cluster.average, k = 6, border = 'navy')
 
# function cutree defines which sample belongs to which group:
result.cluster <- cutree (cluster.average, k = 4)
 
# and finally, to project the result into ordination diagram (NMDS) using the same distance (Bray-Curtis):
NMDS <- metaMDS (comm = dis)
ordiplot (NMDS, type = 'n')
points (NMDS, col = result.cluster, pch = result.cluster)
 
 
 
## 2.TWINSPAN
# This function is available in R only in form of experimental library, which needs to be installed from GitHub
install.packages ('devtools')
library (devtools)
install_github ('zdealveindy/twinspanR')
library (twinspanR)
 
# let's use the example data Danube
data (danube)
 
# calculate standard TWINSPAN with only two levels of cutting:
tw <- twinspan (danube$spe, levels = 2)
print (tw, what = 'table')  # this prints the resulting two-way sorted table
result.twinspan <- cut (tw) # and this generates the vector with assignment of samples into groups
 
# Finally, we may visualized this result using DCA, and compare it with original subjective classification made by Ellenberg
 
# first calculate DCA
DCA <- decorana (danube$spe)
 
# than draw two ordination diagrams, each with different meaning of colors
windows ()
par (mfrow = c(1,2))
ordiplot (DCA, display = 'sites', type = 'n', main = 'Result of TWINSPAN')
points (DCA, col = result.twinspan, pch = result.twinspan)  # this shows results of TWINSPAN
 
ordiplot (DCA, display = 'sites', type = 'n', main = 'Ellenberg (1956)')
points (DCA, col = as.numeric (danube$env$veg.type),
        pch = as.numeric (danube$env$veg.type))   # this shows the original assignment of plots to vegetation types A to B
 
## 3. Example - clustering European countries according to their characteristics (see Example 2 in Hierarchical agglomerative)
 
countries <- read.table ("clipboard", sep = ',', row.names = 1,
                         header = T)
countries.s <- scale (countries)
distance <- dist (countries.s)
clust.countries <- hclust (distance, method = 'ward')
plot (clust.countries)
rect.hclust (clust.countries, k = 4)
en/history/2015-04-24-anadatr.txt · Last modified: 2018/03/30 23:04 (external edit)