############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2023-01-26
#
# Topic(s):
# - An Introduction to Statistical Learning (https://www.statlearning.com)
# - Section(s): 12.4.1
#
# last updated: 2023-03-03

############################################################################

### 12.4.1: K-Means Clustering

# a small toy example; say there are n=4 people that we want to cluster into
# k=2 groups; there are 7 possible ways of doing so

# (1), (2,3,4)
# (2), (1,3,4)
# (3), (1,2,4)
# (4), (1,2,3)
# (1,2), (3,4)
# (1,3), (2,4)
# (1,4), (2,3)

# note that (1), (2,3,4) is the same as (2,3,4), (1) since what we consider to
# be the 'first' and the 'second' cluster is arbitrary

# the equation to compute the total number of such partitionings is:
# https://en.wikipedia.org/wiki/Stirling_numbers_of_the_second_kind

n <- 4
k <- 2
1/factorial(k) * sum((-1)^(0:k) * choose(k,0:k) * (k-(0:k))^n)

# make up some data for the 4 individuals on p=2 features
x1 <- c(3,4,6,8)
x2 <- c(5,3,6,9)

# plot the data
plot(x1, x2, pch=19)
text(x1, x2, 1:4, pos=2)

# compute the total within-cluster variation for each of the 7 partitionings

assign <- read.table(text = "
1 2 2 2
2 1 2 2
2 2 1 2
2 2 2 1
1 1 2 2
1 2 1 2
1 2 2 1")

totalWC <- rep(0, nrow(assign))

for (r in 1:nrow(assign)) {

   C1 <- which(assign[r,] == 1)
   C2 <- which(assign[r,] == 2)

   distx1 <- outer(x1[C1], x1[C1], "-")
   distx2 <- outer(x2[C1], x2[C1], "-")
   WC1 <- (sum(distx1^2) + sum(distx2^2)) / length(C1)

   distx1 <- outer(x1[C2], x1[C2], "-")
   distx2 <- outer(x2[C2], x2[C2], "-")
   WC2 <- (sum(distx1^2) + sum(distx2^2)) / length(C2)

   totalWC[r] <- WC1 + WC2

}

totalWC
assign[which.min(totalWC),]

# so we see that the (1,2),(3,4) clustering leads to the smallest objective
# function value, although (4),(1,2,3) is just a little bit worse

# when n is small, we can examine all possible partitionings, but this quickly
# becomes infeasible when n is large; in that case, we have to use something
# like algorithm 12.2 which hopefully will also find the partitioning that
# leads to the best solution (the one with the smallest total within-cluster
# variation)

############################################################################

# algorithm 12.2

X <- cbind(x1,x2)

# 1. randomly assign the observations to the K clusters

assign <- c(1,1,1,2)
assign

k2means <- function(assign, X) {

   assign.old <- rep(0,length(assign))

   # 2. iterate until cluster assignments stop changing

   while (sum(abs(assign.old - assign)) != 0) {

      print(assign.old)

      assign.old <- assign

      # (a) compute the cluster centroids
      centroid1 <- colMeans(X[assign==1,,drop=FALSE])
      centroid2 <- colMeans(X[assign==2,,drop=FALSE])

      # (b) assign each observation to the cluster whose centroid is closest
      distto1 <- apply(X, 1, function(x) sqrt(sum((x - centroid1)^2)))
      distto2 <- apply(X, 1, function(x) sqrt(sum((x - centroid2)^2)))
      assign <- ifelse(distto1 < distto2, 1, 2)

   }

   return(assign)

}

k2means(assign, X)

# when we start with the (1,2,3),(4) assignment, the algorithm doesn't find a
# better clustering and is actually getting stuck in a local optimum (we saw
# earlier that (1,2),(3,4) is the optimal solution)

# let's try out a different starting assignment

assign <- c(1,2,2,1)
k2means(assign, X)

# now the algorithm does find the optimal solution

# so the solution that the algorithm finds may not be the global optimum and
# the solution can depend on the starting assignment!

############################################################################

# try out the algorithm on a larger dataset

# simulate some data where there are essentially two groups
set.seed(1234)
x1 <- c(rnorm(100, 6, 1), rnorm(50, 3, 1))
x2 <- c(rnorm(100, 3, 1), rnorm(50, 8, 1))
X <- cbind(x1,x2)

# plot the data
plot(x1, x2, pch=19, xlim=range(c(x1,x2)), ylim=range(c(x1,x2)))

# randomly assign the 150 people to two clusters
assign <- sample(c(1,2), 150, replace=TRUE)

# compute the centroids
centroid1 <- colMeans(X[assign == 1,])
centroid2 <- colMeans(X[assign == 2,])

# plot the data again but with color to show the starting arrangement
plot(x1, x2, pch=19, xlim=range(c(x1,x2)), ylim=range(c(x1,x2)),
     col=ifelse(assign==1, "dodgerblue", "firebrick"))

# add the centroids to the plot
points(centroid1[1], centroid1[2], pch=21, cex=4, lwd=5, bg="dodgerblue")
points(centroid2[1], centroid2[2], pch=21, cex=4, lwd=5, bg="firebrick")

# run the algorithm
assign <- k2means(assign, X)

# recompute the centroids based on the solution found
centroid1 <- colMeans(X[assign == 1,])
centroid2 <- colMeans(X[assign == 2,])

# plot the data again, but also show the region where points will be assigned
# to the first versus second cluster

plot(NA, xlim=range(c(x1,x2)), ylim=range(c(x1,x2)), xlab="x1", ylab="x2")

x1s <- seq(min(x1), max(x1), length=500)
x2s <- seq(min(x2), max(x2), length=500)
Xs <- expand.grid(x1s, x2s)
distto1 <- apply(Xs, 1, function(x) sqrt(sum((x - centroid1)^2)))
distto2 <- apply(Xs, 1, function(x) sqrt(sum((x - centroid2)^2)))
assign.grid <- ifelse(distto1 < distto2, 1, 2)
cols <- ifelse(assign.grid==1, rgb(.6,.9,1),
                               rgb(1,.6,.6))
points(Xs[,1], Xs[,2], pch=15, cex=0.9, col=cols, bg=cols)

points(x1, x2, pch=19, xlim=range(c(x1,x2)), ylim=range(c(x1,x2)),
     col=ifelse(assign==1, "dodgerblue", "firebrick"))

points(centroid1[1], centroid1[2], pch=21, cex=4, lwd=5, bg="dodgerblue")
points(centroid2[1], centroid2[2], pch=21, cex=4, lwd=5, bg="firebrick")

############################################################################

# now do k-means clustering using the kmeans() function
res <- kmeans(X, centers=2)
res

# compare the solution found by kmeans() with the one we found earlier
table(assign, res$cluster)

# they are the same (except that the cluster 1 is cluster 2 and vice-versa)

# use 20 random starts (the function provides the best solution)
res <- kmeans(X, centers=2, nstart=20)
res

# note: when using kmeans(), remember to always use set.seed() to make the
# results fully reproducible; for example, we get two different solutions for
# k=3 clusters when running kmeans() twice
kmeans(X, centers=3)
kmeans(X, centers=3)

# but when fixing the seed before running kmeans(), the result will be the same
set.seed(1234)
kmeans(X, centers=3)
set.seed(1234)
kmeans(X, centers=3)

############################################################################