Lecture 5 Demo: Clustering

#knitr::opts_chunk$set(echo = TRUE, fig.width = 7, fig.height = 4.5)

library(palmerpenguins)
library(cluster)
library(patchwork)
library(ggplot2)

1. Prepare data

The Palmer Penguins dataset contains measurements of penguins observed on islands in Antarctica. The data contains the true species of each penguin, but we treat this as unlabeled data and see whether clustering can recover the natural groupings.

# Keep complete cases for the four standard numeric measurements
raw_df <- penguins[, c("species", "island", "bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g")]
keep <- complete.cases(raw_df)
penguins_df <- raw_df[keep, ]

X <- as.matrix(penguins_df[, c("bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g")])
X_scaled <- scale(X)


head(penguins_df)

summary(X)

##  bill_length_mm  bill_depth_mm   flipper_length_mm  body_mass_g  
##  Min.   :32.10   Min.   :13.10   Min.   :172.0     Min.   :2700  
##  1st Qu.:39.23   1st Qu.:15.60   1st Qu.:190.0     1st Qu.:3550  
##  Median :44.45   Median :17.30   Median :197.0     Median :4050  
##  Mean   :43.92   Mean   :17.15   Mean   :200.9     Mean   :4202  
##  3rd Qu.:48.50   3rd Qu.:18.70   3rd Qu.:213.0     3rd Qu.:4750  
##  Max.   :59.60   Max.   :21.50   Max.   :231.0     Max.   :6300

The dataset records four quantitative measurements for each penguin: bill length (mm) and bill depth (mm), which describe the size and shape of the beak; flipper length (mm), which reflects overall body size and swimming adaptation; and body mass (g), which measures weight. Together, these variables capture different aspects of penguin morphology, and we use them as features to assess similarity between individuals when performing clustering.

2. Data visualization

Variable scales:

apply(X, 2, sd)

##    bill_length_mm     bill_depth_mm flipper_length_mm       body_mass_g 
##          5.459584          1.974793         14.061714        801.954536

body_mass_g is on a much larger scale than the other features, so Euclidean distance will be dominated by mass unless we scale.

Visualize the penguins in the PC space:

# PCA view only for visualization

pc <- prcomp(X_scaled)

# Convert to data frame
pc_df <- data.frame(
  PC1 = pc$x[, 1],
  PC2 = pc$x[, 2],
  species = penguins_df$species
)

# Plot
ggplot(pc_df, aes(x = PC1, y = PC2)) +
  geom_point() +
  labs(
    title = "PCA of penguin measurements",
    x = "PC1",
    y = "PC2"
  ) +
  theme_minimal()

The visualization does indicate some clustering structure of the data.

4. K-means on scaled data

4.1 Visualize the results when K = 3

set.seed(123)
km3 <- kmeans(X_scaled, centers = 3, nstart = 50)

pc_df$cluster_kmeans <- factor(km3$cluster)

# PCA plot colored by k-means clusters
p1 <- ggplot(pc_df, aes(x = PC1, y = PC2, color = cluster_kmeans)) +
  geom_point(size = 2, alpha = 0.8) +
  labs(
    title = "K-means clustering on penguins",
    subtitle = "Clusters shown in PCA space",
    x = "PC1",
    y = "PC2",
    color = "Cluster"
  ) +
  theme_minimal()

## Compare with true labels
p2 <- ggplot(pc_df, aes(x = PC1, y = PC2, color = species)) +
  geom_point(size = 2, alpha = 0.8) +
  labs(
    title = "Penguins in PCA space",
    subtitle = "Colored by true species",
    x = "PC1",
    y = "PC2"
  ) +
  theme_minimal()

p1 + p2

The cluster results generates cleaner boundaries of observations than the true labels.

4.2 Elbow plot for choosing K

We plot within-cluster sum squares (WCSS) v.s. K, and check if there is any elbow point.

set.seed(123)
k_grid <- 1:8
wcss <- sapply(k_grid, function(k) {
  kmeans(X_scaled, centers = k, nstart = 50)$tot.withinss
})

elbow_df <- data.frame(k = k_grid, wcss = wcss)

ggplot(elbow_df, aes(x = k, y = wcss)) +
  geom_line() +
  geom_point(size = 2) +
  scale_x_continuous(breaks = k_grid) +
  labs(
    title = "Elbow plot",
    x = "Number of clusters K",
    y = "Total within-cluster sum of squares"
  ) +
  theme_minimal()

Big drop from 1->2, smaller drop from 2-> 3, indicating that most of the structure is captured by 2 clusters, with diminishing returns after.

4.3 Compare the silhouette widths for K = 2,…,6

avg_sil <- sapply(2:6, function(k) {
  km <- kmeans(X_scaled, centers = k, nstart = 50)
  ss <- silhouette(km$cluster, dist(X_scaled))
  mean(ss[, 3])
})

sil_df <- data.frame(
  k = 2:6,
  avg_silhouette = avg_sil
)

ggplot(sil_df, aes(x = k, y = avg_silhouette)) +
  geom_line() +
  geom_point(size = 2) +
  scale_x_continuous(breaks = 2:6) +
  labs(
    title = "Average silhouette width",
    x = "Number of clusters K",
    y = "Average silhouette"
  ) +
  theme_minimal()

Highest sihouette at \(K = 2\), and then steadily decreases, indicating that the data is best separated into two well-separated groups.

Even though there are 3 species, the data geometry may only support 2 well-separated groups. This is consistent with what we see visually from the data.

5. Hierachical clustering

Let’s compare between using three different lineages

8. Hierarchical clustering: compare 3 linkages

dmat <- dist(X_scaled)

hc_complete <- hclust(dmat, method = "complete")
hc_average  <- hclust(dmat, method = "average")
hc_single     <- hclust(dmat, method = "single")

op <- par(mfrow = c(3, 1), mar = c(2, 4, 2.5, 1))
plot(hc_complete, labels = FALSE, main = "Complete linkage")
rect.hclust(hc_complete, k = 3, border = 2:4)

plot(hc_average, labels = FALSE, main = "Average linkage")
rect.hclust(hc_average, k = 3, border = 2:4)

plot(hc_single, labels = FALSE, main = "Single linkage")
rect.hclust(hc_single, k = 3, border = 2:4)

par(op)

Complete linkage seems to provide the most reasonable results (most balanced clusters).
The chain effect is obvious when using single linkage
Average linkage produces a big cluster and a tiny cluster, and does not work that well here.

## If you prefer to have all singletons positioned at 0, you can set hang = -1 in the plot function
plot(hc_single, labels = FALSE, main = "Single linkage", hang = -1)

Comparing accuracy with the true labels:

species <- penguins_df$species

tab_complete <- table(Cluster = cutree(hc_complete, k = 3), Species = species)
tab_average  <- table(Cluster = cutree(hc_average,  k = 3), Species = species)
tab_single     <- table(Cluster = cutree(hc_single,     k = 3), Species = species)

tab_complete

##        Species
## Cluster Adelie Chinstrap Gentoo
##       1    151        14      0
##       2      0         0    123
##       3      0        54      0

tab_average

##        Species
## Cluster Adelie Chinstrap Gentoo
##       1    151        68      0
##       2      0         0    119
##       3      0         0      4

tab_single

##        Species
## Cluster Adelie Chinstrap Gentoo
##       1    151        67      0
##       2      0         0    123
##       3      0         1      0