#knitr::opts_chunk$set(echo = TRUE, fig.width = 7, fig.height = 4.5)
library(palmerpenguins)
library(cluster)
library(patchwork)
library(ggplot2)
The Palmer Penguins dataset contains measurements of penguins observed on islands in Antarctica. The data contains the true species of each penguin, but we treat this as unlabeled data and see whether clustering can recover the natural groupings.
# Keep complete cases for the four standard numeric measurements
raw_df <- penguins[, c("species", "island", "bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g")]
keep <- complete.cases(raw_df)
penguins_df <- raw_df[keep, ]
X <- as.matrix(penguins_df[, c("bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g")])
X_scaled <- scale(X)
head(penguins_df)
summary(X)
## bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## Min. :32.10 Min. :13.10 Min. :172.0 Min. :2700
## 1st Qu.:39.23 1st Qu.:15.60 1st Qu.:190.0 1st Qu.:3550
## Median :44.45 Median :17.30 Median :197.0 Median :4050
## Mean :43.92 Mean :17.15 Mean :200.9 Mean :4202
## 3rd Qu.:48.50 3rd Qu.:18.70 3rd Qu.:213.0 3rd Qu.:4750
## Max. :59.60 Max. :21.50 Max. :231.0 Max. :6300
The dataset records four quantitative measurements for each penguin: bill length (mm) and bill depth (mm), which describe the size and shape of the beak; flipper length (mm), which reflects overall body size and swimming adaptation; and body mass (g), which measures weight. Together, these variables capture different aspects of penguin morphology, and we use them as features to assess similarity between individuals when performing clustering.
apply(X, 2, sd)
## bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## 5.459584 1.974793 14.061714 801.954536
body_mass_g is on a much larger scale than the other
features, so Euclidean distance will be dominated by mass unless we
scale.
# PCA view only for visualization
pc <- prcomp(X_scaled)
# Convert to data frame
pc_df <- data.frame(
PC1 = pc$x[, 1],
PC2 = pc$x[, 2],
species = penguins_df$species
)
# Plot
ggplot(pc_df, aes(x = PC1, y = PC2)) +
geom_point() +
labs(
title = "PCA of penguin measurements",
x = "PC1",
y = "PC2"
) +
theme_minimal()
The visualization does indicate some clustering structure of the data.
set.seed(123)
km3 <- kmeans(X_scaled, centers = 3, nstart = 50)
pc_df$cluster_kmeans <- factor(km3$cluster)
# PCA plot colored by k-means clusters
p1 <- ggplot(pc_df, aes(x = PC1, y = PC2, color = cluster_kmeans)) +
geom_point(size = 2, alpha = 0.8) +
labs(
title = "K-means clustering on penguins",
subtitle = "Clusters shown in PCA space",
x = "PC1",
y = "PC2",
color = "Cluster"
) +
theme_minimal()
## Compare with true labels
p2 <- ggplot(pc_df, aes(x = PC1, y = PC2, color = species)) +
geom_point(size = 2, alpha = 0.8) +
labs(
title = "Penguins in PCA space",
subtitle = "Colored by true species",
x = "PC1",
y = "PC2"
) +
theme_minimal()
p1 + p2
The cluster results generates cleaner boundaries of observations than the true labels.
We plot within-cluster sum squares (WCSS) v.s. K, and check if there is any elbow point.
set.seed(123)
k_grid <- 1:8
wcss <- sapply(k_grid, function(k) {
kmeans(X_scaled, centers = k, nstart = 50)$tot.withinss
})
elbow_df <- data.frame(k = k_grid, wcss = wcss)
ggplot(elbow_df, aes(x = k, y = wcss)) +
geom_line() +
geom_point(size = 2) +
scale_x_continuous(breaks = k_grid) +
labs(
title = "Elbow plot",
x = "Number of clusters K",
y = "Total within-cluster sum of squares"
) +
theme_minimal()
Big drop from 1->2, smaller drop from 2-> 3, indicating that most of the structure is captured by 2 clusters, with diminishing returns after.
avg_sil <- sapply(2:6, function(k) {
km <- kmeans(X_scaled, centers = k, nstart = 50)
ss <- silhouette(km$cluster, dist(X_scaled))
mean(ss[, 3])
})
sil_df <- data.frame(
k = 2:6,
avg_silhouette = avg_sil
)
ggplot(sil_df, aes(x = k, y = avg_silhouette)) +
geom_line() +
geom_point(size = 2) +
scale_x_continuous(breaks = 2:6) +
labs(
title = "Average silhouette width",
x = "Number of clusters K",
y = "Average silhouette"
) +
theme_minimal()
Highest sihouette at \(K = 2\), and then steadily decreases, indicating that the data is best separated into two well-separated groups.
Even though there are 3 species, the data geometry may only support 2 well-separated groups. This is consistent with what we see visually from the data.
Let’s compare between using three different lineages
dmat <- dist(X_scaled)
hc_complete <- hclust(dmat, method = "complete")
hc_average <- hclust(dmat, method = "average")
hc_single <- hclust(dmat, method = "single")
op <- par(mfrow = c(3, 1), mar = c(2, 4, 2.5, 1))
plot(hc_complete, labels = FALSE, main = "Complete linkage")
rect.hclust(hc_complete, k = 3, border = 2:4)
plot(hc_average, labels = FALSE, main = "Average linkage")
rect.hclust(hc_average, k = 3, border = 2:4)
plot(hc_single, labels = FALSE, main = "Single linkage")
rect.hclust(hc_single, k = 3, border = 2:4)
par(op)
## If you prefer to have all singletons positioned at 0, you can set hang = -1 in the plot function
plot(hc_single, labels = FALSE, main = "Single linkage", hang = -1)
species <- penguins_df$species
tab_complete <- table(Cluster = cutree(hc_complete, k = 3), Species = species)
tab_average <- table(Cluster = cutree(hc_average, k = 3), Species = species)
tab_single <- table(Cluster = cutree(hc_single, k = 3), Species = species)
tab_complete
## Species
## Cluster Adelie Chinstrap Gentoo
## 1 151 14 0
## 2 0 0 123
## 3 0 54 0
tab_average
## Species
## Cluster Adelie Chinstrap Gentoo
## 1 151 68 0
## 2 0 0 119
## 3 0 0 4
tab_single
## Species
## Cluster Adelie Chinstrap Gentoo
## 1 151 67 0
## 2 0 0 123
## 3 0 1 0