library(palmerpenguins)
library(mclust)

1) Prepare the same penguins data used in Lecture 5

We reuse the same four numeric measurements and keep complete cases.

raw_df <- penguins[, c(
  "species", "island",
  "bill_length_mm", "bill_depth_mm",
  "flipper_length_mm", "body_mass_g"
)]

keep <- complete.cases(raw_df)
penguins_df <- raw_df[keep, ]

X <- as.matrix(penguins_df[, c(
  "bill_length_mm", "bill_depth_mm",
  "flipper_length_mm", "body_mass_g"
)])

X_scaled <- scale(X)

2) Quick baseline: K-means (hard assignments)

set.seed(32950)
km3 <- kmeans(X_scaled, centers = 3, nstart = 20)

table(km3$cluster)

  1   2   3 
132  87 123 
table(TrueSpecies = penguins_df$species, KmeansCluster = km3$cluster)
           KmeansCluster
TrueSpecies   1   2   3
  Adelie    127  24   0
  Chinstrap   5  63   0
  Gentoo      0   0 123

3) Compute GMM using MClust()

Model selection with BIC surface

Though GMM can model scale differences, the solution may not be stable if scales across variables differ a lot. So we still tend to scale the data first before running GMM in practice.

We use R function Mclust(). By default, it automatically select both number of components and covariance structure by BIC.

set.seed(32950)
gmm_auto <- Mclust(X_scaled, G = 1:12, verbose = F) ## G: the possible numbers of components

summary(gmm_auto)
---------------------------------------------------- 
Gaussian finite mixture model fitted by EM algorithm 
---------------------------------------------------- 

Mclust VEE (ellipsoidal, equal shape and orientation) model with 4 components: 

Clustering table:
  1   2   3   4 
154  66  57  65 

The BIC in MClust is defined as the reverse of regularly defined BIC, so higher BIC represents better model in MClust: \[\text{BIC}_{\text{mclust}} = -\text{BIC} =2l(\hat \Theta) - d\log n\]

We can actually visualize how BIC changes across different modeling choices.

plot(gmm_auto, what = "BIC")

You may also specify a specific convariance structure and number of components (for instance, fixed G = 3).

gmm_3 <- Mclust(X_scaled, G = 3, modelNames = "VEE", verbose = F)
gmm_3$bic
[1] -2510.485

The output of Mclust

We have the fitted model stored in gmm_auto$paramters, the estimated responsibilites stored in gmm_auto$z, and the corresponding hard clustering results in gmm_auto$classification and the uncertainties stored in gmm_auto$uncertainty. \[\gamma_{ik} = P(Z_i=k\mid x_i), \quad \text{uncertainty}_i=1-\max_k\gamma_{ik}\]

print("First 6 entries of the responsibilities:")
[1] "First 6 entries of the responsibilities:"
head(gmm_auto$z)
          [,1]         [,2]         [,3]         [,4]
[1,] 0.9999859 7.831331e-42 1.088997e-27 1.405042e-05
[2,] 0.9997910 1.289681e-26 3.344178e-19 2.089817e-04
[3,] 0.9917237 7.758721e-30 1.436243e-21 8.276320e-03
[4,] 0.9999983 2.581199e-43 4.760017e-29 1.671648e-06
[5,] 0.9999928 1.510400e-52 3.022682e-33 7.230942e-06
[6,] 0.9999509 1.237485e-35 1.283305e-24 4.905485e-05
print("First 6 entries of the uncertainty:")
[1] "First 6 entries of the uncertainty:"
head(gmm_auto$uncertainty)
[1] 1.405042e-05 2.089817e-04 8.276320e-03 1.671648e-06 7.230942e-06 4.905485e-05
print("First 6 entries of hard clustering labels:")
[1] "First 6 entries of hard clustering labels:"
head(gmm_auto$classification)
[1] 1 1 1 1 1 1

4) Visualize the clustering results

MClust provides visualzation of clustering results in the original data space

plot(gmm_auto, what = "classification")

plot(gmm_3, what = "classification")

Though BIC prefers \(K = 4\), compared to the results when \(K =3\), the additional cluster is not very well seperated in the data, so \(K=3\) seems a more reasonable choice.

plot(gmm_3, what = "uncertainty")

5) Visualize clustering results in PC space

pca <- prcomp(X_scaled)
score12 <- pca$x[, 1:2]

op <- par(mfrow = c(2, 2), mar = c(4, 4, 2, 1))

plot(score12,
     col = penguins_df$species,
     pch = 19,
     xlab = "PC1", ylab = "PC2",
     main = "Ground truth (Species)")
legend("topright",
       legend = levels(as.factor(penguins_df$species)),
       col = 1:length(levels(as.factor(penguins_df$species))),
       pch = 19)

plot(score12,
     col = km3$cluster,
     pch = 19,
     xlab = "PC1", ylab = "PC2",
     main = "K-means clusters")

plot(score12,
     col = gmm_3$classification,
     pch = 19,
     xlab = "PC1", ylab = "PC2",
     main = paste("GMM:", gmm_3$modelName, "(G=", gmm_3$G, ")"))

plot(score12,
     col = "grey80",
     pch = 16,
     xlab = "PC1", ylab = "PC2",
     main = "GMM uncertainty (>0.1 highlighted)")

idx <- gmm_3$uncertainty > 0.1   # adjust threshold
points(score12[idx, ],
       col = "red",
       pch = 19)

par(op)

The GMM results are closer to the truth than K-means.

species <- penguins_df$species

tab_kmeans <- table(Cluster = km3$cluster, Species = species)
tab_gmm  <- table(Cluster = gmm_3$classification, Species = species)

tab_kmeans
       Species
Cluster Adelie Chinstrap Gentoo
      1    127         5      0
      2     24        63      0
      3      0         0    123
tab_gmm
       Species
Cluster Adelie Chinstrap Gentoo
      1    151         5      0
      2      0         0    123
      3      0        63      0

6) How covariance constraints change clustering (fixed G = 3)

To isolate covariance assumptions, hold G = 3 fixed and compare BIC values.

cov_models <- c(
  "EII", # spherical, equal volume
  "VII", # spherical, unequal volume
  "VEE", # ellipsoidal, equal shape and orientation
  "VVV"  # ellipsoidal, varying covariance (most flexible)
)

fits_cov <- lapply(cov_models, function(m) {
  Mclust(X_scaled, G = 3, modelNames = m, verbose = F)
})
names(fits_cov) <- cov_models

bic_cov <- sapply(fits_cov, function(f) f$bic)
comparison_table <- data.frame(
  model = cov_models,
  BIC = round(as.numeric(bic_cov), 2)
)
comparison_table <- comparison_table[order(-comparison_table$BIC), ]
comparison_table

Visualize how the covariance choices change the final clustering results

op <- par(mfrow = c(2, 2), mar = c(4, 4, 2, 1))

for (m in cov_models) {
  plot(score12,
       col = fits_cov[[m]]$classification,
       pch = 19,
       xlab = "PC1", ylab = "PC2",
       main = paste0(m, " (G=3)"))
}

par(op)

9) Takeaways

LS0tCnRpdGxlOiAiTGVjdHVyZSA2IERlbW86IEdhdXNzaWFuIE1peHR1cmUgTW9kZWxzIG9uIFBlbmd1aW5zIgphdXRob3I6ICJTVEFUIDI0NjIwID0gU1RBVCAzMjkwMCwgRklOTSAzNDcwMCIKZGF0ZTogIlNwcmluZyAyMDI2IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpgYGB7ciBzZXR1cCwgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KCmxpYnJhcnkocGFsbWVycGVuZ3VpbnMpCmxpYnJhcnkobWNsdXN0KQpgYGAKCiMjIDEpIFByZXBhcmUgdGhlIHNhbWUgcGVuZ3VpbnMgZGF0YSB1c2VkIGluIExlY3R1cmUgNQoKV2UgcmV1c2UgdGhlIHNhbWUgZm91ciBudW1lcmljIG1lYXN1cmVtZW50cyBhbmQga2VlcCBjb21wbGV0ZSBjYXNlcy4KCmBgYHtyIGRhdGEtcHJlcH0KcmF3X2RmIDwtIHBlbmd1aW5zWywgYygKICAic3BlY2llcyIsICJpc2xhbmQiLAogICJiaWxsX2xlbmd0aF9tbSIsICJiaWxsX2RlcHRoX21tIiwKICAiZmxpcHBlcl9sZW5ndGhfbW0iLCAiYm9keV9tYXNzX2ciCildCgprZWVwIDwtIGNvbXBsZXRlLmNhc2VzKHJhd19kZikKcGVuZ3VpbnNfZGYgPC0gcmF3X2RmW2tlZXAsIF0KClggPC0gYXMubWF0cml4KHBlbmd1aW5zX2RmWywgYygKICAiYmlsbF9sZW5ndGhfbW0iLCAiYmlsbF9kZXB0aF9tbSIsCiAgImZsaXBwZXJfbGVuZ3RoX21tIiwgImJvZHlfbWFzc19nIgopXSkKClhfc2NhbGVkIDwtIHNjYWxlKFgpCgpgYGAKCiMjIDIpIFF1aWNrIGJhc2VsaW5lOiBLLW1lYW5zIChoYXJkIGFzc2lnbm1lbnRzKQoKYGBge3Iga21lYW5zLWZpdH0Kc2V0LnNlZWQoMzI5NTApCmttMyA8LSBrbWVhbnMoWF9zY2FsZWQsIGNlbnRlcnMgPSAzLCBuc3RhcnQgPSAyMCkKCnRhYmxlKGttMyRjbHVzdGVyKQp0YWJsZShUcnVlU3BlY2llcyA9IHBlbmd1aW5zX2RmJHNwZWNpZXMsIEttZWFuc0NsdXN0ZXIgPSBrbTMkY2x1c3RlcikKYGBgCgojIyAzKSBDb21wdXRlIEdNTSB1c2luZyBgTUNsdXN0KClgCgojIyMgTW9kZWwgc2VsZWN0aW9uIHdpdGggQklDIHN1cmZhY2UKClRob3VnaCBHTU0gY2FuIG1vZGVsIHNjYWxlIGRpZmZlcmVuY2VzLCB0aGUgc29sdXRpb24gbWF5IG5vdCBiZSBzdGFibGUgaWYgc2NhbGVzIGFjcm9zcyB2YXJpYWJsZXMgZGlmZmVyIGEgbG90LiBTbyB3ZSBzdGlsbCB0ZW5kIHRvIHNjYWxlIHRoZSBkYXRhIGZpcnN0IGJlZm9yZSBydW5uaW5nIEdNTSBpbiBwcmFjdGljZS4KCldlIHVzZSBSIGZ1bmN0aW9uIGBNY2x1c3QoKWAuIEJ5IGRlZmF1bHQsIGl0IGF1dG9tYXRpY2FsbHkgc2VsZWN0IGJvdGggbnVtYmVyIG9mIGNvbXBvbmVudHMgYW5kIGNvdmFyaWFuY2Ugc3RydWN0dXJlIGJ5IEJJQy4KCmBgYHtyIGdtbS1maXQtYXV0b30Kc2V0LnNlZWQoMzI5NTApCmdtbV9hdXRvIDwtIE1jbHVzdChYX3NjYWxlZCwgRyA9IDE6MTIsIHZlcmJvc2UgPSBGKSAjIyBHOiB0aGUgcG9zc2libGUgbnVtYmVycyBvZiBjb21wb25lbnRzCgpzdW1tYXJ5KGdtbV9hdXRvKQpgYGAKCgpUaGUgQklDIGluIGBNQ2x1c3RgIGlzIGRlZmluZWQgYXMgdGhlIHJldmVyc2Ugb2YgcmVndWxhcmx5IGRlZmluZWQgQklDLCBzbyBoaWdoZXIgQklDIHJlcHJlc2VudHMgYmV0dGVyIG1vZGVsIGluIGBNQ2x1c3RgOgokJFx0ZXh0e0JJQ31fe1x0ZXh0e21jbHVzdH19ID0gLVx0ZXh0e0JJQ30gPTJsKFxoYXQgXFRoZXRhKSAtIGRcbG9nIG4kJAoKV2UgY2FuIGFjdHVhbGx5IHZpc3VhbGl6ZSBob3cgQklDIGNoYW5nZXMgYWNyb3NzIGRpZmZlcmVudCBtb2RlbGluZyBjaG9pY2VzLgoKYGBge3IsIGZpZy53aWR0aD00fQpwbG90KGdtbV9hdXRvLCB3aGF0ID0gIkJJQyIpCmBgYAoKWW91IG1heSBhbHNvIHNwZWNpZnkgYSBzcGVjaWZpYyBjb252YXJpYW5jZSBzdHJ1Y3R1cmUgYW5kIG51bWJlciBvZiBjb21wb25lbnRzIChmb3IgaW5zdGFuY2UsIGZpeGVkIEcgPSAzKS4KCmBgYHtyfQpnbW1fMyA8LSBNY2x1c3QoWF9zY2FsZWQsIEcgPSAzLCBtb2RlbE5hbWVzID0gIlZFRSIsIHZlcmJvc2UgPSBGKQpnbW1fMyRiaWMKYGBgCgojIyMgVGhlIG91dHB1dCBvZiBgTWNsdXN0YAoKV2UgaGF2ZSB0aGUgZml0dGVkIG1vZGVsIHN0b3JlZCBpbiBgZ21tX2F1dG8kcGFyYW10ZXJzYCwgdGhlIGVzdGltYXRlZCByZXNwb25zaWJpbGl0ZXMgc3RvcmVkIGluIGBnbW1fYXV0byR6YCwgYW5kIHRoZSBjb3JyZXNwb25kaW5nIGhhcmQgY2x1c3RlcmluZyByZXN1bHRzIGluIGBnbW1fYXV0byRjbGFzc2lmaWNhdGlvbmAgYW5kIHRoZSB1bmNlcnRhaW50aWVzIHN0b3JlZCBpbiBgZ21tX2F1dG8kdW5jZXJ0YWludHlgLgokJFxnYW1tYV97aWt9ID0gUChaX2k9a1xtaWQgeF9pKSwgXHF1YWQgXHRleHR7dW5jZXJ0YWludHl9X2k9MS1cbWF4X2tcZ2FtbWFfe2lrfSQkCgpgYGB7cn0KcHJpbnQoIkZpcnN0IDYgZW50cmllcyBvZiB0aGUgcmVzcG9uc2liaWxpdGllczoiKQpoZWFkKGdtbV9hdXRvJHopCnByaW50KCJGaXJzdCA2IGVudHJpZXMgb2YgdGhlIHVuY2VydGFpbnR5OiIpCmhlYWQoZ21tX2F1dG8kdW5jZXJ0YWludHkpCnByaW50KCJGaXJzdCA2IGVudHJpZXMgb2YgaGFyZCBjbHVzdGVyaW5nIGxhYmVsczoiKQpoZWFkKGdtbV9hdXRvJGNsYXNzaWZpY2F0aW9uKQpgYGAKCgojIyA0KSBWaXN1YWxpemUgdGhlIGNsdXN0ZXJpbmcgcmVzdWx0cwoKYE1DbHVzdGAgcHJvdmlkZXMgdmlzdWFsemF0aW9uIG9mIGNsdXN0ZXJpbmcgcmVzdWx0cyBpbiB0aGUgb3JpZ2luYWwgZGF0YSBzcGFjZSAKYGBge3IsIGZpZy5oZWlnaHQ9Mi41fQpwbG90KGdtbV9hdXRvLCB3aGF0ID0gImNsYXNzaWZpY2F0aW9uIikKYGBgCgpgYGB7ciwgZmlnLmhlaWdodD0yLjV9CnBsb3QoZ21tXzMsIHdoYXQgPSAiY2xhc3NpZmljYXRpb24iKQpgYGAKClRob3VnaCBCSUMgcHJlZmVycyAkSyA9IDQkLCBjb21wYXJlZCB0byB0aGUgcmVzdWx0cyB3aGVuICRLID0zJCwgdGhlIGFkZGl0aW9uYWwgY2x1c3RlciBpcyBub3QgdmVyeSB3ZWxsIHNlcGVyYXRlZCBpbiB0aGUgZGF0YSwgc28gJEs9MyQgc2VlbXMgYSBtb3JlIHJlYXNvbmFibGUgY2hvaWNlLiAKCmBgYHtyLCBmaWcuaGVpZ2h0PTJ9CnBsb3QoZ21tXzMsIHdoYXQgPSAidW5jZXJ0YWludHkiKQpgYGAKCiMjIDUpIFZpc3VhbGl6ZSBjbHVzdGVyaW5nIHJlc3VsdHMgaW4gUEMgc3BhY2UKCmBgYHtyIHBjYS1hbmQtcGxvdHMsIGZpZy5oZWlnaHQ9NSwgZmlnLndpZHRoPTV9CnBjYSA8LSBwcmNvbXAoWF9zY2FsZWQpCnNjb3JlMTIgPC0gcGNhJHhbLCAxOjJdCgpvcCA8LSBwYXIobWZyb3cgPSBjKDIsIDIpLCBtYXIgPSBjKDQsIDQsIDIsIDEpKQoKcGxvdChzY29yZTEyLAogICAgIGNvbCA9IHBlbmd1aW5zX2RmJHNwZWNpZXMsCiAgICAgcGNoID0gMTksCiAgICAgeGxhYiA9ICJQQzEiLCB5bGFiID0gIlBDMiIsCiAgICAgbWFpbiA9ICJHcm91bmQgdHJ1dGggKFNwZWNpZXMpIikKbGVnZW5kKCJ0b3ByaWdodCIsCiAgICAgICBsZWdlbmQgPSBsZXZlbHMoYXMuZmFjdG9yKHBlbmd1aW5zX2RmJHNwZWNpZXMpKSwKICAgICAgIGNvbCA9IDE6bGVuZ3RoKGxldmVscyhhcy5mYWN0b3IocGVuZ3VpbnNfZGYkc3BlY2llcykpKSwKICAgICAgIHBjaCA9IDE5KQoKcGxvdChzY29yZTEyLAogICAgIGNvbCA9IGttMyRjbHVzdGVyLAogICAgIHBjaCA9IDE5LAogICAgIHhsYWIgPSAiUEMxIiwgeWxhYiA9ICJQQzIiLAogICAgIG1haW4gPSAiSy1tZWFucyBjbHVzdGVycyIpCgpwbG90KHNjb3JlMTIsCiAgICAgY29sID0gZ21tXzMkY2xhc3NpZmljYXRpb24sCiAgICAgcGNoID0gMTksCiAgICAgeGxhYiA9ICJQQzEiLCB5bGFiID0gIlBDMiIsCiAgICAgbWFpbiA9IHBhc3RlKCJHTU06IiwgZ21tXzMkbW9kZWxOYW1lLCAiKEc9IiwgZ21tXzMkRywgIikiKSkKCnBsb3Qoc2NvcmUxMiwKICAgICBjb2wgPSAiZ3JleTgwIiwKICAgICBwY2ggPSAxNiwKICAgICB4bGFiID0gIlBDMSIsIHlsYWIgPSAiUEMyIiwKICAgICBtYWluID0gIkdNTSB1bmNlcnRhaW50eSAoPjAuMSBoaWdobGlnaHRlZCkiKQoKaWR4IDwtIGdtbV8zJHVuY2VydGFpbnR5ID4gMC4xICAgIyBhZGp1c3QgdGhyZXNob2xkCnBvaW50cyhzY29yZTEyW2lkeCwgXSwKICAgICAgIGNvbCA9ICJyZWQiLAogICAgICAgcGNoID0gMTkpCgpwYXIob3ApCmBgYAoKVGhlIEdNTSByZXN1bHRzIGFyZSBjbG9zZXIgdG8gdGhlIHRydXRoIHRoYW4gSy1tZWFucy4KCmBgYHtyfQpzcGVjaWVzIDwtIHBlbmd1aW5zX2RmJHNwZWNpZXMKCnRhYl9rbWVhbnMgPC0gdGFibGUoQ2x1c3RlciA9IGttMyRjbHVzdGVyLCBTcGVjaWVzID0gc3BlY2llcykKdGFiX2dtbSAgPC0gdGFibGUoQ2x1c3RlciA9IGdtbV8zJGNsYXNzaWZpY2F0aW9uLCBTcGVjaWVzID0gc3BlY2llcykKCnRhYl9rbWVhbnMKdGFiX2dtbQpgYGAKCgojIyA2KSBIb3cgY292YXJpYW5jZSBjb25zdHJhaW50cyBjaGFuZ2UgY2x1c3RlcmluZyAoZml4ZWQgRyA9IDMpCgoKVG8gaXNvbGF0ZSBjb3ZhcmlhbmNlIGFzc3VtcHRpb25zLCBob2xkIGBHID0gM2AgZml4ZWQgYW5kIGNvbXBhcmUgQklDIHZhbHVlcy4KCmBgYHtyIGNvdmFyaWFuY2UtY29tcGFyZX0KY292X21vZGVscyA8LSBjKAogICJFSUkiLCAjIHNwaGVyaWNhbCwgZXF1YWwgdm9sdW1lCiAgIlZJSSIsICMgc3BoZXJpY2FsLCB1bmVxdWFsIHZvbHVtZQogICJWRUUiLCAjIGVsbGlwc29pZGFsLCBlcXVhbCBzaGFwZSBhbmQgb3JpZW50YXRpb24KICAiVlZWIiAgIyBlbGxpcHNvaWRhbCwgdmFyeWluZyBjb3ZhcmlhbmNlIChtb3N0IGZsZXhpYmxlKQopCgpmaXRzX2NvdiA8LSBsYXBwbHkoY292X21vZGVscywgZnVuY3Rpb24obSkgewogIE1jbHVzdChYX3NjYWxlZCwgRyA9IDMsIG1vZGVsTmFtZXMgPSBtLCB2ZXJib3NlID0gRikKfSkKbmFtZXMoZml0c19jb3YpIDwtIGNvdl9tb2RlbHMKCmJpY19jb3YgPC0gc2FwcGx5KGZpdHNfY292LCBmdW5jdGlvbihmKSBmJGJpYykKY29tcGFyaXNvbl90YWJsZSA8LSBkYXRhLmZyYW1lKAogIG1vZGVsID0gY292X21vZGVscywKICBCSUMgPSByb3VuZChhcy5udW1lcmljKGJpY19jb3YpLCAyKQopCmNvbXBhcmlzb25fdGFibGUgPC0gY29tcGFyaXNvbl90YWJsZVtvcmRlcigtY29tcGFyaXNvbl90YWJsZSRCSUMpLCBdCmNvbXBhcmlzb25fdGFibGUKYGBgCgpWaXN1YWxpemUgaG93IHRoZSBjb3ZhcmlhbmNlIGNob2ljZXMgY2hhbmdlIHRoZSBmaW5hbCBjbHVzdGVyaW5nIHJlc3VsdHMKYGBge3IgY292YXJpYW5jZS1jbGFzc2lmaWNhdGlvbi1wbG90cywgZmlnLmhlaWdodD01LCBmaWcud2lkdGg9NX0Kb3AgPC0gcGFyKG1mcm93ID0gYygyLCAyKSwgbWFyID0gYyg0LCA0LCAyLCAxKSkKCmZvciAobSBpbiBjb3ZfbW9kZWxzKSB7CiAgcGxvdChzY29yZTEyLAogICAgICAgY29sID0gZml0c19jb3ZbW21dXSRjbGFzc2lmaWNhdGlvbiwKICAgICAgIHBjaCA9IDE5LAogICAgICAgeGxhYiA9ICJQQzEiLCB5bGFiID0gIlBDMiIsCiAgICAgICBtYWluID0gcGFzdGUwKG0sICIgKEc9MykiKSkKfQoKcGFyKG9wKQpgYGAKCgoKCgojIyA5KSBUYWtlYXdheXMgCgotIEstbWVhbnMgZ2l2ZXMgKipoYXJkKiogYXNzaWdubWVudHMgb25seS4KLSBHTU0gZ2l2ZXMgKipzb2Z0KiogYXNzaWdubWVudHMgdmlhIHBvc3RlcmlvciBwcm9iYWJpbGl0aWVzLgotIERpZmZlcmVudCBjb3ZhcmlhbmNlIGNvbnN0cmFpbnRzIGltcGx5IGRpZmZlcmVudCBnZW9tZXRyaWMgYXNzdW1wdGlvbnMsIG1ha2luZyBHTU0gbW9yZSBmbGV4aWJsZSB0aGFuIEstbWVhbnMuCi0gQklDIGhlbHBzIGNvbXBhcmUgYm90aCBudW1iZXIgb2YgY29tcG9uZW50cyBhbmQgY292YXJpYW5jZSBzdHJ1Y3R1cmUuCg==