library(palmerpenguins)
library(mclust)
1) Prepare the same penguins data used in Lecture 5
We reuse the same four numeric measurements and keep complete cases.
raw_df <- penguins[, c(
"species", "island",
"bill_length_mm", "bill_depth_mm",
"flipper_length_mm", "body_mass_g"
)]
keep <- complete.cases(raw_df)
penguins_df <- raw_df[keep, ]
X <- as.matrix(penguins_df[, c(
"bill_length_mm", "bill_depth_mm",
"flipper_length_mm", "body_mass_g"
)])
X_scaled <- scale(X)
2) Quick baseline: K-means (hard assignments)
set.seed(32950)
km3 <- kmeans(X_scaled, centers = 3, nstart = 20)
table(km3$cluster)
1 2 3
132 87 123
table(TrueSpecies = penguins_df$species, KmeansCluster = km3$cluster)
KmeansCluster
TrueSpecies 1 2 3
Adelie 127 24 0
Chinstrap 5 63 0
Gentoo 0 0 123
3) Compute GMM using MClust()
Model selection with BIC surface
Though GMM can model scale differences, the solution may not be stable if scales across variables differ a lot. So we still tend to scale the data first before running GMM in practice.
We use R function Mclust(). By default, it automatically select both number of components and covariance structure by BIC.
set.seed(32950)
gmm_auto <- Mclust(X_scaled, G = 1:12, verbose = F) ## G: the possible numbers of components
summary(gmm_auto)
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust VEE (ellipsoidal, equal shape and orientation) model with 4 components:
Clustering table:
1 2 3 4
154 66 57 65
The BIC in MClust is defined as the reverse of regularly defined BIC, so higher BIC represents better model in MClust: \[\text{BIC}_{\text{mclust}} = -\text{BIC} =2l(\hat \Theta) - d\log n\]
We can actually visualize how BIC changes across different modeling choices.
plot(gmm_auto, what = "BIC")

You may also specify a specific convariance structure and number of components (for instance, fixed G = 3).
gmm_3 <- Mclust(X_scaled, G = 3, modelNames = "VEE", verbose = F)
gmm_3$bic
[1] -2510.485
The output of Mclust
We have the fitted model stored in gmm_auto$paramters, the estimated responsibilites stored in gmm_auto$z, and the corresponding hard clustering results in gmm_auto$classification and the uncertainties stored in gmm_auto$uncertainty. \[\gamma_{ik} = P(Z_i=k\mid x_i), \quad \text{uncertainty}_i=1-\max_k\gamma_{ik}\]
print("First 6 entries of the responsibilities:")
[1] "First 6 entries of the responsibilities:"
head(gmm_auto$z)
[,1] [,2] [,3] [,4]
[1,] 0.9999859 7.831331e-42 1.088997e-27 1.405042e-05
[2,] 0.9997910 1.289681e-26 3.344178e-19 2.089817e-04
[3,] 0.9917237 7.758721e-30 1.436243e-21 8.276320e-03
[4,] 0.9999983 2.581199e-43 4.760017e-29 1.671648e-06
[5,] 0.9999928 1.510400e-52 3.022682e-33 7.230942e-06
[6,] 0.9999509 1.237485e-35 1.283305e-24 4.905485e-05
print("First 6 entries of the uncertainty:")
[1] "First 6 entries of the uncertainty:"
head(gmm_auto$uncertainty)
[1] 1.405042e-05 2.089817e-04 8.276320e-03 1.671648e-06 7.230942e-06 4.905485e-05
print("First 6 entries of hard clustering labels:")
[1] "First 6 entries of hard clustering labels:"
head(gmm_auto$classification)
[1] 1 1 1 1 1 1
4) Visualize the clustering results
MClust provides visualzation of clustering results in the original data space
plot(gmm_auto, what = "classification")

plot(gmm_3, what = "classification")

Though BIC prefers \(K = 4\), compared to the results when \(K =3\), the additional cluster is not very well seperated in the data, so \(K=3\) seems a more reasonable choice.
plot(gmm_3, what = "uncertainty")

5) Visualize clustering results in PC space
pca <- prcomp(X_scaled)
score12 <- pca$x[, 1:2]
op <- par(mfrow = c(2, 2), mar = c(4, 4, 2, 1))
plot(score12,
col = penguins_df$species,
pch = 19,
xlab = "PC1", ylab = "PC2",
main = "Ground truth (Species)")
legend("topright",
legend = levels(as.factor(penguins_df$species)),
col = 1:length(levels(as.factor(penguins_df$species))),
pch = 19)
plot(score12,
col = km3$cluster,
pch = 19,
xlab = "PC1", ylab = "PC2",
main = "K-means clusters")
plot(score12,
col = gmm_3$classification,
pch = 19,
xlab = "PC1", ylab = "PC2",
main = paste("GMM:", gmm_3$modelName, "(G=", gmm_3$G, ")"))
plot(score12,
col = "grey80",
pch = 16,
xlab = "PC1", ylab = "PC2",
main = "GMM uncertainty (>0.1 highlighted)")
idx <- gmm_3$uncertainty > 0.1 # adjust threshold
points(score12[idx, ],
col = "red",
pch = 19)
par(op)

The GMM results are closer to the truth than K-means.
species <- penguins_df$species
tab_kmeans <- table(Cluster = km3$cluster, Species = species)
tab_gmm <- table(Cluster = gmm_3$classification, Species = species)
tab_kmeans
Species
Cluster Adelie Chinstrap Gentoo
1 127 5 0
2 24 63 0
3 0 0 123
tab_gmm
Species
Cluster Adelie Chinstrap Gentoo
1 151 5 0
2 0 0 123
3 0 63 0
6) How covariance constraints change clustering (fixed G = 3)
To isolate covariance assumptions, hold G = 3 fixed and compare BIC values.
cov_models <- c(
"EII", # spherical, equal volume
"VII", # spherical, unequal volume
"VEE", # ellipsoidal, equal shape and orientation
"VVV" # ellipsoidal, varying covariance (most flexible)
)
fits_cov <- lapply(cov_models, function(m) {
Mclust(X_scaled, G = 3, modelNames = m, verbose = F)
})
names(fits_cov) <- cov_models
bic_cov <- sapply(fits_cov, function(f) f$bic)
comparison_table <- data.frame(
model = cov_models,
BIC = round(as.numeric(bic_cov), 2)
)
comparison_table <- comparison_table[order(-comparison_table$BIC), ]
comparison_table
Visualize how the covariance choices change the final clustering results
op <- par(mfrow = c(2, 2), mar = c(4, 4, 2, 1))
for (m in cov_models) {
plot(score12,
col = fits_cov[[m]]$classification,
pch = 19,
xlab = "PC1", ylab = "PC2",
main = paste0(m, " (G=3)"))
}
par(op)

9) Takeaways
- K-means gives hard assignments only.
- GMM gives soft assignments via posterior probabilities.
- Different covariance constraints imply different geometric assumptions, making GMM more flexible than K-means.
- BIC helps compare both number of components and covariance structure.
LS0tCnRpdGxlOiAiTGVjdHVyZSA2IERlbW86IEdhdXNzaWFuIE1peHR1cmUgTW9kZWxzIG9uIFBlbmd1aW5zIgphdXRob3I6ICJTVEFUIDI0NjIwID0gU1RBVCAzMjkwMCwgRklOTSAzNDcwMCIKZGF0ZTogIlNwcmluZyAyMDI2IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpgYGB7ciBzZXR1cCwgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KCmxpYnJhcnkocGFsbWVycGVuZ3VpbnMpCmxpYnJhcnkobWNsdXN0KQpgYGAKCiMjIDEpIFByZXBhcmUgdGhlIHNhbWUgcGVuZ3VpbnMgZGF0YSB1c2VkIGluIExlY3R1cmUgNQoKV2UgcmV1c2UgdGhlIHNhbWUgZm91ciBudW1lcmljIG1lYXN1cmVtZW50cyBhbmQga2VlcCBjb21wbGV0ZSBjYXNlcy4KCmBgYHtyIGRhdGEtcHJlcH0KcmF3X2RmIDwtIHBlbmd1aW5zWywgYygKICAic3BlY2llcyIsICJpc2xhbmQiLAogICJiaWxsX2xlbmd0aF9tbSIsICJiaWxsX2RlcHRoX21tIiwKICAiZmxpcHBlcl9sZW5ndGhfbW0iLCAiYm9keV9tYXNzX2ciCildCgprZWVwIDwtIGNvbXBsZXRlLmNhc2VzKHJhd19kZikKcGVuZ3VpbnNfZGYgPC0gcmF3X2RmW2tlZXAsIF0KClggPC0gYXMubWF0cml4KHBlbmd1aW5zX2RmWywgYygKICAiYmlsbF9sZW5ndGhfbW0iLCAiYmlsbF9kZXB0aF9tbSIsCiAgImZsaXBwZXJfbGVuZ3RoX21tIiwgImJvZHlfbWFzc19nIgopXSkKClhfc2NhbGVkIDwtIHNjYWxlKFgpCgpgYGAKCiMjIDIpIFF1aWNrIGJhc2VsaW5lOiBLLW1lYW5zIChoYXJkIGFzc2lnbm1lbnRzKQoKYGBge3Iga21lYW5zLWZpdH0Kc2V0LnNlZWQoMzI5NTApCmttMyA8LSBrbWVhbnMoWF9zY2FsZWQsIGNlbnRlcnMgPSAzLCBuc3RhcnQgPSAyMCkKCnRhYmxlKGttMyRjbHVzdGVyKQp0YWJsZShUcnVlU3BlY2llcyA9IHBlbmd1aW5zX2RmJHNwZWNpZXMsIEttZWFuc0NsdXN0ZXIgPSBrbTMkY2x1c3RlcikKYGBgCgojIyAzKSBDb21wdXRlIEdNTSB1c2luZyBgTUNsdXN0KClgCgojIyMgTW9kZWwgc2VsZWN0aW9uIHdpdGggQklDIHN1cmZhY2UKClRob3VnaCBHTU0gY2FuIG1vZGVsIHNjYWxlIGRpZmZlcmVuY2VzLCB0aGUgc29sdXRpb24gbWF5IG5vdCBiZSBzdGFibGUgaWYgc2NhbGVzIGFjcm9zcyB2YXJpYWJsZXMgZGlmZmVyIGEgbG90LiBTbyB3ZSBzdGlsbCB0ZW5kIHRvIHNjYWxlIHRoZSBkYXRhIGZpcnN0IGJlZm9yZSBydW5uaW5nIEdNTSBpbiBwcmFjdGljZS4KCldlIHVzZSBSIGZ1bmN0aW9uIGBNY2x1c3QoKWAuIEJ5IGRlZmF1bHQsIGl0IGF1dG9tYXRpY2FsbHkgc2VsZWN0IGJvdGggbnVtYmVyIG9mIGNvbXBvbmVudHMgYW5kIGNvdmFyaWFuY2Ugc3RydWN0dXJlIGJ5IEJJQy4KCmBgYHtyIGdtbS1maXQtYXV0b30Kc2V0LnNlZWQoMzI5NTApCmdtbV9hdXRvIDwtIE1jbHVzdChYX3NjYWxlZCwgRyA9IDE6MTIsIHZlcmJvc2UgPSBGKSAjIyBHOiB0aGUgcG9zc2libGUgbnVtYmVycyBvZiBjb21wb25lbnRzCgpzdW1tYXJ5KGdtbV9hdXRvKQpgYGAKCgpUaGUgQklDIGluIGBNQ2x1c3RgIGlzIGRlZmluZWQgYXMgdGhlIHJldmVyc2Ugb2YgcmVndWxhcmx5IGRlZmluZWQgQklDLCBzbyBoaWdoZXIgQklDIHJlcHJlc2VudHMgYmV0dGVyIG1vZGVsIGluIGBNQ2x1c3RgOgokJFx0ZXh0e0JJQ31fe1x0ZXh0e21jbHVzdH19ID0gLVx0ZXh0e0JJQ30gPTJsKFxoYXQgXFRoZXRhKSAtIGRcbG9nIG4kJAoKV2UgY2FuIGFjdHVhbGx5IHZpc3VhbGl6ZSBob3cgQklDIGNoYW5nZXMgYWNyb3NzIGRpZmZlcmVudCBtb2RlbGluZyBjaG9pY2VzLgoKYGBge3IsIGZpZy53aWR0aD00fQpwbG90KGdtbV9hdXRvLCB3aGF0ID0gIkJJQyIpCmBgYAoKWW91IG1heSBhbHNvIHNwZWNpZnkgYSBzcGVjaWZpYyBjb252YXJpYW5jZSBzdHJ1Y3R1cmUgYW5kIG51bWJlciBvZiBjb21wb25lbnRzIChmb3IgaW5zdGFuY2UsIGZpeGVkIEcgPSAzKS4KCmBgYHtyfQpnbW1fMyA8LSBNY2x1c3QoWF9zY2FsZWQsIEcgPSAzLCBtb2RlbE5hbWVzID0gIlZFRSIsIHZlcmJvc2UgPSBGKQpnbW1fMyRiaWMKYGBgCgojIyMgVGhlIG91dHB1dCBvZiBgTWNsdXN0YAoKV2UgaGF2ZSB0aGUgZml0dGVkIG1vZGVsIHN0b3JlZCBpbiBgZ21tX2F1dG8kcGFyYW10ZXJzYCwgdGhlIGVzdGltYXRlZCByZXNwb25zaWJpbGl0ZXMgc3RvcmVkIGluIGBnbW1fYXV0byR6YCwgYW5kIHRoZSBjb3JyZXNwb25kaW5nIGhhcmQgY2x1c3RlcmluZyByZXN1bHRzIGluIGBnbW1fYXV0byRjbGFzc2lmaWNhdGlvbmAgYW5kIHRoZSB1bmNlcnRhaW50aWVzIHN0b3JlZCBpbiBgZ21tX2F1dG8kdW5jZXJ0YWludHlgLgokJFxnYW1tYV97aWt9ID0gUChaX2k9a1xtaWQgeF9pKSwgXHF1YWQgXHRleHR7dW5jZXJ0YWludHl9X2k9MS1cbWF4X2tcZ2FtbWFfe2lrfSQkCgpgYGB7cn0KcHJpbnQoIkZpcnN0IDYgZW50cmllcyBvZiB0aGUgcmVzcG9uc2liaWxpdGllczoiKQpoZWFkKGdtbV9hdXRvJHopCnByaW50KCJGaXJzdCA2IGVudHJpZXMgb2YgdGhlIHVuY2VydGFpbnR5OiIpCmhlYWQoZ21tX2F1dG8kdW5jZXJ0YWludHkpCnByaW50KCJGaXJzdCA2IGVudHJpZXMgb2YgaGFyZCBjbHVzdGVyaW5nIGxhYmVsczoiKQpoZWFkKGdtbV9hdXRvJGNsYXNzaWZpY2F0aW9uKQpgYGAKCgojIyA0KSBWaXN1YWxpemUgdGhlIGNsdXN0ZXJpbmcgcmVzdWx0cwoKYE1DbHVzdGAgcHJvdmlkZXMgdmlzdWFsemF0aW9uIG9mIGNsdXN0ZXJpbmcgcmVzdWx0cyBpbiB0aGUgb3JpZ2luYWwgZGF0YSBzcGFjZSAKYGBge3IsIGZpZy5oZWlnaHQ9Mi41fQpwbG90KGdtbV9hdXRvLCB3aGF0ID0gImNsYXNzaWZpY2F0aW9uIikKYGBgCgpgYGB7ciwgZmlnLmhlaWdodD0yLjV9CnBsb3QoZ21tXzMsIHdoYXQgPSAiY2xhc3NpZmljYXRpb24iKQpgYGAKClRob3VnaCBCSUMgcHJlZmVycyAkSyA9IDQkLCBjb21wYXJlZCB0byB0aGUgcmVzdWx0cyB3aGVuICRLID0zJCwgdGhlIGFkZGl0aW9uYWwgY2x1c3RlciBpcyBub3QgdmVyeSB3ZWxsIHNlcGVyYXRlZCBpbiB0aGUgZGF0YSwgc28gJEs9MyQgc2VlbXMgYSBtb3JlIHJlYXNvbmFibGUgY2hvaWNlLiAKCmBgYHtyLCBmaWcuaGVpZ2h0PTJ9CnBsb3QoZ21tXzMsIHdoYXQgPSAidW5jZXJ0YWludHkiKQpgYGAKCiMjIDUpIFZpc3VhbGl6ZSBjbHVzdGVyaW5nIHJlc3VsdHMgaW4gUEMgc3BhY2UKCmBgYHtyIHBjYS1hbmQtcGxvdHMsIGZpZy5oZWlnaHQ9NSwgZmlnLndpZHRoPTV9CnBjYSA8LSBwcmNvbXAoWF9zY2FsZWQpCnNjb3JlMTIgPC0gcGNhJHhbLCAxOjJdCgpvcCA8LSBwYXIobWZyb3cgPSBjKDIsIDIpLCBtYXIgPSBjKDQsIDQsIDIsIDEpKQoKcGxvdChzY29yZTEyLAogICAgIGNvbCA9IHBlbmd1aW5zX2RmJHNwZWNpZXMsCiAgICAgcGNoID0gMTksCiAgICAgeGxhYiA9ICJQQzEiLCB5bGFiID0gIlBDMiIsCiAgICAgbWFpbiA9ICJHcm91bmQgdHJ1dGggKFNwZWNpZXMpIikKbGVnZW5kKCJ0b3ByaWdodCIsCiAgICAgICBsZWdlbmQgPSBsZXZlbHMoYXMuZmFjdG9yKHBlbmd1aW5zX2RmJHNwZWNpZXMpKSwKICAgICAgIGNvbCA9IDE6bGVuZ3RoKGxldmVscyhhcy5mYWN0b3IocGVuZ3VpbnNfZGYkc3BlY2llcykpKSwKICAgICAgIHBjaCA9IDE5KQoKcGxvdChzY29yZTEyLAogICAgIGNvbCA9IGttMyRjbHVzdGVyLAogICAgIHBjaCA9IDE5LAogICAgIHhsYWIgPSAiUEMxIiwgeWxhYiA9ICJQQzIiLAogICAgIG1haW4gPSAiSy1tZWFucyBjbHVzdGVycyIpCgpwbG90KHNjb3JlMTIsCiAgICAgY29sID0gZ21tXzMkY2xhc3NpZmljYXRpb24sCiAgICAgcGNoID0gMTksCiAgICAgeGxhYiA9ICJQQzEiLCB5bGFiID0gIlBDMiIsCiAgICAgbWFpbiA9IHBhc3RlKCJHTU06IiwgZ21tXzMkbW9kZWxOYW1lLCAiKEc9IiwgZ21tXzMkRywgIikiKSkKCnBsb3Qoc2NvcmUxMiwKICAgICBjb2wgPSAiZ3JleTgwIiwKICAgICBwY2ggPSAxNiwKICAgICB4bGFiID0gIlBDMSIsIHlsYWIgPSAiUEMyIiwKICAgICBtYWluID0gIkdNTSB1bmNlcnRhaW50eSAoPjAuMSBoaWdobGlnaHRlZCkiKQoKaWR4IDwtIGdtbV8zJHVuY2VydGFpbnR5ID4gMC4xICAgIyBhZGp1c3QgdGhyZXNob2xkCnBvaW50cyhzY29yZTEyW2lkeCwgXSwKICAgICAgIGNvbCA9ICJyZWQiLAogICAgICAgcGNoID0gMTkpCgpwYXIob3ApCmBgYAoKVGhlIEdNTSByZXN1bHRzIGFyZSBjbG9zZXIgdG8gdGhlIHRydXRoIHRoYW4gSy1tZWFucy4KCmBgYHtyfQpzcGVjaWVzIDwtIHBlbmd1aW5zX2RmJHNwZWNpZXMKCnRhYl9rbWVhbnMgPC0gdGFibGUoQ2x1c3RlciA9IGttMyRjbHVzdGVyLCBTcGVjaWVzID0gc3BlY2llcykKdGFiX2dtbSAgPC0gdGFibGUoQ2x1c3RlciA9IGdtbV8zJGNsYXNzaWZpY2F0aW9uLCBTcGVjaWVzID0gc3BlY2llcykKCnRhYl9rbWVhbnMKdGFiX2dtbQpgYGAKCgojIyA2KSBIb3cgY292YXJpYW5jZSBjb25zdHJhaW50cyBjaGFuZ2UgY2x1c3RlcmluZyAoZml4ZWQgRyA9IDMpCgoKVG8gaXNvbGF0ZSBjb3ZhcmlhbmNlIGFzc3VtcHRpb25zLCBob2xkIGBHID0gM2AgZml4ZWQgYW5kIGNvbXBhcmUgQklDIHZhbHVlcy4KCmBgYHtyIGNvdmFyaWFuY2UtY29tcGFyZX0KY292X21vZGVscyA8LSBjKAogICJFSUkiLCAjIHNwaGVyaWNhbCwgZXF1YWwgdm9sdW1lCiAgIlZJSSIsICMgc3BoZXJpY2FsLCB1bmVxdWFsIHZvbHVtZQogICJWRUUiLCAjIGVsbGlwc29pZGFsLCBlcXVhbCBzaGFwZSBhbmQgb3JpZW50YXRpb24KICAiVlZWIiAgIyBlbGxpcHNvaWRhbCwgdmFyeWluZyBjb3ZhcmlhbmNlIChtb3N0IGZsZXhpYmxlKQopCgpmaXRzX2NvdiA8LSBsYXBwbHkoY292X21vZGVscywgZnVuY3Rpb24obSkgewogIE1jbHVzdChYX3NjYWxlZCwgRyA9IDMsIG1vZGVsTmFtZXMgPSBtLCB2ZXJib3NlID0gRikKfSkKbmFtZXMoZml0c19jb3YpIDwtIGNvdl9tb2RlbHMKCmJpY19jb3YgPC0gc2FwcGx5KGZpdHNfY292LCBmdW5jdGlvbihmKSBmJGJpYykKY29tcGFyaXNvbl90YWJsZSA8LSBkYXRhLmZyYW1lKAogIG1vZGVsID0gY292X21vZGVscywKICBCSUMgPSByb3VuZChhcy5udW1lcmljKGJpY19jb3YpLCAyKQopCmNvbXBhcmlzb25fdGFibGUgPC0gY29tcGFyaXNvbl90YWJsZVtvcmRlcigtY29tcGFyaXNvbl90YWJsZSRCSUMpLCBdCmNvbXBhcmlzb25fdGFibGUKYGBgCgpWaXN1YWxpemUgaG93IHRoZSBjb3ZhcmlhbmNlIGNob2ljZXMgY2hhbmdlIHRoZSBmaW5hbCBjbHVzdGVyaW5nIHJlc3VsdHMKYGBge3IgY292YXJpYW5jZS1jbGFzc2lmaWNhdGlvbi1wbG90cywgZmlnLmhlaWdodD01LCBmaWcud2lkdGg9NX0Kb3AgPC0gcGFyKG1mcm93ID0gYygyLCAyKSwgbWFyID0gYyg0LCA0LCAyLCAxKSkKCmZvciAobSBpbiBjb3ZfbW9kZWxzKSB7CiAgcGxvdChzY29yZTEyLAogICAgICAgY29sID0gZml0c19jb3ZbW21dXSRjbGFzc2lmaWNhdGlvbiwKICAgICAgIHBjaCA9IDE5LAogICAgICAgeGxhYiA9ICJQQzEiLCB5bGFiID0gIlBDMiIsCiAgICAgICBtYWluID0gcGFzdGUwKG0sICIgKEc9MykiKSkKfQoKcGFyKG9wKQpgYGAKCgoKCgojIyA5KSBUYWtlYXdheXMgCgotIEstbWVhbnMgZ2l2ZXMgKipoYXJkKiogYXNzaWdubWVudHMgb25seS4KLSBHTU0gZ2l2ZXMgKipzb2Z0KiogYXNzaWdubWVudHMgdmlhIHBvc3RlcmlvciBwcm9iYWJpbGl0aWVzLgotIERpZmZlcmVudCBjb3ZhcmlhbmNlIGNvbnN0cmFpbnRzIGltcGx5IGRpZmZlcmVudCBnZW9tZXRyaWMgYXNzdW1wdGlvbnMsIG1ha2luZyBHTU0gbW9yZSBmbGV4aWJsZSB0aGFuIEstbWVhbnMuCi0gQklDIGhlbHBzIGNvbXBhcmUgYm90aCBudW1iZXIgb2YgY29tcG9uZW50cyBhbmQgY292YXJpYW5jZSBzdHJ1Y3R1cmUuCg==