knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(MASS)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ggplot2)
This is a short computational illustration for Lecture 9.
We use Pima.tr as training data and Pima.te
as test data, so the demo stays focused and short.
data(Pima.tr)
data(Pima.te)
train <- na.omit(Pima.tr)
test <- na.omit(Pima.te)
predictor_names <- setdiff(names(train), "type")
str(train)
## 'data.frame': 200 obs. of 8 variables:
## $ npreg: int 5 7 5 0 0 5 3 1 3 2 ...
## $ glu : int 86 195 77 165 107 97 83 193 142 128 ...
## $ bp : int 68 70 82 76 60 76 58 50 80 78 ...
## $ skin : int 28 33 41 43 25 27 31 16 15 37 ...
## $ bmi : num 30.2 25.1 35.8 47.9 26.4 35.6 34.3 25.9 32.4 43.3 ...
## $ ped : num 0.364 0.163 0.156 0.259 0.133 ...
## $ age : int 24 55 35 26 23 52 25 24 63 31 ...
## $ type : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 1 1 2 ...
table(train$type)
##
## No Yes
## 132 68
table(test$type)
##
## No Yes
## 223 109
The response is type, indicating whether the subject
shows diabetes (Yes) or not (No).
The predictors are:
npreg: number of pregnanciesglu: plasma glucose concentrationbp: diastolic blood pressureskin: triceps skinfold thicknessbmi: body mass indexped: diabetes pedigree functionage: age in yearsIn an interactive R session, ?Pima.tr gives the original
dataset documentation.
summary(train)
## npreg glu bp skin
## Min. : 0.00 Min. : 56.0 Min. : 38.00 Min. : 7.00
## 1st Qu.: 1.00 1st Qu.:100.0 1st Qu.: 64.00 1st Qu.:20.75
## Median : 2.00 Median :120.5 Median : 70.00 Median :29.00
## Mean : 3.57 Mean :124.0 Mean : 71.26 Mean :29.21
## 3rd Qu.: 6.00 3rd Qu.:144.0 3rd Qu.: 78.00 3rd Qu.:36.00
## Max. :14.00 Max. :199.0 Max. :110.00 Max. :99.00
## bmi ped age type
## Min. :18.20 Min. :0.0850 Min. :21.00 No :132
## 1st Qu.:27.57 1st Qu.:0.2535 1st Qu.:23.00 Yes: 68
## Median :32.80 Median :0.3725 Median :28.00
## Mean :32.31 Mean :0.4608 Mean :32.11
## 3rd Qu.:36.50 3rd Qu.:0.6160 3rd Qu.:39.25
## Max. :47.90 Max. :2.2880 Max. :63.00
ggplot(train, aes(x = glu, y = bmi, color = type, size = age)) +
geom_point(alpha = 0.7) +
scale_color_manual(values = c("No" = "#1f77b4", "Yes" = "#d62728")) +
labs(
title = "Training data overview",
subtitle = "Glucose vs BMI, with age shown by point size",
x = "Plasma glucose concentration",
y = "Body mass index",
color = "Diabetes",
size = "Age"
) +
theme_minimal(base_size = 14)
fit_glm <- glm(type ~ ., data = train, family = binomial)
fit_lda <- lda(type ~ ., data = train)
summary(fit_glm)
##
## Call:
## glm(formula = type ~ ., family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.773062 1.770386 -5.520 3.38e-08 ***
## npreg 0.103183 0.064694 1.595 0.11073
## glu 0.032117 0.006787 4.732 2.22e-06 ***
## bp -0.004768 0.018541 -0.257 0.79707
## skin -0.001917 0.022500 -0.085 0.93211
## bmi 0.083624 0.042827 1.953 0.05087 .
## ped 1.820410 0.665514 2.735 0.00623 **
## age 0.041184 0.022091 1.864 0.06228 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 256.41 on 199 degrees of freedom
## Residual deviance: 178.39 on 192 degrees of freedom
## AIC: 194.39
##
## Number of Fisher Scoring iterations: 5
fit_lda
## Call:
## lda(type ~ ., data = train)
##
## Prior probabilities of groups:
## No Yes
## 0.66 0.34
##
## Group means:
## npreg glu bp skin bmi ped age
## No 2.916667 113.1061 69.54545 27.20455 31.07424 0.4154848 29.23485
## Yes 4.838235 145.0588 74.58824 33.11765 34.70882 0.5486618 37.69118
##
## Coefficients of linear discriminants:
## LD1
## npreg 0.0794995781
## glu 0.0240316424
## bp -0.0018125857
## skin -0.0008317413
## bmi 0.0494891916
## ped 1.2530603130
## age 0.0314375125
test$p_glm <- predict(fit_glm, newdata = test, type = "response")
test$p_lda <- predict(fit_lda, newdata = test)$posterior[, "Yes"]
head(test[, c("type", predictor_names, "p_glm", "p_lda")], 10)
par(mfrow = c(1, 2))
hist(test$p_glm[test$type == "No"],
breaks = 15, col = rgb(0.12, 0.47, 0.71, 0.5), xlim = c(0, 1),
main = "Logistic regression", xlab = "Predicted P(Y = Yes)", freq = TRUE)
hist(test$p_glm[test$type == "Yes"],
breaks = 15, col = rgb(0.84, 0.15, 0.16, 0.5), add = TRUE)
legend("topright", legend = c("True No", "True Yes"),
fill = c(rgb(0.12, 0.47, 0.71, 0.5), rgb(0.84, 0.15, 0.16, 0.5)),
bty = "n")
hist(test$p_lda[test$type == "No"],
breaks = 15, col = rgb(0.12, 0.47, 0.71, 0.5), xlim = c(0, 1),
main = "LDA", xlab = "Predicted P(Y = Yes)", freq = TRUE)
hist(test$p_lda[test$type == "Yes"],
breaks = 15, col = rgb(0.84, 0.15, 0.16, 0.5), add = TRUE)
legend("topright", legend = c("True No", "True Yes"),
fill = c(rgb(0.12, 0.47, 0.71, 0.5), rgb(0.84, 0.15, 0.16, 0.5)),
bty = "n")
par(mfrow = c(1, 1))
test$pred_glm <- ifelse(test$p_glm > 0.5, "Yes", "No")
test$pred_lda <- ifelse(test$p_lda > 0.5, "Yes", "No")
acc_glm <- mean(test$pred_glm == test$type)
acc_lda <- mean(test$pred_lda == test$type)
auc_glm <- auc(response = test$type, predictor = test$p_glm, levels = c("No", "Yes"))
auc_lda <- auc(response = test$type, predictor = test$p_lda, levels = c("No", "Yes"))
data.frame(
method = c("Logistic regression", "LDA"),
accuracy = c(acc_glm, acc_lda),
auc = c(as.numeric(auc_glm), as.numeric(auc_lda))
)
table(True = test$type, Predicted = test$pred_glm)
## Predicted
## True No Yes
## No 200 23
## Yes 43 66
table(True = test$type, Predicted = test$pred_lda)
## Predicted
## True No Yes
## No 198 25
## Yes 42 67
roc_glm <- roc(response = test$type, predictor = test$p_glm, levels = c("No", "Yes"))
roc_lda <- roc(response = test$type, predictor = test$p_lda, levels = c("No", "Yes"))
plot(roc_glm, col = "#1f77b4", lwd = 2, main = "ROC curves on the test set")
plot(roc_lda, col = "#d62728", lwd = 2, add = TRUE)
legend(
"bottomright",
legend = c("Logistic regression", "LDA"),
col = c("#1f77b4", "#d62728"),
lwd = 2,
bty = "n"
)
Both methods are easy to fit, both estimate class probabilities, and on this dataset their out-of-sample performance is fairly similar.