knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(MASS)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ggplot2)

Goal

This is a short computational illustration for Lecture 9.

We use Pima.tr as training data and Pima.te as test data, so the demo stays focused and short.

Data

data(Pima.tr)
data(Pima.te)

train <- na.omit(Pima.tr)
test <- na.omit(Pima.te)

predictor_names <- setdiff(names(train), "type")

str(train)
## 'data.frame':    200 obs. of  8 variables:
##  $ npreg: int  5 7 5 0 0 5 3 1 3 2 ...
##  $ glu  : int  86 195 77 165 107 97 83 193 142 128 ...
##  $ bp   : int  68 70 82 76 60 76 58 50 80 78 ...
##  $ skin : int  28 33 41 43 25 27 31 16 15 37 ...
##  $ bmi  : num  30.2 25.1 35.8 47.9 26.4 35.6 34.3 25.9 32.4 43.3 ...
##  $ ped  : num  0.364 0.163 0.156 0.259 0.133 ...
##  $ age  : int  24 55 35 26 23 52 25 24 63 31 ...
##  $ type : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 1 1 2 ...
table(train$type)
## 
##  No Yes 
## 132  68
table(test$type)
## 
##  No Yes 
## 223 109

The response is type, indicating whether the subject shows diabetes (Yes) or not (No).

The predictors are:

In an interactive R session, ?Pima.tr gives the original dataset documentation.

A Quick Data Overview

summary(train)
##      npreg            glu              bp              skin      
##  Min.   : 0.00   Min.   : 56.0   Min.   : 38.00   Min.   : 7.00  
##  1st Qu.: 1.00   1st Qu.:100.0   1st Qu.: 64.00   1st Qu.:20.75  
##  Median : 2.00   Median :120.5   Median : 70.00   Median :29.00  
##  Mean   : 3.57   Mean   :124.0   Mean   : 71.26   Mean   :29.21  
##  3rd Qu.: 6.00   3rd Qu.:144.0   3rd Qu.: 78.00   3rd Qu.:36.00  
##  Max.   :14.00   Max.   :199.0   Max.   :110.00   Max.   :99.00  
##       bmi             ped              age         type    
##  Min.   :18.20   Min.   :0.0850   Min.   :21.00   No :132  
##  1st Qu.:27.57   1st Qu.:0.2535   1st Qu.:23.00   Yes: 68  
##  Median :32.80   Median :0.3725   Median :28.00            
##  Mean   :32.31   Mean   :0.4608   Mean   :32.11            
##  3rd Qu.:36.50   3rd Qu.:0.6160   3rd Qu.:39.25            
##  Max.   :47.90   Max.   :2.2880   Max.   :63.00
ggplot(train, aes(x = glu, y = bmi, color = type, size = age)) +
  geom_point(alpha = 0.7) +
  scale_color_manual(values = c("No" = "#1f77b4", "Yes" = "#d62728")) +
  labs(
    title = "Training data overview",
    subtitle = "Glucose vs BMI, with age shown by point size",
    x = "Plasma glucose concentration",
    y = "Body mass index",
    color = "Diabetes",
    size = "Age"
  ) +
  theme_minimal(base_size = 14)

Fit Logistic Regression and LDA

fit_glm <- glm(type ~ ., data = train, family = binomial)
fit_lda <- lda(type ~ ., data = train)

summary(fit_glm)
## 
## Call:
## glm(formula = type ~ ., family = binomial, data = train)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.773062   1.770386  -5.520 3.38e-08 ***
## npreg        0.103183   0.064694   1.595  0.11073    
## glu          0.032117   0.006787   4.732 2.22e-06 ***
## bp          -0.004768   0.018541  -0.257  0.79707    
## skin        -0.001917   0.022500  -0.085  0.93211    
## bmi          0.083624   0.042827   1.953  0.05087 .  
## ped          1.820410   0.665514   2.735  0.00623 ** 
## age          0.041184   0.022091   1.864  0.06228 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 256.41  on 199  degrees of freedom
## Residual deviance: 178.39  on 192  degrees of freedom
## AIC: 194.39
## 
## Number of Fisher Scoring iterations: 5
fit_lda
## Call:
## lda(type ~ ., data = train)
## 
## Prior probabilities of groups:
##   No  Yes 
## 0.66 0.34 
## 
## Group means:
##        npreg      glu       bp     skin      bmi       ped      age
## No  2.916667 113.1061 69.54545 27.20455 31.07424 0.4154848 29.23485
## Yes 4.838235 145.0588 74.58824 33.11765 34.70882 0.5486618 37.69118
## 
## Coefficients of linear discriminants:
##                 LD1
## npreg  0.0794995781
## glu    0.0240316424
## bp    -0.0018125857
## skin  -0.0008317413
## bmi    0.0494891916
## ped    1.2530603130
## age    0.0314375125

Compare Test-Set Probabilities

test$p_glm <- predict(fit_glm, newdata = test, type = "response")
test$p_lda <- predict(fit_lda, newdata = test)$posterior[, "Yes"]

head(test[, c("type", predictor_names, "p_glm", "p_lda")], 10)
par(mfrow = c(1, 2))
hist(test$p_glm[test$type == "No"],
     breaks = 15, col = rgb(0.12, 0.47, 0.71, 0.5), xlim = c(0, 1),
     main = "Logistic regression", xlab = "Predicted P(Y = Yes)", freq = TRUE)
hist(test$p_glm[test$type == "Yes"],
     breaks = 15, col = rgb(0.84, 0.15, 0.16, 0.5), add = TRUE)
legend("topright", legend = c("True No", "True Yes"),
       fill = c(rgb(0.12, 0.47, 0.71, 0.5), rgb(0.84, 0.15, 0.16, 0.5)),
       bty = "n")

hist(test$p_lda[test$type == "No"],
     breaks = 15, col = rgb(0.12, 0.47, 0.71, 0.5), xlim = c(0, 1),
     main = "LDA", xlab = "Predicted P(Y = Yes)", freq = TRUE)
hist(test$p_lda[test$type == "Yes"],
     breaks = 15, col = rgb(0.84, 0.15, 0.16, 0.5), add = TRUE)
legend("topright", legend = c("True No", "True Yes"),
       fill = c(rgb(0.12, 0.47, 0.71, 0.5), rgb(0.84, 0.15, 0.16, 0.5)),
       bty = "n")

par(mfrow = c(1, 1))

Compare Test-Set Performance

test$pred_glm <- ifelse(test$p_glm > 0.5, "Yes", "No")
test$pred_lda <- ifelse(test$p_lda > 0.5, "Yes", "No")

acc_glm <- mean(test$pred_glm == test$type)
acc_lda <- mean(test$pred_lda == test$type)

auc_glm <- auc(response = test$type, predictor = test$p_glm, levels = c("No", "Yes"))
auc_lda <- auc(response = test$type, predictor = test$p_lda, levels = c("No", "Yes"))

data.frame(
  method = c("Logistic regression", "LDA"),
  accuracy = c(acc_glm, acc_lda),
  auc = c(as.numeric(auc_glm), as.numeric(auc_lda))
)
table(True = test$type, Predicted = test$pred_glm)
##      Predicted
## True   No Yes
##   No  200  23
##   Yes  43  66
table(True = test$type, Predicted = test$pred_lda)
##      Predicted
## True   No Yes
##   No  198  25
##   Yes  42  67

Takeaway

roc_glm <- roc(response = test$type, predictor = test$p_glm, levels = c("No", "Yes"))
roc_lda <- roc(response = test$type, predictor = test$p_lda, levels = c("No", "Yes"))

plot(roc_glm, col = "#1f77b4", lwd = 2, main = "ROC curves on the test set")
plot(roc_lda, col = "#d62728", lwd = 2, add = TRUE)
legend(
  "bottomright",
  legend = c("Logistic regression", "LDA"),
  col = c("#1f77b4", "#d62728"),
  lwd = 2,
  bty = "n"
)

Both methods are easy to fit, both estimate class probabilities, and on this dataset their out-of-sample performance is fairly similar.