Goal

In this notebook, we walk through a full factor-analysis workflow using the 10 stocks, downloaded through the quantmod package.

Packages

library(dplyr)
library(tidyr)
library(ggplot2)
library(tibble)
library(purrr)

library(quantmod)

Download the 10 stocks data and data preprocessing

We will get monthly returns (percent %) for 10 stocks (JPMorgan, BOA, WellsFargo, Exxon, Chevron, ConocoPhillips, Apple, Microsoft, NextEra Energy, Duke Energy) from 2010-2026. Each column is stock, and each row is one month.

tickers <- c(
  # Banks
  "JPM", "BAC", "WFC",
  # Energy
  "XOM", "CVX", "COP",
  # Tech
  "AAPL", "MSFT",
  # Utilities
  "NEE", "DUK"
)


getSymbols(tickers, from = "2010-01-01", auto.assign = TRUE)
 [1] "JPM"  "BAC"  "WFC"  "XOM"  "CVX"  "COP"  "AAPL" "MSFT" "NEE"  "DUK" 
prices_list <- lapply(tickers, function(tk) {
  Ad(get(tk))
})

prices <- do.call(merge, prices_list)
colnames(prices) <- tickers

returns_list <- lapply(colnames(prices), function(tk) {
  monthlyReturn(prices[, tk], type = "log")
})

returns <- do.call(merge, returns_list)
colnames(returns) <- colnames(prices)

returns <- na.omit(returns)

returns <- returns * 100

Time-series view

plot_df <- returns %>%
  as.data.frame() %>%
  rownames_to_column(var = "date") %>%
  mutate(date = as.Date(date)) %>%
  pivot_longer(
    cols = -date,
    names_to = "stock",
    values_to = "ret"
  )

ggplot(plot_df, aes(x = date, y = ret, color = stock)) +
  geom_line(alpha = 0.7, linewidth = 0.5) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(
    title = "Monthly stock returns",
    x = NULL,
    y = "Monthly return (%)",
    color = "Stock"
  )

Correlation matrix

Factor analysis is about explaining common dependence, so the correlation matrix is a natural starting point.

X <- returns

cor_mat <- cor(X)
cor_df <- as.data.frame(as.table(cor_mat))

p_cor <- ggplot(cor_df, aes(Var2, Var1, fill = Freq)) +
  geom_tile() +
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
  coord_equal() +
  theme_minimal() +
  labs(title = "Industry return correlations", x = NULL, y = NULL, fill = "Corr") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p_cor

We see here that the correlations are mostly positive, and there are clustering structure as expected, indicating the presence of a few latent factors.

Visualize each stock marginally

X_long <- as.data.frame(X) %>%
  pivot_longer(cols = everything(),
               names_to = "Stock",
               values_to = "Return")

ggplot(X_long, aes(x = Stock, y = Return)) +
  geom_boxplot() +
  labs(title = "Monthly Returns by Stock",
       y = "Return (%)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

vars <- apply(X, 2, var, na.rm = TRUE)
# variance table
var_df <- data.frame(
  Industry = colnames(X),
  Variance = vars
)

ggplot(var_df, aes(x = Industry, y = Variance)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Variance by Stock")

We see moderate marginal volatilities differences across stocks.

Choosing the number of factors

Scree plot

We first look at the eigenvalues of the sample correlation matrix.

X_std <- scale(X, center = TRUE, scale = TRUE)
pca_full <- prcomp(X_std, center = FALSE, scale. = FALSE)
eig <- pca_full$sdev^2
scree_df <- tibble(component = seq_along(eig), eigenvalue = eig)

ggplot(scree_df, aes(component, eigenvalue)) +
  geom_line() +
  geom_point(size = 2) +
  theme_minimal() +
  scale_x_continuous(breaks = scree_df$component) +
  labs(title = "Scree plot (correlation matrix)",
       x = "Component / factor index",
       y = "Eigenvalue")

We do not see a clear elbow.

Parallel analysis

We compare the observed eigenvalues with those from random noise matrices having the same dimensions.

set.seed(24620)
B <- 500
n <- nrow(X_std)
p <- ncol(X_std)

rand_eigs <- replicate(B, {
  X_rand <- matrix(rnorm(n * p), nrow = n, ncol = p)
  eigen(cor(X_rand), symmetric = TRUE, only.values = TRUE)$values
})

pa_mean <- rowMeans(rand_eigs)
pa_q95 <- apply(rand_eigs, 1, quantile, probs = 0.95)

pa_df <- tibble(
  k = seq_len(p),
  observed = eig,
  null_mean = pa_mean,
  null_q95 = pa_q95
) %>%
  pivot_longer(-k, names_to = "series", values_to = "value")

ggplot(pa_df, aes(k, value, color = series)) +
  geom_line() +
  geom_point(size = 1.8) +
  theme_minimal() +
  scale_x_continuous(breaks = seq_len(p)) +
  labs(title = "Parallel analysis", x = "Index", y = "Eigenvalue", color = NULL)

A simple rule is to keep factors whose observed eigenvalues exceed the null benchmark.

pa_m <- sum(eig > pa_q95)
pa_m
[1] 3

PA suggests 3 factors.

Fit several FA models

We fit several candidate models and compare their residual fit. Here we use factanal() on the data matrix.

fit_grid <- function(X, m_values = 1:5) {
  map(m_values, ~ factanal(X, factors = .x, rotation = "none", scores = "regression")) %>%
    setNames(paste0("m", m_values))
}

fa_fits <- fit_grid(X, m_values = 1:5)
fa_fits$m2

Call:
factanal(x = X, factors = .x, scores = "regression", rotation = "none")

Uniquenesses:
  JPM   BAC   WFC   XOM   CVX   COP  AAPL  MSFT   NEE   DUK 
0.158 0.119 0.341 0.169 0.121 0.251 0.878 0.776 0.991 0.943 

Loadings:
     Factor1 Factor2
JPM   0.826   0.400 
BAC   0.807   0.480 
WFC   0.776   0.240 
XOM   0.797  -0.441 
CVX   0.841  -0.415 
COP   0.796  -0.340 
AAPL  0.325   0.129 
MSFT  0.394   0.262 
NEE                 
DUK   0.151  -0.186 

               Factor1 Factor2
SS loadings      4.203   1.050
Proportion Var   0.420   0.105
Cumulative Var   0.420   0.525

Test of the hypothesis that 2 factors are sufficient.
The chi square statistic is 182.2 on 26 degrees of freedom.
The p-value is 2.14e-25 

Residual correlation diagnostics

After fitting FA, we compare the observed covariance (or correlation) with the model-implied one: \[ \hat \Sigma = \hat L \hat L^\top + \hat\Psi. \]

To assess model fit, we examine the residual matrix: \[ \text{Residual} = R - \hat R. \] We convert covariance matrix to correlation matrix using the cov2cor function. We summarize residual fit using the off-diagonal RMSR.

fa_residual_cor <- function(fit, S) {
  L <- unclass(fit$loadings)
  Psi <- diag(fit$uniquenesses)
  Sigma_hat <- L %*% t(L) + Psi

  R_obs <- cov2cor(S)
  R_hat <- cov2cor(Sigma_hat)

  R_obs - R_hat
}

rmsr_offdiag <- function(M) {
  idx <- row(M) < col(M)
  sqrt(mean(M[idx]^2))
}

diag_tbl <- imap_dfr(fa_fits, function(fit, nm) {
  resid <- fa_residual_cor(fit, cor(X_std))
  tibble(
    model = nm,
    m = fit$factors,
    rmsr = rmsr_offdiag(resid),
    max_abs_resid = max(abs(resid[row(resid) < col(resid)]))
  )
})

diag_tbl

Choose a working model

Let’s first use 3 factors for FA.

Inspect the chosen FA model

The R output also shows you the p-value from likelihood-ratio test, testing whether having these factors are enough assuming data coming from a Gaussian factor analysis model.

m_star <- 3
fa_raw <- factanal(X, factors = m_star, rotation = "none", scores = "regression")
fa_rot <- factanal(X, factors = m_star, rotation = "varimax", scores = "regression")

fa_rot

Call:
factanal(x = X, factors = m_star, scores = "regression", rotation = "varimax")

Uniquenesses:
  JPM   BAC   WFC   XOM   CVX   COP  AAPL  MSFT   NEE   DUK 
0.156 0.120 0.333 0.172 0.113 0.228 0.844 0.744 0.069 0.535 

Loadings:
     Factor1 Factor2 Factor3
JPM   0.864   0.313         
BAC   0.903   0.251         
WFC   0.706   0.403         
XOM   0.240   0.874         
CVX   0.293   0.879   0.171 
COP   0.302   0.825         
AAPL  0.335   0.115   0.175 
MSFT  0.477           0.149 
NEE   0.141           0.952 
DUK           0.146   0.666 

               Factor1 Factor2 Factor3
SS loadings      2.653   2.585   1.447
Proportion Var   0.265   0.259   0.145
Cumulative Var   0.265   0.524   0.669

Test of the hypothesis that 3 factors are sufficient.
The chi square statistic is 49.49 on 18 degrees of freedom.
The p-value is 9.01e-05 

Uniquenesses and communalities

fa_summary <- tibble(
  industry = names(fa_rot$uniquenesses),
  uniqueness = as.numeric(fa_rot$uniquenesses),
  communality = 1 - uniqueness
) %>%
  arrange(desc(communality))

fa_summary

Interpretation:

  • A high communality means that the industry’s return variation is largely explained by the common factors.
  • A high uniqueness means that the industry retains a lot of idiosyncratic variation after accounting for common factors.

We can see that MSFT and AAPL still have large uniqueness, also the p-value is still very small. So maybe we actually need more factors, which is consistent with our intuition.

resid_mat <- fa_residual_cor(fa_rot, cor(X_std))
resid_df <- as.data.frame(as.table(resid_mat))

ggplot(resid_df, aes(Var2, Var1, fill = Freq)) +
  geom_tile() +
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
  coord_equal() +
  theme_minimal() +
  labs(title = paste("Residual correlations: FA with", m_star, "factors"),
       x = NULL, y = NULL, fill = "Residual") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

This plot helps decide whether the chosen factor model is still missing systematic structure. We see that MSFT and AAPL still have strong residual correlation, that is evidence that m may still be too small or that the factor model is not capturing the dependence pattern well.

Revised factor model: adding a fourth factor

m_star <- m_star + 1
fa_raw <- factanal(X, factors = m_star, rotation = "none", scores = "regression")
fa_rot <- factanal(X, factors = m_star, rotation = "varimax", scores = "regression")

fa_rot

Call:
factanal(x = X, factors = m_star, scores = "regression", rotation = "varimax")

Uniquenesses:
  JPM   BAC   WFC   XOM   CVX   COP  AAPL  MSFT   NEE   DUK 
0.166 0.104 0.333 0.173 0.109 0.229 0.732 0.005 0.037 0.545 

Loadings:
     Factor1 Factor2 Factor3 Factor4
JPM   0.304   0.817           0.267 
BAC   0.236   0.888           0.228 
WFC   0.393   0.693           0.174 
XOM   0.872   0.238                 
CVX   0.879   0.264   0.164   0.146 
COP   0.819   0.307                 
AAPL  0.113   0.203   0.138   0.442 
MSFT          0.211           0.972 
NEE                   0.966   0.153 
DUK   0.157           0.654         

               Factor1 Factor2 Factor3 Factor4
SS loadings      2.551   2.245   1.420   1.350
Proportion Var   0.255   0.225   0.142   0.135
Cumulative Var   0.255   0.480   0.622   0.757

Test of the hypothesis that 4 factors are sufficient.
The chi square statistic is 16.58 on 11 degrees of freedom.
The p-value is 0.121 
fa_summary <- tibble(
  industry = names(fa_rot$uniquenesses),
  uniqueness = as.numeric(fa_rot$uniquenesses),
  communality = 1 - uniqueness
) %>%
  arrange(desc(communality))

fa_summary
resid_mat <- fa_residual_cor(fa_rot, cor(X_std))
resid_df <- as.data.frame(as.table(resid_mat))

ggplot(resid_df, aes(Var2, Var1, fill = Freq)) +
  geom_tile() +
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
  coord_equal() +
  theme_minimal() +
  labs(title = paste("Residual correlations: FA with", m_star, "factors"),
       x = NULL, y = NULL, fill = "Residual") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The fourth factor seems to mainly explain MSFT, but the test does become insignificant, and there are no remaining large correlations left. So we should still include this factor.

Rotation: unrotated vs varimax loadings

loadings_to_df <- function(fit, label) {
  as.data.frame(unclass(fit$loadings)) %>%
    tibble::rownames_to_column("stock") %>%
    pivot_longer(-stock, names_to = "factor", values_to = "loading") %>%
    mutate(method = label)
}

load_df <- bind_rows(
  loadings_to_df(fa_raw, "Unrotated FA"),
  loadings_to_df(fa_rot, "Varimax FA")
)

load_df$stock <- factor(load_df$stock, levels = rownames(fa_raw$loadings))
ggplot(load_df, aes(factor, stock, fill = loading)) +
  geom_tile() +
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
  facet_wrap(~ method, nrow = 1) +
  theme_minimal() +
  labs(title = "FA loadings: unrotated vs varimax",
       x = NULL, y = NULL, fill = "Loading")

In practice, the unrotated loadings often look diffuse: many stocks may load moderately on multiple factors. After varimax rotation, the pattern is usually easier to narrate because some factors become more concentrated on particular groups of stocks.

Factor scores

The regression scores provide estimated latent-factor realizations over time.

Fhat <- fa_rot$scores %>%
      as.data.frame() %>%
      rownames_to_column(var = "date") %>%
      mutate(date = as.Date(date))


Fhat_long <- Fhat %>%
  pivot_longer(-date, names_to = "factor", values_to = "score")

ggplot(Fhat_long, aes(date, score, color = factor)) +
  geom_line(linewidth = 0.5) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Estimated factor scores over time",
       x = NULL, y = "Score", color = "Factor")

Comparison with PCA

We now compare FA with PCA on the standardized matrix.

pca <- prcomp(X_std, center = FALSE, scale. = FALSE)
round(pca$rotation[, 1:m_star], 3)
       PC1    PC2    PC3    PC4
JPM  0.395 -0.061  0.228 -0.359
BAC  0.383 -0.092  0.273 -0.360
WFC  0.384 -0.140  0.115 -0.306
XOM  0.373 -0.075 -0.391  0.211
CVX  0.393 -0.008 -0.351  0.193
COP  0.375 -0.168 -0.319  0.224
AAPL 0.208  0.226  0.393  0.616
MSFT 0.238  0.195  0.487  0.266
NEE  0.078  0.683 -0.045 -0.165
DUK  0.098  0.616 -0.297 -0.196

PCA vs FA loadings heatmap

pca_load_df <- as.data.frame(pca$rotation[, 1:m_star, drop = FALSE]) %>%
  tibble::rownames_to_column("stock") %>%
  pivot_longer(-stock, names_to = "factor", values_to = "loading") %>%
  mutate(method = "PCA")

fa_rot_df <- loadings_to_df(fa_rot, "FA (varimax)")
plot_load_compare <- bind_rows(pca_load_df, fa_rot_df)
plot_load_compare$stock <- factor(plot_load_compare$stock, levels = rownames(fa_rot$loadings))

ggplot(plot_load_compare, aes(factor, stock, fill = loading)) +
  geom_tile() +
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
  facet_wrap(~ method, nrow = 1) +
  theme_minimal() +
  labs(title = "PCA loadings vs rotated FA loadings",
       x = NULL, y = NULL, fill = "Loading")

PCA loadings describe orthogonal variance-maximizing directions, while FA loadings describe how observed variables depend on latent common factors plus uniquenesses. After rotation, FA gives clearer clustering structure of the variables. In contrast, PCA learns the shared market effect as the first PC.

Key takeaways

---
title: "Lecture 4 Factor Analysis Live Demo: 10 Stocks"
author: "STAT 24620 = STAT 32900, FINM 34700"
date: "Spring 2026"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  fig.width = 7.5,
  fig.height = 4.5,
  message = FALSE,
  warning = FALSE
)
options(digits = 4)
```

## Goal

In this notebook, we walk through a full factor-analysis workflow using the 10 stocks, downloaded through the `quantmod` package.
 
## Packages

```{r packages}
library(dplyr)
library(tidyr)
library(ggplot2)
library(tibble)
library(purrr)

library(quantmod)
```

## Download the 10 stocks data and data preprocessing


We will get monthly returns (percent \%) for 10 stocks (JPMorgan, BOA, WellsFargo, Exxon, Chevron, ConocoPhillips, Apple, Microsoft, NextEra Energy, Duke Energy) from 2010-2026. Each column is stock, and each row is one month. 


```{r get-data}
tickers <- c(
  # Banks
  "JPM", "BAC", "WFC",
  # Energy
  "XOM", "CVX", "COP",
  # Tech
  "AAPL", "MSFT",
  # Utilities
  "NEE", "DUK"
)


getSymbols(tickers, from = "2010-01-01", auto.assign = TRUE)

prices_list <- lapply(tickers, function(tk) {
  Ad(get(tk))
})

prices <- do.call(merge, prices_list)
colnames(prices) <- tickers

returns_list <- lapply(colnames(prices), function(tk) {
  monthlyReturn(prices[, tk], type = "log")
})

returns <- do.call(merge, returns_list)
colnames(returns) <- colnames(prices)

returns <- na.omit(returns)

returns <- returns * 100
```

### Time-series view

```{r time-series-long}
plot_df <- returns %>%
  as.data.frame() %>%
  rownames_to_column(var = "date") %>%
  mutate(date = as.Date(date)) %>%
  pivot_longer(
    cols = -date,
    names_to = "stock",
    values_to = "ret"
  )

ggplot(plot_df, aes(x = date, y = ret, color = stock)) +
  geom_line(alpha = 0.7, linewidth = 0.5) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(
    title = "Monthly stock returns",
    x = NULL,
    y = "Monthly return (%)",
    color = "Stock"
  )
```


### Correlation matrix

Factor analysis is about explaining common dependence, so the correlation matrix is a natural starting point.

```{r observed-correlations}
X <- returns

cor_mat <- cor(X)
cor_df <- as.data.frame(as.table(cor_mat))

p_cor <- ggplot(cor_df, aes(Var2, Var1, fill = Freq)) +
  geom_tile() +
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
  coord_equal() +
  theme_minimal() +
  labs(title = "Industry return correlations", x = NULL, y = NULL, fill = "Corr") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p_cor
```
We see here that the correlations are mostly positive, and there are clustering structure as expected, indicating the presence of a few latent factors.


### Visualize each stock marginally

```{r}
X_long <- as.data.frame(X) %>%
  pivot_longer(cols = everything(),
               names_to = "Stock",
               values_to = "Return")

ggplot(X_long, aes(x = Stock, y = Return)) +
  geom_boxplot() +
  labs(title = "Monthly Returns by Stock",
       y = "Return (%)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

```{r}
vars <- apply(X, 2, var, na.rm = TRUE)
# variance table
var_df <- data.frame(
  Industry = colnames(X),
  Variance = vars
)

ggplot(var_df, aes(x = Industry, y = Variance)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Variance by Stock")
```
We see moderate marginal volatilities differences across stocks.

## Choosing the number of factors

### Scree plot

We first look at the eigenvalues of the sample correlation matrix. 

```{r scree}
X_std <- scale(X, center = TRUE, scale = TRUE)
pca_full <- prcomp(X_std, center = FALSE, scale. = FALSE)
eig <- pca_full$sdev^2
scree_df <- tibble(component = seq_along(eig), eigenvalue = eig)

ggplot(scree_df, aes(component, eigenvalue)) +
  geom_line() +
  geom_point(size = 2) +
  theme_minimal() +
  scale_x_continuous(breaks = scree_df$component) +
  labs(title = "Scree plot (correlation matrix)",
       x = "Component / factor index",
       y = "Eigenvalue")
```

We do not see a clear elbow.

### Parallel analysis

We compare the observed eigenvalues with those from random noise matrices having the same dimensions.

```{r parallel-analysis}
set.seed(24620)
B <- 500
n <- nrow(X_std)
p <- ncol(X_std)

rand_eigs <- replicate(B, {
  X_rand <- matrix(rnorm(n * p), nrow = n, ncol = p)
  eigen(cor(X_rand), symmetric = TRUE, only.values = TRUE)$values
})

pa_mean <- rowMeans(rand_eigs)
pa_q95 <- apply(rand_eigs, 1, quantile, probs = 0.95)

pa_df <- tibble(
  k = seq_len(p),
  observed = eig,
  null_mean = pa_mean,
  null_q95 = pa_q95
) %>%
  pivot_longer(-k, names_to = "series", values_to = "value")

ggplot(pa_df, aes(k, value, color = series)) +
  geom_line() +
  geom_point(size = 1.8) +
  theme_minimal() +
  scale_x_continuous(breaks = seq_len(p)) +
  labs(title = "Parallel analysis", x = "Index", y = "Eigenvalue", color = NULL)
```

A simple rule is to keep factors whose observed eigenvalues exceed the null benchmark.

```{r pa-count}
pa_m <- sum(eig > pa_q95)
pa_m
```

PA suggests 3 factors.

### Fit several FA models

We fit several candidate models and compare their residual fit. Here we use `factanal()` on the data matrix.

```{r fit-fa-grid}
fit_grid <- function(X, m_values = 1:5) {
  map(m_values, ~ factanal(X, factors = .x, rotation = "none", scores = "regression")) %>%
    setNames(paste0("m", m_values))
}

fa_fits <- fit_grid(X, m_values = 1:5)
fa_fits$m2
```

### Residual correlation diagnostics

After fitting FA, we compare the observed covariance (or correlation) with the model-implied one:
\[
\hat \Sigma = \hat L \hat L^\top + \hat\Psi.
\]

To assess model fit, we examine the residual matrix:
\[
\text{Residual} = R - \hat R.
\]
We convert covariance matrix to correlation matrix using the `cov2cor` function.
We summarize residual fit using the off-diagonal RMSR.

```{r residual-diagnostics}
fa_residual_cor <- function(fit, S) {
  L <- unclass(fit$loadings)
  Psi <- diag(fit$uniquenesses)
  Sigma_hat <- L %*% t(L) + Psi

  R_obs <- cov2cor(S)
  R_hat <- cov2cor(Sigma_hat)

  R_obs - R_hat
}

rmsr_offdiag <- function(M) {
  idx <- row(M) < col(M)
  sqrt(mean(M[idx]^2))
}

diag_tbl <- imap_dfr(fa_fits, function(fit, nm) {
  resid <- fa_residual_cor(fit, cor(X_std))
  tibble(
    model = nm,
    m = fit$factors,
    rmsr = rmsr_offdiag(resid),
    max_abs_resid = max(abs(resid[row(resid) < col(resid)]))
  )
})

diag_tbl
```

## Choose a working model


Let's first use 3 factors for FA.

### Inspect the chosen FA model

The R output also shows you the p-value from likelihood-ratio test, testing whether having these factors are enough assuming data coming from a Gaussian factor analysis model.

```{r fit-chosen}
m_star <- 3
fa_raw <- factanal(X, factors = m_star, rotation = "none", scores = "regression")
fa_rot <- factanal(X, factors = m_star, rotation = "varimax", scores = "regression")

fa_rot
```

### Uniquenesses and communalities

```{r uniquenesses-communalities}
fa_summary <- tibble(
  industry = names(fa_rot$uniquenesses),
  uniqueness = as.numeric(fa_rot$uniquenesses),
  communality = 1 - uniqueness
) %>%
  arrange(desc(communality))

fa_summary
```

Interpretation:

- A **high communality** means that the industry's return variation is largely explained by the common factors.
- A **high uniqueness** means that the industry retains a lot of idiosyncratic variation after accounting for common factors.

We can see that `MSFT` and `AAPL` still have large uniqueness, also the p-value is still very small. So maybe we actually need more factors, which is consistent with our intuition. 


```{r residual-heatmap}
resid_mat <- fa_residual_cor(fa_rot, cor(X_std))
resid_df <- as.data.frame(as.table(resid_mat))

ggplot(resid_df, aes(Var2, Var1, fill = Freq)) +
  geom_tile() +
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
  coord_equal() +
  theme_minimal() +
  labs(title = paste("Residual correlations: FA with", m_star, "factors"),
       x = NULL, y = NULL, fill = "Residual") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```
This plot helps decide whether the chosen factor model is still missing systematic structure. We see that `MSFT` and `AAPL` still have strong residual correlation, that is evidence that `m` may still be too small or that the factor model is not capturing the dependence pattern well.

## Revised factor model: adding a fourth factor

```{r fit-new}
m_star <- m_star + 1
fa_raw <- factanal(X, factors = m_star, rotation = "none", scores = "regression")
fa_rot <- factanal(X, factors = m_star, rotation = "varimax", scores = "regression")

fa_rot
```

```{r uniquenesses-communalities-new}
fa_summary <- tibble(
  industry = names(fa_rot$uniquenesses),
  uniqueness = as.numeric(fa_rot$uniquenesses),
  communality = 1 - uniqueness
) %>%
  arrange(desc(communality))

fa_summary
```

```{r residual-heatmap-new}
resid_mat <- fa_residual_cor(fa_rot, cor(X_std))
resid_df <- as.data.frame(as.table(resid_mat))

ggplot(resid_df, aes(Var2, Var1, fill = Freq)) +
  geom_tile() +
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
  coord_equal() +
  theme_minimal() +
  labs(title = paste("Residual correlations: FA with", m_star, "factors"),
       x = NULL, y = NULL, fill = "Residual") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

The fourth factor seems to mainly explain `MSFT`, but the test does become insignificant, and there are no remaining large correlations left. So we should still include this factor.


## Rotation: unrotated vs varimax loadings

```{r loading-helpers}
loadings_to_df <- function(fit, label) {
  as.data.frame(unclass(fit$loadings)) %>%
    tibble::rownames_to_column("stock") %>%
    pivot_longer(-stock, names_to = "factor", values_to = "loading") %>%
    mutate(method = label)
}

load_df <- bind_rows(
  loadings_to_df(fa_raw, "Unrotated FA"),
  loadings_to_df(fa_rot, "Varimax FA")
)

load_df$stock <- factor(load_df$stock, levels = rownames(fa_raw$loadings))
```

```{r loadings-heatmap}
ggplot(load_df, aes(factor, stock, fill = loading)) +
  geom_tile() +
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
  facet_wrap(~ method, nrow = 1) +
  theme_minimal() +
  labs(title = "FA loadings: unrotated vs varimax",
       x = NULL, y = NULL, fill = "Loading")
```
In practice, the unrotated loadings often look diffuse: many stocks may load moderately on multiple factors. After varimax rotation, the pattern is usually easier to narrate because some factors become more concentrated on particular groups of stocks.

- Factor 1: Energy
- Factor 2: Bank
- Factor 3: Utility
- Factor 4: High Tech

## Factor scores

The regression scores provide estimated latent-factor realizations over time.

```{r factor-scores}
Fhat <- fa_rot$scores %>%
      as.data.frame() %>%
      rownames_to_column(var = "date") %>%
      mutate(date = as.Date(date))


Fhat_long <- Fhat %>%
  pivot_longer(-date, names_to = "factor", values_to = "score")

ggplot(Fhat_long, aes(date, score, color = factor)) +
  geom_line(linewidth = 0.5) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Estimated factor scores over time",
       x = NULL, y = "Score", color = "Factor")
```

## Comparison with PCA

We now compare FA with PCA on the standardized matrix.

```{r pca-fit}
pca <- prcomp(X_std, center = FALSE, scale. = FALSE)
round(pca$rotation[, 1:m_star], 3)
```

### PCA vs FA loadings heatmap

```{r pca-vs-fa-loadings}
pca_load_df <- as.data.frame(pca$rotation[, 1:m_star, drop = FALSE]) %>%
  tibble::rownames_to_column("stock") %>%
  pivot_longer(-stock, names_to = "factor", values_to = "loading") %>%
  mutate(method = "PCA")

fa_rot_df <- loadings_to_df(fa_rot, "FA (varimax)")
plot_load_compare <- bind_rows(pca_load_df, fa_rot_df)
plot_load_compare$stock <- factor(plot_load_compare$stock, levels = rownames(fa_rot$loadings))

ggplot(plot_load_compare, aes(factor, stock, fill = loading)) +
  geom_tile() +
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
  facet_wrap(~ method, nrow = 1) +
  theme_minimal() +
  labs(title = "PCA loadings vs rotated FA loadings",
       x = NULL, y = NULL, fill = "Loading")
```
PCA loadings describe orthogonal variance-maximizing directions, while FA loadings describe how observed variables depend on latent common factors plus uniquenesses. After rotation, FA gives clearer clustering structure of the variables. In contrast, PCA learns the shared market effect as the first PC.

## Key takeaways

- FA is a model for the covariance/correlation structure, not just a data-reduction device.
- The number of factors should be chosen using both **eigenvalue diagnostics** and **post-fit residual checks**.
- Rotation can be helpful for interpretation.
- PCA and FA may look similar in practice, but their objectives are different.
