Run this once if needed:
install.packages(c(
"ggplot2", "frenchdata",
"httr2", "glue", "patchwork", "scales"
))
This R demo studies one monthly finance dataset from July 1963 to
February 2026. There are two blocks of variables X and
Y.
X = 4 Fama-French style factors plus 7 macro/financial
predictors from FREDY = monthly value-weighted returns on the 10 Kenneth
French industry portfoliosX collects broad economic and market variables that may
help explain common movements in returns, while Y collects
returns from different parts of the economy. We use the same dataset
throughout to compare three methods:
Y from
X# Main packages for the analysis itself:
# - ggplot2 for all lecture plots
# - frenchdata for the Kenneth French datasets
# The other packages listed above are helper packages used only in a few spots below.
library(ggplot2)
library(frenchdata)
theme_set(ggplot2::theme_minimal(base_size = 13))
set.seed(32950)
We first download the data, treat each month as an observation.
parse_ff_month <- function(x) {
as.Date(paste0(x, "01"), format = "%Y%m%d")
}
download_fred_csv <- function(series_id) {
# Build the FRED URL and download the CSV for one series.
url <- glue::glue("https://fred.stlouisfed.org/graph/fredgraph.csv?id={series_id}")
req <- httr2::request(url)
req <- httr2::req_options(req, http_version = 2L)
resp <- httr2::req_perform(req)
fred_df <- read.csv(text = httr2::resp_body_string(resp), stringsAsFactors = FALSE)
names(fred_df) <- c("date", "value")
fred_df$date <- as.Date(fred_df$date)
fred_df$value <- as.numeric(fred_df$value)
fred_df$series <- series_id
fred_df
}
# Kenneth French data -----------------------------------------------------
ff5_raw <- download_french_data("Fama/French 5 Factors (2x3)")
ind10_raw <- download_french_data("10 Industry Portfolios")
ff5_tbl <- as.data.frame(ff5_raw$subsets$data[[1]])
ff5_tbl <- ff5_tbl[, c("date", "Mkt-RF", "SMB", "HML", "RMW", "CMA")]
names(ff5_tbl) <- c("date", "Mkt_RF", "SMB", "HML", "RMW", "CMA")
ff5_tbl$date <- parse_ff_month(ff5_tbl$date)
ff5_tbl <- ff5_tbl[ff5_tbl$date >= as.Date("1963-07-01") &
ff5_tbl$date <= as.Date("2026-02-01"), ]
industry_tbl <- as.data.frame(ind10_raw$subsets$data[[1]])
industry_tbl <- industry_tbl[, c("date", "NoDur", "Durbl", "Manuf", "Enrgy",
"HiTec", "Telcm", "Shops", "Hlth", "Utils", "Other")]
industry_tbl$date <- parse_ff_month(industry_tbl$date)
industry_tbl <- industry_tbl[industry_tbl$date >= as.Date("1963-07-01") &
industry_tbl$date <= as.Date("2026-02-01"), ]
# FRED data ---------------------------------------------------------------
fred_ids <- c(
"UNRATE", "CPIAUCSL", "INDPRO", "PAYEMS", "FEDFUNDS",
"GS10", "TB3MS", "BAA", "AAA"
)
fred_list <- lapply(fred_ids, download_fred_csv)
fred_raw <- do.call(rbind, fred_list)
fred_raw <- reshape(
fred_raw[, c("date", "series", "value")],
idvar = "date",
timevar = "series",
direction = "wide"
)
names(fred_raw) <- sub("^value\\.", "", names(fred_raw))
fred_raw <- fred_raw[order(fred_raw$date), ]
macro_tbl <- data.frame(
date = fred_raw$date,
unemployment_rate = fred_raw$UNRATE,
inflation_yoy = c(rep(NA, 12), 100 * diff(log(fred_raw$CPIAUCSL), lag = 12)),
industrial_production_yoy = c(rep(NA, 12), 100 * diff(log(fred_raw$INDPRO), lag = 12)),
payrolls_yoy = c(rep(NA, 12), 100 * diff(log(fred_raw$PAYEMS), lag = 12)),
fed_funds = fred_raw$FEDFUNDS,
term_spread = fred_raw$GS10 - fred_raw$TB3MS,
default_spread = fred_raw$BAA - fred_raw$AAA
)
macro_tbl <- macro_tbl[macro_tbl$date >= as.Date("1963-07-01") &
macro_tbl$date <= as.Date("2026-02-01"), ]
# Merge and clean ---------------------------------------------------------
full_tbl <- merge(ff5_tbl, macro_tbl, by = "date")
full_tbl <- merge(full_tbl, industry_tbl, by = "date")
full_tbl <- full_tbl[order(full_tbl$date), ]
analysis_tbl <- full_tbl[complete.cases(full_tbl), ]
rows_dropped <- nrow(full_tbl) - nrow(analysis_tbl)
x_vars <- c(
"SMB", "HML", "RMW", "CMA",
"unemployment_rate", "inflation_yoy", "industrial_production_yoy",
"payrolls_yoy", "fed_funds", "term_spread", "default_spread"
)
y_vars <- c(
"NoDur", "Durbl", "Manuf", "Enrgy", "HiTec",
"Telcm", "Shops", "Hlth", "Utils", "Other"
)
X_raw <- as.matrix(analysis_tbl[, x_vars])
Y_raw <- as.matrix(analysis_tbl[, y_vars])
X_centered <- scale(X_raw, center = TRUE, scale = FALSE)
Y_centered <- scale(Y_raw, center = TRUE, scale = FALSE)
n <- nrow(X_centered)
p <- ncol(X_centered)
q <- ncol(Y_centered)
We center the variables before fitting the methods, but we do not rescale them.
The macro variables are transformed in a simple way:
For CCA, centering is enough for the main canonical-correlation story
here. For regression and reduced-rank regression, keeping Y
on its original scale also makes the prediction interpretation more
direct.
Rows with missing values are dropped only after all transformations are created. In practice, most missingness here comes from the initial 12-month lags used in the growth-rate variables.
data.frame(
sample_start = min(analysis_tbl$date),
sample_end = max(analysis_tbl$date),
n = n,
p = p,
q = q,
rows_dropped_after_transformations = rows_dropped
)
x_variable_table <- data.frame(
variable = x_vars,
meaning = c(
"Size factor: small stocks minus big stocks",
"Value factor: high book-to-market minus low book-to-market",
"Profitability factor: robust minus weak profitability firms",
"Investment factor: conservative minus aggressive investment firms",
"Unemployment rate",
"Year-over-year CPI inflation rate",
"Year-over-year industrial production growth",
"Year-over-year payroll employment growth",
"Federal funds rate",
"Term spread: 10-year Treasury yield minus 3-month Treasury bill rate",
"Default spread: BAA corporate yield minus AAA corporate yield"
),
stringsAsFactors = FALSE
)
y_variable_table <- data.frame(
variable = y_vars,
meaning = c(
"Consumer nondurables",
"Consumer durables",
"Manufacturing",
"Energy",
"High technology",
"Telecommunications",
"Shops / retail-wholesale",
"Healthcare",
"Utilities",
"Other industries"
),
stringsAsFactors = FALSE
)
x_variable_table
y_variable_table
Some interpretation:
Mkt-RF;Mkt-RF makes it easier to see how the
remaining style factors and macro-financial variables contribute to
shared structure across industries;CCA treats the two blocks symmetrically. It asks:
Which linear combination of
Xand which linear combination ofYare most strongly correlated?
This section first fits CCA on the full sample, then uses a train-test split to show that strong in-sample canonical correlations may weaken out of sample.
Before fitting CCA, it is useful to look at the ordinary pairwise
correlations between the variables in X and the variables
in Y.
make_heatmap_df <- function(M, row_name = "row", col_name = "col", value_name = "value") {
df <- as.data.frame(as.table(M), stringsAsFactors = FALSE)
names(df) <- c(row_name, col_name, value_name)
df[[row_name]] <- as.character(df[[row_name]])
df[[col_name]] <- as.character(df[[col_name]])
df
}
xy_cor_mat <- cor(X_centered, Y_centered)
xy_cor_df <- make_heatmap_df(xy_cor_mat, row_name = "x_variable", col_name = "y_variable", value_name = "correlation")
xy_cor_top <- xy_cor_df[order(abs(xy_cor_df$correlation), decreasing = TRUE), ]
head(xy_cor_top, 15)
ggplot(xy_cor_df, aes(x = y_variable, y = x_variable, fill = correlation)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B") +
labs(
title = "Pairwise Correlations Between X and Y",
x = NULL,
y = NULL,
fill = "Corr."
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
This helps explain why the first canonical variate on the
X
side may be dominated by one of the Fama-French factors: some of those
factors have visibly stronger direct correlation with the industry
returns than the macro variables do.
The output directly from the CCA function in R is limited. It only provides the canonical correlations and weights, and we need to calculate the canonical covariates and loadings from that.
cca_fit <- cancor(X_centered, Y_centered, xcenter = FALSE, ycenter = FALSE)
# canonical covariates
U <- X_centered %*% cca_fit$xcoef
V <- Y_centered %*% cca_fit$ycoef
canonical_correlations <- data.frame(
pair = seq_along(cca_fit$cor),
canonical_correlation = cca_fit$cor
)
x_weights <- data.frame(variable = rownames(cca_fit$xcoef), cca_fit$xcoef,
row.names = NULL, check.names = FALSE)
names(x_weights)[-1] <- paste0("Can", seq_len(ncol(cca_fit$xcoef)))
y_weights <- data.frame(variable = rownames(cca_fit$ycoef), cca_fit$ycoef,
row.names = NULL, check.names = FALSE)
names(y_weights)[-1] <- paste0("Can", seq_len(ncol(cca_fit$ycoef)))
# loadings
x_loadings_mat <- cor(X_centered, U)
y_loadings_mat <- cor(Y_centered, V)
x_loadings <- data.frame(variable = rownames(x_loadings_mat), x_loadings_mat,
row.names = NULL, check.names = FALSE)
names(x_loadings)[-1] <- paste0("Can", seq_len(ncol(x_loadings_mat)))
y_loadings <- data.frame(variable = rownames(y_loadings_mat), y_loadings_mat,
row.names = NULL, check.names = FALSE)
names(y_loadings)[-1] <- paste0("Can", seq_len(ncol(y_loadings_mat)))
# cross-loadings: correlation with the opposite-side canonical score
canonical_corr_vec <- cca_fit$cor
x_cross_loadings_mat <- sweep(x_loadings_mat, 2, canonical_corr_vec, "*")
y_cross_loadings_mat <- sweep(y_loadings_mat, 2, canonical_corr_vec, "*")
x_cross_loadings <- data.frame(variable = rownames(x_cross_loadings_mat), x_cross_loadings_mat,
row.names = NULL, check.names = FALSE)
names(x_cross_loadings)[-1] <- paste0("Can", seq_len(ncol(x_cross_loadings_mat)))
y_cross_loadings <- data.frame(variable = rownames(y_cross_loadings_mat), y_cross_loadings_mat,
row.names = NULL, check.names = FALSE)
names(y_cross_loadings)[-1] <- paste0("Can", seq_len(ncol(y_cross_loadings_mat)))
sort_by_abs <- function(df, column_name) {
df[order(abs(df[[column_name]]), decreasing = TRUE), ]
}
round_table <- function(df, digits = 3) {
out <- df
num_cols <- vapply(out, is.numeric, logical(1))
out[num_cols] <- lapply(out[num_cols], round, digits = digits)
out
}
round_table(canonical_correlations, digits = 3)
round_table(x_weights[, c("variable", "Can1", "Can2", "Can3", "Can4")], digits = 3)
round_table(y_weights[, c("variable", "Can1", "Can2", "Can3", "Can4")], digits = 3)
round_table(
sort_by_abs(x_loadings[, c("variable", "Can1", "Can2", "Can3", "Can4")], "Can1"),
digits = 3
)
round_table(
sort_by_abs(y_loadings[, c("variable", "Can1", "Can2", "Can3", "Can4")], "Can1"),
digits = 3
)
round_table(
sort_by_abs(x_cross_loadings[, c("variable", "Can1", "Can2", "Can3", "Can4")], "Can1"),
digits = 3
)
round_table(
sort_by_abs(y_cross_loadings[, c("variable", "Can1", "Can2", "Can3", "Can4")], "Can1"),
digits = 3
)
R^2 interpretation.cca_x_loadings_long <- rbind(
data.frame(variable = x_loadings$variable, pair = "Can1", loading = x_loadings$Can1),
data.frame(variable = x_loadings$variable, pair = "Can2", loading = x_loadings$Can2),
data.frame(variable = x_loadings$variable, pair = "Can3", loading = x_loadings$Can3),
data.frame(variable = x_loadings$variable, pair = "Can4", loading = x_loadings$Can4)
)
cca_y_loadings_long <- rbind(
data.frame(variable = y_loadings$variable, pair = "Can1", loading = y_loadings$Can1),
data.frame(variable = y_loadings$variable, pair = "Can2", loading = y_loadings$Can2),
data.frame(variable = y_loadings$variable, pair = "Can3", loading = y_loadings$Can3),
data.frame(variable = y_loadings$variable, pair = "Can4", loading = y_loadings$Can4)
)
cca_x_loadings_plot <- ggplot(cca_x_loadings_long, aes(x = pair, y = reorder(variable, loading), fill = loading)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B") +
labs(
title = "X Loadings",
x = NULL,
y = NULL,
fill = "Loading"
)
cca_y_loadings_plot <- ggplot(cca_y_loadings_long, aes(x = pair, y = reorder(variable, loading), fill = loading)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B") +
labs(
title = "Y Loadings",
x = NULL,
y = NULL,
fill = "Loading"
)
patchwork::wrap_plots(cca_x_loadings_plot, cca_y_loadings_plot, ncol = 2)
Short interpretation of the first four canonical pairs:
Pair 1: HML and CMA on
the X side; HiTec on the Y
side.
This links value/investment variation to a technology-heavy return
direction.
Pair 2: RMW and SMB on
the X side; Other, Durbl,
HiTec, and Telcm on the Y
side.
This suggests a profitability/size direction tied to a broad
cyclical-growth industry pattern.
Pair 3: SMB and RMW on
the X side; Shops, NoDur,
Manuf, and Durbl on the Y
side.
This looks like a size/profitability direction linked to consumer and
manufacturing industries.
Pair 4: payroll growth, default spread, and
industrial production growth on the X side;
Durbl and Telcm on the Y
side.
This is more macro-financial, linking real activity and credit
conditions to these industries.
CCA is fitted to maximize correlation on the observed sample. A useful check is to see whether the same directions still have high correlation on held-out data.
set.seed(32950)
train_id <- sample(c("train", "test"), size = n, replace = TRUE, prob = c(0.7, 0.3))
X_train_raw <- X_raw[train_id == "train", , drop = FALSE]
Y_train_raw <- Y_raw[train_id == "train", , drop = FALSE]
X_test_raw <- X_raw[train_id == "test", , drop = FALSE]
Y_test_raw <- Y_raw[train_id == "test", , drop = FALSE]
x_train_mean <- colMeans(X_train_raw)
y_train_mean <- colMeans(Y_train_raw)
X_train <- sweep(X_train_raw, 2, x_train_mean, "-")
Y_train <- sweep(Y_train_raw, 2, y_train_mean, "-")
X_test <- sweep(X_test_raw, 2, x_train_mean, "-")
Y_test <- sweep(Y_test_raw, 2, y_train_mean, "-")
cca_train <- cancor(X_train, Y_train, xcenter = FALSE, ycenter = FALSE)
U_train <- X_train %*% cca_train$xcoef
V_train <- Y_train %*% cca_train$ycoef
U_test <- X_test %*% cca_train$xcoef
V_test <- Y_test %*% cca_train$ycoef
train_corr <- diag(cor(U_train, V_train))
test_corr <- diag(cor(U_test, V_test))
cca_split_tbl <- data.frame(
pair = seq_along(train_corr),
train_correlation = train_corr,
test_correlation = test_corr
)
round_table(cca_split_tbl, digits = 3)
The canonical correlations are generally smaller on test data. This does not mean CCA is wrong. It shows that sample canonical correlations are optimized in-sample, so some of the strength may be reduced on new data.
We may also split the data into historical months and future months and check if the canonical correlations are stable for the future.
split_point <- floor(0.7 * n)
X_train_raw_time <- X_raw[1:split_point, , drop = FALSE]
Y_train_raw_time <- Y_raw[1:split_point, , drop = FALSE]
X_test_raw_time <- X_raw[(split_point + 1):n, , drop = FALSE]
Y_test_raw_time <- Y_raw[(split_point + 1):n, , drop = FALSE]
x_train_mean_time <- colMeans(X_train_raw_time)
y_train_mean_time <- colMeans(Y_train_raw_time)
X_train_time <- sweep(X_train_raw_time, 2, x_train_mean_time, "-")
Y_train_time <- sweep(Y_train_raw_time, 2, y_train_mean_time, "-")
X_test_time <- sweep(X_test_raw_time, 2, x_train_mean_time, "-")
Y_test_time <- sweep(Y_test_raw_time, 2, y_train_mean_time, "-")
cca_time <- cancor(X_train_time, Y_train_time, xcenter = FALSE, ycenter = FALSE)
U_train_time <- X_train_time %*% cca_time$xcoef
V_train_time <- Y_train_time %*% cca_time$ycoef
U_test_time <- X_test_time %*% cca_time$xcoef
V_test_time <- Y_test_time %*% cca_time$ycoef
train_corr_time <- diag(cor(U_train_time, V_train_time))
test_corr_time <- diag(cor(U_test_time, V_test_time))
cca_time_split_tbl <- data.frame(
pair = seq_along(train_corr_time),
train_correlation = train_corr_time,
test_correlation = test_corr_time
)
round_table(cca_time_split_tbl, digits = 3)
cca_compare_long <- rbind(
data.frame(
pair = cca_split_tbl$pair,
sample = "Random split: train",
correlation = cca_split_tbl$train_correlation
),
data.frame(
pair = cca_split_tbl$pair,
sample = "Random split: test",
correlation = cca_split_tbl$test_correlation
),
data.frame(
pair = cca_time_split_tbl$pair,
sample = "Time split: train",
correlation = cca_time_split_tbl$train_correlation
),
data.frame(
pair = cca_time_split_tbl$pair,
sample = "Time split: test",
correlation = cca_time_split_tbl$test_correlation
)
)
ggplot(cca_compare_long, aes(x = pair, y = correlation, color = sample, linetype = sample)) +
geom_point(size = 2.5) +
geom_line() +
scale_x_continuous(breaks = cca_split_tbl$pair) +
scale_color_manual(
values = c(
"Random split: train" = "#2C7FB8",
"Random split: test" = "#D95F0E",
"Time split: train" = "#7BCCC4",
"Time split: test" = "#F16913"
)
) +
scale_linetype_manual(
values = c(
"Random split: train" = "solid",
"Random split: test" = "solid",
"Time split: train" = "dashed",
"Time split: test" = "dashed"
)
) +
labs(
title = "CCA: Train and Test Canonical Correlations",
subtitle = paste(
"Time split:",
format(min(analysis_tbl$date[1:split_point]), "%Y-%m"),
"to",
format(max(analysis_tbl$date[1:split_point]), "%Y-%m"),
"vs",
format(min(analysis_tbl$date[(split_point + 1):n]), "%Y-%m"),
"to",
format(max(analysis_tbl$date[(split_point + 1):n]), "%Y-%m")
),
x = "Canonical pair",
y = "Correlation",
color = NULL,
linetype = NULL
)
Now we switch from association to prediction. The question becomes:
How do we predict the full vector of industry returns from the predictor block \(X\)?
This is different from CCA. Here X is the predictor
block and Y is the response block.
We need to write the multivariate regression function ourselves.
## Here the data is already assumed centered
fit_multivariate_ols <- function(X, Y) {
B_ols <- solve(crossprod(X), crossprod(X, Y))
Yhat_ols <- X %*% B_ols
list(B_ols = B_ols, Yhat_ols = Yhat_ols)
}
ols_fit <- fit_multivariate_ols(X_centered, Y_centered)
B_ols <- ols_fit$B_ols
Yhat_ols <- ols_fit$Yhat_ols
response_r2 <- 1 - colSums((Y_centered - Yhat_ols)^2) / colSums(Y_centered^2)
response_r2_tbl <- data.frame(
response = colnames(Y_centered),
r_squared = response_r2
)
# Compare OLS R^2 with squared Y-side CCA cross-loadings.
# Each squared cross-loading is the share of variance in one response
# explained by a single X-side canonical score.
cca_y_cross_sq_tbl <- data.frame(
response = y_cross_loadings$variable,
CCA1_cross_loadings_sq = y_cross_loadings$Can1^2
)
regression_vs_cca_tbl <- merge(response_r2_tbl, cca_y_cross_sq_tbl, by = "response")
regression_vs_cca_tbl <- regression_vs_cca_tbl[order(regression_vs_cca_tbl$r_squared, decreasing = TRUE), ]
round_table(regression_vs_cca_tbl, digits = 3)
The regression \(R^2\) represents the squared correlation between the observed and predicted response for each outcome. In multivariate regression, this quantity is maximized separately for each response, since each outcome can use its own optimal linear combination of \(X\).
In contrast, the correlations between each \(Y\) and the first canonical variate on the \(X\) side are generally lower. This is because CCA has a different objective: it seeks a single linear combination of \(X\) that is maximally correlated with a corresponding linear combination of \(Y\). As a result, CCA captures shared structure across responses rather than optimizing prediction for each one individually.
OLS uses a full \(p \times q\)
coefficient matrix.
RRR instead imposes rank(B) <= r, so the responses are
predicted through only a few shared latent directions.
The key computational idea is simple:
r fitted-value directions.So RRR is the best rank-r approximation to the OLS
fitted values, and the main practical question is how to choose
r.
We need to write our own function for RRR.
fit_rrr <- function(X, Y, rank) {
ols_fit <- fit_multivariate_ols(X, Y)
B_ols <- ols_fit$B_ols
Yhat_ols <- ols_fit$Yhat_ols
if (rank == 0) {
B_rrr <- matrix(0, nrow = ncol(X), ncol = ncol(Y))
Yhat_rrr <- matrix(0, nrow = nrow(X), ncol = ncol(Y))
d <- rep(0, min(ncol(X), ncol(Y)))
} else {
svd_fit <- svd(Yhat_ols)
V_r <- svd_fit$v[, seq_len(rank), drop = FALSE]
B_rrr <- B_ols %*% V_r %*% t(V_r)
Yhat_rrr <- X %*% B_rrr
d <- svd_fit$d
}
list(
rank = rank,
B_ols = B_ols,
Yhat_ols = Yhat_ols,
B_rrr = B_rrr,
Yhat_rrr = Yhat_rrr,
fitted_singular_values = d
)
}
For RRR, rank is a tuning parameter. A natural choice is to pick the rank that gives the best out-of-sample prediction accuracy.
We treat the monthly observations as independent when forming folds, so the focus stays on the multivariate methods rather than time-series dependence.
predict_rrr_from_train <- function(X_train_raw, Y_train_raw, X_test_raw, rank) {
x_means <- colMeans(X_train_raw)
y_means <- colMeans(Y_train_raw)
X_train <- sweep(X_train_raw, 2, x_means, "-")
Y_train <- sweep(Y_train_raw, 2, y_means, "-")
X_test <- sweep(X_test_raw, 2, x_means, "-")
fit <- fit_rrr(X_train, Y_train, rank = rank)
Yhat_test <- X_test %*% fit$B_rrr
sweep(Yhat_test, 2, y_means, "+")
}
k_folds <- 10
fold_id <- sample(rep(1:k_folds, length.out = n))
max_rank <- min(q, qr(X_centered)$rank)
candidate_ranks <- 0:max_rank
compute_rrr_fold_error <- function(rank, fold) {
test_idx <- which(fold_id == fold)
train_idx <- setdiff(seq_len(n), test_idx)
X_train <- X_raw[train_idx, , drop = FALSE]
Y_train <- Y_raw[train_idx, , drop = FALSE]
X_test <- X_raw[test_idx, , drop = FALSE]
Y_test <- Y_raw[test_idx, , drop = FALSE]
Yhat_test <- predict_rrr_from_train(X_train, Y_train, X_test, rank = rank)
mean(rowSums((Y_test - Yhat_test)^2))
}
cv_error_list <- lapply(candidate_ranks, function(rank) {
sapply(seq_len(k_folds), function(fold) compute_rrr_fold_error(rank, fold))
})
cv_results <- data.frame(
rank = candidate_ranks,
cv_error = sapply(cv_error_list, mean),
cv_se = sapply(cv_error_list, function(errors) sd(errors) / sqrt(k_folds))
)
best_rank <- cv_results$rank[which.min(cv_results$cv_error)]
# The largest allowable rank reproduces the unrestricted OLS fitted values.
ols_rank <- max(candidate_ranks)
ols_cv_error <- cv_results$cv_error[cv_results$rank == ols_rank]
ols_cv_se <- cv_results$cv_se[cv_results$rank == ols_rank]
cv_method_compare_tbl <- data.frame(
method = c(paste0("RRR (rank ", best_rank, ")"), "Multivariate regression (OLS)"),
rank = c(best_rank, ols_rank),
cv_error = c(
min(cv_results$cv_error),
ols_cv_error
),
cv_se = c(
cv_results$cv_se[which.min(cv_results$cv_error)],
ols_cv_se
)
)
cv_results
ggplot(cv_results, aes(x = rank, y = cv_error)) +
geom_point(size = 2.5, color = "#2C7FB8") +
geom_line(color = "#2C7FB8") +
geom_errorbar(aes(ymin = cv_error - cv_se, ymax = cv_error + cv_se), width = 0.15) +
geom_vline(xintercept = best_rank, linetype = "dashed", color = "#D95F0E") +
geom_point(
data = subset(cv_results, rank == ols_rank),
color = "#238B45",
size = 3.2
) +
scale_x_continuous(breaks = cv_results$rank) +
labs(
title = "Cross-Validated Rank Selection for RRR",
subtitle = glue::glue("Selected rank = {best_rank}; highest rank corresponds to OLS"),
x = "Candidate rank",
y = "Mean held-out prediction error"
)
CV suggests a rank of 3.
round_table(cv_method_compare_tbl, digits = 3)
rrr_best_fit <- fit_rrr(X_centered, Y_centered, best_rank)
Yhat_rrr_best <- rrr_best_fit$Yhat_rrr
rrr_response_r2 <- 1 - colSums((Y_centered - Yhat_rrr_best)^2) / colSums(Y_centered^2)
ols_vs_rrr_r2_tbl <- data.frame(
response = colnames(Y_centered),
ols_r_squared = response_r2,
rrr_r_squared = rrr_response_r2
)
ols_vs_rrr_r2_tbl <- ols_vs_rrr_r2_tbl[
order(ols_vs_rrr_r2_tbl$ols_r_squared, decreasing = TRUE),
]
round_table(ols_vs_rrr_r2_tbl, digits = 3)
The in-sample R^2 from the RRR is just slightly smaller.
So using RRR instead of the multivariate regression slightly increases
generalizability.
This dataset is useful because it gives a clear story:
X collects broad financial and macroeconomic
variables;Y collects industry-level stock returns;X to predict
Y;