Installation

Run this once if needed:

install.packages(c(
  "ggplot2", "frenchdata",
  "httr2", "glue", "patchwork", "scales"
))

Introduction

This R demo studies one monthly finance dataset from July 1963 to February 2026. There are two blocks of variables X and Y.

X collects broad economic and market variables that may help explain common movements in returns, while Y collects returns from different parts of the economy. We use the same dataset throughout to compare three methods:

  1. CCA: association between two blocks
  2. Multivariate regression: prediction of Y from X
  3. Reduced-rank regression: prediction with shared low-dimensional structure across the responses

Packages and Helper Functions

# Main packages for the analysis itself:
# - ggplot2 for all lecture plots
# - frenchdata for the Kenneth French datasets
# The other packages listed above are helper packages used only in a few spots below.
library(ggplot2)
library(frenchdata)

theme_set(ggplot2::theme_minimal(base_size = 13))

set.seed(32950)

Data Construction

We first download the data, treat each month as an observation.

parse_ff_month <- function(x) {
  as.Date(paste0(x, "01"), format = "%Y%m%d")
}

download_fred_csv <- function(series_id) {
  # Build the FRED URL and download the CSV for one series.
  url <- glue::glue("https://fred.stlouisfed.org/graph/fredgraph.csv?id={series_id}")
  req <- httr2::request(url)
  req <- httr2::req_options(req, http_version = 2L)
  resp <- httr2::req_perform(req)

  fred_df <- read.csv(text = httr2::resp_body_string(resp), stringsAsFactors = FALSE)
  names(fred_df) <- c("date", "value")
  fred_df$date <- as.Date(fred_df$date)
  fred_df$value <- as.numeric(fred_df$value)
  fred_df$series <- series_id
  fred_df
}
# Kenneth French data -----------------------------------------------------
ff5_raw <- download_french_data("Fama/French 5 Factors (2x3)")
ind10_raw <- download_french_data("10 Industry Portfolios")

ff5_tbl <- as.data.frame(ff5_raw$subsets$data[[1]])
ff5_tbl <- ff5_tbl[, c("date", "Mkt-RF", "SMB", "HML", "RMW", "CMA")]
names(ff5_tbl) <- c("date", "Mkt_RF", "SMB", "HML", "RMW", "CMA")
ff5_tbl$date <- parse_ff_month(ff5_tbl$date)
ff5_tbl <- ff5_tbl[ff5_tbl$date >= as.Date("1963-07-01") &
                     ff5_tbl$date <= as.Date("2026-02-01"), ]

industry_tbl <- as.data.frame(ind10_raw$subsets$data[[1]])
industry_tbl <- industry_tbl[, c("date", "NoDur", "Durbl", "Manuf", "Enrgy",
                                 "HiTec", "Telcm", "Shops", "Hlth", "Utils", "Other")]
industry_tbl$date <- parse_ff_month(industry_tbl$date)
industry_tbl <- industry_tbl[industry_tbl$date >= as.Date("1963-07-01") &
                               industry_tbl$date <= as.Date("2026-02-01"), ]

# FRED data ---------------------------------------------------------------
fred_ids <- c(
  "UNRATE", "CPIAUCSL", "INDPRO", "PAYEMS", "FEDFUNDS",
  "GS10", "TB3MS", "BAA", "AAA"
)

fred_list <- lapply(fred_ids, download_fred_csv)
fred_raw <- do.call(rbind, fred_list)
fred_raw <- reshape(
  fred_raw[, c("date", "series", "value")],
  idvar = "date",
  timevar = "series",
  direction = "wide"
)
names(fred_raw) <- sub("^value\\.", "", names(fred_raw))
fred_raw <- fred_raw[order(fred_raw$date), ]

macro_tbl <- data.frame(
  date = fred_raw$date,
  unemployment_rate = fred_raw$UNRATE,
  inflation_yoy = c(rep(NA, 12), 100 * diff(log(fred_raw$CPIAUCSL), lag = 12)),
  industrial_production_yoy = c(rep(NA, 12), 100 * diff(log(fred_raw$INDPRO), lag = 12)),
  payrolls_yoy = c(rep(NA, 12), 100 * diff(log(fred_raw$PAYEMS), lag = 12)),
  fed_funds = fred_raw$FEDFUNDS,
  term_spread = fred_raw$GS10 - fred_raw$TB3MS,
  default_spread = fred_raw$BAA - fred_raw$AAA
)
macro_tbl <- macro_tbl[macro_tbl$date >= as.Date("1963-07-01") &
                         macro_tbl$date <= as.Date("2026-02-01"), ]

# Merge and clean ---------------------------------------------------------
full_tbl <- merge(ff5_tbl, macro_tbl, by = "date")
full_tbl <- merge(full_tbl, industry_tbl, by = "date")
full_tbl <- full_tbl[order(full_tbl$date), ]

analysis_tbl <- full_tbl[complete.cases(full_tbl), ]

rows_dropped <- nrow(full_tbl) - nrow(analysis_tbl)

x_vars <- c(
  "SMB", "HML", "RMW", "CMA",
  "unemployment_rate", "inflation_yoy", "industrial_production_yoy",
  "payrolls_yoy", "fed_funds", "term_spread", "default_spread"
)

y_vars <- c(
  "NoDur", "Durbl", "Manuf", "Enrgy", "HiTec",
  "Telcm", "Shops", "Hlth", "Utils", "Other"
)

X_raw <- as.matrix(analysis_tbl[, x_vars])
Y_raw <- as.matrix(analysis_tbl[, y_vars])

X_centered <- scale(X_raw, center = TRUE, scale = FALSE)
Y_centered <- scale(Y_raw, center = TRUE, scale = FALSE)

n <- nrow(X_centered)
p <- ncol(X_centered)
q <- ncol(Y_centered)

We center the variables before fitting the methods, but we do not rescale them.

The macro variables are transformed in a simple way:

  • inflation, industrial production, and payrolls enter as 12-month log growth rates;
  • yields and spreads enter in percentage points;
  • unemployment enters in levels.

For CCA, centering is enough for the main canonical-correlation story here. For regression and reduced-rank regression, keeping Y on its original scale also makes the prediction interpretation more direct.

Rows with missing values are dropped only after all transformations are created. In practice, most missingness here comes from the initial 12-month lags used in the growth-rate variables.

data.frame(
  sample_start = min(analysis_tbl$date),
  sample_end = max(analysis_tbl$date),
  n = n,
  p = p,
  q = q,
  rows_dropped_after_transformations = rows_dropped
)

What the Variables Mean

x_variable_table <- data.frame(
  variable = x_vars,
  meaning = c(
    "Size factor: small stocks minus big stocks",
    "Value factor: high book-to-market minus low book-to-market",
    "Profitability factor: robust minus weak profitability firms",
    "Investment factor: conservative minus aggressive investment firms",
    "Unemployment rate",
    "Year-over-year CPI inflation rate",
    "Year-over-year industrial production growth",
    "Year-over-year payroll employment growth",
    "Federal funds rate",
    "Term spread: 10-year Treasury yield minus 3-month Treasury bill rate",
    "Default spread: BAA corporate yield minus AAA corporate yield"
  ),
  stringsAsFactors = FALSE
)

y_variable_table <- data.frame(
  variable = y_vars,
  meaning = c(
    "Consumer nondurables",
    "Consumer durables",
    "Manufacturing",
    "Energy",
    "High technology",
    "Telecommunications",
    "Shops / retail-wholesale",
    "Healthcare",
    "Utilities",
    "Other industries"
  ),
  stringsAsFactors = FALSE
)

x_variable_table
y_variable_table

Some interpretation:

Performing CCA

CCA treats the two blocks symmetrically. It asks:

Which linear combination of X and which linear combination of Y are most strongly correlated?

This section first fits CCA on the full sample, then uses a train-test split to show that strong in-sample canonical correlations may weaken out of sample.

Pairwise Correlations Between X and Y

Before fitting CCA, it is useful to look at the ordinary pairwise correlations between the variables in X and the variables in Y.

make_heatmap_df <- function(M, row_name = "row", col_name = "col", value_name = "value") {
  df <- as.data.frame(as.table(M), stringsAsFactors = FALSE)
  names(df) <- c(row_name, col_name, value_name)
  df[[row_name]] <- as.character(df[[row_name]])
  df[[col_name]] <- as.character(df[[col_name]])
  df
}

xy_cor_mat <- cor(X_centered, Y_centered)
xy_cor_df <- make_heatmap_df(xy_cor_mat, row_name = "x_variable", col_name = "y_variable", value_name = "correlation")
xy_cor_top <- xy_cor_df[order(abs(xy_cor_df$correlation), decreasing = TRUE), ]

head(xy_cor_top, 15)
ggplot(xy_cor_df, aes(x = y_variable, y = x_variable, fill = correlation)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B") +
  labs(
    title = "Pairwise Correlations Between X and Y",
    x = NULL,
    y = NULL,
    fill = "Corr."
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

This helps explain why the first canonical variate on the X side may be dominated by one of the Fama-French factors: some of those factors have visibly stronger direct correlation with the industry returns than the macro variables do.

CCA Output

The output directly from the CCA function in R is limited. It only provides the canonical correlations and weights, and we need to calculate the canonical covariates and loadings from that.

cca_fit <- cancor(X_centered, Y_centered, xcenter = FALSE, ycenter = FALSE)

# canonical covariates
U <- X_centered %*% cca_fit$xcoef
V <- Y_centered %*% cca_fit$ycoef

canonical_correlations <- data.frame(
  pair = seq_along(cca_fit$cor),
  canonical_correlation = cca_fit$cor
)

x_weights <- data.frame(variable = rownames(cca_fit$xcoef), cca_fit$xcoef,
                        row.names = NULL, check.names = FALSE)
names(x_weights)[-1] <- paste0("Can", seq_len(ncol(cca_fit$xcoef)))

y_weights <- data.frame(variable = rownames(cca_fit$ycoef), cca_fit$ycoef,
                        row.names = NULL, check.names = FALSE)
names(y_weights)[-1] <- paste0("Can", seq_len(ncol(cca_fit$ycoef)))

# loadings
x_loadings_mat <- cor(X_centered, U)
y_loadings_mat <- cor(Y_centered, V)

x_loadings <- data.frame(variable = rownames(x_loadings_mat), x_loadings_mat,
                         row.names = NULL, check.names = FALSE)
names(x_loadings)[-1] <- paste0("Can", seq_len(ncol(x_loadings_mat)))

y_loadings <- data.frame(variable = rownames(y_loadings_mat), y_loadings_mat,
                         row.names = NULL, check.names = FALSE)
names(y_loadings)[-1] <- paste0("Can", seq_len(ncol(y_loadings_mat)))

# cross-loadings: correlation with the opposite-side canonical score
canonical_corr_vec <- cca_fit$cor
x_cross_loadings_mat <- sweep(x_loadings_mat, 2, canonical_corr_vec, "*")
y_cross_loadings_mat <- sweep(y_loadings_mat, 2, canonical_corr_vec, "*")

x_cross_loadings <- data.frame(variable = rownames(x_cross_loadings_mat), x_cross_loadings_mat,
                               row.names = NULL, check.names = FALSE)
names(x_cross_loadings)[-1] <- paste0("Can", seq_len(ncol(x_cross_loadings_mat)))

y_cross_loadings <- data.frame(variable = rownames(y_cross_loadings_mat), y_cross_loadings_mat,
                               row.names = NULL, check.names = FALSE)
names(y_cross_loadings)[-1] <- paste0("Can", seq_len(ncol(y_cross_loadings_mat)))


sort_by_abs <- function(df, column_name) {
  df[order(abs(df[[column_name]]), decreasing = TRUE), ]
}

round_table <- function(df, digits = 3) {
  out <- df
  num_cols <- vapply(out, is.numeric, logical(1))
  out[num_cols] <- lapply(out[num_cols], round, digits = digits)
  out
}

round_table(canonical_correlations, digits = 3)
round_table(x_weights[, c("variable", "Can1", "Can2", "Can3", "Can4")], digits = 3)
round_table(y_weights[, c("variable", "Can1", "Can2", "Can3", "Can4")], digits = 3)
round_table(
  sort_by_abs(x_loadings[, c("variable", "Can1", "Can2", "Can3", "Can4")], "Can1"),
  digits = 3
)
round_table(
  sort_by_abs(y_loadings[, c("variable", "Can1", "Can2", "Can3", "Can4")], "Can1"),
  digits = 3
)
round_table(
  sort_by_abs(x_cross_loadings[, c("variable", "Can1", "Can2", "Can3", "Can4")], "Can1"),
  digits = 3
)
round_table(
  sort_by_abs(y_cross_loadings[, c("variable", "Can1", "Can2", "Can3", "Can4")], "Can1"),
  digits = 3
)
  • the weights show how the canonical variates are built from the original variables;
  • the loadings are often easier to interpret than the weights because they show correlation with the original variables.
  • the cross-loadings show correlation with the opposite-side canonical score; their squared values are closer in spirit to a univariate R^2 interpretation.
cca_x_loadings_long <- rbind(
  data.frame(variable = x_loadings$variable, pair = "Can1", loading = x_loadings$Can1),
  data.frame(variable = x_loadings$variable, pair = "Can2", loading = x_loadings$Can2),
  data.frame(variable = x_loadings$variable, pair = "Can3", loading = x_loadings$Can3),
  data.frame(variable = x_loadings$variable, pair = "Can4", loading = x_loadings$Can4)
)

cca_y_loadings_long <- rbind(
  data.frame(variable = y_loadings$variable, pair = "Can1", loading = y_loadings$Can1),
  data.frame(variable = y_loadings$variable, pair = "Can2", loading = y_loadings$Can2),
  data.frame(variable = y_loadings$variable, pair = "Can3", loading = y_loadings$Can3),
 data.frame(variable = y_loadings$variable, pair = "Can4", loading = y_loadings$Can4)
)

cca_x_loadings_plot <- ggplot(cca_x_loadings_long, aes(x = pair, y = reorder(variable, loading), fill = loading)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B") +
  labs(
    title = "X Loadings",
    x = NULL,
    y = NULL,
    fill = "Loading"
  )

cca_y_loadings_plot <- ggplot(cca_y_loadings_long, aes(x = pair, y = reorder(variable, loading), fill = loading)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B") +
  labs(
    title = "Y Loadings",
    x = NULL,
    y = NULL,
    fill = "Loading"
  )

patchwork::wrap_plots(cca_x_loadings_plot, cca_y_loadings_plot, ncol = 2)

Short interpretation of the first four canonical pairs:

  • Pair 1: HML and CMA on the X side; HiTec on the Y side.
    This links value/investment variation to a technology-heavy return direction.

  • Pair 2: RMW and SMB on the X side; Other, Durbl, HiTec, and Telcm on the Y side.
    This suggests a profitability/size direction tied to a broad cyclical-growth industry pattern.

  • Pair 3: SMB and RMW on the X side; Shops, NoDur, Manuf, and Durbl on the Y side.
    This looks like a size/profitability direction linked to consumer and manufacturing industries.

  • Pair 4: payroll growth, default spread, and industrial production growth on the X side; Durbl and Telcm on the Y side.
    This is more macro-financial, linking real activity and credit conditions to these industries.

Train-Test Split for CCA

CCA is fitted to maximize correlation on the observed sample. A useful check is to see whether the same directions still have high correlation on held-out data.

set.seed(32950)
train_id <- sample(c("train", "test"), size = n, replace = TRUE, prob = c(0.7, 0.3))

X_train_raw <- X_raw[train_id == "train", , drop = FALSE]
Y_train_raw <- Y_raw[train_id == "train", , drop = FALSE]
X_test_raw  <- X_raw[train_id == "test", , drop = FALSE]
Y_test_raw  <- Y_raw[train_id == "test", , drop = FALSE]

x_train_mean <- colMeans(X_train_raw)
y_train_mean <- colMeans(Y_train_raw)

X_train <- sweep(X_train_raw, 2, x_train_mean, "-")
Y_train <- sweep(Y_train_raw, 2, y_train_mean, "-")
X_test <- sweep(X_test_raw, 2, x_train_mean, "-")
Y_test <- sweep(Y_test_raw, 2, y_train_mean, "-")

cca_train <- cancor(X_train, Y_train, xcenter = FALSE, ycenter = FALSE)

U_train <- X_train %*% cca_train$xcoef
V_train <- Y_train %*% cca_train$ycoef
U_test <- X_test %*% cca_train$xcoef
V_test <- Y_test %*% cca_train$ycoef

train_corr <- diag(cor(U_train, V_train))
test_corr <- diag(cor(U_test, V_test))

cca_split_tbl <- data.frame(
  pair = seq_along(train_corr),
  train_correlation = train_corr,
  test_correlation = test_corr
)

round_table(cca_split_tbl, digits = 3)

The canonical correlations are generally smaller on test data. This does not mean CCA is wrong. It shows that sample canonical correlations are optimized in-sample, so some of the strength may be reduced on new data.

Contiguous Time Split for CCA

We may also split the data into historical months and future months and check if the canonical correlations are stable for the future.

split_point <- floor(0.7 * n)

X_train_raw_time <- X_raw[1:split_point, , drop = FALSE]
Y_train_raw_time <- Y_raw[1:split_point, , drop = FALSE]
X_test_raw_time <- X_raw[(split_point + 1):n, , drop = FALSE]
Y_test_raw_time <- Y_raw[(split_point + 1):n, , drop = FALSE]

x_train_mean_time <- colMeans(X_train_raw_time)
y_train_mean_time <- colMeans(Y_train_raw_time)

X_train_time <- sweep(X_train_raw_time, 2, x_train_mean_time, "-")
Y_train_time <- sweep(Y_train_raw_time, 2, y_train_mean_time, "-")
X_test_time <- sweep(X_test_raw_time, 2, x_train_mean_time, "-")
Y_test_time <- sweep(Y_test_raw_time, 2, y_train_mean_time, "-")

cca_time <- cancor(X_train_time, Y_train_time, xcenter = FALSE, ycenter = FALSE)

U_train_time <- X_train_time %*% cca_time$xcoef
V_train_time <- Y_train_time %*% cca_time$ycoef
U_test_time <- X_test_time %*% cca_time$xcoef
V_test_time <- Y_test_time %*% cca_time$ycoef

train_corr_time <- diag(cor(U_train_time, V_train_time))
test_corr_time <- diag(cor(U_test_time, V_test_time))

cca_time_split_tbl <- data.frame(
  pair = seq_along(train_corr_time),
  train_correlation = train_corr_time,
  test_correlation = test_corr_time
)

round_table(cca_time_split_tbl, digits = 3)
cca_compare_long <- rbind(
  data.frame(
    pair = cca_split_tbl$pair,
    sample = "Random split: train",
    correlation = cca_split_tbl$train_correlation
  ),
  data.frame(
    pair = cca_split_tbl$pair,
    sample = "Random split: test",
    correlation = cca_split_tbl$test_correlation
  ),
  data.frame(
    pair = cca_time_split_tbl$pair,
    sample = "Time split: train",
    correlation = cca_time_split_tbl$train_correlation
  ),
  data.frame(
    pair = cca_time_split_tbl$pair,
    sample = "Time split: test",
    correlation = cca_time_split_tbl$test_correlation
  )
)

ggplot(cca_compare_long, aes(x = pair, y = correlation, color = sample, linetype = sample)) +
  geom_point(size = 2.5) +
  geom_line() +
  scale_x_continuous(breaks = cca_split_tbl$pair) +
  scale_color_manual(
    values = c(
      "Random split: train" = "#2C7FB8",
      "Random split: test" = "#D95F0E",
      "Time split: train" = "#7BCCC4",
      "Time split: test" = "#F16913"
    )
  ) +
  scale_linetype_manual(
    values = c(
      "Random split: train" = "solid",
      "Random split: test" = "solid",
      "Time split: train" = "dashed",
      "Time split: test" = "dashed"
    )
  ) +
  labs(
    title = "CCA: Train and Test Canonical Correlations",
    subtitle = paste(
      "Time split:",
      format(min(analysis_tbl$date[1:split_point]), "%Y-%m"),
      "to",
      format(max(analysis_tbl$date[1:split_point]), "%Y-%m"),
      "vs",
      format(min(analysis_tbl$date[(split_point + 1):n]), "%Y-%m"),
      "to",
      format(max(analysis_tbl$date[(split_point + 1):n]), "%Y-%m")
    ),
    x = "Canonical pair",
    y = "Correlation",
    color = NULL,
    linetype = NULL
  )

Multivariate Regression

Now we switch from association to prediction. The question becomes:

How do we predict the full vector of industry returns from the predictor block \(X\)?

This is different from CCA. Here X is the predictor block and Y is the response block.

We need to write the multivariate regression function ourselves.

## Here the data is already assumed centered
fit_multivariate_ols <- function(X, Y) {
  B_ols <- solve(crossprod(X), crossprod(X, Y))
  Yhat_ols <- X %*% B_ols

  list(B_ols = B_ols, Yhat_ols = Yhat_ols)
}
ols_fit <- fit_multivariate_ols(X_centered, Y_centered)

B_ols <- ols_fit$B_ols
Yhat_ols <- ols_fit$Yhat_ols


response_r2 <- 1 - colSums((Y_centered - Yhat_ols)^2) / colSums(Y_centered^2)
response_r2_tbl <- data.frame(
  response = colnames(Y_centered),
  r_squared = response_r2
)

# Compare OLS R^2 with squared Y-side CCA cross-loadings.
# Each squared cross-loading is the share of variance in one response
# explained by a single X-side canonical score.
cca_y_cross_sq_tbl <- data.frame(
  response = y_cross_loadings$variable,
  CCA1_cross_loadings_sq = y_cross_loadings$Can1^2
)

regression_vs_cca_tbl <- merge(response_r2_tbl, cca_y_cross_sq_tbl, by = "response")
regression_vs_cca_tbl <- regression_vs_cca_tbl[order(regression_vs_cca_tbl$r_squared, decreasing = TRUE), ]

round_table(regression_vs_cca_tbl, digits = 3)

Reduced-Rank Regression

OLS uses a full \(p \times q\) coefficient matrix.
RRR instead imposes rank(B) <= r, so the responses are predicted through only a few shared latent directions.

The key computational idea is simple:

  1. fit multivariate OLS and obtain \(\hat Y_{\text{OLS}} = X \hat B_{\text{OLS}}\);
  2. take the SVD of \(\hat Y_{\text{OLS}}\);
  3. keep only the top r fitted-value directions.

So RRR is the best rank-r approximation to the OLS fitted values, and the main practical question is how to choose r.

We need to write our own function for RRR.

fit_rrr <- function(X, Y, rank) {
  ols_fit <- fit_multivariate_ols(X, Y)
  B_ols <- ols_fit$B_ols
  Yhat_ols <- ols_fit$Yhat_ols

  if (rank == 0) {
    B_rrr <- matrix(0, nrow = ncol(X), ncol = ncol(Y))
    Yhat_rrr <- matrix(0, nrow = nrow(X), ncol = ncol(Y))
    d <- rep(0, min(ncol(X), ncol(Y)))
  } else {
    svd_fit <- svd(Yhat_ols)
    V_r <- svd_fit$v[, seq_len(rank), drop = FALSE]
    B_rrr <- B_ols %*% V_r %*% t(V_r)
    Yhat_rrr <- X %*% B_rrr
    d <- svd_fit$d
  }

  list(
    rank = rank,
    B_ols = B_ols,
    Yhat_ols = Yhat_ols,
    B_rrr = B_rrr,
    Yhat_rrr = Yhat_rrr,
    fitted_singular_values = d
  )
}

Rank Selection by Cross-Validation

For RRR, rank is a tuning parameter. A natural choice is to pick the rank that gives the best out-of-sample prediction accuracy.

We treat the monthly observations as independent when forming folds, so the focus stays on the multivariate methods rather than time-series dependence.

predict_rrr_from_train <- function(X_train_raw, Y_train_raw, X_test_raw, rank) {
  x_means <- colMeans(X_train_raw)
  y_means <- colMeans(Y_train_raw)
  
  X_train <- sweep(X_train_raw, 2, x_means, "-")
  Y_train <- sweep(Y_train_raw, 2, y_means, "-")
  X_test <- sweep(X_test_raw, 2, x_means, "-")
  
  fit <- fit_rrr(X_train, Y_train, rank = rank)
  Yhat_test <- X_test %*% fit$B_rrr
  sweep(Yhat_test, 2, y_means, "+")
}

k_folds <- 10
fold_id <- sample(rep(1:k_folds, length.out = n))
max_rank <- min(q, qr(X_centered)$rank)
candidate_ranks <- 0:max_rank

compute_rrr_fold_error <- function(rank, fold) {
  test_idx <- which(fold_id == fold)
  train_idx <- setdiff(seq_len(n), test_idx)
  
  X_train <- X_raw[train_idx, , drop = FALSE]
  Y_train <- Y_raw[train_idx, , drop = FALSE]
  X_test <- X_raw[test_idx, , drop = FALSE]
  Y_test <- Y_raw[test_idx, , drop = FALSE]
  
  Yhat_test <- predict_rrr_from_train(X_train, Y_train, X_test, rank = rank)
  mean(rowSums((Y_test - Yhat_test)^2))
}

cv_error_list <- lapply(candidate_ranks, function(rank) {
  sapply(seq_len(k_folds), function(fold) compute_rrr_fold_error(rank, fold))
})

cv_results <- data.frame(
  rank = candidate_ranks,
  cv_error = sapply(cv_error_list, mean),
  cv_se = sapply(cv_error_list, function(errors) sd(errors) / sqrt(k_folds))
)

best_rank <- cv_results$rank[which.min(cv_results$cv_error)]

# The largest allowable rank reproduces the unrestricted OLS fitted values.
ols_rank <- max(candidate_ranks)
ols_cv_error <- cv_results$cv_error[cv_results$rank == ols_rank]
ols_cv_se <- cv_results$cv_se[cv_results$rank == ols_rank]

cv_method_compare_tbl <- data.frame(
  method = c(paste0("RRR (rank ", best_rank, ")"), "Multivariate regression (OLS)"),
  rank = c(best_rank, ols_rank),
  cv_error = c(
    min(cv_results$cv_error),
    ols_cv_error
  ),
  cv_se = c(
    cv_results$cv_se[which.min(cv_results$cv_error)],
    ols_cv_se
  )
)

cv_results
ggplot(cv_results, aes(x = rank, y = cv_error)) +
  geom_point(size = 2.5, color = "#2C7FB8") +
  geom_line(color = "#2C7FB8") +
  geom_errorbar(aes(ymin = cv_error - cv_se, ymax = cv_error + cv_se), width = 0.15) +
  geom_vline(xintercept = best_rank, linetype = "dashed", color = "#D95F0E") +
  geom_point(
    data = subset(cv_results, rank == ols_rank),
    color = "#238B45",
    size = 3.2
  ) +
  scale_x_continuous(breaks = cv_results$rank) +
  labs(
    title = "Cross-Validated Rank Selection for RRR",
    subtitle = glue::glue("Selected rank = {best_rank}; highest rank corresponds to OLS"),
    x = "Candidate rank",
    y = "Mean held-out prediction error"
  )

CV suggests a rank of 3.

round_table(cv_method_compare_tbl, digits = 3)
rrr_best_fit <- fit_rrr(X_centered, Y_centered, best_rank)

Yhat_rrr_best <- rrr_best_fit$Yhat_rrr

rrr_response_r2 <- 1 - colSums((Y_centered - Yhat_rrr_best)^2) / colSums(Y_centered^2)

ols_vs_rrr_r2_tbl <- data.frame(
  response = colnames(Y_centered),
  ols_r_squared = response_r2,
  rrr_r_squared = rrr_response_r2
)

ols_vs_rrr_r2_tbl <- ols_vs_rrr_r2_tbl[
  order(ols_vs_rrr_r2_tbl$ols_r_squared, decreasing = TRUE),
]

round_table(ols_vs_rrr_r2_tbl, digits = 3)

The in-sample R^2 from the RRR is just slightly smaller. So using RRR instead of the multivariate regression slightly increases generalizability.

Brief Summary

This dataset is useful because it gives a clear story: