library(dsem)
# Convert arbitrary stock labels into safe column-name fragments.
safe_names <- function(x) {
x %>%
gsub("[^A-Za-z0-9]+", "_", ., perl = TRUE) %>%
gsub("^_|_$", "", ., perl = TRUE)
}
# Return vertically centered node positions for DAG plots.
centered_positions <- function(n) {
if (n <= 1) {
0
} else {
seq(from = (n - 1) / 2, to = -(n - 1) / 2, length.out = n)
}
}
# Build a static or dynamic DAG showing ABC-to-TAC links for the selected stocks.
plot_dsem_dag <- function(dep_species, indep_species, dynamic = FALSE) {
dep_nodes <- paste0("TAC_", safe_names(dep_species))
indep_nodes <- paste0("ABC_", safe_names(indep_species))
static_formulas <- lapply(dep_nodes, function(dep_node) {
reformulate(indep_nodes, response = dep_node)
})
x_coords <- setNames(rep(0, length(indep_nodes)), indep_nodes)
y_coords <- setNames(centered_positions(length(indep_nodes)), indep_nodes)
if (dynamic) {
lag_nodes <- paste0(dep_nodes, "_lag")
dynamic_formulas <- Map(function(dep_node, lag_node) {
reformulate(c(indep_nodes, lag_node), response = dep_node)
}, dep_nodes, lag_nodes)
labels <- c(
setNames(paste0("ABC ", indep_species, " (t)"), indep_nodes),
setNames(paste0("TAC ", dep_species, " (t-1)"), lag_nodes),
setNames(paste0("TAC ", dep_species, " (t)"), dep_nodes)
)
x_coords <- c(
x_coords,
setNames(rep(1, length(lag_nodes)), lag_nodes),
setNames(rep(2, length(dep_nodes)), dep_nodes)
)
dep_y <- centered_positions(length(dep_nodes))
y_coords <- c(
y_coords,
setNames(dep_y, lag_nodes),
setNames(dep_y, dep_nodes)
)
dag_obj <- do.call(
ggdag::dagify,
c(dynamic_formulas, list(labels = labels, coords = list(x = x_coords, y = y_coords)))
)
} else {
labels <- c(
setNames(paste("ABC:", indep_species), indep_nodes),
setNames(paste("TAC:", dep_species), dep_nodes)
)
x_coords <- c(x_coords, setNames(rep(2, length(dep_nodes)), dep_nodes))
y_coords <- c(y_coords, setNames(centered_positions(length(dep_nodes)), dep_nodes))
dag_obj <- do.call(
ggdag::dagify,
c(static_formulas, list(labels = labels, coords = list(x = x_coords, y = y_coords)))
)
}
dag_tidy <- ggdag::tidy_dagitty(dag_obj)
ggplot(dag_tidy, aes(x = x, y = y, xend = xend, yend = yend)) +
ggdag::geom_dag_edges() +
ggdag::geom_dag_node() +
ggdag::geom_dag_label_repel(aes(label = label), color = "black", size = 3) +
ggdag::theme_dag()
}
# dsem has changed allowed values for gmrf_parameterization across versions.
# Choose a value that works for the installed package (GitHub Actions may differ
# from local).
# Find a `gmrf_parameterization` value accepted by the installed dsem version.
choose_gmrf_param <- function() {
candidates <- c(
"full", "project", "gmrf_project", "mvn_project", # seen in newer versions
"separable", "projection" # seen in older versions
)
for (x in candidates) {
ok <- tryCatch({
dsem::dsem_control(gmrf_parameterization = x, quiet = TRUE)
TRUE
}, error = function(e) FALSE)
if (ok) return(x)
}
stop("Could not find a supported gmrf_parameterization value for dsem")
}
# Apply stored centering and scaling constants to new data.
scale_with <- function(x, center, scale) {
sweep(sweep(x, 2, center, "-"), 2, scale, "/")
}
# Return the last finite value in a numeric vector.
last_finite <- function(x) {
x <- x[is.finite(x)]
if (length(x) == 0) NA_real_ else tail(x, 1)
}
# Fit a regional DSEM, compute leave-one-out diagnostics, and generate an
# illustrative forward TAC forecast from observed or supplied ABC scenarios.
fit_dsem_region <- function(
data,
dep_species,
indep_species,
n_future = 5,
seed = 1,
future_abc_raw = NULL,
future_years = NULL,
compute_loo = TRUE
) {
dsem_df <- data %>%
filter(Species %in% unique(c(dep_species, indep_species))) %>%
group_by(Year, Species) %>%
summarise(
TAC = sum(TAC, na.rm = TRUE),
ABC = sum(ABC, na.rm = TRUE),
.groups = "drop"
) %>%
pivot_wider(
names_from = Species,
values_from = c(TAC, ABC),
names_sep = "_"
) %>%
arrange(Year)
names(dsem_df) <- safe_names(names(dsem_df))
dep_safe <- safe_names(dep_species)
indep_safe <- safe_names(indep_species)
y_cols <- paste0("TAC_", dep_safe)
x_cols <- paste0("ABC_", indep_safe)
ts_raw <- dsem_df[, c(y_cols, x_cols)]
ts_scaled <- scale(ts_raw)
ts_center <- attr(ts_scaled, "scaled:center")
ts_scale <- attr(ts_scaled, "scaled:scale")
ts_center[is.na(ts_center)] <- 0
ts_scale[is.na(ts_scale) | ts_scale == 0] <- 1
tsdata <- ts(ts_scaled, start = min(dsem_df$Year))
ar_lines <- paste0(y_cols, " -> ", y_cols, ", 1, ar_", dep_safe)
abc_lines <- unlist(lapply(y_cols, function(y) {
paste0(x_cols, " -> ", y, ", 0, b_", gsub("ABC_", "", x_cols), "_to_", gsub("TAC_", "", y))
}))
sem <- paste(c(ar_lines, abc_lines), collapse = "\n")
gmrf_param <- choose_gmrf_param()
# Try one specific DSEM configuration so callers can loop over fallbacks.
fit_dsem_once <- function(gmrf, family_opt = NULL) {
args <- list(
sem = sem,
tsdata = tsdata,
estimate_delta0 = TRUE,
control = dsem_control(
quiet = TRUE,
gmrf_parameterization = gmrf,
newton_loops = 0,
extra_convergence_checks = FALSE,
getsd = TRUE
)
)
if (!is.null(family_opt)) args$family <- family_opt
do.call(dsem::dsem, args)
}
gmrf_candidates <- unique(c(
gmrf_param,
"full", "project", "gmrf_project", "mvn_project", "separable", "projection"
))
fit <- NULL
fit_notes <- character()
for (g in gmrf_candidates) {
for (fam in list(NULL, "gaussian")) {
fam_tag <- if (is.null(fam)) "default" else fam
res <- tryCatch(
fit_dsem_once(g, fam),
error = function(e) e
)
if (!inherits(res, "error")) {
fit <- res
fit_notes <- c(fit_notes, paste0("fit succeeded with gmrf=", g, ", family=", fam_tag))
break
} else {
fit_notes <- c(
fit_notes,
paste0("fit failed with gmrf=", g, ", family=", fam_tag, ": ", conditionMessage(res))
)
}
}
if (!is.null(fit)) break
}
if (is.null(fit)) {
stop(
paste(
"DSEM fit failed for all gmrf/family combinations.",
paste(fit_notes, collapse = " | ")
)
)
}
dsem_summary <- summary(fit)
if (isTRUE(compute_loo)) {
loo_tbl <- dsem::loo_residuals(fit, what = "loo", track_progress = FALSE) %>%
as_tibble() %>%
rename(
Year = 1,
Series = 2,
Observed_scaled = obs,
Predicted_scaled = est,
SE_scaled = se
) %>%
mutate(
Year = as.numeric(Year),
Series = as.character(Series)
)
} else {
loo_tbl <- tibble(
Year = numeric(),
Series = character(),
Observed_scaled = numeric(),
Predicted_scaled = numeric(),
SE_scaled = numeric()
)
}
if (is.null(future_abc_raw)) {
set.seed(seed)
last_abc <- vapply(dsem_df[x_cols], last_finite, numeric(1))
growth <- setNames(rep(-0.005, length(x_cols)), x_cols)
dep_abc_cols <- intersect(paste0("ABC_", dep_safe), x_cols)
if (length(dep_abc_cols) >= 1) growth[dep_abc_cols[1]] <- 0.02
if (length(dep_abc_cols) >= 2) growth[dep_abc_cols[2]] <- 0.01
noise_sd <- 0.03
new_abc_raw <- sapply(seq_len(n_future), function(h) {
eps <- rnorm(length(x_cols), mean = 0, sd = noise_sd)
pmax(last_abc * (1 + growth) ^ h * (1 + eps), 1)
})
new_abc_raw <- t(new_abc_raw)
colnames(new_abc_raw) <- x_cols
forecast_years <- max(dsem_df$Year) + seq_len(n_future)
} else {
new_abc_raw <- future_abc_raw %>%
as.data.frame() %>%
select(all_of(x_cols))
n_future <- nrow(new_abc_raw)
forecast_years <- if (is.null(future_years)) {
max(dsem_df$Year) + seq_len(n_future)
} else {
future_years
}
}
new_abc_scaled <- scale_with(new_abc_raw, ts_center[x_cols], ts_scale[x_cols])
newdata <- as.data.frame(tsdata)
newdata_future <- as.data.frame(new_abc_scaled)
newdata_future[, y_cols] <- NA_real_
newdata <- bind_rows(newdata, newdata_future)
newdata <- newdata[, colnames(as.data.frame(tsdata))]
newdata_ts <- ts(as.matrix(newdata), start = start(tsdata), frequency = frequency(tsdata))
pred_scaled <- predict(fit, newdata = newdata_ts, type = "response")
colnames(pred_scaled) <- colnames(newdata_ts)
pred <- sweep(pred_scaled, 2, ts_scale[colnames(pred_scaled)], "*")
pred <- sweep(pred, 2, ts_center[colnames(pred)], "+")
pred_tac_future <- tail(pred[, y_cols], n_future)
forecast_tbl <- as_tibble(new_abc_raw) %>%
mutate(Year = forecast_years) %>%
bind_cols(as_tibble(pred_tac_future))
list(
fit = fit,
summary = dsem_summary,
dsem_df = dsem_df,
dep_species = dep_species,
y_cols = y_cols,
x_cols = x_cols,
pred = pred,
ts_center = ts_center,
ts_scale = ts_scale,
loo_tbl = loo_tbl,
n_future = n_future,
forecast_years = forecast_years,
forecast_tbl = forecast_tbl,
fit_notes = fit_notes
)
}
# Refit the DSEM sequentially through time and score one-step-ahead forecasts.
run_dsem_retrospective <- function(data, dep_species, indep_species, start_year = 2015, seed = 1) {
full_df <- data %>%
filter(Species %in% unique(c(dep_species, indep_species))) %>%
group_by(Year, Species) %>%
summarise(
TAC = sum(TAC, na.rm = TRUE),
ABC = sum(ABC, na.rm = TRUE),
.groups = "drop"
) %>%
pivot_wider(
names_from = Species,
values_from = c(TAC, ABC),
names_sep = "_"
) %>%
arrange(Year)
names(full_df) <- safe_names(names(full_df))
dep_safe <- safe_names(dep_species)
indep_safe <- safe_names(indep_species)
y_cols <- paste0("TAC_", dep_safe)
x_cols <- paste0("ABC_", indep_safe)
target_years <- full_df %>%
filter(Year >= start_year) %>%
pull(Year)
purrr::map_dfr(seq_along(target_years), function(i) {
target_year <- target_years[[i]]
target_row <- full_df %>%
filter(Year == target_year)
if (nrow(target_row) != 1 || any(!is.finite(unlist(target_row[, x_cols, drop = FALSE])))) {
return(tibble(
Year = target_year,
Training_end = target_year - 1,
Species = dep_species,
Observed = as.numeric(target_row[, y_cols, drop = FALSE]),
Predicted = NA_real_,
Error = NA_real_,
Abs_Error = NA_real_,
APE = NA_real_,
Status = "missing_future_abc"
))
}
fit_i <- tryCatch(
fit_dsem_region(
data = data %>% filter(Year < target_year),
dep_species = dep_species,
indep_species = indep_species,
n_future = 1,
seed = seed + i,
future_abc_raw = target_row[, x_cols, drop = FALSE],
future_years = target_year,
compute_loo = FALSE
),
error = function(e) e
)
if (inherits(fit_i, "error")) {
return(tibble(
Year = target_year,
Training_end = target_year - 1,
Species = dep_species,
Observed = as.numeric(target_row[, y_cols, drop = FALSE]),
Predicted = NA_real_,
Error = NA_real_,
Abs_Error = NA_real_,
APE = NA_real_,
Status = paste0("fit_failed: ", conditionMessage(fit_i))
))
}
predicted <- fit_i$forecast_tbl %>%
select(all_of(y_cols))
tibble(
Year = target_year,
Training_end = target_year - 1,
Species = dep_species,
Observed = as.numeric(target_row[, y_cols, drop = FALSE]),
Predicted = as.numeric(predicted[1, , drop = FALSE]),
Status = "ok"
) %>%
mutate(
Error = Observed - Predicted,
Abs_Error = abs(Error),
APE = if_else(Observed != 0, 100 * Abs_Error / abs(Observed), NA_real_)
)
})
}
# Re-label raw DSEM coefficient output into management-facing term groups.
summarize_dsem_results <- function(dsem_summary) {
dsem_summary %>%
mutate(
Term = if_else(lag > 0, paste0(path, " (lag ", lag, ")"), path),
Group = case_when(
lag > 0 ~ "Autoregressive (lagged TAC)",
str_detect(first, "^ABC_") ~ "ABC -> TAC",
TRUE ~ "Other"
)
) %>%
arrange(Group, Term) %>%
select(Group, Term, Estimate, Std_Error, z_value, p_value)
}
# Render the future ABC inputs and forecast TAC outputs as a gt table.
render_dsem_forecast_table <- function(region_fit) {
region_fit$forecast_tbl %>%
select(Year, starts_with("ABC_"), starts_with("TAC_")) %>%
gt::gt() %>%
gt::fmt_number(columns = starts_with("ABC_"), decimals = 0) %>%
gt::fmt_number(columns = starts_with("TAC_"), decimals = 0)
}
# Render the grouped DSEM coefficient summary as a gt table.
render_dsem_summary_table <- function(region_fit) {
summarize_dsem_results(region_fit$summary) %>%
gt::gt(rowname_col = "Term", groupname_col = "Group") %>%
gt::cols_label(
Estimate = "Estimate",
Std_Error = "Std. Error",
z_value = "z",
p_value = "p"
) %>%
gt::fmt_number(columns = c(Estimate, Std_Error), decimals = 3) %>%
gt::fmt_number(columns = z_value, decimals = 2) %>%
gt::fmt_number(columns = p_value, decimals = 3)
}
# Back-transform leave-one-out residuals into TAC units for one fitted region.
extract_dsem_loo_residuals <- function(region_fit, region_name) {
region_fit$loo_tbl %>%
filter(Series %in% region_fit$y_cols) %>%
mutate(
Region = region_name,
Species = region_fit$dep_species[match(Series, region_fit$y_cols)],
Observed = Observed_scaled * region_fit$ts_scale[Series] + region_fit$ts_center[Series],
Predicted = Predicted_scaled * region_fit$ts_scale[Series] + region_fit$ts_center[Series],
SE = SE_scaled * region_fit$ts_scale[Series],
Residual = Observed - Predicted,
Std_Residual = if_else(SE_scaled > 0, (Observed_scaled - Predicted_scaled) / SE_scaled, NA_real_)
) %>%
select(Region, Year, Species, Observed, Predicted, SE, Residual, Std_Residual)
}
# Bind leave-one-out residual diagnostics across all fitted regions.
collect_dsem_loo_residuals <- function(region_fits) {
purrr::imap_dfr(region_fits, extract_dsem_loo_residuals)
}
# Summarize leave-one-out residual performance metrics by region and species.
render_dsem_residual_comparison_table <- function(region_fits) {
collect_dsem_loo_residuals(region_fits) %>%
group_by(Region, Species) %>%
summarise(
Bias = mean(Residual, na.rm = TRUE),
MAE = mean(abs(Residual), na.rm = TRUE),
RMSE = sqrt(mean(Residual^2, na.rm = TRUE)),
Mean_abs_z = mean(abs(Std_Residual), na.rm = TRUE),
Max_abs_z = max(abs(Std_Residual), na.rm = TRUE),
.groups = "drop"
) %>%
arrange(Region, Species) %>%
gt::gt(rowname_col = "Species", groupname_col = "Region") %>%
gt::cols_label(
Bias = "Bias",
MAE = "MAE",
RMSE = "RMSE",
Mean_abs_z = "Mean |z|",
Max_abs_z = "Max |z|"
) %>%
gt::fmt_number(columns = c(Bias, MAE, RMSE), decimals = 0) %>%
gt::fmt_number(columns = c(Mean_abs_z, Max_abs_z), decimals = 2)
}
# Plot standardized leave-one-out residuals through time by species and region.
plot_dsem_loo_residuals <- function(region_fits) {
plot_df <- collect_dsem_loo_residuals(region_fits)
ggplot(plot_df, aes(x = Year, y = Std_Residual)) +
geom_hline(yintercept = 0, color = "grey60", linewidth = 0.4) +
geom_hline(yintercept = c(-2, 2), color = "firebrick", linetype = "dashed", linewidth = 0.4) +
geom_line(color = "steelblue", linewidth = 0.8) +
geom_point(color = "steelblue", size = 1.5, alpha = 0.85) +
facet_grid(Species ~ Region) +
labs(x = NULL, y = "Standardized leave-one-out residual")
}
# Combine retrospective forecast results from each region into one data frame.
collect_dsem_retrospective <- function(retro_fits) {
purrr::imap_dfr(retro_fits, ~ mutate(.x, Region = .y), .id = NULL)
}
# Summarize one-step-ahead retrospective forecast accuracy by region and species.
render_dsem_retrospective_summary_table <- function(retro_fits) {
collect_dsem_retrospective(retro_fits) %>%
filter(Status == "ok") %>%
group_by(Region, Species) %>%
summarise(
n_years = n(),
Bias = mean(Error, na.rm = TRUE),
MAE = mean(Abs_Error, na.rm = TRUE),
RMSE = sqrt(mean(Error^2, na.rm = TRUE)),
MAPE = mean(APE, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(Region, Species) %>%
gt::gt(rowname_col = "Species", groupname_col = "Region") %>%
gt::cols_label(
n_years = "Years",
Bias = "Bias",
MAE = "MAE",
RMSE = "RMSE",
MAPE = "MAPE (%)"
) %>%
gt::fmt_number(columns = n_years, decimals = 0) %>%
gt::fmt_number(columns = c(Bias, MAE, RMSE), decimals = 0) %>%
gt::fmt_number(columns = MAPE, decimals = 2)
}
# Plot observed versus retrospective one-step-ahead TAC predictions.
plot_dsem_retrospective <- function(retro_fits) {
plot_df <- collect_dsem_retrospective(retro_fits) %>%
filter(Status == "ok") %>%
pivot_longer(c(Observed, Predicted), names_to = "Series", values_to = "TAC")
ggplot(plot_df, aes(x = Year, y = TAC, color = Series)) +
geom_line(linewidth = 0.9) +
geom_point(size = 1.5, alpha = 0.85) +
facet_grid(Species ~ Region, scales = "free_y") +
scale_color_manual(values = c(Observed = "grey40", Predicted = "steelblue")) +
scale_y_continuous(labels = comma) +
labs(x = NULL, y = "TAC", color = NULL)
}
# Plot in-sample fitted TAC values plus the forward forecast extension.
plot_dsem_predictions <- function(region_fit) {
pred_years <- c(region_fit$dsem_df$Year, region_fit$forecast_years)
pred_df <- as_tibble(region_fit$pred[, region_fit$y_cols]) %>%
mutate(Year = pred_years) %>%
pivot_longer(-Year, names_to = "Species", values_to = "Predicted")
obs_df <- region_fit$dsem_df %>%
select(Year, all_of(region_fit$y_cols)) %>%
pivot_longer(-Year, names_to = "Species", values_to = "Observed")
plot_df <- left_join(pred_df, obs_df, by = c("Year", "Species"))
ggplot(plot_df, aes(x = Year)) +
geom_line(aes(y = Predicted, color = "Predicted"), linewidth = 1) +
geom_point(aes(y = Observed, color = "Observed"), size = 1.4, alpha = 0.8) +
facet_grid(Species ~ ., scales = "free_y") +
scale_color_manual(values = c(Predicted = "steelblue", Observed = "grey40")) +
scale_y_continuous(labels = comma) +
labs(x = NULL, y = "TAC", color = NULL)
}
bsai_dsem_dep <- c("Pollock", "Pacific cod")
bsai_dsem_indep <- mainspp
bsai_dsem <- fit_dsem_region(
data = dfOY %>% filter(lag == 1, Species %in% mainspp),
dep_species = bsai_dsem_dep,
indep_species = bsai_dsem_indep
)
goa_dsem_dep <- c("Pollock", "Pacific cod")
goa_dsem_indep <- goaraw %>%
filter(lag == 1, OY == 1, Area == "Total") %>%
group_by(Species) %>%
summarise(total_TAC = sum(TAC, na.rm = TRUE), .groups = "drop") %>%
filter(is.finite(total_TAC), total_TAC > 0) %>%
arrange(desc(total_TAC)) %>%
slice_head(n = 7) %>%
pull(Species)
goa_dsem <- fit_dsem_region(
data = goaOY %>% filter(lag == 1, Species %in% goa_dsem_indep),
dep_species = goa_dsem_dep,
indep_species = goa_dsem_indep
)
bsai_dsem_retro <- run_dsem_retrospective(
data = dfOY %>% filter(lag == 1, Species %in% mainspp),
dep_species = bsai_dsem_dep,
indep_species = bsai_dsem_indep,
start_year = 2015
)
goa_dsem_retro <- run_dsem_retrospective(
data = goaOY %>% filter(lag == 1, Species %in% goa_dsem_indep),
dep_species = goa_dsem_dep,
indep_species = goa_dsem_indep,
start_year = 2015
)