initial commit

This commit is contained in:
2026-02-10 04:52:37 -05:00
commit 0476f6f8f8
65 changed files with 15368 additions and 0 deletions

5
.Rbuildignore Normal file
View File

@@ -0,0 +1,5 @@
^renv$
^renv\.lock$
^LICENSE\.md$
^.*\.Rproj$
^\.Rproj\.user$

40
.Rprofile Normal file
View File

@@ -0,0 +1,40 @@
source("renv/activate.R")
if (interactive()) {
suppressMessages(library(targets))
}
# --- Targets Aliases ---
tm <- targets::tar_make
ti <- targets::tar_invalidate
tr <- targets::tar_read
to <- targets::tar_outdated
# Specific Macros
tmr <- function() targets::tar_make(report)
tir <- function() targets::tar_invalidate(report)
# --- renv Aliases ---
rs <- renv::status # check health
ri <- renv::install # install packages
rsp <- renv::snapshot # save library state (snapshot)
rr <- renv::restore # revert to lockfile
# --- The "Cheat Sheet" Startup Message ---
message(
"\n---------------------------------------------",
"\n SHORTCUTS LOADED",
"\n---------------------------------------------",
"\n [Targets]",
"\n tm = tar_make()",
"\n ti = tar_invalidate()",
"\n tr = tar_read()",
"\n to = tar_outdated()",
"\n tmr = tar_make(report)",
"\n tir = tar_invalidate(report)",
"\n",
"\n [renv]",
"\n rs = renv::status()",
"\n ri = renv::install()",
"\n rsp = renv::snapshot()",
"\n rr = renv::restore()",
"\n---------------------------------------------\n"
)

7
.dockerignore Normal file
View File

@@ -0,0 +1,7 @@
.git
.gitignore
.Rproj.user
_targets/
_site/
*.html
*.DS_Store

26
.gitignore vendored Normal file
View File

@@ -0,0 +1,26 @@
# --- Data and Pipeline ---
data/
_targets/
_targets.user
# --- R Environment ---
.Rproj.user/
.Rhistory
.RData
.Renviron
.Ruserdata
# Keep the lockfile, ignore the library
renv/library/
renv/staging/
renv/python/
# --- Quarto and Output ---
# Since you are hosting via Caddy/Rsync,
# stop tracking these in Git to avoid bloat.
.quarto/
*_cache/
*_files/
index.html
# --- System ---
.DS_Store

30
DESCRIPTION Normal file
View File

@@ -0,0 +1,30 @@
Package: forestedAnalysis
Title: Spatial Cross-Validation and AOA Analysis of Forest Cover
Version: 0.0.0.9000
Authors@R:
person("Rob", "Wiederstein", , "khuon68@gmail.com", role = c("aut", "cre"))
Description: A research compendium analyzing forest cover data in Washington and Georgia.
It evaluates the Area of Applicability (AOA) and demonstrates model failure
during spatial extrapolation.
License: MIT
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.3
Depends:
tidymodels,
tidyverse
Imports:
cowplot,
forested,
ggplot2,
ggrepel,
knitr,
patchwork,
quarto,
rmarkdown,
sf,
showtext,
sysfonts,
targets,
terra,
waywiser

52
Dockerfile Normal file
View File

@@ -0,0 +1,52 @@
FROM rocker/tidyverse:4.4.0
RUN apt-get update && apt-get install -y \
nano \
neovim \
git \
bash-completion \
openssh-client \
cmake \
libglpk-dev \
libcurl4-openssl-dev \
libssl-dev \
libxml2-dev \
libfontconfig1-dev \
libfreetype6-dev \
libharfbuzz-dev \
libfribidi-dev \
libpng-dev \
libjpeg-dev \
libtiff-dev \
libwebp-dev \
gdal-bin \
libgdal-dev \
libproj-dev \
libgeos-dev \
libudunits2-dev \
&& rm -rf /var/lib/apt/lists/*
# --- CONFIGURATION FIXES ---
# 1. Move library out of project folder (fixes Volume Trap)
ENV RENV_PATHS_LIBRARY=/renv/library
# 2. Disable Symlinks (fixes Root Permission Trap) <--- CRITICAL NEW LINE
ENV RENV_CONFIG_CACHE_SYMLINKS=FALSE
RUN mkdir -p /renv/library && chmod 777 /renv/library
WORKDIR /home/rstudio/project
COPY renv.lock renv.lock
COPY .Rprofile .Rprofile
COPY renv/activate.R renv/activate.R
COPY renv/settings.json renv/settings.json
# Restore (Binaries + No Symlinks)
RUN R -e "options(repos = c(CRAN = 'https://packagemanager.posit.co/cran/__linux__/jammy/latest')); install.packages('renv'); renv::restore()"
COPY . .
# Ensure the actual files are readable by everyone
RUN chmod -R 777 /renv
CMD ["R"]

2
LICENSE Normal file
View File

@@ -0,0 +1,2 @@
YEAR: 2026
COPYRIGHT HOLDER: Rob Wiederstein

21
LICENSE.md Normal file
View File

@@ -0,0 +1,21 @@
# MIT License
Copyright (c) 2026 forested authors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

135
NAMESPACE Normal file
View File

@@ -0,0 +1,135 @@
# Generated by roxygen2: do not edit by hand
export(calculate_ga_aoa)
export(combine_forest)
export(create_stats_summary)
export(get_epa_ecoregions)
export(helper_save_fig)
export(plot_ecoregion_comparison)
export(plot_failure_mechanism)
export(plot_ga_comparison_map)
export(plot_georgia_aoa)
export(plot_precip_hex_comparison)
export(plot_regional_comparison)
export(plot_rf_importance)
export(plot_spatial_exploration)
export(plot_state_topo)
export(plot_theme_diagnostic)
export(plot_us_map)
export(process_ecoregions)
export(save_combined_topo)
export(save_error_map_png)
export(save_outlier_map_png)
export(setup_forestry_fonts)
export(style_audit_table)
export(theme_forestry_plot)
export(theme_forestry_spatial)
export(theme_forestry_void)
importFrom(colorspace,scale_color_discrete_qualitative)
importFrom(colorspace,scale_fill_discrete_qualitative)
importFrom(cowplot,background_grid)
importFrom(cowplot,theme_cowplot)
importFrom(dplyr,"%>%")
importFrom(dplyr,all_of)
importFrom(dplyr,any_of)
importFrom(dplyr,arrange)
importFrom(dplyr,bind_cols)
importFrom(dplyr,case_when)
importFrom(dplyr,desc)
importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,if_any)
importFrom(dplyr,if_else)
importFrom(dplyr,inner_join)
importFrom(dplyr,mutate)
importFrom(dplyr,rename)
importFrom(dplyr,row_number)
importFrom(dplyr,select)
importFrom(dplyr,summarize)
importFrom(dplyr,where)
importFrom(ggplot2,aes)
importFrom(ggplot2,after_stat)
importFrom(ggplot2,coord_sf)
importFrom(ggplot2,element_blank)
importFrom(ggplot2,element_line)
importFrom(ggplot2,element_rect)
importFrom(ggplot2,element_text)
importFrom(ggplot2,geom_hline)
importFrom(ggplot2,geom_point)
importFrom(ggplot2,geom_segment)
importFrom(ggplot2,geom_sf)
importFrom(ggplot2,geom_smooth)
importFrom(ggplot2,geom_vline)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,ggsave)
importFrom(ggplot2,guide_colorbar)
importFrom(ggplot2,guide_legend)
importFrom(ggplot2,guide_none)
importFrom(ggplot2,guides)
importFrom(ggplot2,labs)
importFrom(ggplot2,margin)
importFrom(ggplot2,rel)
importFrom(ggplot2,scale_alpha)
importFrom(ggplot2,scale_color_manual)
importFrom(ggplot2,scale_color_viridis_c)
importFrom(ggplot2,scale_fill_discrete)
importFrom(ggplot2,scale_fill_distiller)
importFrom(ggplot2,scale_fill_manual)
importFrom(ggplot2,scale_shape_manual)
importFrom(ggplot2,scale_x_continuous)
importFrom(ggplot2,scale_y_continuous)
importFrom(ggplot2,theme)
importFrom(ggplot2,theme_minimal)
importFrom(ggplot2,theme_void)
importFrom(ggplot2,unit)
importFrom(ggrepel,geom_label_repel)
importFrom(gt,fmt_number)
importFrom(gt,gt)
importFrom(gt,opt_row_striping)
importFrom(gt,px)
importFrom(gt,tab_header)
importFrom(gt,tab_options)
importFrom(patchwork,plot_annotation)
importFrom(patchwork,plot_layout)
importFrom(patchwork,wrap_plots)
importFrom(psych,describe)
importFrom(ranger,ranger)
importFrom(rmapshaper,ms_filter_islands)
importFrom(rmapshaper,ms_simplify)
importFrom(scales,squish)
importFrom(sf,read_sf)
importFrom(sf,sf_use_s2)
importFrom(sf,st_as_sf)
importFrom(sf,st_coordinates)
importFrom(sf,st_crs)
importFrom(sf,st_drop_geometry)
importFrom(sf,st_filter)
importFrom(sf,st_intersection)
importFrom(sf,st_join)
importFrom(sf,st_make_grid)
importFrom(sf,st_make_valid)
importFrom(sf,st_point_on_surface)
importFrom(sf,st_read)
importFrom(sf,st_transform)
importFrom(sf,st_union)
importFrom(showtext,showtext_auto)
importFrom(spdep,card)
importFrom(spdep,dnearneigh)
importFrom(spdep,lag.listw)
importFrom(spdep,nb2listw)
importFrom(stats,predict)
importFrom(stringr,str_wrap)
importFrom(sysfonts,font_add_google)
importFrom(terra,rast)
importFrom(terra,shade)
importFrom(terra,terrain)
importFrom(tibble,rownames_to_column)
importFrom(tibble,tibble)
importFrom(tidyterra,geom_spatraster)
importFrom(tidyterra,scale_fill_hypso_c)
importFrom(tigris,shift_geometry)
importFrom(tigris,states)
importFrom(utils,download.file)
importFrom(utils,unzip)
importFrom(vip,vi)
importFrom(waywiser,ww_area_of_applicability)

View File

@@ -0,0 +1,6 @@
#' @keywords internal
"_PACKAGE"
## usethis namespace: start
## usethis namespace: end
NULL

2288
R/functions.R Normal file

File diff suppressed because it is too large Load Diff

5
README.md Normal file
View File

@@ -0,0 +1,5 @@
# Optimism Bias in Ecological Modeling
## Project Overview
This project explores the hazards of ignoring **spatial autocorrelation** in ecological modeling. Using the `forested` package and forest structure data from Washington State, this Quarto presentation demonstrates how standard random cross-validation yields overly optimistic performance estimates by allowing models to "cheat" via nearby neighbors. The analysis utilizes the `spatialsample` package to visualize and compare three distinct validation strategies—**Random** (the baseline), **Spatial Blocking** (geographic separation), and **Environmental Clustering** (ecological separation)—to establish robust, geographically transferable model performance metrics.

481
_targets.R Normal file
View File

@@ -0,0 +1,481 @@
library(targets)
library(tarchetypes)
# 1. Options ----
tar_option_set(
packages = c(
"colorspace",
"elevatr",
"forested",
"ggcorrplot",
"ggrepel",
"ggspatial",
"gt",
"magrittr",
"patchwork",
"processx",
"psych",
"quarto",
"ranger",
"rmapshaper",
"sf",
"showtext",
"spatialsample",
"stringr",
"terra",
"tidyterra",
"tidymodels",
"tidyverse",
"tigris",
"xgboost", # For XGBoost
"earth",
"withr"
),
format = "rds"
)
tar_source("R/functions.R")
# 3. The Pipeline ----
list(
# constants
tar_target(n_folds, 10),
# Data Ingestion
tar_target(forested_wa, forested::forested_wa),
tar_target(forested_ga, forested::forested_ga),
tar_target(
wa_sf,
forested_wa %>%
sf::st_as_sf(coords = c("lon", "lat"), crs = 4326, remove = FALSE)
),
tar_target(
name = eco_url,
command = "https://dmap-prod-oms-edc.s3.us-east-1.amazonaws.com/ORD/Ecoregions/us/us_eco_l3.zip",
format = "url"
),
tar_target(
name = data_dir,
command = "data/epa",
format = "file" # Tracks the directory
),
# Download data
tar_target(
name = eco_zip_file,
command = get_epa_ecoregions(url = eco_url, dest_dir = data_dir),
format = "file"
),
# Data Processing
tar_target(forested_us, combine_forest(wa_data = forested_wa, ga_data = forested_ga)),
tar_target(boundary_wa_sf, fetch_state_boundary(state = "Washington")),
tar_target(boundary_ga_sf, fetch_state_boundary(state = "Georgia")),
tar_target(
name = eco_data,
command = process_ecoregions(
zip_path = eco_zip_file,
target_states = c("Washington", "Georgia"),
simplify_tol = 0.05
)
),
# Raster File Target
tar_target(wa_elev_file,
create_elevation_raster(boundary_wa_sf, "data/wa_elevation.tif"),
format = "file"),
tar_target(ga_elev_file,
create_elevation_raster(boundary_ga_sf, "data/ga_elevation.tif"),
format = "file"),
# Maps
tar_target(fig_us_map, plot_us_map()),
tar_target(
name = fig_us_map_file,
command = helper_save_fig(
plot_obj = fig_us_map,
name = "us_forests",
width = 10,
height = 5.25,
type = "map"
),
format = "file"
),
tar_target(wa_ga_map, fetch_study_area(c("Washington", "Georgia"))),
tar_target(map_wa_ga_regional, plot_regional_comparison(forested_us, wa_ga_map)),
tar_target(
name = fig_wa_ga_regional_file,
command = helper_save_fig(
plot_obj = map_wa_ga_regional,
name = "wa_ga_forests",
width = 9.2,
height = 4.25,
type = "map"
),
format = "file"
),
tar_target(
name = ecoregion_plot,
command = plot_ecoregion_comparison(eco_data)
),
tar_target(
name = ecoregion_plot_file,
command = helper_save_fig(
plot_obj = ecoregion_plot,
name = "wa_ga_ecoregions",
width = 10,
height = 4.25,
type = "map"
),
format = "file" # Tells targets to watch the actual .png file
),
tar_target(
name = combined_topo_map,
command = save_combined_topo(
wa_data = forested_wa,
ga_data = forested_ga,
wa_boundary = boundary_wa_sf,
ga_boundary = boundary_ga_sf,
wa_raster_path = wa_elev_file,
ga_raster_path = ga_elev_file,
output_path = "figs/combined_topo.png"
),
format = "file"
),
tar_target(
map_precip_hex,
plot_precip_hex_comparison(
wa_data = forested_wa,
ga_data = forested_ga,
boundaries = wa_ga_map,
bins = 50
)
),
tar_target(plot_cv_comparison, plot_cv_strategies(forested_wa)),
# fold mechanics
tar_target(
fig_fold_mechanics,
plot_fold_mechanics(wa_sf, boundary_wa_sf)
),
# fold diagram
tar_target(fig_classic_cv, plot_classic_kfold_diagram()),
# Analysis
tar_target(tbl_forest_wa, format_summary_table(forested_wa)),
tar_target(plot_distrib_wa, plot_forest_distributions(forested_wa)),
tar_target(plt_outliers, identify_outliers(forested_wa)),
tar_target(map_wa_outliers,
save_outlier_map_png(forested_wa, boundary_wa_sf, wa_elev_file, "figs/wa_outliers.png"),
format = "file"),
tar_target(plt_wa_pca, plot_wa_pca(forested_wa)),
tar_target(
name = p_moran_exploration,
command = plot_spatial_exploration(forested_wa)
),
# correlogram
tar_target(plt_correlogram, plot_correlations(forested_wa)),
# vip plot
tar_target(plt_vip, plot_rf_importance(forested_wa)),
# umap plot
tar_target(umap_plot, plot_umap_forested(forested_wa)),
# 1. Data Splitting -------------------------------------------------
# Define the split (80% Train, 20% Test)
tar_target(splits, initial_split(wa_sf, prop = 0.80, strata = forested)),
# Extract the Training Set (Used for Resampling/Modeling)
tar_target(train_data, training(splits)),
# Extract the Test Set (Locked away until the very end)
tar_target(test_data, testing(splits)),
# 2. Recipes ----
## A: Base (Includes Lat/Lon) ----
tar_target(
recipe_base,
recipe(forested ~ ., data = train_data) %>%
update_role(geometry, new_role = "id") %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors())
),
## B: Non-Spatial (Bio Only) ----
tar_target(
recipe_non_spatial,
recipe(forested ~ ., data = train_data) %>%
update_role(geometry, lat, lon, new_role = "id") %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors())
),
## C: Extensible (Feature Engineered) ----
tar_target(
recipe_extensible,
recipe(forested ~ ., data = train_data) %>%
update_role(geometry, lat, lon, new_role = "id") %>%
step_rm(northness, county, year) %>%
step_ratio(precip_annual, denom = denom_vars(temp_annual_max)) %>%
step_mutate(
temp_range = temp_annual_max - temp_annual_min,
vpd_range = vapor_max - vapor_min
) %>%
step_YeoJohnson(elevation) %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors())
),
tar_target(
plot_yeo,
plot_yeo_johnson(forested_wa)
),
# 3. Engines ----
## Logistic Regression ----
tar_target(
spec_logistic,
logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
),
## MARS ----
tar_target(
spec_mars,
mars(num_terms = 10, prod_degree = 2) %>%
set_engine("earth", nfold = 1) %>% # nfold=1 prevents internal CV (speed)
set_mode("classification")
),
## Random Forest ----
tar_target(
spec_rf,
rand_forest(trees = 1000, min_n = 10) %>%
set_engine("ranger",
importance = "impurity", # Calculate variable importance
num.threads = 1) %>% # <--- Server Safety Lock
set_mode("classification")
),
## XGBoost ----
tar_target(
spec_xgb,
boost_tree(trees = 1000, tree_depth = 6, learn_rate = 0.01) %>%
set_engine("xgboost",
nthread = 1) %>% # <--- Server Safety Lock
set_mode("classification")
),
# 4. The Workflow Set ----
# Crosses every recipe with every model (2 x 4 = 8 workflows)
tar_target(
model_set,
workflow_set(
preproc = list(base = recipe_base,
non_spatial = recipe_non_spatial,
extensible = recipe_extensible),
models = list(
log = spec_logistic,
rf = spec_rf,
xgb = spec_xgb,
mars = spec_mars
),
cross = TRUE
)
),
# 5. Resampling Strategies -----
## A. Random Folds ----
tar_target(
folds_random,
vfold_cv(train_data, v = n_folds, strata = forested)
),
## B. Spatial Blocks ----
tar_target(
folds_block,
spatial_block_cv(train_data, v = n_folds)
),
## C. Spatial Clustering ----
tar_target(
folds_cluster,
spatial_clustering_cv(train_data, v = n_folds)
),
# 6. Fit Models -----
## Branch 1: Random CV ----
tar_target(
results_random,
workflow_map(
model_set,
"fit_resamples",
resamples = folds_random,
metrics = metric_set(roc_auc, accuracy, pr_auc),
verbose = TRUE
)
),
## Branch 2: Block CV ----
tar_target(
results_block,
workflow_map(
model_set,
"fit_resamples",
resamples = folds_block,
metrics = metric_set(roc_auc, accuracy, pr_auc),
verbose = TRUE
)
),
## Branch 3: Cluster CV ----
tar_target(
results_cluster,
workflow_map(
model_set,
"fit_resamples",
resamples = folds_cluster,
metrics = metric_set(roc_auc, accuracy, pr_auc),
verbose = TRUE
)
),
# 7. Results ----
tar_target(
fig_cv_comparison,
plot_spatial_cv_comparison(results_random, results_block, results_cluster)
),
tar_target(
fig_model_stability,
plot_model_stability(results_random, results_block, results_cluster, best_model_id)
),
# 8. Select and Tune the Best Model ----
tar_target(
best_model_id,
results_cluster %>%
rank_results(rank_metric = "roc_auc", select_best = TRUE) %>%
slice(1) %>%
pull(wflow_id)
),
tar_target(
tbl_model_performance,
results_cluster %>%
rank_results(rank_metric = "roc_auc", select_best = TRUE) %>%
filter(.metric == "roc_auc")
),
# 9. Final Fit ----
tar_target(
final_fit_results,
last_fit(
extract_workflow(model_set, best_model_id),
split = splits, # Your original 80/20 split
metrics = metric_set(roc_auc, accuracy)
)
),
# 10. Test Set Performance Plot ----
tar_target(
fig_final_performance,
plot_final_test_results(final_fit_results) # Use the specific plotting function
),
tar_target(
tbl_performance,
create_performance_table(results_cluster, final_fit_results)
),
# 11. Confusion Matrix ----
tar_target(
fig_confusion_matrix,
plot_final_confusion_matrix(final_fit_results)
),
tar_target(
test_predictions,
collect_predictions(final_fit_results) %>%
dplyr::bind_cols(
rsample::testing(splits) %>%
dplyr::select(lat, lon)
)
),
tar_target(
map_wa_errors,
save_error_map_png(
data = test_predictions, # <--- Use the extracted data here
boundary_sf = boundary_wa_sf,
raster_path = wa_elev_file,
output_path = "figs/wa_errors.png"
),
format = "file"
),
# Georgia ----
tar_target(
model_predictors,
c("elevation", "precip_annual", "temp_annual_mean", "roughness")
),
tar_target(
ga_aoa_data,
calculate_ga_aoa(
train_data = forested_wa,
test_data = forested_ga,
predictors = model_predictors
)
),
tar_target(
plot_aoa_ga,
plot_georgia_aoa(
aoa_sf = ga_aoa_data
)
),
# 3. Predict on Georgia using the Washington Model
tar_target(
ga_predictions,
predict_external_region(
final_fit = final_fit_results,
new_data = forested_ga
)
),
# 4. Map the Predictions
tar_target(
map_ga_probs,
plot_ga_comparison_map(
pred_data = ga_predictions,
boundaries = boundary_ga_sf # <--- CHECK THIS NAME
)
),
# 5. Confusion Matrix for Georgia
tar_target(
ga_conf_mat,
plot_ga_confusion_matrix(ga_predictions)
),
# 6. Map of Errors (False Positives + False Negatives)
tar_target(
map_failure_mechanism,
plot_failure_mechanism(
aoa_data = ga_aoa_data, # <--- Reads the SAME data target
pred_data = ga_predictions,
boundaries = boundary_ga_sf
)
),
# Report ----
tar_target(
name = report,
command = {
# 1. Temporarily disable renv auto-loader so Quarto uses system libs
if (file.exists(".Rprofile")) file.rename(".Rprofile", "hold_Rprofile")
# 2. Use a 'tryCatch' to ensure the .Rprofile is restored even if render fails
res <- tryCatch({
quarto::quarto_render("index.qmd", quiet = FALSE)
}, error = function(e) {
if (file.exists("hold_Rprofile")) file.rename("hold_Rprofile", ".Rprofile")
stop(e)
})
# 3. Restore the .Rprofile
if (file.exists("hold_Rprofile")) file.rename("hold_Rprofile", ".Rprofile")
"index.html"
},
format = "file"
)
)

BIN
assets/ecoregions_map.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 550 KiB

43
assets/fonts.css Normal file
View File

@@ -0,0 +1,43 @@
/* assets/fonts.css */
@font-face {
font-family: 'Atkinson Hyperlegible Next';
src: url('fonts/AtkinsonHyperlegibleNext-Regular.ttf') format('truetype');
font-weight: normal;
font-style: normal;
}
@font-face {
font-family: 'Atkinson Hyperlegible Next';
src: url('fonts/AtkinsonHyperlegibleNext-Bold.ttf') format('truetype');
font-weight: bold;
font-style: normal;
}
@font-face {
font-family: 'Atkinson Hyperlegible Next';
src: url('fonts/AtkinsonHyperlegibleNext-Italic.ttf') format('truetype');
font-weight: normal;
font-style: italic;
}
@font-face {
font-family: 'Atkinson Hyperlegible Next';
src: url('fonts/AtkinsonHyperlegibleNext-BoldItalic.ttf') format('truetype');
font-weight: bold;
font-style: italic;
}
@font-face {
font-family: 'Atkinson Hyperlegible Mono';
src: url('fonts/AtkinsonHyperlegibleMono-Regular.ttf') format('truetype');
font-weight: normal;
font-style: normal;
}
@font-face {
font-family: 'Atkinson Hyperlegible Mono';
src: url('fonts/AtkinsonHyperlegibleMono-Bold.ttf') format('truetype');
font-weight: bold;
font-style: normal;
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 286 KiB

31
custom.scss Normal file
View File

@@ -0,0 +1,31 @@
/* custom.scss */
#title-slide h1 {
font-size: 1.4em !important;
line-height: 1.2 !important;
}
/* Fallback: Make it "nuclear" if the above fails */
.reveal .slides section#title-slide h1 {
font-size: 1.2em !important;
}
.reveal .slides section.title-slide {
background-color: #ffffff !important; /* Force white background */
}
.reveal .slides section.title-slide h1 {
color: #000000 !important;
}
.reveal .slides section.title-slide p,
.reveal .slides section.title-slide .quarto-title-author-name {
color: #333333 !important;
}
$font-family-monospace: "Atkinson Hyperlegible Mono", monospace !default;
/*-- scss:defaults --*/
/* Use the name exactly as defined in fonts.css */
$font-family-sans-serif: "Atkinson Hyperlegible Next", sans-serif !default;
$presentation-heading-font: "Atkinson Hyperlegible Next", sans-serif !default;
/* Other tweaks */
$presentation-font-size-root: 40px;

BIN
figs/combined_topo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 MiB

BIN
figs/map_us_forests.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 359 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 430 KiB

BIN
figs/map_wa_ga_forests.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.9 MiB

BIN
figs/wa_errors.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
figs/wa_outliers.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 980 KiB

18
forested.Rproj Normal file
View File

@@ -0,0 +1,18 @@
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace

17
ieee-access.csl Normal file
View File

@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="utf-8"?>
<style xmlns="http://purl.org/net/xbiblio/csl" version="1.0" default-locale="en-US">
<!-- Generated with https://github.com/citation-style-language/utilities/tree/master/generate_dependent_styles/data/ieee -->
<info>
<title>IEEE Access</title>
<id>http://www.zotero.org/styles/ieee-access</id>
<link href="http://www.zotero.org/styles/ieee-access" rel="self"/>
<link href="http://www.zotero.org/styles/ieee" rel="independent-parent"/>
<link href="http://ieeexplore.ieee.org/servlet/opac?punumber=6287639" rel="documentation"/>
<category citation-format="numeric"/>
<category field="engineering"/>
<category field="communications"/>
<issn>2169-3536</issn>
<updated>2014-05-15T02:20:32+00:00</updated>
<rights license="http://creativecommons.org/licenses/by-sa/3.0/">This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License</rights>
</info>
</style>

172
images/resampling.svg Normal file
View File

@@ -0,0 +1,172 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" xmlns:xl="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg" viewBox="-535.5 -701.5 869.75 527.75" width="869.75" height="527.75">
<defs>
<filter id="Shadow" filterUnits="userSpaceOnUse" x="-535.5" y="-701.5">
<feGaussianBlur in="SourceAlpha" result="blur" stdDeviation="1.308"/>
<feOffset in="blur" result="offset" dx="0" dy="2"/>
<feFlood flood-color="black" flood-opacity=".5" result="flood"/>
<feComposite in="flood" in2="offset" operator="in" result="color"/>
<feMerge>
<feMergeNode in="color"/>
<feMergeNode in="SourceGraphic"/>
</feMerge>
</filter>
<font-face font-family="Helvetica Neue" font-size="16" panose-1="2 0 5 3 0 0 0 2 0 4" units-per-em="1000" underline-position="-100" underline-thickness="50" slope="0" x-height="517" cap-height="714" ascent="951.9958" descent="-212.99744" font-weight="400">
<font-face-src>
<font-face-name name="HelveticaNeue"/>
</font-face-src>
</font-face>
<marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="FilledArrow_Marker" stroke-linejoin="miter" stroke-miterlimit="10" viewBox="-1 -4 10 8" markerWidth="10" markerHeight="8" color="black">
<g>
<path d="M 8 0 L 0 -3 L 0 3 Z" fill="currentColor" stroke="currentColor" stroke-width="1"/>
</g>
</marker>
<font-face font-family="Helvetica Neue" font-size="16" panose-1="2 0 5 3 0 0 0 9 0 4" units-per-em="1000" underline-position="-100" underline-thickness="50" slope="-750" x-height="517" cap-height="714" ascent="957.0007" descent="-212.99744" font-style="italic" font-weight="400">
<font-face-src>
<font-face-name name="HelveticaNeue-Italic"/>
</font-face-src>
</font-face>
</defs>
<metadata> Produced by OmniGraffle 7.13.1
<dc:date>2020-03-15 00:14:09 +0000</dc:date>
</metadata>
<g id="Canvas_1" stroke="none" stroke-opacity="1" fill-opacity="1" stroke-dasharray="none" fill="none">
<title>Canvas 1</title>
<g id="Canvas_1: Layer 1">
<title>Layer 1</title>
<g id="Graphic_724" filter="url(#Shadow)">
<ellipse cx="-43.5" cy="-641.25" rx="57.7500922788345" ry="58.7500938767363" fill="white"/>
<ellipse cx="-43.5" cy="-641.25" rx="57.7500922788345" ry="58.7500938767363" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
<text transform="translate(-84.7 -650.474)" fill="black">
<tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="13.496" y="15">All Data</tspan>
</text>
</g>
<g id="Graphic_723" filter="url(#Shadow)">
<path d="M -107.25 -529.5 L -48.99782 -488.9047 L -71.24811 -423.2203 L -143.2519 -423.2203 L -165.50218 -488.9047 Z" fill="#ffeabb"/>
<path d="M -107.25 -529.5 L -48.99782 -488.9047 L -71.24811 -423.2203 L -143.2519 -423.2203 L -165.50218 -488.9047 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
<text transform="translate(-151.25 -474.099)" fill="black">
<tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="16.144" y="15">Training</tspan>
</text>
</g>
<g id="Graphic_722" filter="url(#Shadow)">
<path d="M 207.25 -529.5 L 265.50218 -488.9047 L 243.2519 -423.2203 L 171.2481 -423.2203 L 148.99782 -488.9047 Z" fill="#e5e6ff"/>
<path d="M 207.25 -529.5 L 265.50218 -488.9047 L 243.2519 -423.2203 L 171.2481 -423.2203 L 148.99782 -488.9047 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
<text transform="translate(163.25 -474.099)" fill="black">
<tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="18.664" y="15">Testing</tspan>
</text>
</g>
<g id="Line_721">
<line x1="-64.64317" y1="-586.56304" x2="-87.49556" y2="-527.45516" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
</g>
<g id="Line_720">
<line x1="5.009894" y1="-609.35054" x2="159.74542" y2="-507.5985" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
</g>
<g id="Graphic_719" filter="url(#Shadow)">
<ellipse cx="-335.25" cy="-214.75" rx="61.2500978714911" ry="35.5000567255173" fill="#e5e6ff"/>
<ellipse cx="-335.25" cy="-214.75" rx="61.2500978714911" ry="35.5000567255173" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
<text transform="translate(-379.25 -223.974)" fill="black">
<tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x=".432" y="15">Assessment</tspan>
</text>
</g>
<g id="Graphic_718" filter="url(#Shadow)">
<ellipse cx="-468" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" fill="#ffeabb"/>
<ellipse cx="-468" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
<text transform="translate(-514.2 -223.974)" fill="black">
<tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="16.72" y="15">Analysis</tspan>
</text>
</g>
<g id="Graphic_717" filter="url(#Shadow)">
<rect x="-469.75" y="-359" width="139" height="56.5" fill="white"/>
<rect x="-469.75" y="-359" width="139" height="56.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
<text transform="translate(-464.75 -339.974)" fill="black">
<tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="22.116" y="15">Resample 1</tspan>
</text>
</g>
<g id="Line_716">
<line x1="-153.47164" y1="-453.3897" x2="-334.53915" y2="-363.40586" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
</g>
<g id="Line_715">
<line x1="-416.74946" y1="-302.5" x2="-443.27834" y2="-257.07786" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
</g>
<g id="Line_714">
<line x1="-384.42026" y1="-302.5" x2="-359.01294" y2="-257.1577" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
</g>
<g id="Graphic_713" filter="url(#Shadow)">
<ellipse cx="-37.25" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" fill="#e5e6ff"/>
<ellipse cx="-37.25" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
<text transform="translate(-83.45 -223.974)" fill="black">
<tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="2.6320008" y="15">Assessment</tspan>
</text>
</g>
<g id="Graphic_712" filter="url(#Shadow)">
<ellipse cx="-172.75" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" fill="#ffeabb"/>
<ellipse cx="-172.75" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
<text transform="translate(-218.95 -223.974)" fill="black">
<tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="16.72" y="15">Analysis</tspan>
</text>
</g>
<g id="Graphic_711" filter="url(#Shadow)">
<rect x="-174.5" y="-359" width="139" height="56.5" fill="white"/>
<rect x="-174.5" y="-359" width="139" height="56.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
<text transform="translate(-169.5 -339.974)" fill="black">
<tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="22.116" y="15">Resample 2</tspan>
</text>
</g>
<g id="Line_710">
<line x1="-106.42887" y1="-423.2203" x2="-105.58948" y2="-368.8988" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
</g>
<g id="Line_709">
<line x1="-121.49946" y1="-302.5" x2="-148.02834" y2="-257.07786" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
</g>
<g id="Line_708">
<line x1="-88.50054" y1="-302.5" x2="-61.97166" y2="-257.07786" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
</g>
<g id="Graphic_707" filter="url(#Shadow)">
<ellipse cx="266.75" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" fill="#e5e6ff"/>
<ellipse cx="266.75" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
<text transform="translate(220.55 -223.974)" fill="black">
<tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="2.6320008" y="15">Assessment</tspan>
</text>
</g>
<g id="Graphic_706" filter="url(#Shadow)">
<ellipse cx="131.25" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" fill="#ffeabb"/>
<ellipse cx="131.25" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
<text transform="translate(85.05 -223.974)" fill="black">
<tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="16.72" y="15">Analysis</tspan>
</text>
</g>
<g id="Graphic_705" filter="url(#Shadow)">
<rect x="129.5" y="-359" width="139" height="56.5" fill="white"/>
<rect x="129.5" y="-359" width="139" height="56.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
<text transform="translate(134.5 -339.974)" fill="black">
<tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="21.084" y="15">Resample </tspan>
<tspan font-family="Helvetica Neue" font-size="16" font-style="italic" font-weight="400" fill="black" y="15">B</tspan>
</text>
</g>
<g id="Line_704">
<line x1="-60.738406" y1="-454.24567" x2="130.64323" y2="-363.25103" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
</g>
<g id="Line_703">
<line x1="182.50054" y1="-302.5" x2="155.97166" y2="-257.07786" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
</g>
<g id="Line_702">
<line x1="215.49946" y1="-302.5" x2="242.02834" y2="-257.07786" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
</g>
<g id="Group_698">
<g id="Graphic_701">
<ellipse cx="24.25" cy="-330.75" rx="4.75000759003401" ry="4.00000639160761" fill="black"/>
<ellipse cx="24.25" cy="-330.75" rx="4.75000759003401" ry="4.00000639160761" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
</g>
<g id="Graphic_700">
<ellipse cx="42.75" cy="-330.75" rx="4.75000759003401" ry="4.00000639160761" fill="black"/>
<ellipse cx="42.75" cy="-330.75" rx="4.75000759003401" ry="4.00000639160761" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
</g>
<g id="Graphic_699">
<ellipse cx="61.25" cy="-330.75" rx="4.75000759003401" ry="4.00000639160761" fill="black"/>
<ellipse cx="61.25" cy="-330.75" rx="4.75000759003401" ry="4.00000639160761" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
</g>
</g>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 12 KiB

919
index.qmd Normal file
View File

@@ -0,0 +1,919 @@
---
title: "From Mt. Olympus to the Okefenokee"
subtitle: "A Case Study in Spatial Modeling"
author: "Rob Wiederstein"
lang: en-US
smart: false
format:
revealjs:
from: markdown-smart
theme: [default, custom.scss]
css: assets/fonts.css
embed-resources: true
title-slide-attributes:
data-background-image: assets/study_sites_globe.png
data-background-size: 150%
data-background-position: center
style: "color: #222222;"
transition: fade
slide-number: true
scrollable: true
chalkboard: false
tbl-cap-location: bottom
toc: true
toc-depth: 1
toc-title: "Order"
fig-dpi: 300
fig-width: 10
fig-asp: 0.5
fig-align: center
resources:
- assets/fonts
execute:
echo: false
cache: false
bibliography: references.bib
csl: ieee-access.csl
nocite: |
@*
---
```{r}
#| label: setup
#| include: false
library(here)
library(targets)
library(gt)
library(ggplot2)
library(showtext)
source("R/functions.R")
setup_forestry_fonts()
```
# Introduction
## The Core Problem
::: {.incremental}
1. **Stationarity:** Do rules hold constant across different places?
2. **Spatial Leakage:** When location is added to a model, does it make it more accurate?
3. **Extrapolation:** How does a model perform when trained with location data and applied in a new location?
:::
::: {.notes}
* **Goal:** Test if "location" features trick the model into high accuracy that fails elsewhere.
:::
## The `forested` Package
:::: {.columns}
::: {.column width="25%"}
![](https://github.com/simonpcouch/forested/blob/main/inst/logo.png?raw=true){fig-align="center" width="80%"}
:::
::: {.column width="75%"}
- The `forested` data are from people who looked at a place to see if it was a forest.
- They work for the Forest Inventory and Analysis (FIA) program, part of the USDA.
- It would be cheaper if a forest could be predicted from weather data and land charateristics.
- Forests are in GA and WA.
:::
::::
::: {.notes}
**Speaker Notes:**
- The `forested` package is our primary data source, containing the raw measurements for both the Washington and Georgia "islands".
- We are auditing this package's features to see how well they predict the 'forested' outcome in two geographically distant locations.
:::
## The First Law of Geography
<br>
<br>
>"Everything is related to everything else, but near things are more related than distant things."[@tobler_computer_1970]
::: {.notes}
- This is the "First Law of Geography" and explains why our Random Forest "cheats" using Lat/Lon.
- Proximity Bias (Spatial Autocorrelation) creates high local accuracy but zero portability.
- We are testing for **Stationarity**: Do the biophysical rules of Washington still work in Georgia?
:::
## Caveat
<br/>
<br/>
>"It is not and has never been the case that Toblers first law of geography . . . always holds absolutely. This is and has always been an oversimplification, disguising possible underlying entitation, support, and other misspecification problems."[@pebesma_spatial_2025]
::: {.notes}
:::
## Forest Locations
```{r}
#| label: fig-us-map-forest-locations
#| fig-cap: "Map shows the geographic distance separating Washington and Georgia."
knitr::include_graphics(here::here("figs", "map_us_forests.png"))
```
:::{.notes}
- Washington is approximately 15 degrees north of Georgia and 30 degrees west.
- the sheer distance suggests that the respective forests are different.
:::
## Regional Forestation
```{r}
#| label: fig-map-wa-ga
#| fig-cap: "Washington (a) and Georgia (b) showing forested areas. Note that the states are rescaled independently to maximize clarity."
#tar_read(map_wa_ga)
knitr::include_graphics(here::here("figs", "map_wa_ga_forests.png"))
```
## Regional Topography
```{r}
#| label: fig-topo-compare
#| echo: false
#| fig-align: "center"
#| out-width: "100%"
#| fig-cap: "Topographic relief map of Washington (a) and Georgia (b). Note: Regions are not to scale and elevation ramps are independent (WA range is ~3x GA)."
knitr::include_graphics(
here::here("figs", "combined_topo.png")
)
```
:::{.notes}
- **Scale Disparity:** Remind the audience that WA peaks reach ~4,400m
while GA peaks reach ~1,450m. The color ramps are local.
- **Rain Shadow:** Point out the Cascade barrier in WA; this is the
primary driver for the precipitation variance in the model.
- **Modeling Link:** This extreme relief is why we use a Yeo-Johnson
transformation on elevation in our tidymodels recipe—a linear
scale would over-emphasize alpine peaks while flattening
the Georgia Piedmont.
:::
## Regional Rainfall
```{r}
#| label: fig-precip-hex
#| fig-cap: "Mean annual precipitation (mm). Note the extreme gradient in WA (training) vs. the relative uniformity of GA (target)."
targets::tar_read(map_precip_hex)
```
## Level III Ecoregions
```{r}
#| label: fig-ecoregion-comparison
#| fig-cap: "Washington (a) has nine distinct regions while Georgia (b) has seven. Data sourced from U.S. EPA Level III Ecoregions [@epa_ecoregions_2013; @omernik_ecoregions_1987]."
knitr::include_graphics(here::here("figs", "map_wa_ga_ecoregions.png"))
```
:::{.notes}
- Ecoregions denote areas with similar ecosystems and resources.
- The EPA defines 105 Level III regions for management.
- James Omernik drew these lines using holistic expert synthesis.
- Washington and Georgia share zero common ecoregions.
- Washington transitions rapidly from rainforests to arid deserts.
- This extreme heterogeneity makes random spatial modeling difficult.
:::
# Explore
## Descriptive Summary
```{r}
#| label: display-summary
#| echo: false
tar_read(tbl_forest_wa)
```
## Distributions
```{r}
#| label: fig-distributions
#| fig-cap: "Comparison of environmental variable distributions for forested vs. non-forested areas."
targets::tar_read(plot_distrib_wa)
```
::: {.notes}
**1. Topic Introduction**
- This slide presents a univariate audit of our numeric predictors to identify which biophysical features provide the strongest signal for forestation.
- By comparing the "fingerprints" of forested (green) and non-forested (brown) plots, we can visually assess the potential for classification before we begin training models on our EPYC VM.
**2. Axis Definitions**
- **The X-Axis (Value)**: Represents the measurement for each specific biophysical variable, such as millimeters of rain or degrees Celsius.
- **The Y-Axis (Density)**: Represents the probability density for a given value; higher peaks indicate a higher frequency of observations at that specific value within the dataset.
**3. Significant Variables (High Contrast)**
- **Precipitation (`precip_annual`)**: This is a primary driver; forested plots are heavily concentrated in higher rainfall zones, while non-forested plots dominate the dry end of the spectrum.
- **Elevation**: There is a distinct "Forestation Window"; plots between 1,000 and 2,000 meters show a significant green peak, whereas non-forested plots cluster at lower elevations.
- **Temperature (`temp_annual_max` & `mean`)**: Forested plots consistently peak at lower maximum and mean temperatures compared to non-forested areas, suggesting a thermal threshold for forest growth.
- **Vapor Pressure (`vapor_max` & `min`)**: We see a strong bimodal separation; forested areas occupy a specific atmospheric moisture niche distinctly different from non-forested regions.
**4. Non-Significant Variables (High Overlap)**
- **Orientation (`eastness` & `northness`)**: These distributions are nearly identical for both classes, suggesting that cardinal direction alone is a weak predictor in this regional regime.
- **Roughness**: While there is a slight lean toward forested plots being in rougher terrain, the massive overlap indicates surface texture is not a primary discriminator.
:::
## Outliers
```{r}
#| label: outliers
tar_read(plt_outliers)
```
## Map Outliers
```{r}
#| label: fig-wa-outliers
#| fig-cap: "Map of observations with a value greater than three standard deviations from mean."
#| out-width: "100%"
knitr::include_graphics("figs/wa_outliers.png")
```
::: {.notes}
**1. Topic Introduction**
* This map visualizes our "3-Sigma" outliers, which are heavily concentrated along the mountainous west side of the state.
**2. The Orographic Factor**
* The concentration on the west is driven by the Cascades and Olympics. These regions host our most extreme biophysical values for precipitation and elevation.
**3. Intermixed Extremes**
* Note the intermixing of green and brown points. In these high-volatility alpine zones, a forest and a barren ridge often share the same coordinates.
* This proves that local "nearness" is not enough to predict forestation here; the model must rely on the specific biophysical drivers we identified in our density distributions.
**4. The Audit Link**
* These outliers represent the "edge cases" of our Washington model. Their intermixed nature makes them the hardest points to classify, serving as a preview for our Georgia transfer test.
:::
## Principal Component Analysis
```{r}
#| label: pca
tar_read(plt_wa_pca)
```
::: {.notes}
**1. What is PCA?**
PCA (Principal Component Analysis) is a dimensionality reduction tool that takes our 16 variables—including latitude, longitude, and climate data—and compresses them into two primary axes called Principal Components. It allows us to view the 'shape' of the entire Washington dataset in a single 2D space.
**2. Why use it here?**
We use it to explore the structural integrity of our data. Before building a model, we need to know if the environment of 'Forested' plots is actually mathematically different from 'Non-Forested' plots. By including lat and lon, we are seeing the combined power of geography and biophysics.
**3. Does it show anything?**
It shows that the data is not a random cloud; it has a clear orientation. The spread along PC1 captures the primary environmental gradient of Washington—likely moving from the moist coast to the arid east.
**4. Is there good separation on the outcome variable?**
Yes, the separation is significant. We see a distinct 'No' (Non-Forested) cluster forming a tail on the right and a dense 'Yes' (Forested) cluster on the left. While there is 'Alpine Mixing' in the center where categories overlap, the two groups occupy mostly different regions of the feature space.
**5. What does it foreshadow for modeling?**
This separation foreshadows high accuracy for our local Washington model. Because the classes are so distinct in this space, a logistic regression should have no trouble drawing a boundary between them. However, the tight coupling of biophysics with coordinates (lat/lon) here warns us that the model might 'memorize' Washington's map, which will be the primary challenge when we attempt to transfer it to Georgia.
:::
## Correlogram
```{r}
#| label: correlogram
tar_read(plt_correlogram)
```
::: {.notes}
**1. Orientation**
- If you look at the very first column on the left, we can see exactly what drives our "Forested" classification.
- Blue means "More Forest," Orange means "Less Forest."
**2. The Sanity Check**
- First, look at the bottom square: **Canopy Cover (0.75)**.
- This is our sanity check. Obviously, forests have high canopy cover. If this wasn't blue, our data would be broken.
**3. The Biophysical Story: Water vs. Heat**
- The real story is the battle between water and heat.
- **Precipitation (0.52)** is a strong blue driver. In Washington, rain equals trees.
- **Vapor Pressure (-0.64)** is a deep orange driver. High vapor pressure—which correlates with hot, dry valleys—effectively kills the forest probability.
**4. The Terrain Factor**
- Look at **Roughness (0.39)**.
- Rugged, difficult terrain is more likely to be forested. This is likely a mix of biophysics (mountains catch rain) and human history (flat land gets cleared for farming).
**5. The Surprise**
- Finally, look at **Northness and Eastness**. They are near **zero**.
- This tells us that while the *direction* a slope faces might change *which* trees grow there, it doesn't determine *if* trees grow there.
:::
## VIP
```{r}
#| label: variable-importance-plt
tar_read(plt_vip)
```
::: {.notes}
**1. The Comparison**
- "We just looked at Correlations (linear relationships). Now let's look at Variable Importance via Random Forest. This is what the model actually uses to make decisions."
**2. The Consistency**
- "The top three are the same: Canopy Cover, Rain, and Aridity (Vapor Pressure). This confirms our model is learning real biophysics."
**3. Spatial Factors**
- "But look at number 4: **Longitude**."
- "In the correlation chart, Longitude was just a moderate factor. Here, it is massive."
- "The model has learned that Washington is divided into two distinct climate zones—West and East."
- "Instead of learning the physics of *why* trees grow there, it's partially just memorizing *where* they grow. This confirms our hypothesis: the model is using geography as a shortcut. And is worth remembering when we apply it to the Georgia data.
:::
## UMAP
```{r}
#| label: umap-plot
tar_read(umap_plot)
```
## Spatial Dependency Analysis
```{r}
#| label: fig-moran
#| echo: false
#| fig-align: "center"
#| fig-cap: "<b>Moran Scatterplot.</b> The strong positive slope confirms significant spatial autocorrelation ($I > 0.6$)."
tar_read(p_moran_exploration)
```
:::{.notes}
SPEAKER NOTES:
1. THE VISUAL EVIDENCE: Point out the steep, positive slope of the
red dashed line. This slope is a visual representation of the
Global Morans I. A positive slope confirms that high-elevation
plots are surrounded by other high-elevation plots (Top Right
Quadrant), while low-elevation areas are also clustered (Bottom Left).
2. THE "CHEATING" PROBLEM: Explain that this clustering is why
standard Random Cross-Validation is insufficient. If a training
point and a testing point are only 5km apart, the model can
effectively "cheat" by using local similarities rather than
learning the broader ecological relationships.
3. THE JUSTIFICATION: This plot is the primary justification for:
- Using Spatial Block Cross-Validation to force the model to
predict on entirely unseen regions.
- Removing "Northness" and "County" as predictors to prevent the
model from simply memorizing regional averages.
- Applying the Yeo-Johnson transformation to normalize the extreme
elevation variance seen in these clustered Cascade peaks.
4. THE SCALE: Note that we used a 5km fixed-distance neighborhood
transformed into Washington State Plane North (meters) to ensure
the spatial relationships are geographically accurate.
:::
# Resampling
## Spatial Autocorrelation
<br/>
<br/>
>"When data are not independent (e.g. due to spatial autocorrelation), random cross-validation yields optimistic estimates of predictive performance because training and test sets are not independent."[@roberts_crossvalidation_2017]
:::{.notes}
**1. Translation of the Quote**
This quote describes the "Golden Rule" of geography: "Everything is related to everything else, but near things are more related than distant things."
**2. Definition: Spatial Autocorrelation**
Spatial Autocorrelation just means that data points close to each other are practically clones. If it's raining at your house, it's probably raining at your neighbor's house.
**3. forested dataset**
Forests are "clumpy." If you stand next to a Douglas Fir in Washington and take one step to the left, you are almost certainly still in a forest. The elevation, soil, and rain are identical.
**4. Why Random CV is "Optimistic" (The Cheating)**
- When the standard **Random Cross-Validation** is used, the first tree is assigned to the "Study Group" and the second tree (one step away) to the "Test Group."
- The model doesn't learn ecology. It just looks at the neighbor (lat and lon) and copies the answer.
- This gives us an **"Optimistic Estimate"**—a fancy way of saying our high score was fake because the model was cheating off its neighbor.
:::
## The Mechanics of Resampling
```{r}
#| label: fig-resampling
#| echo: false
#| fig-cap: "Visualizing the resampling process [@kuhn_tidy_2022]"
#| fig-align: "center"
#| out-width: "75%"
#| out-extra: 'style="width:75%;"'
knitr::include_graphics(here::here("images", "resampling.svg"))
```
:::{.notes}
- **The Concept:** Resampling methods (like cross-validation and bootstrapping) are **empirical simulation systems**. They generate different versions of our training set to simulate how the model handles new data.
- **The Golden Rule:** It is critical to remember: Resampling is *always* used with the **Training set**. The **Test set** is not involved.
- **The Vocabulary:** To avoid confusion with our initial Train/Test split, we use specific language for these internal loops:
- **Analysis Set:** The subset used to **fit** the model.
- **Assessment Set:** The subset used to **evaluate** performance.
- **The Mechanism:** In every iteration, these two sets are **mutually exclusive**. We fit on the Analysis set, and we measure performance on the Assessment set.
- **The Why:** As we discussed, simply re-predicting the training set is problematic (it leads to optimism bias). Resampling allows us to get a realistic appraisal using the training set without ever touching the final test data.
:::
## Random K-Fold Cross-Validation
```{r}
#| label: fig-classic-cv
#| fig-cap: "Conceptual diagram showing the random assignment of observations to the analysis and assessment groups."
tar_read("fig_classic_cv")
```
## Cross Validation Strategies
```{r}
#| label: fig-cv-strategies
#| echo: false
#| fig-width: 14
#| fig-height: 5
#| out-width: "100%"
#| fig-cap: "Three validation strategies. **Left:** Random splitting mixes train/test points. **Middle:** Spatial blocking forces geographic separation. **Right:** Clustering blocks by environmental similarity. (Note the outline for the Columbia Plateau. See @fig-ecoregion-comparison.)"
tar_read(plot_cv_comparison)
```
::: {.notes}
**1. Left Panel: Random CV (The Illusion of Accuracy)**
- This visualizes why Random CV yields **over-optimistic estimates**.
- Because the colors are mixed (Random), the model can accurately predict a "Red" point simply by memorizing the "Blue" point next to it.
- This isn't "true" predictive power; it is **autocorrelation leakage**. The model is interpolating neighbors rather than learning the underlying ecological rules.
**2. Middle Panel: Spatial Blocking (Forcing Independence)**
- To get a **realistic assessment**, we must enforce spatial independence.
- The grid structure ensures that the test data (Red blocks) is geographically distinct from the training data.
- The performance score will likely drop compared to the first map, but that lower score is **more accurate**. It reflects how the model will actually perform on a new, unvisited site.
**3. Right Panel: Environmental Clustering (Testing Generalization)**
- This strategy tests for **ecological generalization**.
- Notice the large red area in the southeast—the algorithm identified the **Columbia Plateau** as a distinct environment.
- By holding out entire environments (e.g., training on "Wet Coastal" to predict "Dry Plateau"), we test if the model captures the fundamental biological relationships (e.g., how temp/rain affect trees) rather than just memorizing geographic trends.
:::
## Analysis vs. Assessment
```{r}
#| label: fig-mechanics
#| fig-cap: "Visualization of Fold 1 across three cross-validation strategies. Magenta points represent the held-out assessment set."
tar_read(fig_fold_mechanics)
```
::: {.notes}
- **Visualizing the Split**: This slide illustrates Fold 1 of 5; gray points represent the "Analysis" set used for training, while magenta points represent the "Assessment" set the model must predict.
- **Confetti vs. Blocks**: The Random split (left) creates a "confetti" effect where every test point is surrounded by nearby training points, leading to the spatial autocorrelation and "optimism bias" we discussed earlier.
- **Geographic and Ecological Isolation**: The middle and right maps show how we force the model to predict across geographic and ecological gaps.
- **The Columbia Plateau Test**: Specifically in the Environmental Clustering map (right), the entire Columbia Plateau is isolated as a test set.
- **Validating Results**: Because the model had to "learn" forests in the mountains to predict the Plateau, we gained high confidence in its performance there, which was later confirmed by the near-zero error rate in that region.
- **Preparation for Georgia**: This level of isolation is a direct rehearsal for our next step, where we move from the Washington ecoregions to the completely unfamiliar landscapes of Georgia.
:::
# Models
## Engines
::: {.incremental}
1. Logistic Regression
2. MARS
3. Random Forest
4. XGBoost
:::
::: {.notes}
**Logistic Regression:**
Simple, interpretable baseline. Captures linear relationships efficiently.
**MARS:**
Models non-linearities automatically. Good balance between linear and trees.
**Random Forest:**
Robust ensemble method. Reduces overfitting through averaging.
**XGBoost:**
High-performance gradient boosting. Often dominates on tabular data.
:::
## Recipe A: With Coords
```{.r code-line-numbers="2|3"}
recipe(forested ~ ., data = train_data) %>%
# geometry is ID, but lat/lon remain as predictors
update_role(geometry, new_role = "id") %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors())
```
::: {.notes}
**Base Strategy:** Standard approach uses latitude and longitude as predictive features. Risk is the model memorizing locations instead of learning rules.
:::
## Recipe B: No Coords
```{.r code-line-numbers="2|3"}
recipe(forested ~ ., data = train_data) %>%
# Explicitly remove lat/lon from training
update_role(geometry, lat, lon, new_role = "id") %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors())
```
::: {.notes}
**Non-Spatial:** Removes explicit coordinates to prevent spatial overfitting. Forces the model to rely solely on biological environmental signals.
:::
## Recipe C: Extensible
```{.r code-line-numbers="4-10|12"}
recipe(forested ~ ., data = train_data) %>%
update_role(geometry, lat, lon, new_role = "id") %>%
# 1. Remove political/time markers
step_rm(northness, county, year) %>%
# 2. Add Physics (Aridity & Temp Range)
step_ratio(precip_annual, denom = denom_vars(temp_annual_max)) %>%
step_mutate(
temp_range = temp_annual_max - temp_annual_min,
vpd_range = vapor_max - vapor_min
) %>%
# 3. Fix Skew (Critical for Logistic Regression)
step_YeoJohnson(elevation) %>%
step_dummy(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors())
```
::: {.notes}
**Extensible:** Engineers physics-based features like aridity and temperature range. Transforms skewed variables to help linear models extrapolate to new regions.
:::
## YeoJohnson Transformation
```{r}
#| echo: false
#| fig-align: center
#| fig-width: 10
#| fig-height: 5
#| fig-cap: "<b>Normalizing Elevation via Yeo-Johnson Transformation.</b> The raw elevation data (left) exhibits strong right-skewness, which can degrade linear model performance. Applying a Yeo-Johnson transformation with λ=0.49 (right) successfully normalizes the distribution, satisfying the linearity assumptions required for the Extensible Logistic Regression model."
tar_read(plot_yeo)
```
:::{.notes}
Why this matters:
- **The Problem (Left):** Raw elevation data is highly skewed. Linear models (like Logistic Regression) struggle with this because they assume a consistent relationship across the range.
- **The Solution (Right):** The Yeo-Johnson transformation normalizes the distribution (bell curve).
- **The Result:** This allows the model to "see" the signal clearly, improving stability when moving to new regions like Georgia.
:::
## Resampling Strategies
```{.r code-line-numbers="2|5|8"}
# A. Random Folds (Standard)
vfold_cv(train_data, v = 10, strata = forested)
# B. Spatial Blocks (Grid-based)
spatial_block_cv(train_data, v = 10)
# C. Spatial Clustering (Region-based)
spatial_clustering_cv(train_data, v = 10)
```
::: {.notes}
- **Random Folds:** Standard approach. Randomly shuffles data. Dangerous here because it allows "cheating" via nearby pixels.
- **Spatial Blocks:** Divides the map into a checkerboard. Forces the model to predict on a blind grid square.
- **Spatial Clustering:** Uses K-means to create distinct ecological zones. The hardest test—simulates moving to a totally new region.
:::
# Results
## Spatial Validation Analysis
```{r}
#| label: fig-spatial-results
#| fig-cap: "Comparison of Model Performance (ROC AUC) across three spatial validation strategies. Benchmark (0.96 ROC AUC) indicated by the horizontal dashed line."
#| echo: false
#| message: false
tar_read(fig_cv_comparison)
```
::: {.notes}
- **Optimism Bias:** Notice the "Random CV" column. It shows nearly perfect performance (>0.95 ROC AUC). This is often a "spatial mirage" where the model is simply memorizing locations (autocorrelation) rather than learning environmental drivers.
- **Spatial Honesty:** The "Block" and "Cluster" columns provide a more realistic estimate of how the model will perform on new, geographically distant areas. This represents the "true" performance we should expect for out-of-sample prediction.
- **Feature Leakage:** Compare the "With Coords" vs. "No Coords" rows. If the "With Coords" model crashes in performance during Block CV but holds steady in Random CV, it is a clear sign of overfitting to spatial coordinates (Lat/Lon) rather than the underlying forest ecology.
- **Performance Benchmark:** The dashed line at 0.96 represents an established, high-performance baseline for forest classification. It serves as a "line in the sand" to determine if our machine learning approach provides a meaningful improvement over traditional methods; a model is only truly successful if it can exceed this 0.96 threshold under the pressure of spatial cross-validation.
:::
## Performance Stability
```{r}
#| label: fig-stability
#| fig-cap: "Distribution of ROC AUC scores across individual cross-validation folds. Note the variance in scores by resampling method."
#| echo: false
#| message: false
tar_read(fig_model_stability)
```
::: {.notes}
- **Falsely Confident:** In **Random CV**, notice how tightly clustered the points are at the top; the model's performance is artificially stable because every fold contains a representative "sprinkling" of the entire dataset.
- **The Reality of Variance:** As we transition to **Cluster CV**, the "violin" stretches out, indicating that the model performs significantly better in some geographic regions than others.
- **Identifying Weak Spots:** Each point in the Cluster CV column represents a specific geographic area; the points near the bottom of the violin represent "hard-to-predict" regions where the model's current features might be insufficient.
- **Predictive Risk:** While Random CV suggests the model is ~98% accurate everywhere, this plot proves that in some clusters, performance may actually dip toward 85%.
- **Stakeholder Transparency:** This variance is a critical insight for stakeholders, as it defines the geographic boundaries of where the model's predictions can be most (and least) trusted.
:::
## Predict on Test Set
```{r}
#| label: fig-final-test
tar_read(fig_final_performance)
```
::: {.notes}
- **Beyond the Fold:** This result represents the model's performance on the 20% test set that was "locked away" at the beginning of the project.
- **The Spatial Paradox:** You will notice our Test AUC (0.97) is actually *higher* than our Validation AUC (0.927). In standard AI, this is rare. In forestry, this tells us two things:
1. **Interpolation Power:** The high test score proves the model is excellent at "filling in gaps" within Washington, where it can leverage the patterns of nearby trees.
2. **Extrapolation Power:** The lower (0.927) validation score is our "honest" baseline for new regions, where we stripped away those spatial clues.
- **Classification Nuance:** The 91% Accuracy vs. 97% ROC suggests our model is a better "ranker" than a "classifier." It understands the *gradient* of forest probability better than the hard binary of "Tree vs. No Tree."
- **Validation Success:** The fact that our "Honest" Spatial CV score (0.93) is so high confirms that the 0.97 on the test set isn't just a fluke of spatial memory—it's built on a solid foundation of learning spectral signatures.
:::
## Test vs. Resample Performance
<br/>
<br/>
<br/>
```{r}
#| label: tbl-performance
#| echo: false
#| tbl-cap: "Comparision of model performance on resamples versus on the test set. Note that ROC increased."
targets::tar_read(tbl_performance)
```
## Confusion Matrix
```{r}
#| label: fig-confusion
#| fig-cap: "Confusion matrix showing the classification performance of the final model on the 20% held-out Washington test set."
tar_read(fig_confusion_matrix)
```
::: {.notes}
- **Anatomy of Error:** This matrix breaks down our 91.1% accuracy into specific types of successes and failures, helping us move beyond a single aggregate number.
- **Symmetry of Mistakes:** We are looking for balance between the off-diagonal squares; a heavy skew toward one side would indicate the model has a systematic bias toward over-predicting or under-predicting forest cover.
- **False Positives vs. Negatives:** In ecological terms, False Positives often represent "ghost forests" where the structure exists but the classification differs, while False Negatives are "missed forests" where the model failed to detect the canopy signal.
- **Probability Sensitivity:** Since our ROC AUC is a high 0.97, most of these errors likely occur at the "decision boundary"—meaning the model was nearly correct (e.g., 48% probability) but the hard 50% cutoff forced an error.
- **Production Readiness:** The high density in the True Positive and True Negative quadrants confirms that the model is robust enough for regional mapping, despite the inherent complexity of transition zones.
:::
## Benchmarks
::: {style="font-size: 75%;"}
| Authority | Study Context | Accuracy |
| :--- | :--- | :--- |
| **Ismail et al. (2013)** | Ideal Conditions (Sclerophyll Forest) | **96%** |
| **USGS NLCD** | Federal Standard (US Gov) | **91%** |
| **Our Model (WA)** | Pacific Northwest Training | **90.7%** |
| **Complex Boreal** | Difficult Terrain (Alaska) | **~78%** |
:::
## Spatial Error Analysis
```{r}
#| label: fig-map-wa-errors
#| out-width: "100%"
#| fig-cap: "<b>Map showing Type I and II errors from model.</b> Points are shaded from purple to bright yellow based upon the absolute error of the prediction probability. Note the lack of errors in the Columbia Basin."
knitr::include_graphics("figs/wa_errors.png")
```
::: {.notes}
- **The "Hallucinations":** We are looking at the ~130 mistakes the model made on the test set.
- **Confidence vs. Confusion:** The Red points are where the model was "confidently wrong" (high error magnitude). These aren't just close calls; the model was >90% sure based on the physics features (like elevation/aridity) but missed the biological reality.
- **Geography of Error:** Notice the clustering. The errors aren't random; they hug the alpine transition zones and the rugged coastline, suggesting the model struggles most at the "biophysical edges" where the rules of the forest change rapidly.
:::
# Extrapolation
## The Goal of Prediction {text-align="center"}
<br/>
<br/>
> "The fundamental goal of a model is not to describe the data we have, but to predict the data we don't."[@kuhn_applied_2013]
::: {.notes}
- This quote from Kuhn and Johnson is the foundation of our entire project.
- If our model doesn't generalize to the "second island" (Georgia), it has failed its fundamental goal.
:::
## Assessing Domain Applicability
```{r}
#| label: fig-aoa-georgia
#| echo: false
#| out-width: "100%"
#| fig-cap: "Area of Applicability (AOA) Analysis. The Dissimilarity Index (DI) measures how different the Georgia environment is from Washington's. Note the similarity to the Level III Ecoregion plot @fig-ecoregion-comparison."
targets::tar_read(plot_aoa_ga)
```
::: {.notes}
- Before we even attempt to predict forests in Georgia, we have to ask a fundamental question: **Is it fair to ask a Washington model to understand Georgia?** We can't just assume the rules of nature are the same. We need to measure the mathematical distance between these two worlds.
**What am I looking at?**
- This map **does not** show predictions. It shows **familiarity**.
- We calculated a **Dissimilarity Index** for every pixel in Georgia. Essentially, we asked the model: *"Have you seen conditions like this before?"*
- **Dark Purple/Black:** These areas are the "safe zones." The elevation, temperature, and precipitation here fall within the ranges the model learned in the Cascades.
- **Bright Yellow:** These are the "alien" zones. The combination of variables here (likely the hot, humid lowlands) is completely outside the model's experience. This is pure extrapolation.
**The Takeaway:**
- This creates a **Risk Map**. If our model fails, we expect it to fail *here* [gesture to yellow areas].
- It tells us where our confidence should be high (the purple) and where any prediction is just a wild guess (the yellow).
:::
## External Validation
```{r}
#| label: fig-ga-predictions
#| echo: false
#| out-width: "100%"
#| fig-cap: "The model predictions of forests in Georgia (a) versus the true forest inventory (b)."
targets::tar_read(map_ga_probs)
```
::: {.notes}
- We took the model trained in the Pacific Northwest and asked it: "Where are the forests in Georgia?"
- The Map: This shows the model's raw probability output.
- The Pattern: You can see it identifying the Blue Ridge Mountains (yellow/green) in the northeast.
- The Question: Does this match reality? Or is it seeing "forests" in places that are actually agricultural fields or swamps?
:::
## Quantifying the Error
```{r}
#| label: fig-ga-confusion
#| echo: false
#| out-width: "80%"
#| fig-align: "center"
#| fig-cap: "<b>Confusion Matrix (Georgia).</b> The model accuracy drops significantly compared to Washington. Note the high number of false negatives.(Prediction: No / Truth: Yes)."
targets::tar_read(ga_conf_mat)
```
## Mapping the Failures
```{r}
#| label: fig-ga-errors
#| echo: false
#| out-width: "100%"
#| fig-cap: "Spatial Distribution of Errors. (a) shows the dissimilarity of Georgia from Washington. (b) shows error density increasing in southern Georgia."
targets::tar_read(map_failure_mechanism)
```
::: {.notes}
**Visualizing the "Phantom Forests"**
- This map only shows the mistakes. And unlike Washington, where we had a handful of dots, here the map is lit up.
- **Orange Points (False Positives):** Look at the massive cluster in the South/Southeast.
- These are the **"Phantom Forests."**
- Notice how they perfectly overlap with the "Yellow Zone" (high dissimilarity) we identified at the start. The model saw crops and scrubland and hallucinated trees.
- **The Verdict:** The error isn't random. It is geographically structured. We broke the model exactly where the AOA predicted it would break.
:::
## Lessons Learned
::: {.incremental}
* **Accuracy Collapse:** ~89% (WA) $\to$ ~54% (GA).
* **AOA Validation:** The "Yellow Zone" correctly flagged the risk.
* **The Trap:** High confidence in "Phantom Forests."
* **The Fix:** Quantify domain distance *before* deployment.
:::
::: {.notes}
**1. The Numbers Don't Lie**
We witnessed a catastrophic failure in performance. In Washington, we had a precision instrument (90% accuracy). In Georgia, we essentially flipped a coin (54%). If we had deployed this model in production without validation, we would be generating random noise.
**2. The "Yellow Zone" Was a Warning, Not a Bug**
Remember that bright yellow map? That wasn't just a pretty picture. The Area of Applicability (AOA) screamed at us that the Southeastern Plains were alien territory. The model failed exactly where the AOA said it would—predicting forests in flat, hot agricultural zones it didn't understand.
**3. The "Phantom Forest" Problem**
Our confusion matrix showed a massive spike in False Positives. This is dangerous. The model didn't say "I don't know"; it confidently declared "Yes, there is a forest here." We call these "Phantom Forests." In a real-world scenario—like carbon credit monitoring or fire risk assessment—phantom forests cost millions of dollars.
**4. The Ultimate Takeaway**
The model failed, but the **workflow succeeded**. By calculating the multidimensional distance between our training data and our target data, we predicted *where* the model would break before we even ran it.
**Conclusion:** In spatial data science, you cannot simply "train and deploy." You must respect the ecological boundaries of your training data.
:::
# References
::: {#refs}
:::

22
man/calculate_ga_aoa.Rd Normal file
View File

@@ -0,0 +1,22 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{calculate_ga_aoa}
\alias{calculate_ga_aoa}
\title{Calculate Area of Applicability Data}
\usage{
calculate_ga_aoa(train_data, test_data, predictors)
}
\arguments{
\item{train_data}{Dataframe. The training data from Washington.}
\item{test_data}{Dataframe. The extrapolation data from Georgia.}
\item{predictors}{Character vector. The list of predictor variable names.}
}
\value{
An \code{sf} object containing the Georgia data with an added 'di' (Dissimilarity Index) column.
}
\description{
Generates the Area of Applicability (AOA) scores (Dissimilarity Index)
for the Georgia extrapolation dataset based on the Washington training data.
}

28
man/combine_forest.Rd Normal file
View File

@@ -0,0 +1,28 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{combine_forest}
\alias{combine_forest}
\title{Combine Washington and Georgia Forest Data}
\usage{
combine_forest(wa_data, ga_data)
}
\arguments{
\item{wa_data}{A data frame containing the Washington forest inventory data.}
\item{ga_data}{A data frame containing the Georgia forest inventory data.}
}
\value{
A single combined data frame with an additional column \code{.id}
(renamed to "state") indicating the source ("WA" or "GA").
}
\description{
Merges the Washington and Georgia datasets into a single data frame, adding a
column to identify the source state.
}
\examples{
\dontrun{
combined <- combine_forest(wa_raw, ga_raw)
table(combined$state)
}
}

View File

@@ -0,0 +1,20 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{create_stats_summary}
\alias{create_stats_summary}
\title{Create Statistical Summary of Forest Data}
\usage{
create_stats_summary(data)
}
\arguments{
\item{data}{A data frame or sf object containing the forest data.}
}
\value{
A data frame with descriptive statistics (mean, sd, min, max, etc.),
sorted by descending absolute kurtosis.
}
\description{
Generates descriptive statistics for numeric variables in the dataset,
excluding spatial coordinates (lat/lon) and year. It sorts the results
by kurtosis to highlight non-normal distributions.
}

View File

@@ -0,0 +1,15 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/forestedAnalysis-package.R
\docType{package}
\name{forestedAnalysis-package}
\alias{forestedAnalysis}
\alias{forestedAnalysis-package}
\title{forestedAnalysis: Spatial Cross-Validation and AOA Analysis of Forest Cover}
\description{
A research compendium analyzing forest cover data in Washington and Georgia. It evaluates the Area of Applicability (AOA) and demonstrates model failure during spatial extrapolation.
}
\author{
\strong{Maintainer}: Rob Wiederstein \email{khuon68@gmail.com}
}
\keyword{internal}

23
man/get_epa_ecoregions.Rd Normal file
View File

@@ -0,0 +1,23 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{get_epa_ecoregions}
\alias{get_epa_ecoregions}
\title{Download EPA Level III Ecoregions Data}
\usage{
get_epa_ecoregions(url, dest_dir = "data/epa")
}
\arguments{
\item{url}{Character string. The direct URL to the EPA Ecoregions zip file.}
\item{dest_dir}{Character string. The local directory where the data should
be saved. Defaults to "data/epa".}
}
\value{
A character string containing the full file path to the downloaded zip file.
This return value is designed to be tracked by \code{targets}.
}
\description{
Downloads the EPA Level III Ecoregions shapefile (zip format)
to a local directory. Implements a caching check to avoid re-downloading
if the file already exists.
}

35
man/helper_save_fig.Rd Normal file
View File

@@ -0,0 +1,35 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{helper_save_fig}
\alias{helper_save_fig}
\title{Helper: Save Plot for Quarto Slide}
\usage{
helper_save_fig(
plot_obj,
name,
type = c("map", "plot"),
width = 10,
height = 6.18,
dpi = 300
)
}
\arguments{
\item{plot_obj}{The ggplot object to save.}
\item{name}{A short descriptive name (e.g., "wa_ecoregions").}
\item{type}{Either "map" or "plot". Adds this prefix to the filename.}
\item{width}{Width in inches (default: 10).}
\item{height}{Height in inches (default: 6.18).}
\item{dpi}{Resolution (default: 300).}
}
\value{
The full file path (invisibly).
}
\description{
Saves a ggplot object as a PNG, sized to fit comfortably
below a standard slide title, with robust font handling.
}

View File

@@ -0,0 +1,20 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_ecoregion_comparison}
\alias{plot_ecoregion_comparison}
\title{Plot Ecoregion Complexity Comparison (WA vs GA)}
\usage{
plot_ecoregion_comparison(eco_data)
}
\arguments{
\item{eco_data}{An \code{sf} object containing ecoregion polygons. Must contain
columns \code{STATE_NAME} and \code{US_L3NAME}.}
}
\value{
A \code{patchwork} object containing the combined plot.
}
\description{
Generates a side-by-side comparison of Level III ecoregions for Washington
and Georgia. It uses a "void" theme, qualitative colors, and carefully tuned
label repulsion settings to avoid overlapping text.
}

View File

@@ -0,0 +1,23 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_failure_mechanism}
\alias{plot_failure_mechanism}
\title{Plot Failure Mechanism Comparison}
\usage{
plot_failure_mechanism(aoa_data, pred_data, boundaries)
}
\arguments{
\item{aoa_data}{Dataframe containing the AOA results (must have 'di', 'lon', 'lat').}
\item{pred_data}{Dataframe containing prediction results (columns: .pred_class, forested, lon, lat).}
\item{boundaries}{An \code{sf} object containing state boundaries (must include "GA" or "Georgia").}
}
\value{
A \code{patchwork} object containing the side-by-side comparison.
}
\description{
Creates a side-by-side diagnostic plot returning a patchwork object.
(a) The Area of Applicability (Dissimilarity Index) showing where the model is extrapolating.
(b) The spatial distribution of actual classification errors.
}

View File

@@ -0,0 +1,21 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_ga_comparison_map}
\alias{plot_ga_comparison_map}
\title{Plot Georgia Forest Comparison}
\usage{
plot_ga_comparison_map(pred_data, boundaries)
}
\arguments{
\item{pred_data}{Dataframe containing prediction results (columns: .pred_class, forested, lon, lat).}
\item{boundaries}{An \code{sf} object containing state boundaries (must include "GA" or "Georgia").}
}
\value{
A \code{patchwork} object containing the labeled comparison plot.
}
\description{
Creates a side-by-side comparison of forest cover for Georgia.
The left plot (a) is the Model Prediction, and the right plot (b) is the Actual Data.
Features a shared right-side legend and standardized spatial styling.
}

17
man/plot_georgia_aoa.Rd Normal file
View File

@@ -0,0 +1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_georgia_aoa}
\alias{plot_georgia_aoa}
\title{Plot Georgia Area of Applicability (AOA)}
\usage{
plot_georgia_aoa(aoa_sf)
}
\arguments{
\item{aoa_sf}{An \code{sf} object containing the 'di' column (output of \code{calculate_ga_aoa}).}
}
\value{
A \code{ggplot} object showing the Dissimilarity Index map.
}
\description{
Plots the pre-calculated Dissimilarity Index (DI) for Georgia.
}

View File

@@ -0,0 +1,34 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_precip_hex_comparison}
\alias{plot_precip_hex_comparison}
\title{Plot Annual Rainfall Comparison (Clipped Hexes)}
\usage{
plot_precip_hex_comparison(
wa_data,
ga_data,
boundaries,
bins = 30,
max_limit = 2500
)
}
\arguments{
\item{wa_data}{Dataframe containing Washington data (requires 'precip_annual', 'lat', 'lon').}
\item{ga_data}{Dataframe containing Georgia data (requires 'precip_annual', 'lat', 'lon').}
\item{boundaries}{An \code{sf} object containing state boundaries.}
\item{bins}{Integer. Number of hexes across the state width. Default is 30.}
\item{max_limit}{Numeric. The visual cap for rainfall (mm) to ensure comparable scales. Default is 2500.}
}
\value{
A \code{patchwork} object containing the side-by-side comparison.
}
\description{
Creates a polished side-by-side comparison of annual precipitation.
Hexagons are spatially generated and clipped to the exact state boundaries
to eliminate "bleeding" edges. Uses \code{theme_forestry_void} with explicit
font sizing to match topographic maps.
}

View File

@@ -0,0 +1,23 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_regional_comparison}
\alias{plot_regional_comparison}
\title{Plot Regional Comparison of Forested Data (WA vs GA)}
\usage{
plot_regional_comparison(data, boundaries)
}
\arguments{
\item{data}{A data frame containing the forest point data. Must contain
columns \code{lon}, \code{lat}, \code{state}, and \code{forested}.}
\item{boundaries}{An \code{sf} object containing state boundaries. Must contain
a \code{NAME} column.}
}
\value{
A \code{patchwork} object containing the combined side-by-side maps.
}
\description{
Generates a side-by-side comparison of forest cover for Washington
and Georgia. It handles font registration (Atkinson Hyperlegible), spatial
transformations, and creates a combined plot with a shared legend.
}

17
man/plot_rf_importance.Rd Normal file
View File

@@ -0,0 +1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_rf_importance}
\alias{plot_rf_importance}
\title{Plot Random Forest Variable Importance}
\usage{
plot_rf_importance(data)
}
\arguments{
\item{data}{An sf object or data frame containing the 'forested' target and predictors.}
}
\description{
Fits a ranger Random Forest model to the provided data, calculates
permutation importance, and generates a lollipop chart. It distinguishes
between spatial (lat/lon) and biophysical predictors.
Uses the project's 'Atkinson' font theme via theme_forestry_plot().
}

View File

@@ -0,0 +1,18 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_spatial_exploration}
\alias{plot_spatial_exploration}
\title{Plot Spatial Autocorrelation Exploration}
\usage{
plot_spatial_exploration(wa_data)
}
\arguments{
\item{wa_data}{A dataframe or tibble containing elevation, lat, and lon columns.}
}
\value{
A ggplot object showing standardized elevation vs. spatially lagged elevation.
}
\description{
Generates a Moran Scatterplot to visualize spatial
autocorrelation in elevation data using a 5km neighborhood.
}

11
man/plot_state_topo.Rd Normal file
View File

@@ -0,0 +1,11 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_state_topo}
\alias{plot_state_topo}
\title{Create Single State Topo Plot}
\usage{
plot_state_topo(data, boundary_sf, raster_path, state_name)
}
\description{
Generates a topo map with manually tuned label placement for Georgia.
}

View File

@@ -0,0 +1,11 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_theme_diagnostic}
\alias{plot_theme_diagnostic}
\title{Simplified Theme Diagnostic}
\usage{
plot_theme_diagnostic()
}
\description{
Uses built-in NC data to verify theme_forestry_spatial.
}

11
man/plot_us_map.Rd Normal file
View File

@@ -0,0 +1,11 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_us_map}
\alias{plot_us_map}
\title{Plot US Map with Forestry Theme}
\usage{
plot_us_map()
}
\description{
Highlights Washington and Georgia using standardized presentation fonts.
}

32
man/process_ecoregions.Rd Normal file
View File

@@ -0,0 +1,32 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{process_ecoregions}
\alias{process_ecoregions}
\title{Process and Clip EPA Ecoregions}
\usage{
process_ecoregions(
zip_path,
target_states = c("Washington", "Georgia"),
simplify_tol = 0.05
)
}
\arguments{
\item{zip_path}{Character string. The file path to the zipped EPA shapefile.}
\item{target_states}{Character vector. The names of the states to clip the
ecoregions to. Defaults to \code{c("Washington", "Georgia")}.}
\item{simplify_tol}{Numeric. The simplification tolerance passed to
\code{rmapshaper::ms_simplify}. Range is 0-1, where higher numbers remove more detail.
Defaults to 0.05.}
}
\value{
An \code{sf} object containing the processed ecoregions with standardized
columns \code{US_L3NAME} and \code{STATE_NAME}.
}
\description{
Extracts EPA Level III ecoregion data from a zipped shapefile,
standardizes column names, and clips the geometry to specified state boundaries.
It includes robust steps for geometry repair (handling spherical validity),
small island removal, and simplification for optimized plotting.
}

19
man/save_combined_topo.Rd Normal file
View File

@@ -0,0 +1,19 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{save_combined_topo}
\alias{save_combined_topo}
\title{Save Combined Side-by-Side Topo Plot}
\usage{
save_combined_topo(
wa_data,
ga_data,
wa_boundary,
ga_boundary,
wa_raster_path,
ga_raster_path,
output_path
)
}
\description{
Save Combined Side-by-Side Topo Plot
}

26
man/save_error_map_png.Rd Normal file
View File

@@ -0,0 +1,26 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{save_error_map_png}
\alias{save_error_map_png}
\title{Save Model Error Diagnostic Map}
\usage{
save_error_map_png(data, boundary_sf, raster_path, output_path)
}
\arguments{
\item{data}{A data frame containing model predictions (must include '.pred_class',
'forested', '.pred_Yes', 'lon', and 'lat').}
\item{boundary_sf}{An \code{sf} object representing the state boundary.}
\item{raster_path}{Character string. File path to the elevation raster (.tif).}
\item{output_path}{Character string. File path where the PNG will be saved.}
}
\value{
The \code{output_path} (invisible), for integration with \code{targets}.
}
\description{
Generates a diagnostic map highlighting prediction errors. It plots
misclassified points colored by the magnitude of the error (confidence in the wrong answer)
over a hillshaded elevation background.
}

View File

@@ -0,0 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{save_outlier_map_png}
\alias{save_outlier_map_png}
\title{Save Outlier Diagnostic Map}
\usage{
save_outlier_map_png(data, boundary_sf, raster_path, output_path)
}
\arguments{
\item{data}{A data frame containing the analysis dataset (must include numeric columns and 'forested' factor).}
\item{boundary_sf}{An \code{sf} object representing the state boundary (e.g., Washington).}
\item{raster_path}{Character string. File path to the elevation raster (.tif).}
\item{output_path}{Character string. File path where the PNG will be saved.}
}
\value{
The \code{output_path} (invisible), for integration with \code{targets}.
}
\description{
Generates a diagnostic map highlighting multivariate outliers (Z > 3)
overlaid on a hillshaded elevation raster. Uses the standardized forestry theme.
}

View File

@@ -0,0 +1,15 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{setup_forestry_fonts}
\alias{setup_forestry_fonts}
\title{Register Project Fonts}
\usage{
setup_forestry_fonts()
}
\value{
NULL (called for side effects)
}
\description{
Registers 'Atkinson Hyperlegible Next' (Sans) and 'Atkinson Hyperlegible Mono'
(Monospace) with the sysfonts package for use in R graphics.
}

21
man/style_audit_table.Rd Normal file
View File

@@ -0,0 +1,21 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{style_audit_table}
\alias{style_audit_table}
\title{Style Audit Table (GT)}
\usage{
style_audit_table(data, title = NULL, subtitle = NULL)
}
\arguments{
\item{data}{A data frame to be formatted.}
\item{title}{Character string. The title of the table (optional).}
}
\value{
A \code{gt_tbl} object ready for rendering.
}
\description{
Converts a data frame into a formatted \code{gt} table with consistent
styling for audit reports. Includes row striping, numeric formatting, and
standardized font sizes.
}

View File

@@ -0,0 +1,18 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{theme_forestry_plot}
\alias{theme_forestry_plot}
\title{Standard Forestry Plot Theme (Cowplot + Atkinson)}
\usage{
theme_forestry_plot(font_size = 14, grid = TRUE)
}
\arguments{
\item{font_size}{Integer. Base font size. Default is 14 (good for slides).}
\item{grid}{Logical. If TRUE, adds a light gray grid (useful for presentations).}
}
\description{
A standardized theme for non-spatial plots (scatter, bar, line).
Based on cowplot::theme_cowplot(), it includes clean axes and a minimalist look.
Uses 'Atkinson Hyperlegible Next' for all text.
}

View File

@@ -0,0 +1,11 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{theme_forestry_spatial}
\alias{theme_forestry_spatial}
\title{Standardized Spatial Theme (Atkinson)}
\usage{
theme_forestry_spatial(base_size = 16)
}
\description{
High-visibility map theme for presentations using Atkinson fonts.
}

View File

@@ -0,0 +1,11 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{theme_forestry_void}
\alias{theme_forestry_void}
\title{Standardized Void Theme (Maximal Data Ink)}
\usage{
theme_forestry_void(base_size = 16)
}
\description{
Removes axes/grids for shape-focused maps, but keeps project fonts.
}

140
references.bib Normal file
View File

@@ -0,0 +1,140 @@
@misc{epa_ecoregions_2013,
title = {Level {{III}} and {{IV Ecoregions}} of the {{Continental United States}}},
author = {{U.S. Environmental Protection Agency}},
year = 2013,
address = {Corvallis, OR},
urldate = {2026-01-13}
}
@article{frescino_fiesta_2023,
title = {`{{FIESTA}}': A Forest Inventory Estimation and Analysis {{R}} Package},
shorttitle = {`{{FIESTA}}'},
author = {Frescino, Tracey S. and Moisen, Gretchen G. and Patterson, Paul L. and Toney, Chris and White, Grayson W.},
year = 2023,
month = jul,
journal = {Ecography},
volume = {2023},
number = {7},
pages = {e06428},
issn = {0906-7590, 1600-0587},
doi = {10.1111/ecog.06428},
urldate = {2026-01-14},
abstract = {Ecologists are increasingly relying on national forest inventories to address a wide variety of issues. The `FIESTA' R package (Forest Inventory ESTimation and Analysis) is a tool that enables customized investigations using the extensive sample-based inventory data collected across all lands in the US by the US Dept of Agriculture, Forest Service, Forest Inventory and Analysis (FIA) Program. To date, the complex nature of the FIA inventory constrains many users to conduct only limited analyses through existing tools with pre-specified geographic boundaries, timeframes, and auxiliary data under a single statistical estimation process. Yet, the rapid evolution of available remotely sensed data and statistical methods present the opportunity to conduct spatial and temporal analyses of forest attributes that are much more relevant to many pressing ecological, environmental, economic, and social issues in the US, The `FIESTA' package was developed to augment the current set of available tools by providing a flexible platform that accommodates evolving technologies and leading-edge estimation techniques. The package contains a collection of functions that can query FIA databases, summarize sample-based inventory data, extract and aggregate auxiliary spatial data, and generate estimates with associated variances. The `FIESTA' R package is available on CRAN ( https://cran.r-project.org/package=FIESTA ).},
langid = {english}
}
@book{kuhn_applied_2013,
title = {Applied {{Predictive Modeling}}},
author = {Kuhn, Max and Johnson, Kjell},
year = 2013,
publisher = {Springer},
address = {New York, NY},
doi = {10.1007/978-1-4614-6849-3},
urldate = {2026-01-22},
copyright = {http://www.springer.com/tdm},
isbn = {978-1-4614-6848-6 978-1-4614-6849-3},
langid = {english},
keywords = {Model,Non-Linear,Predictive Models,R,Regression Models,Regression Trees}
}
@book{kuhn_tidy_2022,
title = {Tidy {{Modeling}} with {{R}}},
author = {Kuhn, Max},
year = 2022,
publisher = {O'Reilly Media, Incorporated},
address = {Sebastopol},
urldate = {2026-01-13},
collaborator = {Silge, Julia},
isbn = {978-1-4920-9648-1 978-1-4920-9644-3},
langid = {english}
}
@article{omernik_ecoregions_1987,
title = {Ecoregions of the {{Conterminous United States}}},
author = {Omernik, James M.},
year = 1987,
month = mar,
journal = {Annals of the Association of American Geographers},
volume = {77},
number = {1},
pages = {118--125},
issn = {0004-5608, 1467-8306},
doi = {10.1111/j.1467-8306.1987.tb00149.x},
urldate = {2026-01-13},
langid = {english}
}
@misc{pebesma_spatial_2025,
title = {Spatial {{Data Science}}},
author = {Pebesma, Edzer and Bivand, Roger},
year = 2025,
month = jan,
urldate = {2026-01-17},
howpublished = {https://r-spatial.org/book/},
langid = {english},
file = {/home/rkw/Zotero/storage/ZNFK3H6Q/book.html}
}
@article{roberts_crossvalidation_2017,
title = {Cross-validation Strategies for Data with Temporal, Spatial, Hierarchical, or Phylogenetic Structure},
shorttitle = {Cross-Validation},
author = {Roberts, David R. and Bahn, Volker and Ciuti, Simone and Boyce, Mark S. and Elith, Jane and Guillera-Arroita, Gurutzeta and Hauenstein, Severin and Lahoz-Monfort, Jos{\'e} J. and Schr{\"o}der, Boris and Thuiller, Wilfried and Warton, David I. and Wintle, Brendan A. and Hartig, Florian and Dormann, Carsten F.},
year = 2017,
month = aug,
journal = {Ecography},
volume = {40},
number = {8},
pages = {913--929},
issn = {0906-7590, 1600-0587},
doi = {10.1111/ecog.02881},
urldate = {2026-01-11},
abstract = {Ecological data often show temporal, spatial, hierarchical (random effects), or phylogenetic structure. Modern statistical approaches are increasingly accounting for such dependencies. However, when performing cross-validation, these structures are regularly ignored, resulting in serious underestimation of predictive error. One cause for the poor performance of uncorrected (random) cross-validation, noted often by modellers, are dependence structures in the data that persist as dependence structures in model residuals, violating the assumption of independence. Even more concerning, because often overlooked, is that structured data also provides ample opportunity for overfitting with non-causal predictors. This problem can persist even if remedies such as autoregressive models, generalized least squares, or mixed models are used. Block cross-validation, where data are split strategically rather than randomly, can address these issues. However, the blocking strategy must be carefully considered. Blocking in space, time, random effects or phylogenetic distance, while accounting for dependencies in the data, may also unwittingly induce extrapolations by restricting the ranges or combinations of predictor variables available for model training, thus overestimating interpolation errors. On the other hand, deliberate blocking in predictor space may also improve error estimates when extrapolation is the modelling goal. Here, we review the ecological literature on non-random and blocked cross-validation approaches. We also provide a series of simulations and case studies, in which we show that, for all instances tested, block cross-validation is nearly universally more appropriate than random cross-validation if the goal is predicting to new data or predictor space, or for selecting causal predictors. We recommend that block cross-validation be used wherever dependence structures exist in a dataset, even if no correlation structure is visible in the fitted model residuals, or if the fitted models account for such correlations.},
langid = {english},
file = {/home/rkw/Zotero/storage/JFMJE6FR/Roberts et al. - 2017 - Crossvalidation strategies for data with temporal, spatial, hierarchical, or phylogenetic structure.pdf}
}
@article{tobler_computer_1970,
title = {A {{Computer Movie Simulating Urban Growth}} in the {{Detroit Region}}},
author = {Tobler, W. R.},
year = 1970,
month = jun,
journal = {Economic Geography},
publisher = {Routledge},
urldate = {2026-01-22},
abstract = {(1970). A Computer Movie Simulating Urban Growth in the Detroit Region. Economic Geography: Vol. 46, PROCEEDINGS International Geographical Union Commission on Quantitative Methods, pp. 234-240.},
copyright = {\copyright{} 1970 Taylor and Francis Group, LLC},
langid = {english},
file = {/home/rkw/Zotero/storage/75EV82QZ/143141.html}
}
@article{white_method_2025,
title = {A Method for Empirically Assessing Small Area Estimators via Bootstrap-Weighted k-Nearest-Neighbor Artificial Populations, with Applications to Forest Inventory},
author = {White, Grayson W and Wieczorek, Jerzy A and Cody, Zachariah W and Tan, Emily X and Chistolini, Jacqueline O and McConville, Kelly S and Frescino, Tracey S and Moisen, Gretchen G},
editor = {Fassnacht, Fabian},
year = 2025,
month = nov,
journal = {Forestry: An International Journal of Forest Research},
pages = {cpaf071},
issn = {0015-752X, 1464-3626},
doi = {10.1093/forestry/cpaf071},
urldate = {2026-01-14},
abstract = {Abstract National Forest Inventories monitor forest attributes across a variety of spatial and temporal scales in a given country. Increased interest in reporting and management at smaller scales has driven National Forest Inventories to investigate and adopt small area estimation (SAE) due to the promise of increased precision at these scales. However, comparing and evaluating SAE models for a given application is inherently difficult. Typically, many areas lack enough data to check unit-level modeling assumptions or to assess unit-level predictions empirically; and no ground truth is available for checking area-level estimates. Design-based simulation from artificial populations can help with each of these issues, but only if the artificial populations realistically represent the application at hand and are not built using assumptions that inherently favor one SAE model over another. In this paper, we borrow ideas from random hot deck, approximate Bayesian bootstrap, and \$k\$ nearest neighbor imputation methods to propose a \$k\$ nearest neighbor-based approximation to approximate Bayesian bootstrap, for generating an artificial population when rich unit-level auxiliary data are available. We introduce diagnostic checks on the process of building the artificial population, and demonstrate how to use it for design-based simulation studies to compare and evaluate SAE models, using real data from the Forest Inventory and Analysis program of the United States Department of Agriculture Forest Service (the National Forest Inventory of the United States).},
copyright = {https://creativecommons.org/licenses/by/4.0/},
langid = {english}
}
@article{white_small_2025,
title = {Small Area Estimation of Forest Biomass via a Two-Stage Model for Continuous Zero-Inflated Data},
author = {White, Grayson W. and Yamamoto, Josh K. and Elsyad, Dinan H. and Schmitt, Julian F. and Korsgaard, Niels H. and Hu, Jie Kate and Gaines, George C. and Frescino, Tracey S. and McConville, Kelly S.},
year = 2025,
month = jan,
journal = {Canadian Journal of Forest Research},
volume = {55},
pages = {1--19},
issn = {0045-5067, 1208-6037},
doi = {10.1139/cjfr-2024-0149},
urldate = {2026-01-14},
abstract = {Nationwide Forest Inventories (NFIs) collect data on and monitor the trends of forests across the globe. Users of NFI data are increasingly interested in monitoring forest attributes such as biomass at fine geographic and temporal scales, resulting in a need for assessment and development of small area estimation techniques in forest inventory. We implement a small area estimator and parametric bootstrap estimator that account for zero-inflation in biomass data via a two-stage model-based approach and compare the performance to a Horvitz--Thompson estimator, a post-stratified estimator, and to the unit- and area-level empirical best linear unbiased prediction (EBLUP) estimators. We conduct a simulation study in Nevada with data from the United States NFI, the Forest Inventory and Analysis Program, and remote sensing data products. Results show the zero-inflated estimator has the lowest relative bias and the smallest empirical root mean square error. Moreover, the 95\% confidence interval coverages of the zero-inflated estimator and the unit-level EBLUP are more accurate than the other two estimators. To further illustrate the practical utility, we employ a data application across the 2019 measurement year in Nevada. We introduce the R package, saeczi, which efficiently implements the zero-inflated estimator and its mean squared error estimator.},
langid = {english},
file = {/home/rkw/Zotero/storage/VSX6A8MF/White et al. - 2025 - Small area estimation of forest biomass via a two-stage model for continuous zero-inflated data.pdf}
}

8995
renv.lock Normal file

File diff suppressed because one or more lines are too long

7
renv/.gitignore vendored Normal file
View File

@@ -0,0 +1,7 @@
library/
local/
cellar/
lock/
python/
sandbox/
staging/

1334
renv/activate.R Normal file

File diff suppressed because it is too large Load Diff

19
renv/settings.json Normal file
View File

@@ -0,0 +1,19 @@
{
"bioconductor.version": null,
"external.libraries": [],
"ignored.packages": [],
"package.dependency.fields": [
"Imports",
"Depends",
"LinkingTo"
],
"ppm.enabled": null,
"ppm.ignored.urls": [],
"r.version": null,
"snapshot.type": "implicit",
"use.cache": true,
"vcs.ignore.cellar": true,
"vcs.ignore.library": true,
"vcs.ignore.local": true,
"vcs.manage.ignores": true
}

View File

@@ -0,0 +1,49 @@
# scripts/make_title_globe.R
library(ggplot2)
library(sf)
library(dplyr)
library(maps)
# 1. Setup Data
states_sf <- sf::st_as_sf(maps::map("state", plot = FALSE, fill = TRUE))
# 2. Colors
wa_color <- "#D95F0E"
ga_color <- "#00A88F"
bg_fill <- "#f0f0f0"
borders <- "#ffffff"
ocean_col <- "#ffffff"
# 3. Create the Plot
p <- ggplot() +
# Graticules
geom_sf(data = sf::st_graticule(lat = seq(-90, 90, 10), lon = seq(-180, 180, 10)),
color = "#e0e0e0", linewidth = 0.1) +
# Background States
geom_sf(data = states_sf,
fill = bg_fill, color = borders, linewidth = 0.3) +
# Highlights
geom_sf(data = states_sf %>% filter(ID == "washington"),
fill = wa_color, color = borders, linewidth = 0.3) +
geom_sf(data = states_sf %>% filter(ID == "georgia"),
fill = ga_color, color = borders, linewidth = 0.3) +
# --- THE FIX ---
# -102 was Center. -72 was too far Left.
# -87 is the magic number (15 degrees East).
coord_sf(crs = "+proj=ortho +lat_0=40 +lon_0=-102") +
# Theme
theme_void() +
theme(
panel.background = element_rect(fill = ocean_col, color = NA),
plot.background = element_rect(fill = ocean_col, color = NA)
)
# 4. Save
if(!dir.exists("assets")) dir.create("assets")
ggsave("assets/study_sites_globe.png", plot = p, width = 10, height = 10, dpi = 300)
message("Success! Globe rotated 15 degrees.")