initial commit

2026-02-10 04:52:37 -05:00
commit 0476f6f8f8
65 changed files with 15368 additions and 0 deletions
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -0,0 +1,5 @@
 ^renv$
 ^renv\.lock$
 ^LICENSE\.md$
 ^.*\.Rproj$
 ^\.Rproj\.user$
--- a/.Rprofile
+++ b/.Rprofile
@@ -0,0 +1,40 @@
 source("renv/activate.R")
 if (interactive()) {
  suppressMessages(library(targets))
 }
 # --- Targets Aliases ---
 tm  <- targets::tar_make
 ti  <- targets::tar_invalidate
 tr  <- targets::tar_read
 to  <- targets::tar_outdated
 # Specific Macros
 tmr <- function() targets::tar_make(report)
 tir <- function() targets::tar_invalidate(report)
 # --- renv Aliases ---
 rs  <- renv::status    # check health
 ri  <- renv::install   # install packages
 rsp <- renv::snapshot  # save library state (snapshot)
 rr  <- renv::restore   # revert to lockfile
 # --- The "Cheat Sheet" Startup Message ---
 message(
  "\n---------------------------------------------",
  "\n SHORTCUTS LOADED",
  "\n---------------------------------------------",
  "\n [Targets]",
  "\n   tm   = tar_make()",
  "\n   ti   = tar_invalidate()",
  "\n   tr   = tar_read()",
  "\n   to   = tar_outdated()",
  "\n   tmr  = tar_make(report)",
  "\n   tir  = tar_invalidate(report)",
  "\n",
  "\n [renv]",
  "\n   rs   = renv::status()",
  "\n   ri   = renv::install()",
  "\n   rsp  = renv::snapshot()",
  "\n   rr   = renv::restore()",
  "\n---------------------------------------------\n"
 )
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,7 @@
 .git
 .gitignore
 .Rproj.user
 _targets/
 _site/
 *.html
 *.DS_Store
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,26 @@
 # --- Data and Pipeline ---
 data/
 _targets/
 _targets.user
 # --- R Environment ---
 .Rproj.user/
 .Rhistory
 .RData
 .Renviron
 .Ruserdata
 # Keep the lockfile, ignore the library
 renv/library/
 renv/staging/
 renv/python/
 # --- Quarto and Output ---
 # Since you are hosting via Caddy/Rsync, 
 # stop tracking these in Git to avoid bloat.
 .quarto/
 *_cache/
 *_files/
 index.html
 # --- System ---
 .DS_Store
--- a/30
+++ b/30
@@ -0,0 +1,30 @@
 Package: forestedAnalysis
 Title: Spatial Cross-Validation and AOA Analysis of Forest Cover
 Version: 0.0.0.9000
 Authors@R: 
    person("Rob", "Wiederstein", , "khuon68@gmail.com", role = c("aut", "cre"))
 Description: A research compendium analyzing forest cover data in Washington and Georgia. 
    It evaluates the Area of Applicability (AOA) and demonstrates model failure 
    during spatial extrapolation.
 License: MIT
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.3
 Depends: 
    tidymodels,
    tidyverse
 Imports: 
    cowplot,
    forested,
    ggplot2,
    ggrepel,
    knitr,
    patchwork,
    quarto,
    rmarkdown,
    sf,
    showtext,
    sysfonts,
    targets,
    terra,
    waywiser
--- a/52
+++ b/52
@@ -0,0 +1,52 @@
 FROM rocker/tidyverse:4.4.0
 RUN apt-get update && apt-get install -y \
    nano \
    neovim \
    git \
    bash-completion \
    openssh-client \
    cmake \
    libglpk-dev \
    libcurl4-openssl-dev \
    libssl-dev \
    libxml2-dev \
    libfontconfig1-dev \
    libfreetype6-dev \
    libharfbuzz-dev \
    libfribidi-dev \
    libpng-dev \
    libjpeg-dev \
    libtiff-dev \
    libwebp-dev \
    gdal-bin \
    libgdal-dev \
    libproj-dev \
    libgeos-dev \
    libudunits2-dev \
    && rm -rf /var/lib/apt/lists/*
 # --- CONFIGURATION FIXES ---
 # 1. Move library out of project folder (fixes Volume Trap)
 ENV RENV_PATHS_LIBRARY=/renv/library
 # 2. Disable Symlinks (fixes Root Permission Trap) <--- CRITICAL NEW LINE
 ENV RENV_CONFIG_CACHE_SYMLINKS=FALSE
 RUN mkdir -p /renv/library && chmod 777 /renv/library
 WORKDIR /home/rstudio/project
 COPY renv.lock renv.lock
 COPY .Rprofile .Rprofile
 COPY renv/activate.R renv/activate.R
 COPY renv/settings.json renv/settings.json
 # Restore (Binaries + No Symlinks)
 RUN R -e "options(repos = c(CRAN = 'https://packagemanager.posit.co/cran/__linux__/jammy/latest')); install.packages('renv'); renv::restore()"
 COPY . .
 # Ensure the actual files are readable by everyone
 RUN chmod -R 777 /renv
 CMD ["R"]
--- a/2
+++ b/2
@@ -0,0 +1,2 @@
 YEAR: 2026
 COPYRIGHT HOLDER: Rob Wiederstein
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -0,0 +1,21 @@
 # MIT License
 Copyright (c) 2026 forested authors
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/135
+++ b/135
@@ -0,0 +1,135 @@
 # Generated by roxygen2: do not edit by hand
 export(calculate_ga_aoa)
 export(combine_forest)
 export(create_stats_summary)
 export(get_epa_ecoregions)
 export(helper_save_fig)
 export(plot_ecoregion_comparison)
 export(plot_failure_mechanism)
 export(plot_ga_comparison_map)
 export(plot_georgia_aoa)
 export(plot_precip_hex_comparison)
 export(plot_regional_comparison)
 export(plot_rf_importance)
 export(plot_spatial_exploration)
 export(plot_state_topo)
 export(plot_theme_diagnostic)
 export(plot_us_map)
 export(process_ecoregions)
 export(save_combined_topo)
 export(save_error_map_png)
 export(save_outlier_map_png)
 export(setup_forestry_fonts)
 export(style_audit_table)
 export(theme_forestry_plot)
 export(theme_forestry_spatial)
 export(theme_forestry_void)
 importFrom(colorspace,scale_color_discrete_qualitative)
 importFrom(colorspace,scale_fill_discrete_qualitative)
 importFrom(cowplot,background_grid)
 importFrom(cowplot,theme_cowplot)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,all_of)
 importFrom(dplyr,any_of)
 importFrom(dplyr,arrange)
 importFrom(dplyr,bind_cols)
 importFrom(dplyr,case_when)
 importFrom(dplyr,desc)
 importFrom(dplyr,filter)
 importFrom(dplyr,group_by)
 importFrom(dplyr,if_any)
 importFrom(dplyr,if_else)
 importFrom(dplyr,inner_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,rename)
 importFrom(dplyr,row_number)
 importFrom(dplyr,select)
 importFrom(dplyr,summarize)
 importFrom(dplyr,where)
 importFrom(ggplot2,aes)
 importFrom(ggplot2,after_stat)
 importFrom(ggplot2,coord_sf)
 importFrom(ggplot2,element_blank)
 importFrom(ggplot2,element_line)
 importFrom(ggplot2,element_rect)
 importFrom(ggplot2,element_text)
 importFrom(ggplot2,geom_hline)
 importFrom(ggplot2,geom_point)
 importFrom(ggplot2,geom_segment)
 importFrom(ggplot2,geom_sf)
 importFrom(ggplot2,geom_smooth)
 importFrom(ggplot2,geom_vline)
 importFrom(ggplot2,ggplot)
 importFrom(ggplot2,ggsave)
 importFrom(ggplot2,guide_colorbar)
 importFrom(ggplot2,guide_legend)
 importFrom(ggplot2,guide_none)
 importFrom(ggplot2,guides)
 importFrom(ggplot2,labs)
 importFrom(ggplot2,margin)
 importFrom(ggplot2,rel)
 importFrom(ggplot2,scale_alpha)
 importFrom(ggplot2,scale_color_manual)
 importFrom(ggplot2,scale_color_viridis_c)
 importFrom(ggplot2,scale_fill_discrete)
 importFrom(ggplot2,scale_fill_distiller)
 importFrom(ggplot2,scale_fill_manual)
 importFrom(ggplot2,scale_shape_manual)
 importFrom(ggplot2,scale_x_continuous)
 importFrom(ggplot2,scale_y_continuous)
 importFrom(ggplot2,theme)
 importFrom(ggplot2,theme_minimal)
 importFrom(ggplot2,theme_void)
 importFrom(ggplot2,unit)
 importFrom(ggrepel,geom_label_repel)
 importFrom(gt,fmt_number)
 importFrom(gt,gt)
 importFrom(gt,opt_row_striping)
 importFrom(gt,px)
 importFrom(gt,tab_header)
 importFrom(gt,tab_options)
 importFrom(patchwork,plot_annotation)
 importFrom(patchwork,plot_layout)
 importFrom(patchwork,wrap_plots)
 importFrom(psych,describe)
 importFrom(ranger,ranger)
 importFrom(rmapshaper,ms_filter_islands)
 importFrom(rmapshaper,ms_simplify)
 importFrom(scales,squish)
 importFrom(sf,read_sf)
 importFrom(sf,sf_use_s2)
 importFrom(sf,st_as_sf)
 importFrom(sf,st_coordinates)
 importFrom(sf,st_crs)
 importFrom(sf,st_drop_geometry)
 importFrom(sf,st_filter)
 importFrom(sf,st_intersection)
 importFrom(sf,st_join)
 importFrom(sf,st_make_grid)
 importFrom(sf,st_make_valid)
 importFrom(sf,st_point_on_surface)
 importFrom(sf,st_read)
 importFrom(sf,st_transform)
 importFrom(sf,st_union)
 importFrom(showtext,showtext_auto)
 importFrom(spdep,card)
 importFrom(spdep,dnearneigh)
 importFrom(spdep,lag.listw)
 importFrom(spdep,nb2listw)
 importFrom(stats,predict)
 importFrom(stringr,str_wrap)
 importFrom(sysfonts,font_add_google)
 importFrom(terra,rast)
 importFrom(terra,shade)
 importFrom(terra,terrain)
 importFrom(tibble,rownames_to_column)
 importFrom(tibble,tibble)
 importFrom(tidyterra,geom_spatraster)
 importFrom(tidyterra,scale_fill_hypso_c)
 importFrom(tigris,shift_geometry)
 importFrom(tigris,states)
 importFrom(utils,download.file)
 importFrom(utils,unzip)
 importFrom(vip,vi)
 importFrom(waywiser,ww_area_of_applicability)
--- a/R/forestedAnalysis-package.R
+++ b/R/forestedAnalysis-package.R
@@ -0,0 +1,6 @@
 #' @keywords internal
 "_PACKAGE"
 ## usethis namespace: start
 ## usethis namespace: end
 NULL
--- a/R/functions.R
+++ b/R/functions.R
--- a/README.md
+++ b/README.md
@@ -0,0 +1,5 @@
 # Optimism Bias in Ecological Modeling
 ## Project Overview
 This project explores the hazards of ignoring **spatial autocorrelation** in ecological modeling. Using the `forested` package and forest structure data from Washington State, this Quarto presentation demonstrates how standard random cross-validation yields overly optimistic performance estimates by allowing models to "cheat" via nearby neighbors. The analysis utilizes the `spatialsample` package to visualize and compare three distinct validation strategies—**Random** (the baseline), **Spatial Blocking** (geographic separation), and **Environmental Clustering** (ecological separation)—to establish robust, geographically transferable model performance metrics.
--- a/_targets.R
+++ b/_targets.R
@@ -0,0 +1,481 @@
 library(targets)
 library(tarchetypes)
 # 1. Options ----
 tar_option_set(
  packages = c(
    "colorspace",
    "elevatr", 
    "forested", 
    "ggcorrplot", 
    "ggrepel",
    "ggspatial",
    "gt", 
    "magrittr",
    "patchwork", 
    "processx",
    "psych", 
    "quarto",
    "ranger", 
    "rmapshaper",
    "sf",
    "showtext",
    "spatialsample",
    "stringr",
    "terra", 
    "tidyterra",
    "tidymodels",
    "tidyverse",
    "tigris",
    "xgboost",   # For XGBoost
    "earth",
    "withr"
  ),
  format = "rds"
 )
 tar_source("R/functions.R")
 # 3. The Pipeline ----
 list(
  # constants 
  tar_target(n_folds, 10),
  # Data Ingestion
  tar_target(forested_wa, forested::forested_wa),
  tar_target(forested_ga, forested::forested_ga),
  tar_target(
    wa_sf,
    forested_wa %>% 
      sf::st_as_sf(coords = c("lon", "lat"), crs = 4326, remove = FALSE)
  ),
  tar_target(
    name = eco_url,
    command = "https://dmap-prod-oms-edc.s3.us-east-1.amazonaws.com/ORD/Ecoregions/us/us_eco_l3.zip",
    format = "url" 
  ),
  tar_target(
    name = data_dir,
    command = "data/epa",
    format = "file" # Tracks the directory
  ),
  # Download data
  tar_target(
    name = eco_zip_file,
    command = get_epa_ecoregions(url = eco_url, dest_dir = data_dir),
    format = "file"
  ),
  # Data Processing
  tar_target(forested_us, combine_forest(wa_data = forested_wa, ga_data = forested_ga)),
  tar_target(boundary_wa_sf, fetch_state_boundary(state = "Washington")),
  tar_target(boundary_ga_sf, fetch_state_boundary(state = "Georgia")),
  tar_target(
    name = eco_data,
    command = process_ecoregions(
      zip_path = eco_zip_file, 
      target_states = c("Washington", "Georgia"),
      simplify_tol = 0.05
    )
  ),
  # Raster File Target
  tar_target(wa_elev_file, 
             create_elevation_raster(boundary_wa_sf, "data/wa_elevation.tif"), 
             format = "file"),
  tar_target(ga_elev_file, 
             create_elevation_raster(boundary_ga_sf, "data/ga_elevation.tif"), 
             format = "file"),
  # Maps
  tar_target(fig_us_map, plot_us_map()),
  tar_target(
    name = fig_us_map_file,
    command = helper_save_fig(
      plot_obj = fig_us_map,
      name = "us_forests",
      width = 10,
      height = 5.25,
      type = "map"
    ),
    format = "file"
  ),
  tar_target(wa_ga_map, fetch_study_area(c("Washington", "Georgia"))),
  tar_target(map_wa_ga_regional, plot_regional_comparison(forested_us, wa_ga_map)),
  tar_target(
    name = fig_wa_ga_regional_file,
    command = helper_save_fig(
      plot_obj = map_wa_ga_regional,
      name = "wa_ga_forests",
      width = 9.2,
      height = 4.25,
      type = "map"
    ),
    format = "file"
  ),
  tar_target(
    name = ecoregion_plot,
    command = plot_ecoregion_comparison(eco_data)
  ),
  tar_target(
    name = ecoregion_plot_file,
    command = helper_save_fig(
      plot_obj = ecoregion_plot, 
      name = "wa_ga_ecoregions",
      width = 10,
      height = 4.25,
      type = "map"
    ),
    format = "file" # Tells targets to watch the actual .png file
  ),
  tar_target(
    name = combined_topo_map,
    command = save_combined_topo(
      wa_data = forested_wa,
      ga_data = forested_ga,
      wa_boundary = boundary_wa_sf,
      ga_boundary = boundary_ga_sf,
      wa_raster_path = wa_elev_file,
      ga_raster_path = ga_elev_file,
      output_path = "figs/combined_topo.png"
    ),
    format = "file"
  ),
  tar_target(
    map_precip_hex,
    plot_precip_hex_comparison(
      wa_data = forested_wa,
      ga_data = forested_ga,
      boundaries = wa_ga_map,
      bins = 50
    )
  ),
  tar_target(plot_cv_comparison, plot_cv_strategies(forested_wa)),
  # fold mechanics
  tar_target(
    fig_fold_mechanics,
    plot_fold_mechanics(wa_sf, boundary_wa_sf)
  ),
  # fold diagram
  tar_target(fig_classic_cv, plot_classic_kfold_diagram()),
  # Analysis
  tar_target(tbl_forest_wa, format_summary_table(forested_wa)),
  tar_target(plot_distrib_wa, plot_forest_distributions(forested_wa)),
  tar_target(plt_outliers, identify_outliers(forested_wa)),
  tar_target(map_wa_outliers, 
             save_outlier_map_png(forested_wa, boundary_wa_sf, wa_elev_file, "figs/wa_outliers.png"),
             format = "file"),
  tar_target(plt_wa_pca, plot_wa_pca(forested_wa)),
  tar_target(
    name = p_moran_exploration,
    command = plot_spatial_exploration(forested_wa)
  ),
  # correlogram
  tar_target(plt_correlogram, plot_correlations(forested_wa)),
  # vip plot
  tar_target(plt_vip, plot_rf_importance(forested_wa)),
  # umap plot
  tar_target(umap_plot, plot_umap_forested(forested_wa)),
  # 1. Data Splitting -------------------------------------------------
  # Define the split (80% Train, 20% Test)
  tar_target(splits, initial_split(wa_sf, prop = 0.80, strata = forested)),
  # Extract the Training Set (Used for Resampling/Modeling)
  tar_target(train_data, training(splits)),
  # Extract the Test Set (Locked away until the very end)
  tar_target(test_data, testing(splits)),
  # 2. Recipes ----
  ## A: Base (Includes Lat/Lon) ----
  tar_target(
    recipe_base,
    recipe(forested ~ ., data = train_data) %>%
      update_role(geometry, new_role = "id") %>%
      step_novel(all_nominal_predictors()) %>%
      step_dummy(all_nominal_predictors()) %>%
      step_zv(all_predictors()) %>%
      step_normalize(all_numeric_predictors())
  ),
  ## B: Non-Spatial (Bio Only) ----
  tar_target(
    recipe_non_spatial,
    recipe(forested ~ ., data = train_data) %>%
      update_role(geometry, lat, lon, new_role = "id") %>%
      step_novel(all_nominal_predictors()) %>%
      step_dummy(all_nominal_predictors()) %>%
      step_zv(all_predictors()) %>%
      step_normalize(all_numeric_predictors())
  ),
  ## C: Extensible (Feature Engineered) ----
  tar_target(
    recipe_extensible,
    recipe(forested ~ ., data = train_data) %>%
      update_role(geometry, lat, lon, new_role = "id") %>%
      step_rm(northness, county, year) %>%
      step_ratio(precip_annual, denom = denom_vars(temp_annual_max)) %>%
      step_mutate(
        temp_range = temp_annual_max - temp_annual_min,
        vpd_range = vapor_max - vapor_min
      ) %>%
      step_YeoJohnson(elevation) %>%
      step_novel(all_nominal_predictors()) %>%
      step_dummy(all_nominal_predictors()) %>%
      step_zv(all_predictors()) %>%
      step_normalize(all_numeric_predictors())
  ),
  tar_target(
    plot_yeo,
    plot_yeo_johnson(forested_wa)
  ),
  # 3. Engines ----
  ## Logistic Regression ----
  tar_target(
    spec_logistic,
    logistic_reg() %>% 
      set_engine("glm") %>% 
      set_mode("classification")
  ),
  ## MARS ----
  tar_target(
    spec_mars,
    mars(num_terms = 10, prod_degree = 2) %>% 
      set_engine("earth", nfold = 1) %>%  # nfold=1 prevents internal CV (speed)
      set_mode("classification")
  ),
  ## Random Forest ----
  tar_target(
    spec_rf,
    rand_forest(trees = 1000, min_n = 10) %>% 
      set_engine("ranger", 
                 importance = "impurity", # Calculate variable importance
                 num.threads = 1) %>%     # <--- Server Safety Lock
      set_mode("classification")
  ),
  ## XGBoost ----
  tar_target(
    spec_xgb,
    boost_tree(trees = 1000, tree_depth = 6, learn_rate = 0.01) %>% 
      set_engine("xgboost", 
                 nthread = 1) %>%         # <--- Server Safety Lock
      set_mode("classification")
  ),
  # 4. The Workflow Set ----
  # Crosses every recipe with every model (2 x 4 = 8 workflows)
  tar_target(
    model_set,
    workflow_set(
      preproc = list(base = recipe_base, 
                     non_spatial = recipe_non_spatial,
                     extensible = recipe_extensible),
      models = list(
        log = spec_logistic, 
        rf = spec_rf, 
        xgb = spec_xgb, 
        mars = spec_mars
      ),
      cross = TRUE
    )
  ),
  # 5. Resampling Strategies -----
  ## A. Random Folds ----
  tar_target(
    folds_random,
    vfold_cv(train_data, v = n_folds, strata = forested)
  ),
  ## B. Spatial Blocks ----
  tar_target(
    folds_block,
    spatial_block_cv(train_data, v = n_folds) 
  ),
  ## C. Spatial Clustering ----
  tar_target(
    folds_cluster,
    spatial_clustering_cv(train_data, v = n_folds) 
  ),
  # 6. Fit Models -----
  ## Branch 1: Random CV ----
  tar_target(
    results_random,
    workflow_map(
      model_set, 
      "fit_resamples", 
      resamples = folds_random,
      metrics = metric_set(roc_auc, accuracy, pr_auc),
      verbose = TRUE
    )
  ),
  ## Branch 2: Block CV ----
  tar_target(
    results_block,
    workflow_map(
      model_set, 
      "fit_resamples", 
      resamples = folds_block,
      metrics = metric_set(roc_auc, accuracy, pr_auc),
      verbose = TRUE
    )
  ),
  ## Branch 3: Cluster CV ----
  tar_target(
    results_cluster,
    workflow_map(
      model_set, 
      "fit_resamples", 
      resamples = folds_cluster,
      metrics = metric_set(roc_auc, accuracy, pr_auc),
      verbose = TRUE
    )
  ),
  # 7. Results ----
  tar_target(
    fig_cv_comparison,
    plot_spatial_cv_comparison(results_random, results_block, results_cluster)
  ),
  tar_target(
    fig_model_stability,
    plot_model_stability(results_random, results_block, results_cluster, best_model_id)
  ),
  # 8. Select and Tune the Best Model ----
  tar_target(
    best_model_id,
    results_cluster %>% 
      rank_results(rank_metric = "roc_auc", select_best = TRUE) %>% 
      slice(1) %>% 
      pull(wflow_id)
  ),
  tar_target(
    tbl_model_performance,
    results_cluster %>% 
      rank_results(rank_metric = "roc_auc", select_best = TRUE) %>% 
      filter(.metric == "roc_auc")
  ),
  # 9. Final Fit ----
  tar_target(
    final_fit_results,
    last_fit(
      extract_workflow(model_set, best_model_id),
      split = splits, # Your original 80/20 split
      metrics = metric_set(roc_auc, accuracy)
    )
  ),
  # 10. Test Set Performance Plot ----
  tar_target(
    fig_final_performance,
    plot_final_test_results(final_fit_results) # Use the specific plotting function
  ),
  tar_target(
    tbl_performance,
    create_performance_table(results_cluster, final_fit_results)
  ),
  # 11. Confusion Matrix ----
  tar_target(
    fig_confusion_matrix,
    plot_final_confusion_matrix(final_fit_results)
  ),
  tar_target(
    test_predictions,
    collect_predictions(final_fit_results) %>%
      dplyr::bind_cols(
        rsample::testing(splits) %>% 
          dplyr::select(lat, lon)
      )
  ),
  tar_target(
    map_wa_errors,
    save_error_map_png(
      data = test_predictions,  # <--- Use the extracted data here
      boundary_sf = boundary_wa_sf,
      raster_path = wa_elev_file,
      output_path = "figs/wa_errors.png"
    ),
    format = "file"
  ),
  # Georgia ----
  tar_target(
    model_predictors,
    c("elevation", "precip_annual", "temp_annual_mean", "roughness")
  ),
  tar_target(
    ga_aoa_data,
    calculate_ga_aoa(
      train_data = forested_wa,
      test_data = forested_ga,
      predictors = model_predictors
    )
  ),
  tar_target(
    plot_aoa_ga,
    plot_georgia_aoa(
      aoa_sf = ga_aoa_data 
    )
  ),
  # 3. Predict on Georgia using the Washington Model
  tar_target(
    ga_predictions,
    predict_external_region(
      final_fit = final_fit_results, 
      new_data = forested_ga         
    )
  ),
  # 4. Map the Predictions
  tar_target(
    map_ga_probs,
    plot_ga_comparison_map(
      pred_data = ga_predictions,
      boundaries = boundary_ga_sf # <--- CHECK THIS NAME
    )
  ),
  # 5. Confusion Matrix for Georgia
  tar_target(
    ga_conf_mat,
    plot_ga_confusion_matrix(ga_predictions)
  ),
  # 6. Map of Errors (False Positives + False Negatives)
  tar_target(
    map_failure_mechanism,
    plot_failure_mechanism(
      aoa_data = ga_aoa_data,      # <--- Reads the SAME data target
      pred_data = ga_predictions,  
      boundaries = boundary_ga_sf
    )
  ),
  # Report ----
  tar_target(
    name = report,
    command = {
      # 1. Temporarily disable renv auto-loader so Quarto uses system libs
      if (file.exists(".Rprofile")) file.rename(".Rprofile", "hold_Rprofile")
      # 2. Use a 'tryCatch' to ensure the .Rprofile is restored even if render fails
      res <- tryCatch({
        quarto::quarto_render("index.qmd", quiet = FALSE)
      }, error = function(e) {
        if (file.exists("hold_Rprofile")) file.rename("hold_Rprofile", ".Rprofile")
        stop(e)
      })
      # 3. Restore the .Rprofile
      if (file.exists("hold_Rprofile")) file.rename("hold_Rprofile", ".Rprofile")
      "index.html"
    },
    format = "file"
  )
 )
--- a/assets/ecoregions_map.png
+++ b/assets/ecoregions_map.png
--- a/assets/fonts.css
+++ b/assets/fonts.css
@@ -0,0 +1,43 @@
 /* assets/fonts.css */
@font-face {
  font-family: 'Atkinson Hyperlegible Next';
  src: url('fonts/AtkinsonHyperlegibleNext-Regular.ttf') format('truetype');
  font-weight: normal;
  font-style: normal;
 }
@font-face {
  font-family: 'Atkinson Hyperlegible Next';
  src: url('fonts/AtkinsonHyperlegibleNext-Bold.ttf') format('truetype');
  font-weight: bold;
  font-style: normal;
 }
@font-face {
  font-family: 'Atkinson Hyperlegible Next';
  src: url('fonts/AtkinsonHyperlegibleNext-Italic.ttf') format('truetype');
  font-weight: normal;
  font-style: italic;
 }
@font-face {
  font-family: 'Atkinson Hyperlegible Next';
  src: url('fonts/AtkinsonHyperlegibleNext-BoldItalic.ttf') format('truetype');
  font-weight: bold;
  font-style: italic;
 }
@font-face {
  font-family: 'Atkinson Hyperlegible Mono';
  src: url('fonts/AtkinsonHyperlegibleMono-Regular.ttf') format('truetype');
  font-weight: normal;
  font-style: normal;
 }
@font-face {
  font-family: 'Atkinson Hyperlegible Mono';
  src: url('fonts/AtkinsonHyperlegibleMono-Bold.ttf') format('truetype');
  font-weight: bold;
  font-style: normal;
 }
--- a/assets/fonts/AtkinsonHyperlegibleMono-Bold.ttf
+++ b/assets/fonts/AtkinsonHyperlegibleMono-Bold.ttf
--- a/assets/fonts/AtkinsonHyperlegibleMono-Regular.ttf
+++ b/assets/fonts/AtkinsonHyperlegibleMono-Regular.ttf
--- a/assets/fonts/AtkinsonHyperlegibleNext-Bold.ttf
+++ b/assets/fonts/AtkinsonHyperlegibleNext-Bold.ttf
--- a/assets/fonts/AtkinsonHyperlegibleNext-BoldItalic.ttf
+++ b/assets/fonts/AtkinsonHyperlegibleNext-BoldItalic.ttf
--- a/assets/fonts/AtkinsonHyperlegibleNext-Italic.ttf
+++ b/assets/fonts/AtkinsonHyperlegibleNext-Italic.ttf
--- a/assets/fonts/AtkinsonHyperlegibleNext-Regular.ttf
+++ b/assets/fonts/AtkinsonHyperlegibleNext-Regular.ttf
--- a/assets/study_sites_globe.png
+++ b/assets/study_sites_globe.png
--- a/custom.scss
+++ b/custom.scss
@@ -0,0 +1,31 @@
 /* custom.scss */
 #title-slide h1 {
  font-size: 1.4em !important;
  line-height: 1.2 !important;
 }
 /* Fallback: Make it "nuclear" if the above fails */
 .reveal .slides section#title-slide h1 {
  font-size: 1.2em !important;
 }
 .reveal .slides section.title-slide {
  background-color: #ffffff !important; /* Force white background */
 }
 .reveal .slides section.title-slide h1 {
  color: #000000 !important;
 }
 .reveal .slides section.title-slide p,
 .reveal .slides section.title-slide .quarto-title-author-name {
  color: #333333 !important;
 }
 $font-family-monospace: "Atkinson Hyperlegible Mono", monospace !default;
 /*-- scss:defaults --*/
 /* Use the name exactly as defined in fonts.css */
 $font-family-sans-serif: "Atkinson Hyperlegible Next", sans-serif !default;
 $presentation-heading-font: "Atkinson Hyperlegible Next", sans-serif !default;
 /* Other tweaks */
 $presentation-font-size-root: 40px;
--- a/figs/combined_topo.png
+++ b/figs/combined_topo.png
--- a/figs/map_us_forests.png
+++ b/figs/map_us_forests.png
--- a/figs/map_wa_ga_ecoregions.png
+++ b/figs/map_wa_ga_ecoregions.png
--- a/figs/map_wa_ga_forests.png
+++ b/figs/map_wa_ga_forests.png
--- a/figs/wa_errors.png
+++ b/figs/wa_errors.png
--- a/figs/wa_outliers.png
+++ b/figs/wa_outliers.png
--- a/forested.Rproj
+++ b/forested.Rproj
@@ -0,0 +1,18 @@
 Version: 1.0
 RestoreWorkspace: Default
 SaveWorkspace: Default
 AlwaysSaveHistory: Default
 EnableCodeIndexing: Yes
 UseSpacesForTab: Yes
 NumSpacesForTab: 2
 Encoding: UTF-8
 RnwWeave: Sweave
 LaTeX: pdfLaTeX
 BuildType: Package
 PackageUseDevtools: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
 PackageRoxygenize: rd,collate,namespace
--- a/ieee-access.csl
+++ b/ieee-access.csl
@@ -0,0 +1,17 @@
 <?xml version="1.0" encoding="utf-8"?>
 <style xmlns="http://purl.org/net/xbiblio/csl" version="1.0" default-locale="en-US">
  <!-- Generated with https://github.com/citation-style-language/utilities/tree/master/generate_dependent_styles/data/ieee -->
  <info>
    <title>IEEE Access</title>
    <id>http://www.zotero.org/styles/ieee-access</id>
    <link href="http://www.zotero.org/styles/ieee-access" rel="self"/>
    <link href="http://www.zotero.org/styles/ieee" rel="independent-parent"/>
    <link href="http://ieeexplore.ieee.org/servlet/opac?punumber=6287639" rel="documentation"/>
    <category citation-format="numeric"/>
    <category field="engineering"/>
    <category field="communications"/>
    <issn>2169-3536</issn>
    <updated>2014-05-15T02:20:32+00:00</updated>
    <rights license="http://creativecommons.org/licenses/by-sa/3.0/">This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License</rights>
  </info>
 </style>
--- a/images/resampling.svg
+++ b/images/resampling.svg
@@ -0,0 +1,172 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 <svg xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" xmlns:xl="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg" viewBox="-535.5 -701.5 869.75 527.75" width="869.75" height="527.75">
  <defs>
    <filter id="Shadow" filterUnits="userSpaceOnUse" x="-535.5" y="-701.5">
      <feGaussianBlur in="SourceAlpha" result="blur" stdDeviation="1.308"/>
      <feOffset in="blur" result="offset" dx="0" dy="2"/>
      <feFlood flood-color="black" flood-opacity=".5" result="flood"/>
      <feComposite in="flood" in2="offset" operator="in" result="color"/>
      <feMerge>
        <feMergeNode in="color"/>
        <feMergeNode in="SourceGraphic"/>
      </feMerge>
    </filter>
    <font-face font-family="Helvetica Neue" font-size="16" panose-1="2 0 5 3 0 0 0 2 0 4" units-per-em="1000" underline-position="-100" underline-thickness="50" slope="0" x-height="517" cap-height="714" ascent="951.9958" descent="-212.99744" font-weight="400">
      <font-face-src>
        <font-face-name name="HelveticaNeue"/>
      </font-face-src>
    </font-face>
    <marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="FilledArrow_Marker" stroke-linejoin="miter" stroke-miterlimit="10" viewBox="-1 -4 10 8" markerWidth="10" markerHeight="8" color="black">
      <g>
        <path d="M 8 0 L 0 -3 L 0 3 Z" fill="currentColor" stroke="currentColor" stroke-width="1"/>
      </g>
    </marker>
    <font-face font-family="Helvetica Neue" font-size="16" panose-1="2 0 5 3 0 0 0 9 0 4" units-per-em="1000" underline-position="-100" underline-thickness="50" slope="-750" x-height="517" cap-height="714" ascent="957.0007" descent="-212.99744" font-style="italic" font-weight="400">
      <font-face-src>
        <font-face-name name="HelveticaNeue-Italic"/>
      </font-face-src>
    </font-face>
  </defs>
  <metadata> Produced by OmniGraffle 7.13.1 
    <dc:date>2020-03-15 00:14:09 +0000</dc:date>
  </metadata>
  <g id="Canvas_1" stroke="none" stroke-opacity="1" fill-opacity="1" stroke-dasharray="none" fill="none">
    <title>Canvas 1</title>
    <g id="Canvas_1: Layer 1">
      <title>Layer 1</title>
      <g id="Graphic_724" filter="url(#Shadow)">
        <ellipse cx="-43.5" cy="-641.25" rx="57.7500922788345" ry="58.7500938767363" fill="white"/>
        <ellipse cx="-43.5" cy="-641.25" rx="57.7500922788345" ry="58.7500938767363" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        <text transform="translate(-84.7 -650.474)" fill="black">
          <tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="13.496" y="15">All Data</tspan>
        </text>
      </g>
      <g id="Graphic_723" filter="url(#Shadow)">
        <path d="M -107.25 -529.5 L -48.99782 -488.9047 L -71.24811 -423.2203 L -143.2519 -423.2203 L -165.50218 -488.9047 Z" fill="#ffeabb"/>
        <path d="M -107.25 -529.5 L -48.99782 -488.9047 L -71.24811 -423.2203 L -143.2519 -423.2203 L -165.50218 -488.9047 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        <text transform="translate(-151.25 -474.099)" fill="black">
          <tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="16.144" y="15">Training</tspan>
        </text>
      </g>
      <g id="Graphic_722" filter="url(#Shadow)">
        <path d="M 207.25 -529.5 L 265.50218 -488.9047 L 243.2519 -423.2203 L 171.2481 -423.2203 L 148.99782 -488.9047 Z" fill="#e5e6ff"/>
        <path d="M 207.25 -529.5 L 265.50218 -488.9047 L 243.2519 -423.2203 L 171.2481 -423.2203 L 148.99782 -488.9047 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        <text transform="translate(163.25 -474.099)" fill="black">
          <tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="18.664" y="15">Testing</tspan>
        </text>
      </g>
      <g id="Line_721">
        <line x1="-64.64317" y1="-586.56304" x2="-87.49556" y2="-527.45516" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_720">
        <line x1="5.009894" y1="-609.35054" x2="159.74542" y2="-507.5985" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Graphic_719" filter="url(#Shadow)">
        <ellipse cx="-335.25" cy="-214.75" rx="61.2500978714911" ry="35.5000567255173" fill="#e5e6ff"/>
        <ellipse cx="-335.25" cy="-214.75" rx="61.2500978714911" ry="35.5000567255173" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        <text transform="translate(-379.25 -223.974)" fill="black">
          <tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x=".432" y="15">Assessment</tspan>
        </text>
      </g>
      <g id="Graphic_718" filter="url(#Shadow)">
        <ellipse cx="-468" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" fill="#ffeabb"/>
        <ellipse cx="-468" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        <text transform="translate(-514.2 -223.974)" fill="black">
          <tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="16.72" y="15">Analysis</tspan>
        </text>
      </g>
      <g id="Graphic_717" filter="url(#Shadow)">
        <rect x="-469.75" y="-359" width="139" height="56.5" fill="white"/>
        <rect x="-469.75" y="-359" width="139" height="56.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        <text transform="translate(-464.75 -339.974)" fill="black">
          <tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="22.116" y="15">Resample 1</tspan>
        </text>
      </g>
      <g id="Line_716">
        <line x1="-153.47164" y1="-453.3897" x2="-334.53915" y2="-363.40586" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_715">
        <line x1="-416.74946" y1="-302.5" x2="-443.27834" y2="-257.07786" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_714">
        <line x1="-384.42026" y1="-302.5" x2="-359.01294" y2="-257.1577" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Graphic_713" filter="url(#Shadow)">
        <ellipse cx="-37.25" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" fill="#e5e6ff"/>
        <ellipse cx="-37.25" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        <text transform="translate(-83.45 -223.974)" fill="black">
          <tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="2.6320008" y="15">Assessment</tspan>
        </text>
      </g>
      <g id="Graphic_712" filter="url(#Shadow)">
        <ellipse cx="-172.75" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" fill="#ffeabb"/>
        <ellipse cx="-172.75" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        <text transform="translate(-218.95 -223.974)" fill="black">
          <tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="16.72" y="15">Analysis</tspan>
        </text>
      </g>
      <g id="Graphic_711" filter="url(#Shadow)">
        <rect x="-174.5" y="-359" width="139" height="56.5" fill="white"/>
        <rect x="-174.5" y="-359" width="139" height="56.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        <text transform="translate(-169.5 -339.974)" fill="black">
          <tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="22.116" y="15">Resample 2</tspan>
        </text>
      </g>
      <g id="Line_710">
        <line x1="-106.42887" y1="-423.2203" x2="-105.58948" y2="-368.8988" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_709">
        <line x1="-121.49946" y1="-302.5" x2="-148.02834" y2="-257.07786" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_708">
        <line x1="-88.50054" y1="-302.5" x2="-61.97166" y2="-257.07786" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Graphic_707" filter="url(#Shadow)">
        <ellipse cx="266.75" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" fill="#e5e6ff"/>
        <ellipse cx="266.75" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        <text transform="translate(220.55 -223.974)" fill="black">
          <tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="2.6320008" y="15">Assessment</tspan>
        </text>
      </g>
      <g id="Graphic_706" filter="url(#Shadow)">
        <ellipse cx="131.25" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" fill="#ffeabb"/>
        <ellipse cx="131.25" cy="-214.75" rx="64.0001022657214" ry="35.5000567255173" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        <text transform="translate(85.05 -223.974)" fill="black">
          <tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="16.72" y="15">Analysis</tspan>
        </text>
      </g>
      <g id="Graphic_705" filter="url(#Shadow)">
        <rect x="129.5" y="-359" width="139" height="56.5" fill="white"/>
        <rect x="129.5" y="-359" width="139" height="56.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        <text transform="translate(134.5 -339.974)" fill="black">
          <tspan font-family="Helvetica Neue" font-size="16" font-weight="400" fill="black" x="21.084" y="15">Resample </tspan>
          <tspan font-family="Helvetica Neue" font-size="16" font-style="italic" font-weight="400" fill="black" y="15">B</tspan>
        </text>
      </g>
      <g id="Line_704">
        <line x1="-60.738406" y1="-454.24567" x2="130.64323" y2="-363.25103" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_703">
        <line x1="182.50054" y1="-302.5" x2="155.97166" y2="-257.07786" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_702">
        <line x1="215.49946" y1="-302.5" x2="242.02834" y2="-257.07786" marker-end="url(#FilledArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Group_698">
        <g id="Graphic_701">
          <ellipse cx="24.25" cy="-330.75" rx="4.75000759003401" ry="4.00000639160761" fill="black"/>
          <ellipse cx="24.25" cy="-330.75" rx="4.75000759003401" ry="4.00000639160761" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        </g>
        <g id="Graphic_700">
          <ellipse cx="42.75" cy="-330.75" rx="4.75000759003401" ry="4.00000639160761" fill="black"/>
          <ellipse cx="42.75" cy="-330.75" rx="4.75000759003401" ry="4.00000639160761" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        </g>
        <g id="Graphic_699">
          <ellipse cx="61.25" cy="-330.75" rx="4.75000759003401" ry="4.00000639160761" fill="black"/>
          <ellipse cx="61.25" cy="-330.75" rx="4.75000759003401" ry="4.00000639160761" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
        </g>
      </g>
    </g>
  </g>
 </svg>
--- a/index.qmd
+++ b/index.qmd
@@ -0,0 +1,919 @@
 ---
 title: "From Mt. Olympus to the Okefenokee"
 subtitle: "A Case Study in Spatial Modeling"
 author: "Rob Wiederstein"
 lang: en-US
 smart: false
 format: 
  revealjs:
    from: markdown-smart
    theme: [default, custom.scss]
    css: assets/fonts.css
    embed-resources: true
    title-slide-attributes:
      data-background-image: assets/study_sites_globe.png
      data-background-size: 150%
      data-background-position: center
      style: "color: #222222;"
    transition: fade
    slide-number: true
    scrollable: true 
    chalkboard: false
    tbl-cap-location: bottom
    toc: true
    toc-depth: 1
    toc-title: "Order"
    fig-dpi: 300
    fig-width: 10
    fig-asp: 0.5
    fig-align: center
    resources: 
      - assets/fonts
 execute:
  echo: false
  cache: false
 bibliography: references.bib
 csl: ieee-access.csl
 nocite: |
  @*
 ---
 ```{r}
 #| label: setup
 #| include: false
 library(here)
 library(targets)
 library(gt)
 library(ggplot2)
 library(showtext)
 source("R/functions.R")
 setup_forestry_fonts()
 ```
 # Introduction
 ## The Core Problem
 ::: {.incremental}
 1.  **Stationarity:** Do rules hold constant across different places?
 2.  **Spatial Leakage:** When location is added to a model, does it make it more accurate?
 3. **Extrapolation:** How does a  model perform when trained with location data and applied in a new location?
 :::
 ::: {.notes}
 * **Goal:** Test if "location" features trick the model into high accuracy that fails elsewhere.
 :::
 ## The `forested` Package
 :::: {.columns}
 ::: {.column width="25%"}
 ![](https://github.com/simonpcouch/forested/blob/main/inst/logo.png?raw=true){fig-align="center" width="80%"}
 :::
 ::: {.column width="75%"}
 - The `forested` data are from people who looked at a place to see if it was a forest. 
 - They work for the Forest Inventory and Analysis (FIA) program, part of the USDA.
 - It would be cheaper if a forest could be predicted from weather data and land charateristics. 
 - Forests are in GA and WA.
 :::
 ::::
 ::: {.notes}
 **Speaker Notes:**
 - The `forested` package is our primary data source, containing the raw measurements for both the Washington and Georgia "islands".
 - We are auditing this package's features to see how well they predict the 'forested' outcome in two geographically distant locations.
 :::
 ## The First Law of Geography
 <br>
 <br>
 >"Everything is related to everything else, but near things are more related than distant things."[@tobler_computer_1970]
 ::: {.notes}
 - This is the "First Law of Geography" and explains why our Random Forest "cheats" using Lat/Lon.
 - Proximity Bias (Spatial Autocorrelation) creates high local accuracy but zero portability.
 - We are testing for **Stationarity**: Do the biophysical rules of Washington still work in Georgia?
 :::
 ## Caveat
 <br/>
 <br/>
 >"It is not and has never been the case that Tobler’s first law of geography . . . always holds absolutely. This is and has always been an oversimplification, disguising possible underlying entitation, support, and other misspecification problems."[@pebesma_spatial_2025]
 ::: {.notes}
 :::
 ## Forest Locations
 ```{r}
 #| label: fig-us-map-forest-locations
 #| fig-cap: "Map shows the geographic distance separating Washington and Georgia."
 knitr::include_graphics(here::here("figs", "map_us_forests.png"))
 ```
 :::{.notes}
 - Washington is approximately 15 degrees north of Georgia and 30 degrees west.
 - the sheer distance suggests that the respective forests are different.
 :::
 ## Regional Forestation
 ```{r}
 #| label: fig-map-wa-ga
 #| fig-cap: "Washington (a) and Georgia (b) showing forested areas. Note that the states are rescaled independently to maximize clarity."
 #tar_read(map_wa_ga)
 knitr::include_graphics(here::here("figs", "map_wa_ga_forests.png"))
 ```
 ## Regional Topography
 ```{r}
 #| label: fig-topo-compare
 #| echo: false
 #| fig-align: "center"
 #| out-width: "100%"
 #| fig-cap: "Topographic relief map of Washington (a) and Georgia (b). Note: Regions are not to scale and elevation ramps are independent (WA range is ~3x GA)."
 knitr::include_graphics(
  here::here("figs", "combined_topo.png")
 )
 ```
 :::{.notes}
 - **Scale Disparity:** Remind the audience that WA peaks reach ~4,400m 
   while GA peaks reach ~1,450m. The color ramps are local.
 - **Rain Shadow:** Point out the Cascade barrier in WA; this is the 
   primary driver for the precipitation variance in the model.
 - **Modeling Link:** This extreme relief is why we use a Yeo-Johnson 
   transformation on elevation in our tidymodels recipe—a linear 
   scale would over-emphasize alpine peaks while flattening 
   the Georgia Piedmont.
 :::
 ## Regional Rainfall
 ```{r}
 #| label: fig-precip-hex
 #| fig-cap: "Mean annual precipitation (mm). Note the extreme gradient in WA (training) vs. the relative uniformity of GA (target)."
 targets::tar_read(map_precip_hex)
 ```
 ## Level III Ecoregions
 ```{r}
 #| label: fig-ecoregion-comparison
 #| fig-cap: "Washington (a) has nine distinct regions while Georgia (b) has seven. Data sourced from U.S. EPA Level III Ecoregions [@epa_ecoregions_2013; @omernik_ecoregions_1987]."
 knitr::include_graphics(here::here("figs", "map_wa_ga_ecoregions.png"))
 ```
 :::{.notes}
 - Ecoregions denote areas with similar ecosystems and resources.
 - The EPA defines 105 Level III regions for management.
 - James Omernik drew these lines using holistic expert synthesis.
 - Washington and Georgia share zero common ecoregions.
 - Washington transitions rapidly from rainforests to arid deserts.
 - This extreme heterogeneity makes random spatial modeling difficult.
 :::
 # Explore
 ## Descriptive Summary
 ```{r}
 #| label: display-summary
 #| echo: false
 tar_read(tbl_forest_wa)
 ```
 ## Distributions
 ```{r}
 #| label: fig-distributions
 #| fig-cap: "Comparison of environmental variable distributions for forested vs. non-forested areas."
 targets::tar_read(plot_distrib_wa)
 ```
 ::: {.notes}
 **1. Topic Introduction**
 - This slide presents a univariate audit of our numeric predictors to identify which biophysical features provide the strongest signal for forestation.
 - By comparing the "fingerprints" of forested (green) and non-forested (brown) plots, we can visually assess the potential for classification before we begin training models on our EPYC VM.
 **2. Axis Definitions**
 - **The X-Axis (Value)**: Represents the measurement for each specific biophysical variable, such as millimeters of rain or degrees Celsius.
 - **The Y-Axis (Density)**: Represents the probability density for a given value; higher peaks indicate a higher frequency of observations at that specific value within the dataset.
 **3. Significant Variables (High Contrast)**
 - **Precipitation (`precip_annual`)**: This is a primary driver; forested plots are heavily concentrated in higher rainfall zones, while non-forested plots dominate the dry end of the spectrum.
 - **Elevation**: There is a distinct "Forestation Window"; plots between 1,000 and 2,000 meters show a significant green peak, whereas non-forested plots cluster at lower elevations.
 - **Temperature (`temp_annual_max` & `mean`)**: Forested plots consistently peak at lower maximum and mean temperatures compared to non-forested areas, suggesting a thermal threshold for forest growth.
 - **Vapor Pressure (`vapor_max` & `min`)**: We see a strong bimodal separation; forested areas occupy a specific atmospheric moisture niche distinctly different from non-forested regions.
 **4. Non-Significant Variables (High Overlap)**
 - **Orientation (`eastness` & `northness`)**: These distributions are nearly identical for both classes, suggesting that cardinal direction alone is a weak predictor in this regional regime.
 - **Roughness**: While there is a slight lean toward forested plots being in rougher terrain, the massive overlap indicates surface texture is not a primary discriminator.
 :::
 ## Outliers
 ```{r}
 #| label: outliers
 tar_read(plt_outliers)
 ```
 ## Map Outliers
 ```{r}
 #| label: fig-wa-outliers
 #| fig-cap: "Map of observations with a value greater than three standard deviations from mean."
 #| out-width: "100%"
 knitr::include_graphics("figs/wa_outliers.png")
 ```
 ::: {.notes}
 **1. Topic Introduction**
 * This map visualizes our "3-Sigma" outliers, which are heavily concentrated along the mountainous west side of the state.
 **2. The Orographic Factor**
 * The concentration on the west is driven by the Cascades and Olympics. These regions host our most extreme biophysical values for precipitation and elevation.
 **3. Intermixed Extremes**
 * Note the intermixing of green and brown points. In these high-volatility alpine zones, a forest and a barren ridge often share the same coordinates.
 * This proves that local "nearness" is not enough to predict forestation here; the model must rely on the specific biophysical drivers we identified in our density distributions.
 **4. The Audit Link**
 * These outliers represent the "edge cases" of our Washington model. Their intermixed nature makes them the hardest points to classify, serving as a preview for our Georgia transfer test.
 :::
 ## Principal Component Analysis
 ```{r}
 #| label: pca
 tar_read(plt_wa_pca)
 ```
 ::: {.notes}
 **1. What is PCA?**
 PCA (Principal Component Analysis) is a dimensionality reduction tool that takes our 16 variables—including latitude, longitude, and climate data—and compresses them into two primary axes called Principal Components. It allows us to view the 'shape' of the entire Washington dataset in a single 2D space.
 **2. Why use it here?**
 We use it to explore the structural integrity of our data. Before building a model, we need to know if the environment of 'Forested' plots is actually mathematically different from 'Non-Forested' plots. By including lat and lon, we are seeing the combined power of geography and biophysics.
 **3. Does it show anything?**
 It shows that the data is not a random cloud; it has a clear orientation. The spread along PC1 captures the primary environmental gradient of Washington—likely moving from the moist coast to the arid east. 
 **4. Is there good separation on the outcome variable?**
 Yes, the separation is significant. We see a distinct 'No' (Non-Forested) cluster forming a tail on the right and a dense 'Yes' (Forested) cluster on the left. While there is 'Alpine Mixing' in the center where categories overlap, the two groups occupy mostly different regions of the feature space.
 **5. What does it foreshadow for modeling?**
 This separation foreshadows high accuracy for our local Washington model. Because the classes are so distinct in this space, a logistic regression should have no trouble drawing a boundary between them. However, the tight coupling of biophysics with coordinates (lat/lon) here warns us that the model might 'memorize' Washington's map, which will be the primary challenge when we attempt to transfer it to Georgia.
 :::
 ## Correlogram
 ```{r}
 #| label: correlogram
 tar_read(plt_correlogram)
 ```
 ::: {.notes}
 **1. Orientation**
 - If you look at the very first column on the left, we can see exactly what drives our "Forested" classification.
 - Blue means "More Forest," Orange means "Less Forest."
 **2. The Sanity Check**
 - First, look at the bottom square: **Canopy Cover (0.75)**.
 - This is our sanity check. Obviously, forests have high canopy cover. If this wasn't blue, our data would be broken.
 **3. The Biophysical Story: Water vs. Heat**
 - The real story is the battle between water and heat.
 - **Precipitation (0.52)** is a strong blue driver. In Washington, rain equals trees.
 - **Vapor Pressure (-0.64)** is a deep orange driver. High vapor pressure—which correlates with hot, dry valleys—effectively kills the forest probability.
 **4. The Terrain Factor**
 - Look at **Roughness (0.39)**.
 - Rugged, difficult terrain is more likely to be forested. This is likely a mix of biophysics (mountains catch rain) and human history (flat land gets cleared for farming).
 **5. The Surprise**
 - Finally, look at **Northness and Eastness**. They are near **zero**.
 - This tells us that while the *direction* a slope faces might change *which* trees grow there, it doesn't determine *if* trees grow there.
 :::
 ## VIP
 ```{r}
 #| label: variable-importance-plt
 tar_read(plt_vip)
 ```
 ::: {.notes}
 **1. The Comparison**
 - "We just looked at Correlations (linear relationships). Now let's look at Variable Importance via Random Forest. This is what the model actually uses to make decisions."
 **2. The Consistency**
 - "The top three are the same: Canopy Cover, Rain, and Aridity (Vapor Pressure). This confirms our model is learning real biophysics."
 **3. Spatial Factors**
 - "But look at number 4: **Longitude**."
 - "In the correlation chart, Longitude was just a moderate factor. Here, it is massive."
 - "The model has learned that Washington is divided into two distinct climate zones—West and East."
 - "Instead of learning the physics of *why* trees grow there, it's partially just memorizing *where* they grow. This confirms our hypothesis: the model is using geography as a shortcut. And is worth remembering when we apply it to the Georgia data.
 :::
 ## UMAP
 ```{r}
 #| label: umap-plot
 tar_read(umap_plot)
 ```
 ## Spatial Dependency Analysis
 ```{r}
 #| label: fig-moran
 #| echo: false
 #| fig-align: "center"
 #| fig-cap: "<b>Moran Scatterplot.</b> The strong positive slope confirms significant spatial autocorrelation ($I > 0.6$)."
 tar_read(p_moran_exploration)
 ```
 :::{.notes}
 SPEAKER NOTES:
 1. THE VISUAL EVIDENCE: Point out the steep, positive slope of the 
   red dashed line. This slope is a visual representation of the 
   Global Moran’s I. A positive slope confirms that high-elevation 
   plots are surrounded by other high-elevation plots (Top Right 
   Quadrant), while low-elevation areas are also clustered (Bottom Left).
 2. THE "CHEATING" PROBLEM: Explain that this clustering is why 
   standard Random Cross-Validation is insufficient. If a training 
   point and a testing point are only 5km apart, the model can 
   effectively "cheat" by using local similarities rather than 
   learning the broader ecological relationships.
 3. THE JUSTIFICATION: This plot is the primary justification for:
   - Using Spatial Block Cross-Validation to force the model to 
     predict on entirely unseen regions.
   - Removing "Northness" and "County" as predictors to prevent the 
     model from simply memorizing regional averages.
   - Applying the Yeo-Johnson transformation to normalize the extreme 
     elevation variance seen in these clustered Cascade peaks.
 4. THE SCALE: Note that we used a 5km fixed-distance neighborhood 
   transformed into Washington State Plane North (meters) to ensure 
   the spatial relationships are geographically accurate.
 :::
 # Resampling
 ## Spatial Autocorrelation
 <br/>
 <br/>
 >"When data are not independent (e.g. due to spatial autocorrelation), random cross-validation yields optimistic estimates of predictive performance because training and test sets are not independent."[@roberts_crossvalidation_2017]
 :::{.notes}
 **1. Translation of the Quote**
 This quote describes the "Golden Rule" of geography: "Everything is related to everything else, but near things are more related than distant things."
 **2. Definition: Spatial Autocorrelation**
 Spatial Autocorrelation just means that data points close to each other are practically clones. If it's raining at your house, it's probably raining at your neighbor's house.
 **3. forested dataset**
 Forests are "clumpy." If you stand next to a Douglas Fir in Washington and take one step to the left, you are almost certainly still in a forest. The elevation, soil, and rain are identical.
 **4. Why Random CV is "Optimistic" (The Cheating)**
 - When the standard **Random Cross-Validation** is used, the first tree is assigned to the "Study Group" and the second tree (one step away) to the "Test Group."
 - The model doesn't learn ecology. It just looks at the neighbor (lat and lon) and copies the answer.
 - This gives us an **"Optimistic Estimate"**—a fancy way of saying our high score was fake because the model was cheating off its neighbor.
 :::
 ## The Mechanics of Resampling
 ```{r}
 #| label: fig-resampling
 #| echo: false
 #| fig-cap: "Visualizing the resampling process [@kuhn_tidy_2022]"
 #| fig-align: "center"
 #| out-width: "75%"
 #| out-extra: 'style="width:75%;"'
 knitr::include_graphics(here::here("images", "resampling.svg"))
 ```
 :::{.notes}
 - **The Concept:** Resampling methods (like cross-validation and bootstrapping) are **empirical simulation systems**. They generate different versions of our training set to simulate how the model handles new data.
 - **The Golden Rule:** It is critical to remember: Resampling is *always* used with the **Training set**. The **Test set** is not involved.
 - **The Vocabulary:** To avoid confusion with our initial Train/Test split, we use specific language for these internal loops:
    - **Analysis Set:** The subset used to **fit** the model.
    - **Assessment Set:** The subset used to **evaluate** performance.
 - **The Mechanism:** In every iteration, these two sets are **mutually exclusive**. We fit on the Analysis set, and we measure performance on the Assessment set.
 - **The Why:** As we discussed, simply re-predicting the training set is problematic (it leads to optimism bias). Resampling allows us to get a realistic appraisal using the training set without ever touching the final test data.
 :::
 ## Random K-Fold Cross-Validation
 ```{r}
 #| label: fig-classic-cv
 #| fig-cap: "Conceptual diagram showing the random assignment of observations to the analysis and assessment groups."
 tar_read("fig_classic_cv")
 ```
 ## Cross Validation Strategies
 ```{r}
 #| label: fig-cv-strategies
 #| echo: false
 #| fig-width: 14
 #| fig-height: 5
 #| out-width: "100%"
 #| fig-cap: "Three validation strategies. **Left:** Random splitting mixes train/test points. **Middle:** Spatial blocking forces geographic separation. **Right:** Clustering blocks by environmental similarity. (Note the outline for the Columbia Plateau. See @fig-ecoregion-comparison.)"
 tar_read(plot_cv_comparison)
 ```
 ::: {.notes}
 **1. Left Panel: Random CV (The Illusion of Accuracy)**
 - This visualizes why Random CV yields **over-optimistic estimates**.
 - Because the colors are mixed (Random), the model can accurately predict a "Red" point simply by memorizing the "Blue" point next to it.
 - This isn't "true" predictive power; it is **autocorrelation leakage**. The model is interpolating neighbors rather than learning the underlying ecological rules.
 **2. Middle Panel: Spatial Blocking (Forcing Independence)**
 - To get a **realistic assessment**, we must enforce spatial independence.
 - The grid structure ensures that the test data (Red blocks) is geographically distinct from the training data.
 - The performance score will likely drop compared to the first map, but that lower score is **more accurate**. It reflects how the model will actually perform on a new, unvisited site.
 **3. Right Panel: Environmental Clustering (Testing Generalization)**
 - This strategy tests for **ecological generalization**.
 - Notice the large red area in the southeast—the algorithm identified the **Columbia Plateau** as a distinct environment.
 - By holding out entire environments (e.g., training on "Wet Coastal" to predict "Dry Plateau"), we test if the model captures the fundamental biological relationships (e.g., how temp/rain affect trees) rather than just memorizing geographic trends.
 :::
 ## Analysis vs. Assessment
 ```{r}
 #| label: fig-mechanics
 #| fig-cap: "Visualization of Fold 1 across three cross-validation strategies. Magenta points represent the held-out assessment set."
 tar_read(fig_fold_mechanics)
 ```
 ::: {.notes}
 - **Visualizing the Split**: This slide illustrates Fold 1 of 5; gray points represent the "Analysis" set used for training, while magenta points represent the "Assessment" set the model must predict.
 - **Confetti vs. Blocks**: The Random split (left) creates a "confetti" effect where every test point is surrounded by nearby training points, leading to the spatial autocorrelation and "optimism bias" we discussed earlier.
 - **Geographic and Ecological Isolation**: The middle and right maps show how we force the model to predict across geographic and ecological gaps. 
 - **The Columbia Plateau Test**: Specifically in the Environmental Clustering map (right), the entire Columbia Plateau is isolated as a test set.
 - **Validating Results**: Because the model had to "learn" forests in the mountains to predict the Plateau, we gained high confidence in its performance there, which was later confirmed by the near-zero error rate in that region.
 - **Preparation for Georgia**: This level of isolation is a direct rehearsal for our next step, where we move from the Washington ecoregions to the completely unfamiliar landscapes of Georgia.
 :::
 # Models
 ## Engines
 ::: {.incremental}
 1. Logistic Regression
 2. MARS
 3. Random Forest
 4. XGBoost
 :::
 ::: {.notes}
 **Logistic Regression:**
 Simple, interpretable baseline. Captures linear relationships efficiently.
 **MARS:**
 Models non-linearities automatically. Good balance between linear and trees.
 **Random Forest:**
 Robust ensemble method. Reduces overfitting through averaging.
 **XGBoost:**
 High-performance gradient boosting. Often dominates on tabular data.
 :::
 ## Recipe A: With Coords
 ```{.r code-line-numbers="2|3"}
 recipe(forested ~ ., data = train_data) %>%
  # geometry is ID, but lat/lon remain as predictors
  update_role(geometry, new_role = "id") %>%
  step_novel(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_predictors()) %>%
  step_normalize(all_numeric_predictors())
 ```
 ::: {.notes} 
 **Base Strategy:** Standard approach uses latitude and longitude as predictive features. Risk is the model memorizing locations instead of learning rules. 
 :::
 ## Recipe B: No Coords
 ```{.r code-line-numbers="2|3"}
 recipe(forested ~ ., data = train_data) %>%
  # Explicitly remove lat/lon from training
  update_role(geometry, lat, lon, new_role = "id") %>%
  step_novel(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_predictors()) %>%
  step_normalize(all_numeric_predictors())
 ```
 ::: {.notes} 
 **Non-Spatial:** Removes explicit coordinates to prevent spatial overfitting. Forces the model to rely solely on biological environmental signals. 
 :::
 ## Recipe C: Extensible
 ```{.r code-line-numbers="4-10|12"}
 recipe(forested ~ ., data = train_data) %>%
  update_role(geometry, lat, lon, new_role = "id") %>%
  # 1. Remove political/time markers
  step_rm(northness, county, year) %>%
  # 2. Add Physics (Aridity & Temp Range)
  step_ratio(precip_annual, denom = denom_vars(temp_annual_max)) %>%
  step_mutate(
    temp_range = temp_annual_max - temp_annual_min,
    vpd_range = vapor_max - vapor_min
  ) %>%
  # 3. Fix Skew (Critical for Logistic Regression)
  step_YeoJohnson(elevation) %>%
  step_dummy(all_nominal_predictors()) %>% 
  step_normalize(all_numeric_predictors())
 ```
 ::: {.notes} 
 **Extensible:** Engineers physics-based features like aridity and temperature range. Transforms skewed variables to help linear models extrapolate to new regions. 
 :::
 ## YeoJohnson Transformation
 ```{r}
 #| echo: false
 #| fig-align: center
 #| fig-width: 10
 #| fig-height: 5
 #| fig-cap: "<b>Normalizing Elevation via Yeo-Johnson Transformation.</b> The raw elevation data (left) exhibits strong right-skewness, which can degrade linear model performance. Applying a Yeo-Johnson transformation with λ=0.49 (right) successfully normalizes the distribution, satisfying the linearity assumptions required for the Extensible Logistic Regression model."
 tar_read(plot_yeo)
 ```
 :::{.notes} 
 Why this matters:
 - **The Problem (Left):** Raw elevation data is highly skewed. Linear models (like Logistic Regression) struggle with this because they assume a consistent relationship across the range.
 - **The Solution (Right):** The Yeo-Johnson transformation normalizes the distribution (bell curve).
 - **The Result:** This allows the model to "see" the signal clearly, improving stability when moving to new regions like Georgia. 
 :::
 ## Resampling Strategies
 ```{.r code-line-numbers="2|5|8"}
 # A. Random Folds (Standard)
 vfold_cv(train_data, v = 10, strata = forested)
 # B. Spatial Blocks (Grid-based)
 spatial_block_cv(train_data, v = 10)
 # C. Spatial Clustering (Region-based)
 spatial_clustering_cv(train_data, v = 10)
 ```
 ::: {.notes} 
 - **Random Folds:** Standard approach. Randomly shuffles data. Dangerous here because it allows "cheating" via nearby pixels.
 - **Spatial Blocks:** Divides the map into a checkerboard. Forces the model to predict on a blind grid square.
 - **Spatial Clustering:** Uses K-means to create distinct ecological zones. The hardest test—simulates moving to a totally new region.
 :::
 # Results
 ## Spatial Validation Analysis
 ```{r}
 #| label: fig-spatial-results
 #| fig-cap: "Comparison of Model Performance (ROC AUC) across three spatial validation strategies. Benchmark (0.96 ROC AUC) indicated by the horizontal dashed line."
 #| echo: false
 #| message: false
 tar_read(fig_cv_comparison)
 ```
 ::: {.notes}
 - **Optimism Bias:** Notice the "Random CV" column. It shows nearly perfect performance (>0.95 ROC AUC). This is often a "spatial mirage" where the model is simply memorizing locations (autocorrelation) rather than learning environmental drivers.
 - **Spatial Honesty:** The "Block" and "Cluster" columns provide a more realistic estimate of how the model will perform on new, geographically distant areas. This represents the "true" performance we should expect for out-of-sample prediction.
 - **Feature Leakage:** Compare the "With Coords" vs. "No Coords" rows. If the "With Coords" model crashes in performance during Block CV but holds steady in Random CV, it is a clear sign of overfitting to spatial coordinates (Lat/Lon) rather than the underlying forest ecology.
 - **Performance Benchmark:** The dashed line at 0.96 represents an established, high-performance baseline for forest classification. It serves as a "line in the sand" to determine if our machine learning approach provides a meaningful improvement over traditional methods; a model is only truly successful if it can exceed this 0.96 threshold under the pressure of spatial cross-validation.
 :::
 ## Performance Stability
 ```{r}
 #| label: fig-stability
 #| fig-cap: "Distribution of ROC AUC scores across individual cross-validation folds. Note the variance in scores by resampling method."
 #| echo: false
 #| message: false
 tar_read(fig_model_stability)
 ```
 ::: {.notes}
 - **Falsely Confident:** In **Random CV**, notice how tightly clustered the points are at the top; the model's performance is artificially stable because every fold contains a representative "sprinkling" of the entire dataset.
 - **The Reality of Variance:** As we transition to **Cluster CV**, the "violin" stretches out, indicating that the model performs significantly better in some geographic regions than others.
 - **Identifying Weak Spots:** Each point in the Cluster CV column represents a specific geographic area; the points near the bottom of the violin represent "hard-to-predict" regions where the model's current features might be insufficient.
 - **Predictive Risk:** While Random CV suggests the model is ~98% accurate everywhere, this plot proves that in some clusters, performance may actually dip toward 85%.
 - **Stakeholder Transparency:** This variance is a critical insight for stakeholders, as it defines the geographic boundaries of where the model's predictions can be most (and least) trusted.
 :::
 ## Predict on Test Set
 ```{r}
 #| label: fig-final-test
 tar_read(fig_final_performance)
 ```
 ::: {.notes}
 - **Beyond the Fold:** This result represents the model's performance on the 20% test set that was "locked away" at the beginning of the project.
 - **The Spatial Paradox:** You will notice our Test AUC (0.97) is actually *higher* than our Validation AUC (0.927). In standard AI, this is rare. In forestry, this tells us two things:
    1.  **Interpolation Power:** The high test score proves the model is excellent at "filling in gaps" within Washington, where it can leverage the patterns of nearby trees.
    2.  **Extrapolation Power:** The lower (0.927) validation score is our "honest" baseline for new regions, where we stripped away those spatial clues.
 - **Classification Nuance:** The 91% Accuracy vs. 97% ROC suggests our model is a better "ranker" than a "classifier." It understands the *gradient* of forest probability better than the hard binary of "Tree vs. No Tree."
 - **Validation Success:** The fact that our "Honest" Spatial CV score (0.93) is so high confirms that the 0.97 on the test set isn't just a fluke of spatial memory—it's built on a solid foundation of learning spectral signatures.
 :::
 ## Test vs. Resample Performance
 <br/>
 <br/>
 <br/>
 ```{r}
 #| label: tbl-performance
 #| echo: false
 #| tbl-cap: "Comparision of model performance on resamples versus on the test set. Note that ROC increased."
 targets::tar_read(tbl_performance)
 ```
 ## Confusion Matrix
 ```{r}
 #| label: fig-confusion
 #| fig-cap: "Confusion matrix showing the classification performance of the final model on the 20% held-out Washington test set."
 tar_read(fig_confusion_matrix)
 ```
 ::: {.notes}
 - **Anatomy of Error:** This matrix breaks down our 91.1% accuracy into specific types of successes and failures, helping us move beyond a single aggregate number.
 - **Symmetry of Mistakes:** We are looking for balance between the off-diagonal squares; a heavy skew toward one side would indicate the model has a systematic bias toward over-predicting or under-predicting forest cover.
 - **False Positives vs. Negatives:** In ecological terms, False Positives often represent "ghost forests" where the structure exists but the classification differs, while False Negatives are "missed forests" where the model failed to detect the canopy signal.
 - **Probability Sensitivity:** Since our ROC AUC is a high 0.97, most of these errors likely occur at the "decision boundary"—meaning the model was nearly correct (e.g., 48% probability) but the hard 50% cutoff forced an error.
 - **Production Readiness:** The high density in the True Positive and True Negative quadrants confirms that the model is robust enough for regional mapping, despite the inherent complexity of transition zones.
 :::
 ## Benchmarks
 ::: {style="font-size: 75%;"}
 | Authority | Study Context | Accuracy |
 | :--- | :--- | :--- |
 | **Ismail et al. (2013)** | Ideal Conditions (Sclerophyll Forest) | **96%** |
 | **USGS NLCD** | Federal Standard (US Gov) | **91%** |
 | **Our Model (WA)** | Pacific Northwest Training | **90.7%** |
 | **Complex Boreal** | Difficult Terrain (Alaska) | **~78%** |
 :::
 ## Spatial Error Analysis
 ```{r}
 #| label: fig-map-wa-errors
 #| out-width: "100%"
 #| fig-cap: "<b>Map showing Type I and II errors from model.</b> Points are shaded from purple to bright yellow based upon the absolute error of the prediction probability. Note the lack of errors in the Columbia Basin."
 knitr::include_graphics("figs/wa_errors.png")
 ```
 ::: {.notes} 
 - **The "Hallucinations":** We are looking at the ~130 mistakes the model made on the test set.
 - **Confidence vs. Confusion:** The Red points are where the model was "confidently wrong" (high error magnitude). These aren't just close calls; the model was >90% sure based on the physics features (like elevation/aridity) but missed the biological reality.
 - **Geography of Error:** Notice the clustering. The errors aren't random; they hug the alpine transition zones and the rugged coastline, suggesting the model struggles most at the "biophysical edges" where the rules of the forest change rapidly. 
 :::
 # Extrapolation
 ## The Goal of Prediction {text-align="center"}
 <br/>
 <br/>
 > "The fundamental goal of a model is not to describe the data we have, but to predict the data we don't."[@kuhn_applied_2013]
 ::: {.notes}
 - This quote from Kuhn and Johnson is the foundation of our entire project.
 - If our model doesn't generalize to the "second island" (Georgia), it has failed its fundamental goal.
 :::
 ## Assessing Domain Applicability
 ```{r}
 #| label: fig-aoa-georgia
 #| echo: false
 #| out-width: "100%"
 #| fig-cap: "Area of Applicability (AOA) Analysis. The Dissimilarity Index (DI) measures how different the Georgia environment is from Washington's. Note the similarity to the Level III Ecoregion plot @fig-ecoregion-comparison."
 targets::tar_read(plot_aoa_ga)
 ```
 ::: {.notes}
 - Before we even attempt to predict forests in Georgia, we have to ask a fundamental question: **Is it fair to ask a Washington model to understand Georgia?** We can't just assume the rules of nature are the same. We need to measure the mathematical distance between these two worlds.
 **What am I looking at?**
 - This map **does not** show predictions. It shows **familiarity**.
 - We calculated a **Dissimilarity Index** for every pixel in Georgia. Essentially, we asked the model: *"Have you seen conditions like this before?"*
 - **Dark Purple/Black:** These areas are the "safe zones." The elevation, temperature, and precipitation here fall within the ranges the model learned in the Cascades.
 - **Bright Yellow:** These are the "alien" zones. The combination of variables here (likely the hot, humid lowlands) is completely outside the model's experience. This is pure extrapolation.
 **The Takeaway:**
 - This creates a **Risk Map**. If our model fails, we expect it to fail *here* [gesture to yellow areas].
 - It tells us where our confidence should be high (the purple) and where any prediction is just a wild guess (the yellow).
 :::
 ## External Validation
 ```{r}
 #| label: fig-ga-predictions
 #| echo: false
 #| out-width: "100%"
 #| fig-cap: "The model predictions of forests in Georgia (a) versus the true forest inventory (b)."
 targets::tar_read(map_ga_probs)
 ```
 ::: {.notes}
 - We took the model trained in the Pacific Northwest and asked it: "Where are the forests in Georgia?"
 - The Map: This shows the model's raw probability output.
 - The Pattern: You can see it identifying the Blue Ridge Mountains (yellow/green) in the northeast.
 - The Question: Does this match reality? Or is it seeing "forests" in places that are actually agricultural fields or swamps?
 :::
 ## Quantifying the Error
 ```{r}
 #| label: fig-ga-confusion
 #| echo: false
 #| out-width: "80%"
 #| fig-align: "center"
 #| fig-cap: "<b>Confusion Matrix (Georgia).</b> The model accuracy drops significantly compared to Washington. Note the high number of false negatives.(Prediction: No / Truth: Yes)."
 targets::tar_read(ga_conf_mat)
 ```
 ## Mapping the Failures
 ```{r}
 #| label: fig-ga-errors
 #| echo: false
 #| out-width: "100%"
 #| fig-cap: "Spatial Distribution of Errors. (a) shows the dissimilarity of Georgia from Washington. (b) shows error density increasing in southern Georgia."
 targets::tar_read(map_failure_mechanism)
 ```
 ::: {.notes}
 **Visualizing the "Phantom Forests"**
 - This map only shows the mistakes. And unlike Washington, where we had a handful of dots, here the map is lit up.
 - **Orange Points (False Positives):** Look at the massive cluster in the South/Southeast.
    - These are the **"Phantom Forests."**
    - Notice how they perfectly overlap with the "Yellow Zone" (high dissimilarity) we identified at the start. The model saw crops and scrubland and hallucinated trees.
 - **The Verdict:** The error isn't random. It is geographically structured. We broke the model exactly where the AOA predicted it would break.
 :::
 ## Lessons Learned
 ::: {.incremental}
 * **Accuracy Collapse:** ~89% (WA) $\to$ ~54% (GA).
 * **AOA Validation:** The "Yellow Zone" correctly flagged the risk.
 * **The Trap:** High confidence in "Phantom Forests."
 * **The Fix:** Quantify domain distance *before* deployment.
 :::
 ::: {.notes}
 **1. The Numbers Don't Lie**
 We witnessed a catastrophic failure in performance. In Washington, we had a precision instrument (90% accuracy). In Georgia, we essentially flipped a coin (54%). If we had deployed this model in production without validation, we would be generating random noise.
 **2. The "Yellow Zone" Was a Warning, Not a Bug**
 Remember that bright yellow map? That wasn't just a pretty picture. The Area of Applicability (AOA) screamed at us that the Southeastern Plains were alien territory. The model failed exactly where the AOA said it would—predicting forests in flat, hot agricultural zones it didn't understand.
 **3. The "Phantom Forest" Problem**
 Our confusion matrix showed a massive spike in False Positives. This is dangerous. The model didn't say "I don't know"; it confidently declared "Yes, there is a forest here." We call these "Phantom Forests." In a real-world scenario—like carbon credit monitoring or fire risk assessment—phantom forests cost millions of dollars.
 **4. The Ultimate Takeaway**
 The model failed, but the **workflow succeeded**. By calculating the multidimensional distance between our training data and our target data, we predicted *where* the model would break before we even ran it.
 **Conclusion:** In spatial data science, you cannot simply "train and deploy." You must respect the ecological boundaries of your training data.
 :::
 # References
 ::: {#refs}
 :::
--- a/man/calculate_ga_aoa.Rd
+++ b/man/calculate_ga_aoa.Rd
@@ -0,0 +1,22 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{calculate_ga_aoa}
 \alias{calculate_ga_aoa}
 \title{Calculate Area of Applicability Data}
 \usage{
 calculate_ga_aoa(train_data, test_data, predictors)
 }
 \arguments{
 \item{train_data}{Dataframe. The training data from Washington.}
 \item{test_data}{Dataframe. The extrapolation data from Georgia.}
 \item{predictors}{Character vector. The list of predictor variable names.}
 }
 \value{
 An \code{sf} object containing the Georgia data with an added 'di' (Dissimilarity Index) column.
 }
 \description{
 Generates the Area of Applicability (AOA) scores (Dissimilarity Index)
 for the Georgia extrapolation dataset based on the Washington training data.
 }
--- a/man/combine_forest.Rd
+++ b/man/combine_forest.Rd
@@ -0,0 +1,28 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{combine_forest}
 \alias{combine_forest}
 \title{Combine Washington and Georgia Forest Data}
 \usage{
 combine_forest(wa_data, ga_data)
 }
 \arguments{
 \item{wa_data}{A data frame containing the Washington forest inventory data.}
 \item{ga_data}{A data frame containing the Georgia forest inventory data.}
 }
 \value{
 A single combined data frame with an additional column \code{.id}
 (renamed to "state") indicating the source ("WA" or "GA").
 }
 \description{
 Merges the Washington and Georgia datasets into a single data frame, adding a
 column to identify the source state.
 }
 \examples{
 \dontrun{
  combined <- combine_forest(wa_raw, ga_raw)
  table(combined$state)
 }
 }
--- a/man/create_stats_summary.Rd
+++ b/man/create_stats_summary.Rd
@@ -0,0 +1,20 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{create_stats_summary}
 \alias{create_stats_summary}
 \title{Create Statistical Summary of Forest Data}
 \usage{
 create_stats_summary(data)
 }
 \arguments{
 \item{data}{A data frame or sf object containing the forest data.}
 }
 \value{
 A data frame with descriptive statistics (mean, sd, min, max, etc.),
 sorted by descending absolute kurtosis.
 }
 \description{
 Generates descriptive statistics for numeric variables in the dataset,
 excluding spatial coordinates (lat/lon) and year. It sorts the results
 by kurtosis to highlight non-normal distributions.
 }
--- a/man/forestedAnalysis-package.Rd
+++ b/man/forestedAnalysis-package.Rd
@@ -0,0 +1,15 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/forestedAnalysis-package.R
 \docType{package}
 \name{forestedAnalysis-package}
 \alias{forestedAnalysis}
 \alias{forestedAnalysis-package}
 \title{forestedAnalysis: Spatial Cross-Validation and AOA Analysis of Forest Cover}
 \description{
 A research compendium analyzing forest cover data in Washington and Georgia. It evaluates the Area of Applicability (AOA) and demonstrates model failure during spatial extrapolation.
 }
 \author{
 \strong{Maintainer}: Rob Wiederstein \email{khuon68@gmail.com}
 }
 \keyword{internal}
--- a/man/get_epa_ecoregions.Rd
+++ b/man/get_epa_ecoregions.Rd
@@ -0,0 +1,23 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{get_epa_ecoregions}
 \alias{get_epa_ecoregions}
 \title{Download EPA Level III Ecoregions Data}
 \usage{
 get_epa_ecoregions(url, dest_dir = "data/epa")
 }
 \arguments{
 \item{url}{Character string. The direct URL to the EPA Ecoregions zip file.}
 \item{dest_dir}{Character string. The local directory where the data should
 be saved. Defaults to "data/epa".}
 }
 \value{
 A character string containing the full file path to the downloaded zip file.
 This return value is designed to be tracked by \code{targets}.
 }
 \description{
 Downloads the EPA Level III Ecoregions shapefile (zip format)
 to a local directory. Implements a caching check to avoid re-downloading
 if the file already exists.
 }
--- a/man/helper_save_fig.Rd
+++ b/man/helper_save_fig.Rd
@@ -0,0 +1,35 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{helper_save_fig}
 \alias{helper_save_fig}
 \title{Helper: Save Plot for Quarto Slide}
 \usage{
 helper_save_fig(
  plot_obj,
  name,
  type = c("map", "plot"),
  width = 10,
  height = 6.18,
  dpi = 300
 )
 }
 \arguments{
 \item{plot_obj}{The ggplot object to save.}
 \item{name}{A short descriptive name (e.g., "wa_ecoregions").}
 \item{type}{Either "map" or "plot". Adds this prefix to the filename.}
 \item{width}{Width in inches (default: 10).}
 \item{height}{Height in inches (default: 6.18).}
 \item{dpi}{Resolution (default: 300).}
 }
 \value{
 The full file path (invisibly).
 }
 \description{
 Saves a ggplot object as a PNG, sized to fit comfortably
 below a standard slide title, with robust font handling.
 }
--- a/man/plot_ecoregion_comparison.Rd
+++ b/man/plot_ecoregion_comparison.Rd
@@ -0,0 +1,20 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{plot_ecoregion_comparison}
 \alias{plot_ecoregion_comparison}
 \title{Plot Ecoregion Complexity Comparison (WA vs GA)}
 \usage{
 plot_ecoregion_comparison(eco_data)
 }
 \arguments{
 \item{eco_data}{An \code{sf} object containing ecoregion polygons. Must contain
 columns \code{STATE_NAME} and \code{US_L3NAME}.}
 }
 \value{
 A \code{patchwork} object containing the combined plot.
 }
 \description{
 Generates a side-by-side comparison of Level III ecoregions for Washington
 and Georgia. It uses a "void" theme, qualitative colors, and carefully tuned
 label repulsion settings to avoid overlapping text.
 }
--- a/man/plot_failure_mechanism.Rd
+++ b/man/plot_failure_mechanism.Rd
@@ -0,0 +1,23 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{plot_failure_mechanism}
 \alias{plot_failure_mechanism}
 \title{Plot Failure Mechanism Comparison}
 \usage{
 plot_failure_mechanism(aoa_data, pred_data, boundaries)
 }
 \arguments{
 \item{aoa_data}{Dataframe containing the AOA results (must have 'di', 'lon', 'lat').}
 \item{pred_data}{Dataframe containing prediction results (columns: .pred_class, forested, lon, lat).}
 \item{boundaries}{An \code{sf} object containing state boundaries (must include "GA" or "Georgia").}
 }
 \value{
 A \code{patchwork} object containing the side-by-side comparison.
 }
 \description{
 Creates a side-by-side diagnostic plot returning a patchwork object.
 (a) The Area of Applicability (Dissimilarity Index) showing where the model is extrapolating.
 (b) The spatial distribution of actual classification errors.
 }
--- a/man/plot_ga_comparison_map.Rd
+++ b/man/plot_ga_comparison_map.Rd
@@ -0,0 +1,21 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{plot_ga_comparison_map}
 \alias{plot_ga_comparison_map}
 \title{Plot Georgia Forest Comparison}
 \usage{
 plot_ga_comparison_map(pred_data, boundaries)
 }
 \arguments{
 \item{pred_data}{Dataframe containing prediction results (columns: .pred_class, forested, lon, lat).}
 \item{boundaries}{An \code{sf} object containing state boundaries (must include "GA" or "Georgia").}
 }
 \value{
 A \code{patchwork} object containing the labeled comparison plot.
 }
 \description{
 Creates a side-by-side comparison of forest cover for Georgia.
 The left plot (a) is the Model Prediction, and the right plot (b) is the Actual Data.
 Features a shared right-side legend and standardized spatial styling.
 }
--- a/man/plot_georgia_aoa.Rd
+++ b/man/plot_georgia_aoa.Rd
@@ -0,0 +1,17 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{plot_georgia_aoa}
 \alias{plot_georgia_aoa}
 \title{Plot Georgia Area of Applicability (AOA)}
 \usage{
 plot_georgia_aoa(aoa_sf)
 }
 \arguments{
 \item{aoa_sf}{An \code{sf} object containing the 'di' column (output of \code{calculate_ga_aoa}).}
 }
 \value{
 A \code{ggplot} object showing the Dissimilarity Index map.
 }
 \description{
 Plots the pre-calculated Dissimilarity Index (DI) for Georgia.
 }
--- a/man/plot_precip_hex_comparison.Rd
+++ b/man/plot_precip_hex_comparison.Rd
@@ -0,0 +1,34 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{plot_precip_hex_comparison}
 \alias{plot_precip_hex_comparison}
 \title{Plot Annual Rainfall Comparison (Clipped Hexes)}
 \usage{
 plot_precip_hex_comparison(
  wa_data,
  ga_data,
  boundaries,
  bins = 30,
  max_limit = 2500
 )
 }
 \arguments{
 \item{wa_data}{Dataframe containing Washington data (requires 'precip_annual', 'lat', 'lon').}
 \item{ga_data}{Dataframe containing Georgia data (requires 'precip_annual', 'lat', 'lon').}
 \item{boundaries}{An \code{sf} object containing state boundaries.}
 \item{bins}{Integer. Number of hexes across the state width. Default is 30.}
 \item{max_limit}{Numeric. The visual cap for rainfall (mm) to ensure comparable scales. Default is 2500.}
 }
 \value{
 A \code{patchwork} object containing the side-by-side comparison.
 }
 \description{
 Creates a polished side-by-side comparison of annual precipitation.
 Hexagons are spatially generated and clipped to the exact state boundaries
 to eliminate "bleeding" edges. Uses \code{theme_forestry_void} with explicit
 font sizing to match topographic maps.
 }
--- a/man/plot_regional_comparison.Rd
+++ b/man/plot_regional_comparison.Rd
@@ -0,0 +1,23 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{plot_regional_comparison}
 \alias{plot_regional_comparison}
 \title{Plot Regional Comparison of Forested Data (WA vs GA)}
 \usage{
 plot_regional_comparison(data, boundaries)
 }
 \arguments{
 \item{data}{A data frame containing the forest point data. Must contain
 columns \code{lon}, \code{lat}, \code{state}, and \code{forested}.}
 \item{boundaries}{An \code{sf} object containing state boundaries. Must contain
 a \code{NAME} column.}
 }
 \value{
 A \code{patchwork} object containing the combined side-by-side maps.
 }
 \description{
 Generates a side-by-side comparison of forest cover for Washington
 and Georgia. It handles font registration (Atkinson Hyperlegible), spatial
 transformations, and creates a combined plot with a shared legend.
 }
--- a/man/plot_rf_importance.Rd
+++ b/man/plot_rf_importance.Rd
@@ -0,0 +1,17 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{plot_rf_importance}
 \alias{plot_rf_importance}
 \title{Plot Random Forest Variable Importance}
 \usage{
 plot_rf_importance(data)
 }
 \arguments{
 \item{data}{An sf object or data frame containing the 'forested' target and predictors.}
 }
 \description{
 Fits a ranger Random Forest model to the provided data, calculates
 permutation importance, and generates a lollipop chart. It distinguishes
 between spatial (lat/lon) and biophysical predictors.
 Uses the project's 'Atkinson' font theme via theme_forestry_plot().
 }
--- a/man/plot_spatial_exploration.Rd
+++ b/man/plot_spatial_exploration.Rd
@@ -0,0 +1,18 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{plot_spatial_exploration}
 \alias{plot_spatial_exploration}
 \title{Plot Spatial Autocorrelation Exploration}
 \usage{
 plot_spatial_exploration(wa_data)
 }
 \arguments{
 \item{wa_data}{A dataframe or tibble containing elevation, lat, and lon columns.}
 }
 \value{
 A ggplot object showing standardized elevation vs. spatially lagged elevation.
 }
 \description{
 Generates a Moran Scatterplot to visualize spatial
 autocorrelation in elevation data using a 5km neighborhood.
 }
--- a/man/plot_state_topo.Rd
+++ b/man/plot_state_topo.Rd
@@ -0,0 +1,11 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{plot_state_topo}
 \alias{plot_state_topo}
 \title{Create Single State Topo Plot}
 \usage{
 plot_state_topo(data, boundary_sf, raster_path, state_name)
 }
 \description{
 Generates a topo map with manually tuned label placement for Georgia.
 }
--- a/man/plot_theme_diagnostic.Rd
+++ b/man/plot_theme_diagnostic.Rd
@@ -0,0 +1,11 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{plot_theme_diagnostic}
 \alias{plot_theme_diagnostic}
 \title{Simplified Theme Diagnostic}
 \usage{
 plot_theme_diagnostic()
 }
 \description{
 Uses built-in NC data to verify theme_forestry_spatial.
 }
--- a/man/plot_us_map.Rd
+++ b/man/plot_us_map.Rd
@@ -0,0 +1,11 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{plot_us_map}
 \alias{plot_us_map}
 \title{Plot US Map with Forestry Theme}
 \usage{
 plot_us_map()
 }
 \description{
 Highlights Washington and Georgia using standardized presentation fonts.
 }
--- a/man/process_ecoregions.Rd
+++ b/man/process_ecoregions.Rd
@@ -0,0 +1,32 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{process_ecoregions}
 \alias{process_ecoregions}
 \title{Process and Clip EPA Ecoregions}
 \usage{
 process_ecoregions(
  zip_path,
  target_states = c("Washington", "Georgia"),
  simplify_tol = 0.05
 )
 }
 \arguments{
 \item{zip_path}{Character string. The file path to the zipped EPA shapefile.}
 \item{target_states}{Character vector. The names of the states to clip the
 ecoregions to. Defaults to \code{c("Washington", "Georgia")}.}
 \item{simplify_tol}{Numeric. The simplification tolerance passed to
 \code{rmapshaper::ms_simplify}. Range is 0-1, where higher numbers remove more detail.
 Defaults to 0.05.}
 }
 \value{
 An \code{sf} object containing the processed ecoregions with standardized
 columns \code{US_L3NAME} and \code{STATE_NAME}.
 }
 \description{
 Extracts EPA Level III ecoregion data from a zipped shapefile,
 standardizes column names, and clips the geometry to specified state boundaries.
 It includes robust steps for geometry repair (handling spherical validity),
 small island removal, and simplification for optimized plotting.
 }
--- a/man/save_combined_topo.Rd
+++ b/man/save_combined_topo.Rd
@@ -0,0 +1,19 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{save_combined_topo}
 \alias{save_combined_topo}
 \title{Save Combined Side-by-Side Topo Plot}
 \usage{
 save_combined_topo(
  wa_data,
  ga_data,
  wa_boundary,
  ga_boundary,
  wa_raster_path,
  ga_raster_path,
  output_path
 )
 }
 \description{
 Save Combined Side-by-Side Topo Plot
 }
--- a/man/save_error_map_png.Rd
+++ b/man/save_error_map_png.Rd
@@ -0,0 +1,26 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{save_error_map_png}
 \alias{save_error_map_png}
 \title{Save Model Error Diagnostic Map}
 \usage{
 save_error_map_png(data, boundary_sf, raster_path, output_path)
 }
 \arguments{
 \item{data}{A data frame containing model predictions (must include '.pred_class',
 'forested', '.pred_Yes', 'lon', and 'lat').}
 \item{boundary_sf}{An \code{sf} object representing the state boundary.}
 \item{raster_path}{Character string. File path to the elevation raster (.tif).}
 \item{output_path}{Character string. File path where the PNG will be saved.}
 }
 \value{
 The \code{output_path} (invisible), for integration with \code{targets}.
 }
 \description{
 Generates a diagnostic map highlighting prediction errors. It plots
 misclassified points colored by the magnitude of the error (confidence in the wrong answer)
 over a hillshaded elevation background.
 }
--- a/man/save_outlier_map_png.Rd
+++ b/man/save_outlier_map_png.Rd
@@ -0,0 +1,24 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{save_outlier_map_png}
 \alias{save_outlier_map_png}
 \title{Save Outlier Diagnostic Map}
 \usage{
 save_outlier_map_png(data, boundary_sf, raster_path, output_path)
 }
 \arguments{
 \item{data}{A data frame containing the analysis dataset (must include numeric columns and 'forested' factor).}
 \item{boundary_sf}{An \code{sf} object representing the state boundary (e.g., Washington).}
 \item{raster_path}{Character string. File path to the elevation raster (.tif).}
 \item{output_path}{Character string. File path where the PNG will be saved.}
 }
 \value{
 The \code{output_path} (invisible), for integration with \code{targets}.
 }
 \description{
 Generates a diagnostic map highlighting multivariate outliers (Z > 3)
 overlaid on a hillshaded elevation raster. Uses the standardized forestry theme.
 }
--- a/man/setup_forestry_fonts.Rd
+++ b/man/setup_forestry_fonts.Rd
@@ -0,0 +1,15 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{setup_forestry_fonts}
 \alias{setup_forestry_fonts}
 \title{Register Project Fonts}
 \usage{
 setup_forestry_fonts()
 }
 \value{
 NULL (called for side effects)
 }
 \description{
 Registers 'Atkinson Hyperlegible Next' (Sans) and 'Atkinson Hyperlegible Mono'
 (Monospace) with the sysfonts package for use in R graphics.
 }
--- a/man/style_audit_table.Rd
+++ b/man/style_audit_table.Rd
@@ -0,0 +1,21 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{style_audit_table}
 \alias{style_audit_table}
 \title{Style Audit Table (GT)}
 \usage{
 style_audit_table(data, title = NULL, subtitle = NULL)
 }
 \arguments{
 \item{data}{A data frame to be formatted.}
 \item{title}{Character string. The title of the table (optional).}
 }
 \value{
 A \code{gt_tbl} object ready for rendering.
 }
 \description{
 Converts a data frame into a formatted \code{gt} table with consistent
 styling for audit reports. Includes row striping, numeric formatting, and
 standardized font sizes.
 }
--- a/man/theme_forestry_plot.Rd
+++ b/man/theme_forestry_plot.Rd
@@ -0,0 +1,18 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{theme_forestry_plot}
 \alias{theme_forestry_plot}
 \title{Standard Forestry Plot Theme (Cowplot + Atkinson)}
 \usage{
 theme_forestry_plot(font_size = 14, grid = TRUE)
 }
 \arguments{
 \item{font_size}{Integer. Base font size. Default is 14 (good for slides).}
 \item{grid}{Logical. If TRUE, adds a light gray grid (useful for presentations).}
 }
 \description{
 A standardized theme for non-spatial plots (scatter, bar, line).
 Based on cowplot::theme_cowplot(), it includes clean axes and a minimalist look.
 Uses 'Atkinson Hyperlegible Next' for all text.
 }
--- a/man/theme_forestry_spatial.Rd
+++ b/man/theme_forestry_spatial.Rd
@@ -0,0 +1,11 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{theme_forestry_spatial}
 \alias{theme_forestry_spatial}
 \title{Standardized Spatial Theme (Atkinson)}
 \usage{
 theme_forestry_spatial(base_size = 16)
 }
 \description{
 High-visibility map theme for presentations using Atkinson fonts.
 }
--- a/man/theme_forestry_void.Rd
+++ b/man/theme_forestry_void.Rd
@@ -0,0 +1,11 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
 \name{theme_forestry_void}
 \alias{theme_forestry_void}
 \title{Standardized Void Theme (Maximal Data Ink)}
 \usage{
 theme_forestry_void(base_size = 16)
 }
 \description{
 Removes axes/grids for shape-focused maps, but keeps project fonts.
 }
--- a/references.bib
+++ b/references.bib
@@ -0,0 +1,140 @@
@misc{epa_ecoregions_2013,
  title = {Level {{III}} and {{IV Ecoregions}} of the {{Continental United States}}},
  author = {{U.S. Environmental Protection Agency}},
  year = 2013,
  address = {Corvallis, OR},
  urldate = {2026-01-13}
 }
@article{frescino_fiesta_2023,
  title = {`{{FIESTA}}': A Forest Inventory Estimation and Analysis {{R}} Package},
  shorttitle = {`{{FIESTA}}'},
  author = {Frescino, Tracey S. and Moisen, Gretchen G. and Patterson, Paul L. and Toney, Chris and White, Grayson W.},
  year = 2023,
  month = jul,
  journal = {Ecography},
  volume = {2023},
  number = {7},
  pages = {e06428},
  issn = {0906-7590, 1600-0587},
  doi = {10.1111/ecog.06428},
  urldate = {2026-01-14},
  abstract = {Ecologists are increasingly relying on national forest inventories to address a wide variety of issues. The `FIESTA' R package (Forest Inventory ESTimation and Analysis) is a tool that enables customized investigations using the extensive sample-based inventory data collected across all lands in the US by the US Dept of Agriculture, Forest Service, Forest Inventory and Analysis (FIA) Program. To date, the complex nature of the FIA inventory constrains many users to conduct only limited analyses through existing tools with pre-specified geographic boundaries, timeframes, and auxiliary data under a single statistical estimation process. Yet, the rapid evolution of available remotely sensed data and statistical methods present the opportunity to conduct spatial and temporal analyses of forest attributes that are much more relevant to many pressing ecological, environmental, economic, and social issues in the US, The `FIESTA' package was developed to augment the current set of available tools by providing a flexible platform that accommodates evolving technologies and leading-edge estimation techniques. The package contains a collection of functions that can query FIA databases, summarize sample-based inventory data, extract and aggregate auxiliary spatial data, and generate estimates with associated variances. The `FIESTA' R package is available on CRAN (               https://cran.r-project.org/package=FIESTA               ).},
  langid = {english}
 }
@book{kuhn_applied_2013,
  title = {Applied {{Predictive Modeling}}},
  author = {Kuhn, Max and Johnson, Kjell},
  year = 2013,
  publisher = {Springer},
  address = {New York, NY},
  doi = {10.1007/978-1-4614-6849-3},
  urldate = {2026-01-22},
  copyright = {http://www.springer.com/tdm},
  isbn = {978-1-4614-6848-6 978-1-4614-6849-3},
  langid = {english},
  keywords = {Model,Non-Linear,Predictive Models,R,Regression Models,Regression Trees}
 }
@book{kuhn_tidy_2022,
  title = {Tidy {{Modeling}} with {{R}}},
  author = {Kuhn, Max},
  year = 2022,
  publisher = {O'Reilly Media, Incorporated},
  address = {Sebastopol},
  urldate = {2026-01-13},
  collaborator = {Silge, Julia},
  isbn = {978-1-4920-9648-1 978-1-4920-9644-3},
  langid = {english}
 }
@article{omernik_ecoregions_1987,
  title = {Ecoregions of the {{Conterminous United States}}},
  author = {Omernik, James M.},
  year = 1987,
  month = mar,
  journal = {Annals of the Association of American Geographers},
  volume = {77},
  number = {1},
  pages = {118--125},
  issn = {0004-5608, 1467-8306},
  doi = {10.1111/j.1467-8306.1987.tb00149.x},
  urldate = {2026-01-13},
  langid = {english}
 }
@misc{pebesma_spatial_2025,
  title = {Spatial {{Data Science}}},
  author = {Pebesma, Edzer and Bivand, Roger},
  year = 2025,
  month = jan,
  urldate = {2026-01-17},
  howpublished = {https://r-spatial.org/book/},
  langid = {english},
  file = {/home/rkw/Zotero/storage/ZNFK3H6Q/book.html}
 }
@article{roberts_crossvalidation_2017,
  title = {Cross-validation Strategies for Data with Temporal, Spatial, Hierarchical, or Phylogenetic Structure},
  shorttitle = {Cross-Validation},
  author = {Roberts, David R. and Bahn, Volker and Ciuti, Simone and Boyce, Mark S. and Elith, Jane and Guillera-Arroita, Gurutzeta and Hauenstein, Severin and Lahoz-Monfort, Jos{\'e} J. and Schr{\"o}der, Boris and Thuiller, Wilfried and Warton, David I. and Wintle, Brendan A. and Hartig, Florian and Dormann, Carsten F.},
  year = 2017,
  month = aug,
  journal = {Ecography},
  volume = {40},
  number = {8},
  pages = {913--929},
  issn = {0906-7590, 1600-0587},
  doi = {10.1111/ecog.02881},
  urldate = {2026-01-11},
  abstract = {Ecological data often show temporal, spatial, hierarchical (random effects), or phylogenetic structure. Modern statistical approaches are increasingly accounting for such dependencies. However, when performing cross-validation, these structures are regularly ignored, resulting in serious underestimation of predictive error. One cause for the poor performance of uncorrected (random) cross-validation, noted often by modellers, are dependence structures in the data that persist as dependence structures in model residuals, violating the assumption of independence. Even more concerning, because often overlooked, is that structured data also provides ample opportunity for overfitting with non-causal predictors. This problem can persist even if remedies such as autoregressive models, generalized least squares, or mixed models are used. Block cross-validation, where data are split strategically rather than randomly, can address these issues. However, the blocking strategy must be carefully considered. Blocking in space, time, random effects or phylogenetic distance, while accounting for dependencies in the data, may also unwittingly induce extrapolations by restricting the ranges or combinations of predictor variables available for model training, thus overestimating interpolation errors. On the other hand, deliberate blocking in predictor space may also improve error estimates when extrapolation is the modelling goal. Here, we review the ecological literature on non-random and blocked cross-validation approaches. We also provide a series of simulations and case studies, in which we show that, for all instances tested, block cross-validation is nearly universally more appropriate than random cross-validation if the goal is predicting to new data or predictor space, or for selecting causal predictors. We recommend that block cross-validation be used wherever dependence structures exist in a dataset, even if no correlation structure is visible in the fitted model residuals, or if the fitted models account for such correlations.},
  langid = {english},
  file = {/home/rkw/Zotero/storage/JFMJE6FR/Roberts et al. - 2017 - Cross‐validation strategies for data with temporal, spatial, hierarchical, or phylogenetic structure.pdf}
 }
@article{tobler_computer_1970,
  title = {A {{Computer Movie Simulating Urban Growth}} in the {{Detroit Region}}},
  author = {Tobler, W. R.},
  year = 1970,
  month = jun,
  journal = {Economic Geography},
  publisher = {Routledge},
  urldate = {2026-01-22},
  abstract = {(1970). A Computer Movie Simulating Urban Growth in the Detroit Region. Economic Geography: Vol. 46, PROCEEDINGS International Geographical Union Commission on Quantitative Methods, pp. 234-240.},
  copyright = {\copyright{} 1970 Taylor and Francis Group, LLC},
  langid = {english},
  file = {/home/rkw/Zotero/storage/75EV82QZ/143141.html}
 }
@article{white_method_2025,
  title = {A Method for Empirically Assessing Small Area Estimators via Bootstrap-Weighted k-Nearest-Neighbor Artificial Populations, with Applications to Forest Inventory},
  author = {White, Grayson W and Wieczorek, Jerzy A and Cody, Zachariah W and Tan, Emily X and Chistolini, Jacqueline O and McConville, Kelly S and Frescino, Tracey S and Moisen, Gretchen G},
  editor = {Fassnacht, Fabian},
  year = 2025,
  month = nov,
  journal = {Forestry: An International Journal of Forest Research},
  pages = {cpaf071},
  issn = {0015-752X, 1464-3626},
  doi = {10.1093/forestry/cpaf071},
  urldate = {2026-01-14},
  abstract = {Abstract             National Forest Inventories monitor forest attributes across a variety of spatial and temporal scales in a given country. Increased interest in reporting and management at smaller scales has driven National Forest Inventories to investigate and adopt small area estimation (SAE) due to the promise of increased precision at these scales. However, comparing and evaluating SAE models for a given application is inherently difficult. Typically, many areas lack enough data to check unit-level modeling assumptions or to assess unit-level predictions empirically; and no ground truth is available for checking area-level estimates. Design-based simulation from artificial populations can help with each of these issues, but only if the artificial populations realistically represent the application at hand and are not built using assumptions that inherently favor one SAE model over another. In this paper, we borrow ideas from random hot deck, approximate Bayesian bootstrap, and \$k\$ nearest neighbor imputation methods to propose a \$k\$ nearest neighbor-based approximation to approximate Bayesian bootstrap, for generating an artificial population when rich unit-level auxiliary data are available. We introduce diagnostic checks on the process of building the artificial population, and demonstrate how to use it for design-based simulation studies to compare and evaluate SAE models, using real data from the Forest Inventory and Analysis program of the United States Department of Agriculture Forest Service (the National Forest Inventory of the United States).},
  copyright = {https://creativecommons.org/licenses/by/4.0/},
  langid = {english}
 }
@article{white_small_2025,
  title = {Small Area Estimation of Forest Biomass via a Two-Stage Model for Continuous Zero-Inflated Data},
  author = {White, Grayson W. and Yamamoto, Josh K. and Elsyad, Dinan H. and Schmitt, Julian F. and Korsgaard, Niels H. and Hu, Jie Kate and Gaines, George C. and Frescino, Tracey S. and McConville, Kelly S.},
  year = 2025,
  month = jan,
  journal = {Canadian Journal of Forest Research},
  volume = {55},
  pages = {1--19},
  issn = {0045-5067, 1208-6037},
  doi = {10.1139/cjfr-2024-0149},
  urldate = {2026-01-14},
  abstract = {Nationwide Forest Inventories (NFIs) collect data on and monitor the trends of forests across the globe. Users of NFI data are increasingly interested in monitoring forest attributes such as biomass at fine geographic and temporal scales, resulting in a need for assessment and development of small area estimation techniques in forest inventory. We implement a small area estimator and parametric bootstrap estimator that account for zero-inflation in biomass data via a two-stage model-based approach and compare the performance to a Horvitz--Thompson estimator, a post-stratified estimator, and to the unit- and area-level empirical best linear unbiased prediction (EBLUP) estimators. We conduct a simulation study in Nevada with data from the United States NFI, the Forest Inventory and Analysis Program, and remote sensing data products. Results show the zero-inflated estimator has the lowest relative bias and the smallest empirical root mean square error. Moreover, the 95\% confidence interval coverages of the zero-inflated estimator and the unit-level EBLUP are more accurate than the other two estimators. To further illustrate the practical utility, we employ a data application across the 2019 measurement year in Nevada. We introduce the R package, saeczi, which efficiently implements the zero-inflated estimator and its mean squared error estimator.},
  langid = {english},
  file = {/home/rkw/Zotero/storage/VSX6A8MF/White et al. - 2025 - Small area estimation of forest biomass via a two-stage model for continuous zero-inflated data.pdf}
 }
--- a/renv.lock
+++ b/renv.lock
--- a/renv/.gitignore
+++ b/renv/.gitignore
@@ -0,0 +1,7 @@
 library/
 local/
 cellar/
 lock/
 python/
 sandbox/
 staging/
--- a/renv/activate.R
+++ b/renv/activate.R
--- a/renv/settings.json
+++ b/renv/settings.json
@@ -0,0 +1,19 @@
 {
  "bioconductor.version": null,
  "external.libraries": [],
  "ignored.packages": [],
  "package.dependency.fields": [
    "Imports",
    "Depends",
    "LinkingTo"
  ],
  "ppm.enabled": null,
  "ppm.ignored.urls": [],
  "r.version": null,
  "snapshot.type": "implicit",
  "use.cache": true,
  "vcs.ignore.cellar": true,
  "vcs.ignore.library": true,
  "vcs.ignore.local": true,
  "vcs.manage.ignores": true
 }
--- a/scripts/make_title_globe.R
+++ b/scripts/make_title_globe.R
@@ -0,0 +1,49 @@
 # scripts/make_title_globe.R
 library(ggplot2)
 library(sf)
 library(dplyr)
 library(maps)
 # 1. Setup Data
 states_sf <- sf::st_as_sf(maps::map("state", plot = FALSE, fill = TRUE))
 # 2. Colors
 wa_color <- "#D95F0E" 
 ga_color <- "#00A88F" 
 bg_fill  <- "#f0f0f0"
 borders  <- "#ffffff"
 ocean_col <- "#ffffff"
 # 3. Create the Plot
 p <- ggplot() +
  # Graticules
  geom_sf(data = sf::st_graticule(lat = seq(-90, 90, 10), lon = seq(-180, 180, 10)),
          color = "#e0e0e0", linewidth = 0.1) +
  # Background States
  geom_sf(data = states_sf, 
          fill = bg_fill, color = borders, linewidth = 0.3) +
  # Highlights
  geom_sf(data = states_sf %>% filter(ID == "washington"), 
          fill = wa_color, color = borders, linewidth = 0.3) +
  geom_sf(data = states_sf %>% filter(ID == "georgia"), 
          fill = ga_color, color = borders, linewidth = 0.3) +
  # --- THE FIX ---
  # -102 was Center. -72 was too far Left.
  # -87 is the magic number (15 degrees East).
  coord_sf(crs = "+proj=ortho +lat_0=40 +lon_0=-102") +
  # Theme
  theme_void() +
  theme(
    panel.background = element_rect(fill = ocean_col, color = NA),
    plot.background = element_rect(fill = ocean_col, color = NA)
  )
 # 4. Save
 if(!dir.exists("assets")) dir.create("assets")
 ggsave("assets/study_sites_globe.png", plot = p, width = 10, height = 10, dpi = 300)
 message("Success! Globe rotated 15 degrees.")
		`@@ -0,0 +1,2 @@`
							`YEAR: 2026`
							`COPYRIGHT HOLDER: Rob Wiederstein`