initial commit

2026-02-10 04:52:37 -05:00
commit 0476f6f8f8
65 changed files with 15368 additions and 0 deletions
--- a/_targets.R
+++ b/_targets.R
@@ -0,0 +1,481 @@
+library(targets)
+library(tarchetypes)
+
+# 1. Options ----
+tar_option_set(
+  packages = c(
+    "colorspace",
+    "elevatr", 
+    "forested", 
+    "ggcorrplot", 
+    "ggrepel",
+    "ggspatial",
+    "gt", 
+    "magrittr",
+    "patchwork", 
+    "processx",
+    "psych", 
+    "quarto",
+    "ranger", 
+    "rmapshaper",
+    "sf",
+    "showtext",
+    "spatialsample",
+    "stringr",
+    "terra", 
+    "tidyterra",
+    "tidymodels",
+    "tidyverse",
+    "tigris",
+    "xgboost",   # For XGBoost
+    "earth",
+    "withr"
+  ),
+  format = "rds"
+)
+
+tar_source("R/functions.R")
+
+# 3. The Pipeline ----
+list(
+  # constants 
+  tar_target(n_folds, 10),
+  # Data Ingestion
+  tar_target(forested_wa, forested::forested_wa),
+  tar_target(forested_ga, forested::forested_ga),
+  tar_target(
+    wa_sf,
+    forested_wa %>% 
+      sf::st_as_sf(coords = c("lon", "lat"), crs = 4326, remove = FALSE)
+  ),
+  tar_target(
+    name = eco_url,
+    command = "https://dmap-prod-oms-edc.s3.us-east-1.amazonaws.com/ORD/Ecoregions/us/us_eco_l3.zip",
+    format = "url" 
+  ),
+  tar_target(
+    name = data_dir,
+    command = "data/epa",
+    format = "file" # Tracks the directory
+  ),
+  # Download data
+  tar_target(
+    name = eco_zip_file,
+    command = get_epa_ecoregions(url = eco_url, dest_dir = data_dir),
+    format = "file"
+  ),
+  # Data Processing
+  tar_target(forested_us, combine_forest(wa_data = forested_wa, ga_data = forested_ga)),
+  tar_target(boundary_wa_sf, fetch_state_boundary(state = "Washington")),
+  tar_target(boundary_ga_sf, fetch_state_boundary(state = "Georgia")),
+  tar_target(
+    name = eco_data,
+    command = process_ecoregions(
+      zip_path = eco_zip_file, 
+      target_states = c("Washington", "Georgia"),
+      simplify_tol = 0.05
+    )
+  ),
+  # Raster File Target
+  tar_target(wa_elev_file, 
+             create_elevation_raster(boundary_wa_sf, "data/wa_elevation.tif"), 
+             format = "file"),
+  tar_target(ga_elev_file, 
+             create_elevation_raster(boundary_ga_sf, "data/ga_elevation.tif"), 
+             format = "file"),
+  
+  # Maps
+  tar_target(fig_us_map, plot_us_map()),
+  tar_target(
+    name = fig_us_map_file,
+    command = helper_save_fig(
+      plot_obj = fig_us_map,
+      name = "us_forests",
+      width = 10,
+      height = 5.25,
+      type = "map"
+    ),
+    format = "file"
+  ),
+  tar_target(wa_ga_map, fetch_study_area(c("Washington", "Georgia"))),
+  tar_target(map_wa_ga_regional, plot_regional_comparison(forested_us, wa_ga_map)),
+  tar_target(
+    name = fig_wa_ga_regional_file,
+    command = helper_save_fig(
+      plot_obj = map_wa_ga_regional,
+      name = "wa_ga_forests",
+      width = 9.2,
+      height = 4.25,
+      type = "map"
+    ),
+    format = "file"
+  ),
+  tar_target(
+    name = ecoregion_plot,
+    command = plot_ecoregion_comparison(eco_data)
+  ),
+  tar_target(
+    name = ecoregion_plot_file,
+    command = helper_save_fig(
+      plot_obj = ecoregion_plot, 
+      name = "wa_ga_ecoregions",
+      width = 10,
+      height = 4.25,
+      type = "map"
+    ),
+    format = "file" # Tells targets to watch the actual .png file
+  ),
+  tar_target(
+    name = combined_topo_map,
+    command = save_combined_topo(
+      wa_data = forested_wa,
+      ga_data = forested_ga,
+      wa_boundary = boundary_wa_sf,
+      ga_boundary = boundary_ga_sf,
+      wa_raster_path = wa_elev_file,
+      ga_raster_path = ga_elev_file,
+      output_path = "figs/combined_topo.png"
+    ),
+    format = "file"
+  ),
+  tar_target(
+    map_precip_hex,
+    plot_precip_hex_comparison(
+      wa_data = forested_wa,
+      ga_data = forested_ga,
+      boundaries = wa_ga_map,
+      bins = 50
+    )
+  ),
+  tar_target(plot_cv_comparison, plot_cv_strategies(forested_wa)),
+  # fold mechanics
+  tar_target(
+    fig_fold_mechanics,
+    plot_fold_mechanics(wa_sf, boundary_wa_sf)
+  ),
+  # fold diagram
+  tar_target(fig_classic_cv, plot_classic_kfold_diagram()),
+  # Analysis
+  tar_target(tbl_forest_wa, format_summary_table(forested_wa)),
+  
+  tar_target(plot_distrib_wa, plot_forest_distributions(forested_wa)),
+  tar_target(plt_outliers, identify_outliers(forested_wa)),
+  
+  tar_target(map_wa_outliers, 
+             save_outlier_map_png(forested_wa, boundary_wa_sf, wa_elev_file, "figs/wa_outliers.png"),
+             format = "file"),
+  tar_target(plt_wa_pca, plot_wa_pca(forested_wa)),
+  tar_target(
+    name = p_moran_exploration,
+    command = plot_spatial_exploration(forested_wa)
+  ),
+  # correlogram
+  tar_target(plt_correlogram, plot_correlations(forested_wa)),
+  
+  # vip plot
+  tar_target(plt_vip, plot_rf_importance(forested_wa)),
+  # umap plot
+  tar_target(umap_plot, plot_umap_forested(forested_wa)),
+  
+  # 1. Data Splitting -------------------------------------------------
+  
+  # Define the split (80% Train, 20% Test)
+  tar_target(splits, initial_split(wa_sf, prop = 0.80, strata = forested)),
+  
+  # Extract the Training Set (Used for Resampling/Modeling)
+  tar_target(train_data, training(splits)),
+  
+  # Extract the Test Set (Locked away until the very end)
+  tar_target(test_data, testing(splits)),
+  # 2. Recipes ----
+  ## A: Base (Includes Lat/Lon) ----
+  tar_target(
+    recipe_base,
+    recipe(forested ~ ., data = train_data) %>%
+      update_role(geometry, new_role = "id") %>%
+      step_novel(all_nominal_predictors()) %>%
+      step_dummy(all_nominal_predictors()) %>%
+      step_zv(all_predictors()) %>%
+      step_normalize(all_numeric_predictors())
+  ),
+  
+  ## B: Non-Spatial (Bio Only) ----
+  tar_target(
+    recipe_non_spatial,
+    recipe(forested ~ ., data = train_data) %>%
+      update_role(geometry, lat, lon, new_role = "id") %>%
+      step_novel(all_nominal_predictors()) %>%
+      step_dummy(all_nominal_predictors()) %>%
+      step_zv(all_predictors()) %>%
+      step_normalize(all_numeric_predictors())
+  ),
+  
+  ## C: Extensible (Feature Engineered) ----
+  tar_target(
+    recipe_extensible,
+    recipe(forested ~ ., data = train_data) %>%
+      update_role(geometry, lat, lon, new_role = "id") %>%
+      step_rm(northness, county, year) %>%
+      step_ratio(precip_annual, denom = denom_vars(temp_annual_max)) %>%
+      step_mutate(
+        temp_range = temp_annual_max - temp_annual_min,
+        vpd_range = vapor_max - vapor_min
+      ) %>%
+      step_YeoJohnson(elevation) %>%
+      step_novel(all_nominal_predictors()) %>%
+      step_dummy(all_nominal_predictors()) %>%
+      step_zv(all_predictors()) %>%
+      step_normalize(all_numeric_predictors())
+  ),
+  tar_target(
+    plot_yeo,
+    plot_yeo_johnson(forested_wa)
+  ),
+  # 3. Engines ----
+  ## Logistic Regression ----
+  tar_target(
+    spec_logistic,
+    logistic_reg() %>% 
+      set_engine("glm") %>% 
+      set_mode("classification")
+  ),
+  
+  ## MARS ----
+  tar_target(
+    spec_mars,
+    mars(num_terms = 10, prod_degree = 2) %>% 
+      set_engine("earth", nfold = 1) %>%  # nfold=1 prevents internal CV (speed)
+      set_mode("classification")
+  ),
+  
+  ## Random Forest ----
+  tar_target(
+    spec_rf,
+    rand_forest(trees = 1000, min_n = 10) %>% 
+      set_engine("ranger", 
+                 importance = "impurity", # Calculate variable importance
+                 num.threads = 1) %>%     # <--- Server Safety Lock
+      set_mode("classification")
+  ),
+  
+  ## XGBoost ----
+  tar_target(
+    spec_xgb,
+    boost_tree(trees = 1000, tree_depth = 6, learn_rate = 0.01) %>% 
+      set_engine("xgboost", 
+                 nthread = 1) %>%         # <--- Server Safety Lock
+      set_mode("classification")
+  ),
+  # 4. The Workflow Set ----
+  # Crosses every recipe with every model (2 x 4 = 8 workflows)
+  tar_target(
+    model_set,
+    workflow_set(
+      preproc = list(base = recipe_base, 
+                     non_spatial = recipe_non_spatial,
+                     extensible = recipe_extensible),
+      models = list(
+        log = spec_logistic, 
+        rf = spec_rf, 
+        xgb = spec_xgb, 
+        mars = spec_mars
+      ),
+      cross = TRUE
+    )
+  ),
+  # 5. Resampling Strategies -----
+  
+  ## A. Random Folds ----
+  tar_target(
+    folds_random,
+    vfold_cv(train_data, v = n_folds, strata = forested)
+  ),
+  
+  ## B. Spatial Blocks ----
+  tar_target(
+    folds_block,
+    spatial_block_cv(train_data, v = n_folds) 
+  ),
+  
+  ## C. Spatial Clustering ----
+  tar_target(
+    folds_cluster,
+    spatial_clustering_cv(train_data, v = n_folds) 
+  ),
+  # 6. Fit Models -----
+  
+  ## Branch 1: Random CV ----
+  tar_target(
+    results_random,
+    workflow_map(
+      model_set, 
+      "fit_resamples", 
+      resamples = folds_random,
+      metrics = metric_set(roc_auc, accuracy, pr_auc),
+      verbose = TRUE
+    )
+  ),
+  
+  ## Branch 2: Block CV ----
+  tar_target(
+    results_block,
+    workflow_map(
+      model_set, 
+      "fit_resamples", 
+      resamples = folds_block,
+      metrics = metric_set(roc_auc, accuracy, pr_auc),
+      verbose = TRUE
+    )
+  ),
+  
+  ## Branch 3: Cluster CV ----
+  tar_target(
+    results_cluster,
+    workflow_map(
+      model_set, 
+      "fit_resamples", 
+      resamples = folds_cluster,
+      metrics = metric_set(roc_auc, accuracy, pr_auc),
+      verbose = TRUE
+    )
+  ),
+  # 7. Results ----
+  tar_target(
+    fig_cv_comparison,
+    plot_spatial_cv_comparison(results_random, results_block, results_cluster)
+  ),
+  tar_target(
+    fig_model_stability,
+    plot_model_stability(results_random, results_block, results_cluster, best_model_id)
+  ),
+  # 8. Select and Tune the Best Model ----
+  tar_target(
+    best_model_id,
+    results_cluster %>% 
+      rank_results(rank_metric = "roc_auc", select_best = TRUE) %>% 
+      slice(1) %>% 
+      pull(wflow_id)
+  ),
+  tar_target(
+    tbl_model_performance,
+    results_cluster %>% 
+      rank_results(rank_metric = "roc_auc", select_best = TRUE) %>% 
+      filter(.metric == "roc_auc")
+  ),
+  
+  # 9. Final Fit ----
+  tar_target(
+    final_fit_results,
+    last_fit(
+      extract_workflow(model_set, best_model_id),
+      split = splits, # Your original 80/20 split
+      metrics = metric_set(roc_auc, accuracy)
+    )
+  ),
+  
+  # 10. Test Set Performance Plot ----
+  tar_target(
+    fig_final_performance,
+    plot_final_test_results(final_fit_results) # Use the specific plotting function
+  ),
+  tar_target(
+    tbl_performance,
+    create_performance_table(results_cluster, final_fit_results)
+  ),
+  # 11. Confusion Matrix ----
+  tar_target(
+    fig_confusion_matrix,
+    plot_final_confusion_matrix(final_fit_results)
+  ),
+  tar_target(
+    test_predictions,
+    collect_predictions(final_fit_results) %>%
+      dplyr::bind_cols(
+        rsample::testing(splits) %>% 
+          dplyr::select(lat, lon)
+      )
+  ),
+  tar_target(
+    map_wa_errors,
+    save_error_map_png(
+      data = test_predictions,  # <--- Use the extracted data here
+      boundary_sf = boundary_wa_sf,
+      raster_path = wa_elev_file,
+      output_path = "figs/wa_errors.png"
+    ),
+    format = "file"
+  ),
+  # Georgia ----
+  tar_target(
+    model_predictors,
+    c("elevation", "precip_annual", "temp_annual_mean", "roughness")
+  ),
+  tar_target(
+    ga_aoa_data,
+    calculate_ga_aoa(
+      train_data = forested_wa,
+      test_data = forested_ga,
+      predictors = model_predictors
+    )
+  ),
+  tar_target(
+    plot_aoa_ga,
+    plot_georgia_aoa(
+      aoa_sf = ga_aoa_data 
+    )
+  ),
+  
+  # 3. Predict on Georgia using the Washington Model
+  tar_target(
+    ga_predictions,
+    predict_external_region(
+      final_fit = final_fit_results, 
+      new_data = forested_ga         
+    )
+  ),
+  
+  # 4. Map the Predictions
+  tar_target(
+    map_ga_probs,
+    plot_ga_comparison_map(
+      pred_data = ga_predictions,
+      boundaries = boundary_ga_sf # <--- CHECK THIS NAME
+    )
+  ),
+  # 5. Confusion Matrix for Georgia
+  tar_target(
+    ga_conf_mat,
+    plot_ga_confusion_matrix(ga_predictions)
+  ),
+  # 6. Map of Errors (False Positives + False Negatives)
+  tar_target(
+    map_failure_mechanism,
+    plot_failure_mechanism(
+      aoa_data = ga_aoa_data,      # <--- Reads the SAME data target
+      pred_data = ga_predictions,  
+      boundaries = boundary_ga_sf
+    )
+  ),
+  # Report ----
+  tar_target(
+    name = report,
+    command = {
+      # 1. Temporarily disable renv auto-loader so Quarto uses system libs
+      if (file.exists(".Rprofile")) file.rename(".Rprofile", "hold_Rprofile")
+      
+      # 2. Use a 'tryCatch' to ensure the .Rprofile is restored even if render fails
+      res <- tryCatch({
+        quarto::quarto_render("index.qmd", quiet = FALSE)
+      }, error = function(e) {
+        if (file.exists("hold_Rprofile")) file.rename("hold_Rprofile", ".Rprofile")
+        stop(e)
+      })
+      
+      # 3. Restore the .Rprofile
+      if (file.exists("hold_Rprofile")) file.rename("hold_Rprofile", ".Rprofile")
+      
+      "index.html"
+    },
+    format = "file"
+  )
+)