From b38892f49ecde9796f28c9f417f9d87bace87792 Mon Sep 17 00:00:00 2001
From: Rob Wiederstein <khuon68@gmail.com>
Date: Sun, 22 Feb 2026 03:52:34 -0500
Subject: [PATCH] Refactor: consistent naming across functions, targets, and
 pkgdown

Functions: prepare_eda_recipe -> build_eda_recipe,
           create_efficiency_plot -> plot_efficiency,
           format_class_imbalance_tourney_gt -> format_tournament_gt

Targets: model_inputs_prefix -> baf_model_input_prefix,
         tbl_fraud_by_month_data -> fraud_by_month_summary,
         model_diag -> diag_fit, winning_params -> best_params,
         production_recipe_blueprint -> prod_recipe,
         final_eval_data -> test_predictions

pkgdown: restructured reference index into 6 logical sections,
         removed stale names and development comments.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 NAMESPACE                                     |   4 +-
 R/functions.R                                 |  14 +-
 _pkgdown.yml                                  |  53 ++--
 _targets.R                                    | 272 +++++++-----------
 ...pare_eda_recipe.Rd => build_eda_recipe.Rd} |  10 +-
 ..._tourney_gt.Rd => format_tournament_gt.Rd} |   8 +-
 ..._efficiency_plot.Rd => plot_efficiency.Rd} |  10 +-
 7 files changed, 159 insertions(+), 212 deletions(-)
 rename man/{prepare_eda_recipe.Rd => build_eda_recipe.Rd} (55%)
 rename man/{format_class_imbalance_tourney_gt.Rd => format_tournament_gt.Rd} (68%)
 rename man/{create_efficiency_plot.Rd => plot_efficiency.Rd} (50%)

diff --git a/NAMESPACE b/NAMESPACE
index 7aa014e..4aec14e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,14 +1,15 @@
 # Generated by roxygen2: do not edit by hand
 
 export(build_baf_recipe)
+export(build_eda_recipe)
 export(clean_baf_base)
 export(compute_fraud_by_month)
 export(connect_baf)
 export(convert_to_parquet)
 export(engineer_features)
 export(evaluate_final_model)
-export(format_class_imbalance_tourney_gt)
 export(format_fraud_by_month_gt)
+export(format_tournament_gt)
 export(generate_model_inputs)
 export(plot_conf_mat_heatmap)
 export(plot_fraud_by_month)
@@ -16,7 +17,6 @@ export(plot_hexbin_interaction)
 export(plot_missingness)
 export(plot_num_cor)
 export(plot_var_imp)
-export(prepare_eda_recipe)
 export(render_slides)
 export(run_imbalance_tournament)
 export(save_report_figure)
diff --git a/R/functions.R b/R/functions.R
index 316e25c..ef5777d 100644
--- a/R/functions.R
+++ b/R/functions.R
@@ -580,9 +580,9 @@ run_imbalance_tournament <- function(
   return(results_df)
 }
 
-#' Format Class Imbalance Tournament Table
+#' Format Tournament Results Table
 #'
-#' Aggregates results from the model tournament and performs paired t-tests 
+#' Aggregates results from the model tournament and performs paired t-tests
 #' against the 'Standard' model to determine statistical significance.
 #'
 #' @param results_df The tibble output from `run_imbalance_tournament`.
@@ -593,7 +593,7 @@ run_imbalance_tournament <- function(
 #'
 #' @return A formatted gt table object.
 #' @export
-format_class_imbalance_tourney_gt <- function(results_df) {
+format_tournament_gt <- function(results_df) {
   
   # Extract scores for the 'Standard' recipe to use as the baseline for t-tests
   standard_scores <- results_df |> 
@@ -648,12 +648,12 @@ format_class_imbalance_tourney_gt <- function(results_df) {
     )
 }
 
-#' Create Effectiveness vs Efficiency Plot
+#' Plot Effectiveness vs Efficiency
 #' @param results_df Tibble from run_imbalance_tournament
 #' @importFrom ggplot2 ggplot aes geom_point scale_color_manual labs theme_minimal
 #' @importFrom ggrepel geom_text_repel
 #' @importFrom cowplot theme_half_open background_grid
-create_efficiency_plot <- function(results_df) {
+plot_efficiency <- function(results_df) {
   # Aggregate by recipe
   plot_data <- results_df |>
     dplyr::group_by(recipe) |>
@@ -677,11 +677,11 @@ create_efficiency_plot <- function(results_df) {
     cowplot::theme_half_open(font_family = "Atkinson Hyperlegible") +
     cowplot::background_grid(major = "y")
 }
-#' Prepare EDA Recipe
+#' Build EDA Recipe
 #' @param eda_data Raw EDA data
 #' @importFrom recipes recipe update_role step_novel step_unknown step_impute_median step_dummy all_nominal_predictors all_numeric_predictors prep
 #' @export
-prepare_eda_recipe <- function(eda_data) {
+build_eda_recipe <- function(eda_data) {
   recipe(outcome ~ ., data = eda_data) |>
     update_role(month, new_role = "ID") |>
     step_novel(all_nominal_predictors()) |>
diff --git a/_pkgdown.yml b/_pkgdown.yml
index cae461f..89d3ae3 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -2,7 +2,7 @@ url: https://docs.robwiederstein.org/baflakehouse
 
 template:
   bootstrap: 5
-  bootswatch: flatly # Clean, professional look
+  bootswatch: flatly
 
 navbar:
   structure:
@@ -15,44 +15,51 @@ navbar:
 
 reference:
   - title: "Data Ingestion & Lakehouse Setup"
-    desc: "Functions for moving data from CSV to partitioned Parquet in MinIO."
+    desc: "Functions for moving raw CSV data into the MinIO Lakehouse as partitioned Parquet."
     contents:
       - baflakehouse-package
       - convert_to_parquet
       - connect_baf
       - clean_baf_base
-      
+
   - title: "Feature Engineering & Preprocessing"
-    desc: "The 'Recipes' layer of the pipeline."
+    desc: "Recipes and transformations applied across the pipeline layers."
     contents:
       - engineer_features
-      - prepare_eda_recipe
-      - build_baf_recipe          # NEW: Untrained blueprint for production
       - generate_model_inputs
+      - build_eda_recipe
+      - build_baf_recipe
 
-  - title: "The Tournament (Model Selection)"
-    desc: "Cross-validation and imbalance strategy testing."
+  - title: "Exploratory Data Analysis"
+    desc: "Diagnostic model and visualizations for understanding the fraud signal."
+    contents:
+      - train_diag_model
+      - plot_var_imp
+      - plot_hexbin_interaction
+      - plot_missingness
+      - plot_num_cor
+
+  - title: "Model Selection & Tuning"
+    desc: "Imbalance strategy tournament, hyperparameter tuning, and results formatting."
     contents:
       - run_imbalance_tournament
       - tune_lgbm
-      - train_diag_model
-      - create_efficiency_plot    # Moved here: Belongs with the tournament
+      - format_tournament_gt
+      - plot_efficiency
 
   - title: "Final Evaluation & Production Deployment"
-    desc: "Results on unseen data (Months 6-7) and MinIO artifact serialization."
+    desc: "Holdout evaluation on months 6-7 and MinIO model artifact serialization."
     contents:
       - evaluate_final_model
-      - train_production_model    # NEW: The final deployment function
+      - train_production_model
 
-  - title: "Reporting: Tables & Visualizations"
-    desc: "Generating ggplot2 figures and gt tables for Quarto."
+  - title: "Reporting"
+    desc: "Figures, tables, and slide rendering for the Quarto presentation."
     contents:
-      - starts_with("plot_")
-      - starts_with("compute_")
-      - starts_with("format_")    # Neatly catches all your gt table formatters
-
-  - title: "Pipeline Utilities"
-    desc: "Internal helpers for the targets workflow and slide generation."
-    contents:
-      - starts_with("save_report_")
-      - render_slides             # Consolidated here
\ No newline at end of file
+      - plot_fraud_by_month
+      - plot_conf_mat_heatmap
+      - compute_fraud_by_month
+      - format_fraud_by_month_gt
+      - save_report_figure
+      - save_report_table
+      - render_slides
diff --git a/_targets.R b/_targets.R
index 7dc43d3..6ba8db5 100644
--- a/_targets.R
+++ b/_targets.R
@@ -19,9 +19,9 @@ tar_option_set(
     "scales",
     "ggplot2",
     "quarto",
-    "corrr",  
-    "recipes",  
-    "themis", 
+    "corrr",
+    "recipes",
+    "themis",
     "tidyselect"
   )
 )
@@ -37,7 +37,7 @@ list(
       bucket_name = "baf-fraud"
     )
   ),
-  
+
   tar_target(
     baf_primary_prefix,
     clean_baf_base(
@@ -49,7 +49,7 @@ list(
       verbose = TRUE
     )
   ),
-  
+
   tar_target(
     baf_feature_prefix,
     engineer_features(
@@ -61,98 +61,60 @@ list(
       verbose = TRUE
     )
   ),
-  
-  # ---- Figure objects ----
+
+  # ---- 05_model_input Generation ----
   tar_target(
-    fig_fraud_by_month,
-    plot_fraud_by_month(baf_primary_prefix, bucket_name = "baf-fraud")
+    baf_model_input_prefix,
+    generate_model_inputs(
+      feature_prefix = baf_feature_prefix,
+      out_prefix = "05_model_input",
+      bucket_name = "baf-fraud"
+    )
   ),
-  
-  # ---- Saved figure path (file target) ----
-  tar_target(
-    fig_fraud_by_month_path,
-    save_report_figure(
-      fig_fraud_by_month,
-      filename = "fig_fraud_by_month.png",
-      out_dir = "reports/figures"
-    ),
-    format = "file"
-  ),
-  tar_target(
-    tbl_fraud_by_month_data,
-    compute_fraud_by_month(baf_primary_prefix)
-  ),
-  
-  tar_target(
-    tbl_fraud_by_month_gt,
-    format_fraud_by_month_gt(tbl_fraud_by_month_data)
-  ),
-  
-  tar_target(
-    tbl_fraud_by_month_path,
-    save_report_table(tbl_fraud_by_month_gt, filename = "tbl_fraud_by_month.rds"),
-    format = "file"
-  ),
-  
-  # ---- Exploratory Data Analysis (EDA) Layer ----
+
+  # ---- EDA Layer ----
   tar_target(
     data_eda_m0,
     connect_baf(baf_primary_prefix, use_duckdb = TRUE) |>
       filter(month == 0) |>
       collect()
   ),
-  
+
   tar_target(
     eda_recipe,
-    prepare_eda_recipe(data_eda_m0)
+    build_eda_recipe(data_eda_m0)
   ),
-  
+
   tar_target(
     data_baked_eda_m0,
     bake(eda_recipe, new_data = data_eda_m0)
   ),
-  
+
   tar_target(
-    model_diag,
+    diag_fit,
     train_diag_model(data_baked_eda_m0)
   ),
-  
+
   # ---- EDA Figures ----
-  tar_target(fig_var_imp, plot_var_imp(model_diag)),
+  tar_target(fig_var_imp,            plot_var_imp(diag_fit)),
   tar_target(fig_hexbin_interaction, plot_hexbin_interaction(data_baked_eda_m0)),
-  tar_target(fig_missingness, plot_missingness(data_eda_m0)),
-  tar_target(fig_num_cor, plot_num_cor(data_eda_m0)),
-  
-  # ---- Saved EDA Figure Paths ----
+  tar_target(fig_missingness,        plot_missingness(data_eda_m0)),
+  tar_target(fig_num_cor,            plot_num_cor(data_eda_m0)),
+
+  # ---- Fraud Prevalence ----
   tar_target(
-    fig_var_imp_path, 
-    save_report_figure(fig_var_imp, "fig_var_imp.png"), 
-    format = "file"
+    fig_fraud_by_month,
+    plot_fraud_by_month(baf_primary_prefix, bucket_name = "baf-fraud")
   ),
+
   tar_target(
-    fig_hexbin_interaction_path, 
-    save_report_figure(fig_hexbin_interaction, "fig_hexbin_interaction.png"), 
-    format = "file"
+    fraud_by_month_summary,
+    compute_fraud_by_month(baf_primary_prefix)
   ),
+
   tar_target(
-    fig_missingness_path, 
-    save_report_figure(fig_missingness, "fig_missingness.png"), 
-    format = "file"
-  ),
-  tar_target(
-    fig_num_cor_path, 
-    save_report_figure(fig_num_cor, "fig_num_cor.png"), 
-    format = "file"
-  ),
-  
-  # ---- 05_model_input Generation ----
-  tar_target(
-    model_inputs_prefix,
-    generate_model_inputs(
-      feature_prefix = baf_feature_prefix, 
-      out_prefix = "05_model_input",
-      bucket_name = "baf-fraud"
-    )
+    tbl_fraud_by_month_gt,
+    format_fraud_by_month_gt(fraud_by_month_summary)
   ),
 
   # ---- Tournament Inputs ----
@@ -161,14 +123,14 @@ list(
     tibble::tribble(
       ~recipe_name, ~data_folder, ~scale_pos_weight,
       "Standard",   "baseline",   1,
-      "Weighted",   "baseline",   4,  
+      "Weighted",   "baseline",   4,
       "Under",      "under",      1,
       "Smote",      "smote",      1,
       "Adasyn",     "adasyn",     1,
       "Tomek",      "tomek",      1
     )
   ),
-  
+
   tar_target(
     imbalance_windows,
     tibble::tribble(
@@ -178,110 +140,112 @@ list(
       "Window 3",  c(2, 3, 4),   5
     )
   ),
-  
-  # ---- 1. Data Layer (The Tournament Results) ----
+
+  # ---- Hyperparameter Tuning ----
+  tar_target(
+    best_params,
+    tune_lgbm(imbalance_windows)
+  ),
+
+  # ---- Tournament Results ----
   tar_target(
     tbl_strategy_showdown,
     {
-      # Force DAG to wait for the folders to be generated
-      force(model_inputs_prefix)
-      # Pass baf_feature_prefix so it tracks the latest layer
+      force(baf_model_input_prefix)
       run_imbalance_tournament(imbalance_tasks, imbalance_windows, baf_feature_prefix)
     }
   ),
-  
-  # ---- 2. Figure Layer ----
+
   tar_target(
     fig_strategy_showdown,
-    create_efficiency_plot(tbl_strategy_showdown) 
+    plot_efficiency(tbl_strategy_showdown)
   ),
-  
-  tar_target(
-    fig_strategy_showdown_path,
-    save_report_figure(
-      fig_strategy_showdown,
-      filename = "fig_strategy_showdown.png",
-      out_dir = "reports/figures"
-    ),
-    format = "file"
-  ),
-  
-  # ---- 3. Table Layer (gt object) ----
+
   tar_target(
     tbl_strategy_showdown_gt,
-    format_class_imbalance_tourney_gt(tbl_strategy_showdown)
+    format_tournament_gt(tbl_strategy_showdown)
   ),
-  
+
+  # ---- Final Evaluation (Months 6-7) ----
   tar_target(
-    tbl_strategy_showdown_path,
-    save_report_table(
-      tbl_strategy_showdown_gt, 
-      filename = "tbl_strategy_showdown.rds",
-      out_dir = "reports/tables"
-    ),
-    format = "file"
+    test_predictions,
+    evaluate_final_model(params = best_params)
   ),
-  
-  # ---- Final Production Evaluation ----
+
   tar_target(
-    final_eval_data,
-    evaluate_final_model(params = winning_params)
+    final_conf_mat,
+    yardstick::conf_mat(test_predictions, truth, pred_class)
   ),
-  
+
   tar_target(
-    final_conf_mat, 
-    yardstick::conf_mat(final_eval_data, truth, pred_class)
+    final_roc_curve,
+    yardstick::roc_curve(test_predictions, truth, prob)
   ),
-  
+
   tar_target(
-    final_roc_curve, 
-    yardstick::roc_curve(final_eval_data, truth, prob)
+    final_pr_curve,
+    yardstick::pr_curve(test_predictions, truth, prob)
   ),
-  
-  tar_target(
-    final_pr_curve, 
-    yardstick::pr_curve(final_eval_data, truth, prob)
-  ),
-  
-  # ---- Save Final Assets ----
+
   tar_target(
     fig_final_curves,
     {
       p1 <- ggplot2::autoplot(final_roc_curve) + ggplot2::labs(title = "ROC Curve (Months 6-7)")
-      p2 <- ggplot2::autoplot(final_pr_curve) + ggplot2::labs(title = "PR Curve (Months 6-7)")
+      p2 <- ggplot2::autoplot(final_pr_curve)  + ggplot2::labs(title = "PR Curve (Months 6-7)")
       cowplot::plot_grid(p1, p2)
     }
   ),
-  
-  tar_target(
-    fig_final_curves_path,
-    save_report_figure(fig_final_curves, "fig_final_curves.png"),
-    format = "file"
-  ),
-  
-  tar_target(
-    tbl_final_conf_mat_path,
-    save_report_table(final_conf_mat, "tbl_final_conf_mat.rds", out_dir = "reports/tables"),
-    format = "file"
-  ),
-  # ---- Generate and Save Heatmap ----
+
   tar_target(
     fig_final_conf_mat,
     plot_conf_mat_heatmap(final_conf_mat)
   ),
-  
+
+  # ---- Production Deployment ----
   tar_target(
-    fig_final_conf_mat_path,
-    save_report_figure(fig_final_conf_mat, "fig_final_conf_mat.png"),
-    format = "file"
+    data_full,
+    connect_baf(baf_feature_prefix, use_duckdb = TRUE) |>
+      collect()
   ),
-  # ---- Report Dependency Update ----
+
+  tar_target(
+    prod_recipe,
+    build_baf_recipe(data_full)
+  ),
+
+  tar_target(
+    production_model_uri,
+    train_production_model(
+      data          = data_full,
+      recipe        = prod_recipe,
+      best_params   = best_params,
+      model_filename = "baf_lgbm_prod_v1.txt"
+    ),
+    format = "rds"
+  ),
+
+  # ---- Saved Figure Paths ----
+  tar_target(fig_fraud_by_month_path,     save_report_figure(fig_fraud_by_month,     "fig_fraud_by_month.png"),     format = "file"),
+  tar_target(fig_var_imp_path,            save_report_figure(fig_var_imp,            "fig_var_imp.png"),            format = "file"),
+  tar_target(fig_hexbin_interaction_path, save_report_figure(fig_hexbin_interaction, "fig_hexbin_interaction.png"), format = "file"),
+  tar_target(fig_missingness_path,        save_report_figure(fig_missingness,        "fig_missingness.png"),        format = "file"),
+  tar_target(fig_num_cor_path,            save_report_figure(fig_num_cor,            "fig_num_cor.png"),            format = "file"),
+  tar_target(fig_strategy_showdown_path,  save_report_figure(fig_strategy_showdown,  "fig_strategy_showdown.png",  out_dir = "reports/figures"), format = "file"),
+  tar_target(fig_final_conf_mat_path,     save_report_figure(fig_final_conf_mat,     "fig_final_conf_mat.png"),     format = "file"),
+  tar_target(fig_final_curves_path,       save_report_figure(fig_final_curves,       "fig_final_curves.png"),       format = "file"),
+
+  # ---- Saved Table Paths ----
+  tar_target(tbl_fraud_by_month_path,    save_report_table(tbl_fraud_by_month_gt,    "tbl_fraud_by_month.rds"),                          format = "file"),
+  tar_target(tbl_strategy_showdown_path, save_report_table(tbl_strategy_showdown_gt, "tbl_strategy_showdown.rds", out_dir = "reports/tables"), format = "file"),
+  tar_target(tbl_final_conf_mat_path,    save_report_table(final_conf_mat,           "tbl_final_conf_mat.rds",    out_dir = "reports/tables"), format = "file"),
+
+  # ---- Report Assembly ----
   tar_target(
     report_assets,
     c(
       fig_fraud_by_month_path,
       tbl_fraud_by_month_path,
-      fig_strategy_showdown_path, 
+      fig_strategy_showdown_path,
       tbl_strategy_showdown_path,
       fig_var_imp_path,
       fig_hexbin_interaction_path,
@@ -290,33 +254,9 @@ list(
     ),
     format = "file"
   ),
-  
+
   tar_quarto(
     report_slides,
     path = "index.qmd"
-  ),
-  # production model deployment
-  tar_target(
-    data_full,
-    connect_baf(baf_feature_prefix, use_duckdb = TRUE) |> 
-      collect()
-  ),
-  tar_target(
-    production_recipe_blueprint,
-    build_baf_recipe(data_full)
-  ),
-  tar_target(
-    winning_params,
-    tune_lgbm(imbalance_windows)
-  ),
-  tar_target(
-    production_model_uri,
-    train_production_model(
-      data = data_full,
-      recipe = production_recipe_blueprint, # <--- Pass the untrained blueprint!
-      best_params = winning_params,
-      model_filename = "baf_lgbm_prod_v1.txt"
-    ),
-    format = "rds" 
   )
-)
\ No newline at end of file
+)
diff --git a/man/prepare_eda_recipe.Rd b/man/build_eda_recipe.Rd
similarity index 55%
rename from man/prepare_eda_recipe.Rd
rename to man/build_eda_recipe.Rd
index 7eae699..a959617 100644
--- a/man/prepare_eda_recipe.Rd
+++ b/man/build_eda_recipe.Rd
@@ -1,14 +1,14 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
-\name{prepare_eda_recipe}
-\alias{prepare_eda_recipe}
-\title{Prepare EDA Recipe}
+\name{build_eda_recipe}
+\alias{build_eda_recipe}
+\title{Build EDA Recipe}
 \usage{
-prepare_eda_recipe(eda_data)
+build_eda_recipe(eda_data)
 }
 \arguments{
 \item{eda_data}{Raw EDA data}
 }
 \description{
-Prepare EDA Recipe
+Build EDA Recipe
 }
diff --git a/man/format_class_imbalance_tourney_gt.Rd b/man/format_tournament_gt.Rd
similarity index 68%
rename from man/format_class_imbalance_tourney_gt.Rd
rename to man/format_tournament_gt.Rd
index fd46aef..be0ecfd 100644
--- a/man/format_class_imbalance_tourney_gt.Rd
+++ b/man/format_tournament_gt.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
-\name{format_class_imbalance_tourney_gt}
-\alias{format_class_imbalance_tourney_gt}
-\title{Format Class Imbalance Tournament Table}
+\name{format_tournament_gt}
+\alias{format_tournament_gt}
+\title{Format Tournament Results Table}
 \usage{
-format_class_imbalance_tourney_gt(results_df)
+format_tournament_gt(results_df)
 }
 \arguments{
 \item{results_df}{The tibble output from \code{run_imbalance_tournament}.}
diff --git a/man/create_efficiency_plot.Rd b/man/plot_efficiency.Rd
similarity index 50%
rename from man/create_efficiency_plot.Rd
rename to man/plot_efficiency.Rd
index 47ae669..733e809 100644
--- a/man/create_efficiency_plot.Rd
+++ b/man/plot_efficiency.Rd
@@ -1,14 +1,14 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/functions.R
-\name{create_efficiency_plot}
-\alias{create_efficiency_plot}
-\title{Create Effectiveness vs Efficiency Plot}
+\name{plot_efficiency}
+\alias{plot_efficiency}
+\title{Plot Effectiveness vs Efficiency}
 \usage{
-create_efficiency_plot(results_df)
+plot_efficiency(results_df)
 }
 \arguments{
 \item{results_df}{Tibble from run_imbalance_tournament}
 }
 \description{
-Create Effectiveness vs Efficiency Plot
+Plot Effectiveness vs Efficiency
 }