From b38892f49ecde9796f28c9f417f9d87bace87792 Mon Sep 17 00:00:00 2001 From: Rob Wiederstein Date: Sun, 22 Feb 2026 03:52:34 -0500 Subject: [PATCH] Refactor: consistent naming across functions, targets, and pkgdown Functions: prepare_eda_recipe -> build_eda_recipe, create_efficiency_plot -> plot_efficiency, format_class_imbalance_tourney_gt -> format_tournament_gt Targets: model_inputs_prefix -> baf_model_input_prefix, tbl_fraud_by_month_data -> fraud_by_month_summary, model_diag -> diag_fit, winning_params -> best_params, production_recipe_blueprint -> prod_recipe, final_eval_data -> test_predictions pkgdown: restructured reference index into 6 logical sections, removed stale names and development comments. Co-Authored-By: Claude Sonnet 4.6 --- NAMESPACE | 4 +- R/functions.R | 14 +- _pkgdown.yml | 53 ++-- _targets.R | 272 +++++++----------- ...pare_eda_recipe.Rd => build_eda_recipe.Rd} | 10 +- ..._tourney_gt.Rd => format_tournament_gt.Rd} | 8 +- ..._efficiency_plot.Rd => plot_efficiency.Rd} | 10 +- 7 files changed, 159 insertions(+), 212 deletions(-) rename man/{prepare_eda_recipe.Rd => build_eda_recipe.Rd} (55%) rename man/{format_class_imbalance_tourney_gt.Rd => format_tournament_gt.Rd} (68%) rename man/{create_efficiency_plot.Rd => plot_efficiency.Rd} (50%) diff --git a/NAMESPACE b/NAMESPACE index 7aa014e..4aec14e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,14 +1,15 @@ # Generated by roxygen2: do not edit by hand export(build_baf_recipe) +export(build_eda_recipe) export(clean_baf_base) export(compute_fraud_by_month) export(connect_baf) export(convert_to_parquet) export(engineer_features) export(evaluate_final_model) -export(format_class_imbalance_tourney_gt) export(format_fraud_by_month_gt) +export(format_tournament_gt) export(generate_model_inputs) export(plot_conf_mat_heatmap) export(plot_fraud_by_month) @@ -16,7 +17,6 @@ export(plot_hexbin_interaction) export(plot_missingness) export(plot_num_cor) export(plot_var_imp) -export(prepare_eda_recipe) export(render_slides) export(run_imbalance_tournament) export(save_report_figure) diff --git a/R/functions.R b/R/functions.R index 316e25c..ef5777d 100644 --- a/R/functions.R +++ b/R/functions.R @@ -580,9 +580,9 @@ run_imbalance_tournament <- function( return(results_df) } -#' Format Class Imbalance Tournament Table +#' Format Tournament Results Table #' -#' Aggregates results from the model tournament and performs paired t-tests +#' Aggregates results from the model tournament and performs paired t-tests #' against the 'Standard' model to determine statistical significance. #' #' @param results_df The tibble output from `run_imbalance_tournament`. @@ -593,7 +593,7 @@ run_imbalance_tournament <- function( #' #' @return A formatted gt table object. #' @export -format_class_imbalance_tourney_gt <- function(results_df) { +format_tournament_gt <- function(results_df) { # Extract scores for the 'Standard' recipe to use as the baseline for t-tests standard_scores <- results_df |> @@ -648,12 +648,12 @@ format_class_imbalance_tourney_gt <- function(results_df) { ) } -#' Create Effectiveness vs Efficiency Plot +#' Plot Effectiveness vs Efficiency #' @param results_df Tibble from run_imbalance_tournament #' @importFrom ggplot2 ggplot aes geom_point scale_color_manual labs theme_minimal #' @importFrom ggrepel geom_text_repel #' @importFrom cowplot theme_half_open background_grid -create_efficiency_plot <- function(results_df) { +plot_efficiency <- function(results_df) { # Aggregate by recipe plot_data <- results_df |> dplyr::group_by(recipe) |> @@ -677,11 +677,11 @@ create_efficiency_plot <- function(results_df) { cowplot::theme_half_open(font_family = "Atkinson Hyperlegible") + cowplot::background_grid(major = "y") } -#' Prepare EDA Recipe +#' Build EDA Recipe #' @param eda_data Raw EDA data #' @importFrom recipes recipe update_role step_novel step_unknown step_impute_median step_dummy all_nominal_predictors all_numeric_predictors prep #' @export -prepare_eda_recipe <- function(eda_data) { +build_eda_recipe <- function(eda_data) { recipe(outcome ~ ., data = eda_data) |> update_role(month, new_role = "ID") |> step_novel(all_nominal_predictors()) |> diff --git a/_pkgdown.yml b/_pkgdown.yml index cae461f..89d3ae3 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -2,7 +2,7 @@ url: https://docs.robwiederstein.org/baflakehouse template: bootstrap: 5 - bootswatch: flatly # Clean, professional look + bootswatch: flatly navbar: structure: @@ -15,44 +15,51 @@ navbar: reference: - title: "Data Ingestion & Lakehouse Setup" - desc: "Functions for moving data from CSV to partitioned Parquet in MinIO." + desc: "Functions for moving raw CSV data into the MinIO Lakehouse as partitioned Parquet." contents: - baflakehouse-package - convert_to_parquet - connect_baf - clean_baf_base - + - title: "Feature Engineering & Preprocessing" - desc: "The 'Recipes' layer of the pipeline." + desc: "Recipes and transformations applied across the pipeline layers." contents: - engineer_features - - prepare_eda_recipe - - build_baf_recipe # NEW: Untrained blueprint for production - generate_model_inputs + - build_eda_recipe + - build_baf_recipe - - title: "The Tournament (Model Selection)" - desc: "Cross-validation and imbalance strategy testing." + - title: "Exploratory Data Analysis" + desc: "Diagnostic model and visualizations for understanding the fraud signal." + contents: + - train_diag_model + - plot_var_imp + - plot_hexbin_interaction + - plot_missingness + - plot_num_cor + + - title: "Model Selection & Tuning" + desc: "Imbalance strategy tournament, hyperparameter tuning, and results formatting." contents: - run_imbalance_tournament - tune_lgbm - - train_diag_model - - create_efficiency_plot # Moved here: Belongs with the tournament + - format_tournament_gt + - plot_efficiency - title: "Final Evaluation & Production Deployment" - desc: "Results on unseen data (Months 6-7) and MinIO artifact serialization." + desc: "Holdout evaluation on months 6-7 and MinIO model artifact serialization." contents: - evaluate_final_model - - train_production_model # NEW: The final deployment function + - train_production_model - - title: "Reporting: Tables & Visualizations" - desc: "Generating ggplot2 figures and gt tables for Quarto." + - title: "Reporting" + desc: "Figures, tables, and slide rendering for the Quarto presentation." contents: - - starts_with("plot_") - - starts_with("compute_") - - starts_with("format_") # Neatly catches all your gt table formatters - - - title: "Pipeline Utilities" - desc: "Internal helpers for the targets workflow and slide generation." - contents: - - starts_with("save_report_") - - render_slides # Consolidated here \ No newline at end of file + - plot_fraud_by_month + - plot_conf_mat_heatmap + - compute_fraud_by_month + - format_fraud_by_month_gt + - save_report_figure + - save_report_table + - render_slides diff --git a/_targets.R b/_targets.R index 7dc43d3..6ba8db5 100644 --- a/_targets.R +++ b/_targets.R @@ -19,9 +19,9 @@ tar_option_set( "scales", "ggplot2", "quarto", - "corrr", - "recipes", - "themis", + "corrr", + "recipes", + "themis", "tidyselect" ) ) @@ -37,7 +37,7 @@ list( bucket_name = "baf-fraud" ) ), - + tar_target( baf_primary_prefix, clean_baf_base( @@ -49,7 +49,7 @@ list( verbose = TRUE ) ), - + tar_target( baf_feature_prefix, engineer_features( @@ -61,98 +61,60 @@ list( verbose = TRUE ) ), - - # ---- Figure objects ---- + + # ---- 05_model_input Generation ---- tar_target( - fig_fraud_by_month, - plot_fraud_by_month(baf_primary_prefix, bucket_name = "baf-fraud") + baf_model_input_prefix, + generate_model_inputs( + feature_prefix = baf_feature_prefix, + out_prefix = "05_model_input", + bucket_name = "baf-fraud" + ) ), - - # ---- Saved figure path (file target) ---- - tar_target( - fig_fraud_by_month_path, - save_report_figure( - fig_fraud_by_month, - filename = "fig_fraud_by_month.png", - out_dir = "reports/figures" - ), - format = "file" - ), - tar_target( - tbl_fraud_by_month_data, - compute_fraud_by_month(baf_primary_prefix) - ), - - tar_target( - tbl_fraud_by_month_gt, - format_fraud_by_month_gt(tbl_fraud_by_month_data) - ), - - tar_target( - tbl_fraud_by_month_path, - save_report_table(tbl_fraud_by_month_gt, filename = "tbl_fraud_by_month.rds"), - format = "file" - ), - - # ---- Exploratory Data Analysis (EDA) Layer ---- + + # ---- EDA Layer ---- tar_target( data_eda_m0, connect_baf(baf_primary_prefix, use_duckdb = TRUE) |> filter(month == 0) |> collect() ), - + tar_target( eda_recipe, - prepare_eda_recipe(data_eda_m0) + build_eda_recipe(data_eda_m0) ), - + tar_target( data_baked_eda_m0, bake(eda_recipe, new_data = data_eda_m0) ), - + tar_target( - model_diag, + diag_fit, train_diag_model(data_baked_eda_m0) ), - + # ---- EDA Figures ---- - tar_target(fig_var_imp, plot_var_imp(model_diag)), + tar_target(fig_var_imp, plot_var_imp(diag_fit)), tar_target(fig_hexbin_interaction, plot_hexbin_interaction(data_baked_eda_m0)), - tar_target(fig_missingness, plot_missingness(data_eda_m0)), - tar_target(fig_num_cor, plot_num_cor(data_eda_m0)), - - # ---- Saved EDA Figure Paths ---- + tar_target(fig_missingness, plot_missingness(data_eda_m0)), + tar_target(fig_num_cor, plot_num_cor(data_eda_m0)), + + # ---- Fraud Prevalence ---- tar_target( - fig_var_imp_path, - save_report_figure(fig_var_imp, "fig_var_imp.png"), - format = "file" + fig_fraud_by_month, + plot_fraud_by_month(baf_primary_prefix, bucket_name = "baf-fraud") ), + tar_target( - fig_hexbin_interaction_path, - save_report_figure(fig_hexbin_interaction, "fig_hexbin_interaction.png"), - format = "file" + fraud_by_month_summary, + compute_fraud_by_month(baf_primary_prefix) ), + tar_target( - fig_missingness_path, - save_report_figure(fig_missingness, "fig_missingness.png"), - format = "file" - ), - tar_target( - fig_num_cor_path, - save_report_figure(fig_num_cor, "fig_num_cor.png"), - format = "file" - ), - - # ---- 05_model_input Generation ---- - tar_target( - model_inputs_prefix, - generate_model_inputs( - feature_prefix = baf_feature_prefix, - out_prefix = "05_model_input", - bucket_name = "baf-fraud" - ) + tbl_fraud_by_month_gt, + format_fraud_by_month_gt(fraud_by_month_summary) ), # ---- Tournament Inputs ---- @@ -161,14 +123,14 @@ list( tibble::tribble( ~recipe_name, ~data_folder, ~scale_pos_weight, "Standard", "baseline", 1, - "Weighted", "baseline", 4, + "Weighted", "baseline", 4, "Under", "under", 1, "Smote", "smote", 1, "Adasyn", "adasyn", 1, "Tomek", "tomek", 1 ) ), - + tar_target( imbalance_windows, tibble::tribble( @@ -178,110 +140,112 @@ list( "Window 3", c(2, 3, 4), 5 ) ), - - # ---- 1. Data Layer (The Tournament Results) ---- + + # ---- Hyperparameter Tuning ---- + tar_target( + best_params, + tune_lgbm(imbalance_windows) + ), + + # ---- Tournament Results ---- tar_target( tbl_strategy_showdown, { - # Force DAG to wait for the folders to be generated - force(model_inputs_prefix) - # Pass baf_feature_prefix so it tracks the latest layer + force(baf_model_input_prefix) run_imbalance_tournament(imbalance_tasks, imbalance_windows, baf_feature_prefix) } ), - - # ---- 2. Figure Layer ---- + tar_target( fig_strategy_showdown, - create_efficiency_plot(tbl_strategy_showdown) + plot_efficiency(tbl_strategy_showdown) ), - - tar_target( - fig_strategy_showdown_path, - save_report_figure( - fig_strategy_showdown, - filename = "fig_strategy_showdown.png", - out_dir = "reports/figures" - ), - format = "file" - ), - - # ---- 3. Table Layer (gt object) ---- + tar_target( tbl_strategy_showdown_gt, - format_class_imbalance_tourney_gt(tbl_strategy_showdown) + format_tournament_gt(tbl_strategy_showdown) ), - + + # ---- Final Evaluation (Months 6-7) ---- tar_target( - tbl_strategy_showdown_path, - save_report_table( - tbl_strategy_showdown_gt, - filename = "tbl_strategy_showdown.rds", - out_dir = "reports/tables" - ), - format = "file" + test_predictions, + evaluate_final_model(params = best_params) ), - - # ---- Final Production Evaluation ---- + tar_target( - final_eval_data, - evaluate_final_model(params = winning_params) + final_conf_mat, + yardstick::conf_mat(test_predictions, truth, pred_class) ), - + tar_target( - final_conf_mat, - yardstick::conf_mat(final_eval_data, truth, pred_class) + final_roc_curve, + yardstick::roc_curve(test_predictions, truth, prob) ), - + tar_target( - final_roc_curve, - yardstick::roc_curve(final_eval_data, truth, prob) + final_pr_curve, + yardstick::pr_curve(test_predictions, truth, prob) ), - - tar_target( - final_pr_curve, - yardstick::pr_curve(final_eval_data, truth, prob) - ), - - # ---- Save Final Assets ---- + tar_target( fig_final_curves, { p1 <- ggplot2::autoplot(final_roc_curve) + ggplot2::labs(title = "ROC Curve (Months 6-7)") - p2 <- ggplot2::autoplot(final_pr_curve) + ggplot2::labs(title = "PR Curve (Months 6-7)") + p2 <- ggplot2::autoplot(final_pr_curve) + ggplot2::labs(title = "PR Curve (Months 6-7)") cowplot::plot_grid(p1, p2) } ), - - tar_target( - fig_final_curves_path, - save_report_figure(fig_final_curves, "fig_final_curves.png"), - format = "file" - ), - - tar_target( - tbl_final_conf_mat_path, - save_report_table(final_conf_mat, "tbl_final_conf_mat.rds", out_dir = "reports/tables"), - format = "file" - ), - # ---- Generate and Save Heatmap ---- + tar_target( fig_final_conf_mat, plot_conf_mat_heatmap(final_conf_mat) ), - + + # ---- Production Deployment ---- tar_target( - fig_final_conf_mat_path, - save_report_figure(fig_final_conf_mat, "fig_final_conf_mat.png"), - format = "file" + data_full, + connect_baf(baf_feature_prefix, use_duckdb = TRUE) |> + collect() ), - # ---- Report Dependency Update ---- + + tar_target( + prod_recipe, + build_baf_recipe(data_full) + ), + + tar_target( + production_model_uri, + train_production_model( + data = data_full, + recipe = prod_recipe, + best_params = best_params, + model_filename = "baf_lgbm_prod_v1.txt" + ), + format = "rds" + ), + + # ---- Saved Figure Paths ---- + tar_target(fig_fraud_by_month_path, save_report_figure(fig_fraud_by_month, "fig_fraud_by_month.png"), format = "file"), + tar_target(fig_var_imp_path, save_report_figure(fig_var_imp, "fig_var_imp.png"), format = "file"), + tar_target(fig_hexbin_interaction_path, save_report_figure(fig_hexbin_interaction, "fig_hexbin_interaction.png"), format = "file"), + tar_target(fig_missingness_path, save_report_figure(fig_missingness, "fig_missingness.png"), format = "file"), + tar_target(fig_num_cor_path, save_report_figure(fig_num_cor, "fig_num_cor.png"), format = "file"), + tar_target(fig_strategy_showdown_path, save_report_figure(fig_strategy_showdown, "fig_strategy_showdown.png", out_dir = "reports/figures"), format = "file"), + tar_target(fig_final_conf_mat_path, save_report_figure(fig_final_conf_mat, "fig_final_conf_mat.png"), format = "file"), + tar_target(fig_final_curves_path, save_report_figure(fig_final_curves, "fig_final_curves.png"), format = "file"), + + # ---- Saved Table Paths ---- + tar_target(tbl_fraud_by_month_path, save_report_table(tbl_fraud_by_month_gt, "tbl_fraud_by_month.rds"), format = "file"), + tar_target(tbl_strategy_showdown_path, save_report_table(tbl_strategy_showdown_gt, "tbl_strategy_showdown.rds", out_dir = "reports/tables"), format = "file"), + tar_target(tbl_final_conf_mat_path, save_report_table(final_conf_mat, "tbl_final_conf_mat.rds", out_dir = "reports/tables"), format = "file"), + + # ---- Report Assembly ---- tar_target( report_assets, c( fig_fraud_by_month_path, tbl_fraud_by_month_path, - fig_strategy_showdown_path, + fig_strategy_showdown_path, tbl_strategy_showdown_path, fig_var_imp_path, fig_hexbin_interaction_path, @@ -290,33 +254,9 @@ list( ), format = "file" ), - + tar_quarto( report_slides, path = "index.qmd" - ), - # production model deployment - tar_target( - data_full, - connect_baf(baf_feature_prefix, use_duckdb = TRUE) |> - collect() - ), - tar_target( - production_recipe_blueprint, - build_baf_recipe(data_full) - ), - tar_target( - winning_params, - tune_lgbm(imbalance_windows) - ), - tar_target( - production_model_uri, - train_production_model( - data = data_full, - recipe = production_recipe_blueprint, # <--- Pass the untrained blueprint! - best_params = winning_params, - model_filename = "baf_lgbm_prod_v1.txt" - ), - format = "rds" ) -) \ No newline at end of file +) diff --git a/man/prepare_eda_recipe.Rd b/man/build_eda_recipe.Rd similarity index 55% rename from man/prepare_eda_recipe.Rd rename to man/build_eda_recipe.Rd index 7eae699..a959617 100644 --- a/man/prepare_eda_recipe.Rd +++ b/man/build_eda_recipe.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R -\name{prepare_eda_recipe} -\alias{prepare_eda_recipe} -\title{Prepare EDA Recipe} +\name{build_eda_recipe} +\alias{build_eda_recipe} +\title{Build EDA Recipe} \usage{ -prepare_eda_recipe(eda_data) +build_eda_recipe(eda_data) } \arguments{ \item{eda_data}{Raw EDA data} } \description{ -Prepare EDA Recipe +Build EDA Recipe } diff --git a/man/format_class_imbalance_tourney_gt.Rd b/man/format_tournament_gt.Rd similarity index 68% rename from man/format_class_imbalance_tourney_gt.Rd rename to man/format_tournament_gt.Rd index fd46aef..be0ecfd 100644 --- a/man/format_class_imbalance_tourney_gt.Rd +++ b/man/format_tournament_gt.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R -\name{format_class_imbalance_tourney_gt} -\alias{format_class_imbalance_tourney_gt} -\title{Format Class Imbalance Tournament Table} +\name{format_tournament_gt} +\alias{format_tournament_gt} +\title{Format Tournament Results Table} \usage{ -format_class_imbalance_tourney_gt(results_df) +format_tournament_gt(results_df) } \arguments{ \item{results_df}{The tibble output from \code{run_imbalance_tournament}.} diff --git a/man/create_efficiency_plot.Rd b/man/plot_efficiency.Rd similarity index 50% rename from man/create_efficiency_plot.Rd rename to man/plot_efficiency.Rd index 47ae669..733e809 100644 --- a/man/create_efficiency_plot.Rd +++ b/man/plot_efficiency.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R -\name{create_efficiency_plot} -\alias{create_efficiency_plot} -\title{Create Effectiveness vs Efficiency Plot} +\name{plot_efficiency} +\alias{plot_efficiency} +\title{Plot Effectiveness vs Efficiency} \usage{ -create_efficiency_plot(results_df) +plot_efficiency(results_df) } \arguments{ \item{results_df}{Tibble from run_imbalance_tournament} } \description{ -Create Effectiveness vs Efficiency Plot +Plot Effectiveness vs Efficiency }