Refactor: consistent naming across functions, targets, and pkgdown

Functions: prepare_eda_recipe -> build_eda_recipe,
           create_efficiency_plot -> plot_efficiency,
           format_class_imbalance_tourney_gt -> format_tournament_gt

Targets: model_inputs_prefix -> baf_model_input_prefix,
         tbl_fraud_by_month_data -> fraud_by_month_summary,
         model_diag -> diag_fit, winning_params -> best_params,
         production_recipe_blueprint -> prod_recipe,
         final_eval_data -> test_predictions

pkgdown: restructured reference index into 6 logical sections,
         removed stale names and development comments.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-22 03:52:34 -05:00
parent f47b2e1be2
commit b38892f49e
7 changed files with 159 additions and 212 deletions

View File

@@ -1,14 +1,15 @@
# Generated by roxygen2: do not edit by hand # Generated by roxygen2: do not edit by hand
export(build_baf_recipe) export(build_baf_recipe)
export(build_eda_recipe)
export(clean_baf_base) export(clean_baf_base)
export(compute_fraud_by_month) export(compute_fraud_by_month)
export(connect_baf) export(connect_baf)
export(convert_to_parquet) export(convert_to_parquet)
export(engineer_features) export(engineer_features)
export(evaluate_final_model) export(evaluate_final_model)
export(format_class_imbalance_tourney_gt)
export(format_fraud_by_month_gt) export(format_fraud_by_month_gt)
export(format_tournament_gt)
export(generate_model_inputs) export(generate_model_inputs)
export(plot_conf_mat_heatmap) export(plot_conf_mat_heatmap)
export(plot_fraud_by_month) export(plot_fraud_by_month)
@@ -16,7 +17,6 @@ export(plot_hexbin_interaction)
export(plot_missingness) export(plot_missingness)
export(plot_num_cor) export(plot_num_cor)
export(plot_var_imp) export(plot_var_imp)
export(prepare_eda_recipe)
export(render_slides) export(render_slides)
export(run_imbalance_tournament) export(run_imbalance_tournament)
export(save_report_figure) export(save_report_figure)

View File

@@ -580,9 +580,9 @@ run_imbalance_tournament <- function(
return(results_df) return(results_df)
} }
#' Format Class Imbalance Tournament Table #' Format Tournament Results Table
#' #'
#' Aggregates results from the model tournament and performs paired t-tests #' Aggregates results from the model tournament and performs paired t-tests
#' against the 'Standard' model to determine statistical significance. #' against the 'Standard' model to determine statistical significance.
#' #'
#' @param results_df The tibble output from `run_imbalance_tournament`. #' @param results_df The tibble output from `run_imbalance_tournament`.
@@ -593,7 +593,7 @@ run_imbalance_tournament <- function(
#' #'
#' @return A formatted gt table object. #' @return A formatted gt table object.
#' @export #' @export
format_class_imbalance_tourney_gt <- function(results_df) { format_tournament_gt <- function(results_df) {
# Extract scores for the 'Standard' recipe to use as the baseline for t-tests # Extract scores for the 'Standard' recipe to use as the baseline for t-tests
standard_scores <- results_df |> standard_scores <- results_df |>
@@ -648,12 +648,12 @@ format_class_imbalance_tourney_gt <- function(results_df) {
) )
} }
#' Create Effectiveness vs Efficiency Plot #' Plot Effectiveness vs Efficiency
#' @param results_df Tibble from run_imbalance_tournament #' @param results_df Tibble from run_imbalance_tournament
#' @importFrom ggplot2 ggplot aes geom_point scale_color_manual labs theme_minimal #' @importFrom ggplot2 ggplot aes geom_point scale_color_manual labs theme_minimal
#' @importFrom ggrepel geom_text_repel #' @importFrom ggrepel geom_text_repel
#' @importFrom cowplot theme_half_open background_grid #' @importFrom cowplot theme_half_open background_grid
create_efficiency_plot <- function(results_df) { plot_efficiency <- function(results_df) {
# Aggregate by recipe # Aggregate by recipe
plot_data <- results_df |> plot_data <- results_df |>
dplyr::group_by(recipe) |> dplyr::group_by(recipe) |>
@@ -677,11 +677,11 @@ create_efficiency_plot <- function(results_df) {
cowplot::theme_half_open(font_family = "Atkinson Hyperlegible") + cowplot::theme_half_open(font_family = "Atkinson Hyperlegible") +
cowplot::background_grid(major = "y") cowplot::background_grid(major = "y")
} }
#' Prepare EDA Recipe #' Build EDA Recipe
#' @param eda_data Raw EDA data #' @param eda_data Raw EDA data
#' @importFrom recipes recipe update_role step_novel step_unknown step_impute_median step_dummy all_nominal_predictors all_numeric_predictors prep #' @importFrom recipes recipe update_role step_novel step_unknown step_impute_median step_dummy all_nominal_predictors all_numeric_predictors prep
#' @export #' @export
prepare_eda_recipe <- function(eda_data) { build_eda_recipe <- function(eda_data) {
recipe(outcome ~ ., data = eda_data) |> recipe(outcome ~ ., data = eda_data) |>
update_role(month, new_role = "ID") |> update_role(month, new_role = "ID") |>
step_novel(all_nominal_predictors()) |> step_novel(all_nominal_predictors()) |>

View File

@@ -2,7 +2,7 @@ url: https://docs.robwiederstein.org/baflakehouse
template: template:
bootstrap: 5 bootstrap: 5
bootswatch: flatly # Clean, professional look bootswatch: flatly
navbar: navbar:
structure: structure:
@@ -15,44 +15,51 @@ navbar:
reference: reference:
- title: "Data Ingestion & Lakehouse Setup" - title: "Data Ingestion & Lakehouse Setup"
desc: "Functions for moving data from CSV to partitioned Parquet in MinIO." desc: "Functions for moving raw CSV data into the MinIO Lakehouse as partitioned Parquet."
contents: contents:
- baflakehouse-package - baflakehouse-package
- convert_to_parquet - convert_to_parquet
- connect_baf - connect_baf
- clean_baf_base - clean_baf_base
- title: "Feature Engineering & Preprocessing" - title: "Feature Engineering & Preprocessing"
desc: "The 'Recipes' layer of the pipeline." desc: "Recipes and transformations applied across the pipeline layers."
contents: contents:
- engineer_features - engineer_features
- prepare_eda_recipe
- build_baf_recipe # NEW: Untrained blueprint for production
- generate_model_inputs - generate_model_inputs
- build_eda_recipe
- build_baf_recipe
- title: "The Tournament (Model Selection)" - title: "Exploratory Data Analysis"
desc: "Cross-validation and imbalance strategy testing." desc: "Diagnostic model and visualizations for understanding the fraud signal."
contents:
- train_diag_model
- plot_var_imp
- plot_hexbin_interaction
- plot_missingness
- plot_num_cor
- title: "Model Selection & Tuning"
desc: "Imbalance strategy tournament, hyperparameter tuning, and results formatting."
contents: contents:
- run_imbalance_tournament - run_imbalance_tournament
- tune_lgbm - tune_lgbm
- train_diag_model - format_tournament_gt
- create_efficiency_plot # Moved here: Belongs with the tournament - plot_efficiency
- title: "Final Evaluation & Production Deployment" - title: "Final Evaluation & Production Deployment"
desc: "Results on unseen data (Months 6-7) and MinIO artifact serialization." desc: "Holdout evaluation on months 6-7 and MinIO model artifact serialization."
contents: contents:
- evaluate_final_model - evaluate_final_model
- train_production_model # NEW: The final deployment function - train_production_model
- title: "Reporting: Tables & Visualizations" - title: "Reporting"
desc: "Generating ggplot2 figures and gt tables for Quarto." desc: "Figures, tables, and slide rendering for the Quarto presentation."
contents: contents:
- starts_with("plot_") - plot_fraud_by_month
- starts_with("compute_") - plot_conf_mat_heatmap
- starts_with("format_") # Neatly catches all your gt table formatters - compute_fraud_by_month
- format_fraud_by_month_gt
- title: "Pipeline Utilities" - save_report_figure
desc: "Internal helpers for the targets workflow and slide generation." - save_report_table
contents: - render_slides
- starts_with("save_report_")
- render_slides # Consolidated here

View File

@@ -19,9 +19,9 @@ tar_option_set(
"scales", "scales",
"ggplot2", "ggplot2",
"quarto", "quarto",
"corrr", "corrr",
"recipes", "recipes",
"themis", "themis",
"tidyselect" "tidyselect"
) )
) )
@@ -37,7 +37,7 @@ list(
bucket_name = "baf-fraud" bucket_name = "baf-fraud"
) )
), ),
tar_target( tar_target(
baf_primary_prefix, baf_primary_prefix,
clean_baf_base( clean_baf_base(
@@ -49,7 +49,7 @@ list(
verbose = TRUE verbose = TRUE
) )
), ),
tar_target( tar_target(
baf_feature_prefix, baf_feature_prefix,
engineer_features( engineer_features(
@@ -61,98 +61,60 @@ list(
verbose = TRUE verbose = TRUE
) )
), ),
# ---- Figure objects ---- # ---- 05_model_input Generation ----
tar_target( tar_target(
fig_fraud_by_month, baf_model_input_prefix,
plot_fraud_by_month(baf_primary_prefix, bucket_name = "baf-fraud") generate_model_inputs(
feature_prefix = baf_feature_prefix,
out_prefix = "05_model_input",
bucket_name = "baf-fraud"
)
), ),
# ---- Saved figure path (file target) ---- # ---- EDA Layer ----
tar_target(
fig_fraud_by_month_path,
save_report_figure(
fig_fraud_by_month,
filename = "fig_fraud_by_month.png",
out_dir = "reports/figures"
),
format = "file"
),
tar_target(
tbl_fraud_by_month_data,
compute_fraud_by_month(baf_primary_prefix)
),
tar_target(
tbl_fraud_by_month_gt,
format_fraud_by_month_gt(tbl_fraud_by_month_data)
),
tar_target(
tbl_fraud_by_month_path,
save_report_table(tbl_fraud_by_month_gt, filename = "tbl_fraud_by_month.rds"),
format = "file"
),
# ---- Exploratory Data Analysis (EDA) Layer ----
tar_target( tar_target(
data_eda_m0, data_eda_m0,
connect_baf(baf_primary_prefix, use_duckdb = TRUE) |> connect_baf(baf_primary_prefix, use_duckdb = TRUE) |>
filter(month == 0) |> filter(month == 0) |>
collect() collect()
), ),
tar_target( tar_target(
eda_recipe, eda_recipe,
prepare_eda_recipe(data_eda_m0) build_eda_recipe(data_eda_m0)
), ),
tar_target( tar_target(
data_baked_eda_m0, data_baked_eda_m0,
bake(eda_recipe, new_data = data_eda_m0) bake(eda_recipe, new_data = data_eda_m0)
), ),
tar_target( tar_target(
model_diag, diag_fit,
train_diag_model(data_baked_eda_m0) train_diag_model(data_baked_eda_m0)
), ),
# ---- EDA Figures ---- # ---- EDA Figures ----
tar_target(fig_var_imp, plot_var_imp(model_diag)), tar_target(fig_var_imp, plot_var_imp(diag_fit)),
tar_target(fig_hexbin_interaction, plot_hexbin_interaction(data_baked_eda_m0)), tar_target(fig_hexbin_interaction, plot_hexbin_interaction(data_baked_eda_m0)),
tar_target(fig_missingness, plot_missingness(data_eda_m0)), tar_target(fig_missingness, plot_missingness(data_eda_m0)),
tar_target(fig_num_cor, plot_num_cor(data_eda_m0)), tar_target(fig_num_cor, plot_num_cor(data_eda_m0)),
# ---- Saved EDA Figure Paths ---- # ---- Fraud Prevalence ----
tar_target( tar_target(
fig_var_imp_path, fig_fraud_by_month,
save_report_figure(fig_var_imp, "fig_var_imp.png"), plot_fraud_by_month(baf_primary_prefix, bucket_name = "baf-fraud")
format = "file"
), ),
tar_target( tar_target(
fig_hexbin_interaction_path, fraud_by_month_summary,
save_report_figure(fig_hexbin_interaction, "fig_hexbin_interaction.png"), compute_fraud_by_month(baf_primary_prefix)
format = "file"
), ),
tar_target( tar_target(
fig_missingness_path, tbl_fraud_by_month_gt,
save_report_figure(fig_missingness, "fig_missingness.png"), format_fraud_by_month_gt(fraud_by_month_summary)
format = "file"
),
tar_target(
fig_num_cor_path,
save_report_figure(fig_num_cor, "fig_num_cor.png"),
format = "file"
),
# ---- 05_model_input Generation ----
tar_target(
model_inputs_prefix,
generate_model_inputs(
feature_prefix = baf_feature_prefix,
out_prefix = "05_model_input",
bucket_name = "baf-fraud"
)
), ),
# ---- Tournament Inputs ---- # ---- Tournament Inputs ----
@@ -161,14 +123,14 @@ list(
tibble::tribble( tibble::tribble(
~recipe_name, ~data_folder, ~scale_pos_weight, ~recipe_name, ~data_folder, ~scale_pos_weight,
"Standard", "baseline", 1, "Standard", "baseline", 1,
"Weighted", "baseline", 4, "Weighted", "baseline", 4,
"Under", "under", 1, "Under", "under", 1,
"Smote", "smote", 1, "Smote", "smote", 1,
"Adasyn", "adasyn", 1, "Adasyn", "adasyn", 1,
"Tomek", "tomek", 1 "Tomek", "tomek", 1
) )
), ),
tar_target( tar_target(
imbalance_windows, imbalance_windows,
tibble::tribble( tibble::tribble(
@@ -178,110 +140,112 @@ list(
"Window 3", c(2, 3, 4), 5 "Window 3", c(2, 3, 4), 5
) )
), ),
# ---- 1. Data Layer (The Tournament Results) ---- # ---- Hyperparameter Tuning ----
tar_target(
best_params,
tune_lgbm(imbalance_windows)
),
# ---- Tournament Results ----
tar_target( tar_target(
tbl_strategy_showdown, tbl_strategy_showdown,
{ {
# Force DAG to wait for the folders to be generated force(baf_model_input_prefix)
force(model_inputs_prefix)
# Pass baf_feature_prefix so it tracks the latest layer
run_imbalance_tournament(imbalance_tasks, imbalance_windows, baf_feature_prefix) run_imbalance_tournament(imbalance_tasks, imbalance_windows, baf_feature_prefix)
} }
), ),
# ---- 2. Figure Layer ----
tar_target( tar_target(
fig_strategy_showdown, fig_strategy_showdown,
create_efficiency_plot(tbl_strategy_showdown) plot_efficiency(tbl_strategy_showdown)
), ),
tar_target(
fig_strategy_showdown_path,
save_report_figure(
fig_strategy_showdown,
filename = "fig_strategy_showdown.png",
out_dir = "reports/figures"
),
format = "file"
),
# ---- 3. Table Layer (gt object) ----
tar_target( tar_target(
tbl_strategy_showdown_gt, tbl_strategy_showdown_gt,
format_class_imbalance_tourney_gt(tbl_strategy_showdown) format_tournament_gt(tbl_strategy_showdown)
), ),
# ---- Final Evaluation (Months 6-7) ----
tar_target( tar_target(
tbl_strategy_showdown_path, test_predictions,
save_report_table( evaluate_final_model(params = best_params)
tbl_strategy_showdown_gt,
filename = "tbl_strategy_showdown.rds",
out_dir = "reports/tables"
),
format = "file"
), ),
# ---- Final Production Evaluation ----
tar_target( tar_target(
final_eval_data, final_conf_mat,
evaluate_final_model(params = winning_params) yardstick::conf_mat(test_predictions, truth, pred_class)
), ),
tar_target( tar_target(
final_conf_mat, final_roc_curve,
yardstick::conf_mat(final_eval_data, truth, pred_class) yardstick::roc_curve(test_predictions, truth, prob)
), ),
tar_target( tar_target(
final_roc_curve, final_pr_curve,
yardstick::roc_curve(final_eval_data, truth, prob) yardstick::pr_curve(test_predictions, truth, prob)
), ),
tar_target(
final_pr_curve,
yardstick::pr_curve(final_eval_data, truth, prob)
),
# ---- Save Final Assets ----
tar_target( tar_target(
fig_final_curves, fig_final_curves,
{ {
p1 <- ggplot2::autoplot(final_roc_curve) + ggplot2::labs(title = "ROC Curve (Months 6-7)") p1 <- ggplot2::autoplot(final_roc_curve) + ggplot2::labs(title = "ROC Curve (Months 6-7)")
p2 <- ggplot2::autoplot(final_pr_curve) + ggplot2::labs(title = "PR Curve (Months 6-7)") p2 <- ggplot2::autoplot(final_pr_curve) + ggplot2::labs(title = "PR Curve (Months 6-7)")
cowplot::plot_grid(p1, p2) cowplot::plot_grid(p1, p2)
} }
), ),
tar_target(
fig_final_curves_path,
save_report_figure(fig_final_curves, "fig_final_curves.png"),
format = "file"
),
tar_target(
tbl_final_conf_mat_path,
save_report_table(final_conf_mat, "tbl_final_conf_mat.rds", out_dir = "reports/tables"),
format = "file"
),
# ---- Generate and Save Heatmap ----
tar_target( tar_target(
fig_final_conf_mat, fig_final_conf_mat,
plot_conf_mat_heatmap(final_conf_mat) plot_conf_mat_heatmap(final_conf_mat)
), ),
# ---- Production Deployment ----
tar_target( tar_target(
fig_final_conf_mat_path, data_full,
save_report_figure(fig_final_conf_mat, "fig_final_conf_mat.png"), connect_baf(baf_feature_prefix, use_duckdb = TRUE) |>
format = "file" collect()
), ),
# ---- Report Dependency Update ----
tar_target(
prod_recipe,
build_baf_recipe(data_full)
),
tar_target(
production_model_uri,
train_production_model(
data = data_full,
recipe = prod_recipe,
best_params = best_params,
model_filename = "baf_lgbm_prod_v1.txt"
),
format = "rds"
),
# ---- Saved Figure Paths ----
tar_target(fig_fraud_by_month_path, save_report_figure(fig_fraud_by_month, "fig_fraud_by_month.png"), format = "file"),
tar_target(fig_var_imp_path, save_report_figure(fig_var_imp, "fig_var_imp.png"), format = "file"),
tar_target(fig_hexbin_interaction_path, save_report_figure(fig_hexbin_interaction, "fig_hexbin_interaction.png"), format = "file"),
tar_target(fig_missingness_path, save_report_figure(fig_missingness, "fig_missingness.png"), format = "file"),
tar_target(fig_num_cor_path, save_report_figure(fig_num_cor, "fig_num_cor.png"), format = "file"),
tar_target(fig_strategy_showdown_path, save_report_figure(fig_strategy_showdown, "fig_strategy_showdown.png", out_dir = "reports/figures"), format = "file"),
tar_target(fig_final_conf_mat_path, save_report_figure(fig_final_conf_mat, "fig_final_conf_mat.png"), format = "file"),
tar_target(fig_final_curves_path, save_report_figure(fig_final_curves, "fig_final_curves.png"), format = "file"),
# ---- Saved Table Paths ----
tar_target(tbl_fraud_by_month_path, save_report_table(tbl_fraud_by_month_gt, "tbl_fraud_by_month.rds"), format = "file"),
tar_target(tbl_strategy_showdown_path, save_report_table(tbl_strategy_showdown_gt, "tbl_strategy_showdown.rds", out_dir = "reports/tables"), format = "file"),
tar_target(tbl_final_conf_mat_path, save_report_table(final_conf_mat, "tbl_final_conf_mat.rds", out_dir = "reports/tables"), format = "file"),
# ---- Report Assembly ----
tar_target( tar_target(
report_assets, report_assets,
c( c(
fig_fraud_by_month_path, fig_fraud_by_month_path,
tbl_fraud_by_month_path, tbl_fraud_by_month_path,
fig_strategy_showdown_path, fig_strategy_showdown_path,
tbl_strategy_showdown_path, tbl_strategy_showdown_path,
fig_var_imp_path, fig_var_imp_path,
fig_hexbin_interaction_path, fig_hexbin_interaction_path,
@@ -290,33 +254,9 @@ list(
), ),
format = "file" format = "file"
), ),
tar_quarto( tar_quarto(
report_slides, report_slides,
path = "index.qmd" path = "index.qmd"
),
# production model deployment
tar_target(
data_full,
connect_baf(baf_feature_prefix, use_duckdb = TRUE) |>
collect()
),
tar_target(
production_recipe_blueprint,
build_baf_recipe(data_full)
),
tar_target(
winning_params,
tune_lgbm(imbalance_windows)
),
tar_target(
production_model_uri,
train_production_model(
data = data_full,
recipe = production_recipe_blueprint, # <--- Pass the untrained blueprint!
best_params = winning_params,
model_filename = "baf_lgbm_prod_v1.txt"
),
format = "rds"
) )
) )

View File

@@ -1,14 +1,14 @@
% Generated by roxygen2: do not edit by hand % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R % Please edit documentation in R/functions.R
\name{prepare_eda_recipe} \name{build_eda_recipe}
\alias{prepare_eda_recipe} \alias{build_eda_recipe}
\title{Prepare EDA Recipe} \title{Build EDA Recipe}
\usage{ \usage{
prepare_eda_recipe(eda_data) build_eda_recipe(eda_data)
} }
\arguments{ \arguments{
\item{eda_data}{Raw EDA data} \item{eda_data}{Raw EDA data}
} }
\description{ \description{
Prepare EDA Recipe Build EDA Recipe
} }

View File

@@ -1,10 +1,10 @@
% Generated by roxygen2: do not edit by hand % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R % Please edit documentation in R/functions.R
\name{format_class_imbalance_tourney_gt} \name{format_tournament_gt}
\alias{format_class_imbalance_tourney_gt} \alias{format_tournament_gt}
\title{Format Class Imbalance Tournament Table} \title{Format Tournament Results Table}
\usage{ \usage{
format_class_imbalance_tourney_gt(results_df) format_tournament_gt(results_df)
} }
\arguments{ \arguments{
\item{results_df}{The tibble output from \code{run_imbalance_tournament}.} \item{results_df}{The tibble output from \code{run_imbalance_tournament}.}

View File

@@ -1,14 +1,14 @@
% Generated by roxygen2: do not edit by hand % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R % Please edit documentation in R/functions.R
\name{create_efficiency_plot} \name{plot_efficiency}
\alias{create_efficiency_plot} \alias{plot_efficiency}
\title{Create Effectiveness vs Efficiency Plot} \title{Plot Effectiveness vs Efficiency}
\usage{ \usage{
create_efficiency_plot(results_df) plot_efficiency(results_df)
} }
\arguments{ \arguments{
\item{results_df}{Tibble from run_imbalance_tournament} \item{results_df}{Tibble from run_imbalance_tournament}
} }
\description{ \description{
Create Effectiveness vs Efficiency Plot Plot Effectiveness vs Efficiency
} }