Files
bank-fraud-baf-lakehouse/_targets.R
Rob Wiederstein df978d042f Refactor bucket structure: baf-fraud/ prefix under lake bucket
All functions now default to bucket_name = "lake" with "baf-fraud/"
prepended to all layer prefixes, matching the contemporary lakehouse
naming convention (one bucket per environment, project as prefix).

Migration: copy baf-fraud/ data to lake/baf-fraud/ on analyticsvm,
update BAF_BUCKET env var from "baf-fraud" to "lake".

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-22 05:36:25 -05:00

263 lines
6.7 KiB
R

library(targets)
library(tarchetypes)
tar_option_set(
packages = c(
"arrow",
"bonsai",
"duckdb",
"glue",
"gt",
"here",
"lightgbm",
"lubridate",
"tidymodels",
"tidyverse",
"cowplot",
"colorspace",
"readr",
"scales",
"ggplot2",
"quarto",
"corrr",
"recipes",
"themis",
"tidyselect"
)
)
tar_source("./R/functions.R")
list(
tar_target(
baf_parquet_prefix,
convert_to_parquet(
from_prefix = "baf-fraud/01_raw",
to_prefix = "baf-fraud/02_intermediate",
bucket_name = "lake"
)
),
tar_target(
baf_primary_prefix,
clean_baf_base(
in_prefix = baf_parquet_prefix,
out_prefix = "baf-fraud/03_primary/variant=Base",
bucket_name = "lake",
partitioning = "month",
existing_data_behavior = "delete_matching",
verbose = TRUE
)
),
tar_target(
baf_feature_prefix,
engineer_features(
in_prefix = baf_primary_prefix,
out_prefix = "baf-fraud/04_feature/variant=Base",
bucket_name = "lake",
partitioning = "month",
existing_data_behavior = "delete_matching",
verbose = TRUE
)
),
# ---- 05_model_input Generation ----
tar_target(
baf_model_input_prefix,
generate_model_inputs(
feature_prefix = baf_feature_prefix,
out_prefix = "baf-fraud/05_model_input",
bucket_name = "lake"
)
),
# ---- EDA Layer ----
tar_target(
data_eda_m0,
connect_baf(baf_primary_prefix, use_duckdb = TRUE) |>
filter(month == 0) |>
collect()
),
tar_target(
eda_recipe,
build_eda_recipe(data_eda_m0)
),
tar_target(
data_baked_eda_m0,
bake(eda_recipe, new_data = data_eda_m0)
),
tar_target(
diag_fit,
train_diag_model(data_baked_eda_m0)
),
# ---- EDA Figures ----
tar_target(fig_var_imp, plot_var_imp(diag_fit)),
tar_target(fig_hexbin_interaction, plot_hexbin_interaction(data_baked_eda_m0)),
tar_target(fig_missingness, plot_missingness(data_eda_m0)),
tar_target(fig_num_cor, plot_num_cor(data_eda_m0)),
# ---- Fraud Prevalence ----
tar_target(
fig_fraud_by_month,
plot_fraud_by_month(baf_primary_prefix, bucket_name = "lake")
),
tar_target(
fraud_by_month_summary,
compute_fraud_by_month(baf_primary_prefix)
),
tar_target(
tbl_fraud_by_month_gt,
format_fraud_by_month_gt(fraud_by_month_summary)
),
# ---- Tournament Inputs ----
tar_target(
imbalance_tasks,
tibble::tribble(
~recipe_name, ~data_folder, ~scale_pos_weight,
"Standard", "baseline", 1,
"Weighted", "baseline", 4,
"Under", "under", 1,
"Smote", "smote", 1,
"Adasyn", "adasyn", 1,
"Tomek", "tomek", 1
)
),
tar_target(
imbalance_windows,
tibble::tribble(
~window_id, ~train_months, ~test_month,
"Window 1", c(0, 1, 2), 3,
"Window 2", c(1, 2, 3), 4,
"Window 3", c(2, 3, 4), 5
)
),
# ---- Hyperparameter Tuning ----
tar_target(
best_params,
tune_lgbm(imbalance_windows)
),
# ---- Tournament Results ----
tar_target(
tbl_strategy_showdown,
{
force(baf_model_input_prefix)
run_imbalance_tournament(imbalance_tasks, imbalance_windows, baf_feature_prefix)
}
),
tar_target(
fig_strategy_showdown,
plot_efficiency(tbl_strategy_showdown)
),
tar_target(
tbl_strategy_showdown_gt,
format_tournament_gt(tbl_strategy_showdown)
),
# ---- Final Evaluation (Months 6-7) ----
tar_target(
test_predictions,
evaluate_final_model(params = best_params)
),
tar_target(
final_conf_mat,
yardstick::conf_mat(test_predictions, truth, pred_class)
),
tar_target(
final_roc_curve,
yardstick::roc_curve(test_predictions, truth, prob)
),
tar_target(
final_pr_curve,
yardstick::pr_curve(test_predictions, truth, prob)
),
tar_target(
fig_final_curves,
{
p1 <- ggplot2::autoplot(final_roc_curve) + ggplot2::labs(title = "ROC Curve (Months 6-7)")
p2 <- ggplot2::autoplot(final_pr_curve) + ggplot2::labs(title = "PR Curve (Months 6-7)")
cowplot::plot_grid(p1, p2)
}
),
tar_target(
fig_final_conf_mat,
plot_conf_mat_heatmap(final_conf_mat)
),
# ---- Production Deployment ----
tar_target(
data_full,
connect_baf(baf_feature_prefix, use_duckdb = TRUE) |>
collect()
),
tar_target(
prod_recipe,
build_baf_recipe(data_full)
),
tar_target(
production_model_uri,
train_production_model(
data = data_full,
recipe = prod_recipe,
best_params = best_params,
model_filename = "baf_lgbm_prod_v1.txt"
),
format = "rds"
),
# ---- Saved Figure Paths ----
tar_target(fig_fraud_by_month_path, save_report_figure(fig_fraud_by_month, "fig_fraud_by_month.png"), format = "file"),
tar_target(fig_var_imp_path, save_report_figure(fig_var_imp, "fig_var_imp.png"), format = "file"),
tar_target(fig_hexbin_interaction_path, save_report_figure(fig_hexbin_interaction, "fig_hexbin_interaction.png"), format = "file"),
tar_target(fig_missingness_path, save_report_figure(fig_missingness, "fig_missingness.png"), format = "file"),
tar_target(fig_num_cor_path, save_report_figure(fig_num_cor, "fig_num_cor.png"), format = "file"),
tar_target(fig_strategy_showdown_path, save_report_figure(fig_strategy_showdown, "fig_strategy_showdown.png", out_dir = "reports/figures"), format = "file"),
tar_target(fig_final_conf_mat_path, save_report_figure(fig_final_conf_mat, "fig_final_conf_mat.png"), format = "file"),
tar_target(fig_final_curves_path, save_report_figure(fig_final_curves, "fig_final_curves.png"), format = "file"),
# ---- Saved Table Paths ----
tar_target(tbl_fraud_by_month_path, save_report_table(tbl_fraud_by_month_gt, "tbl_fraud_by_month.rds"), format = "file"),
tar_target(tbl_strategy_showdown_path, save_report_table(tbl_strategy_showdown_gt, "tbl_strategy_showdown.rds", out_dir = "reports/tables"), format = "file"),
tar_target(tbl_final_conf_mat_path, save_report_table(final_conf_mat, "tbl_final_conf_mat.rds", out_dir = "reports/tables"), format = "file"),
# ---- Report Assembly ----
tar_target(
report_assets,
c(
fig_fraud_by_month_path,
tbl_fraud_by_month_path,
fig_strategy_showdown_path,
tbl_strategy_showdown_path,
fig_var_imp_path,
fig_hexbin_interaction_path,
fig_missingness_path,
fig_num_cor_path
),
format = "file"
),
tar_quarto(
report_slides,
path = "index.qmd"
)
)