library(targets) library(tarchetypes) tar_option_set( packages = c( "arrow", "bonsai", "duckdb", "glue", "gt", "here", "lightgbm", "lubridate", "tidymodels", "tidyverse", "cowplot", "colorspace", "readr", "scales", "ggplot2", "quarto", "corrr", "recipes", "themis", "tidyselect" ) ) tar_source("./R/functions.R") list( tar_target( baf_parquet_prefix, convert_to_parquet( from_prefix = "baf-fraud/01_raw", to_prefix = "baf-fraud/02_intermediate", bucket_name = "lake" ) ), tar_target( baf_primary_prefix, clean_baf_base( in_prefix = baf_parquet_prefix, out_prefix = "baf-fraud/03_primary/variant=Base", bucket_name = "lake", partitioning = "month", existing_data_behavior = "delete_matching", verbose = TRUE ) ), tar_target( baf_feature_prefix, engineer_features( in_prefix = baf_primary_prefix, out_prefix = "baf-fraud/04_feature/variant=Base", bucket_name = "lake", partitioning = "month", existing_data_behavior = "delete_matching", verbose = TRUE ) ), # ---- 05_model_input Generation ---- tar_target( baf_model_input_prefix, generate_model_inputs( feature_prefix = baf_feature_prefix, out_prefix = "baf-fraud/05_model_input", bucket_name = "lake" ) ), # ---- EDA Layer ---- tar_target( data_eda_m0, connect_baf(baf_primary_prefix, use_duckdb = TRUE) |> filter(month == 0) |> collect() ), tar_target( eda_recipe, build_eda_recipe(data_eda_m0) ), tar_target( data_baked_eda_m0, bake(eda_recipe, new_data = data_eda_m0) ), tar_target( diag_fit, train_diag_model(data_baked_eda_m0) ), # ---- EDA Figures ---- tar_target(fig_var_imp, plot_var_imp(diag_fit)), tar_target(fig_hexbin_interaction, plot_hexbin_interaction(data_baked_eda_m0)), tar_target(fig_missingness, plot_missingness(data_eda_m0)), tar_target(fig_num_cor, plot_num_cor(data_eda_m0)), # ---- Fraud Prevalence ---- tar_target( fig_fraud_by_month, plot_fraud_by_month(baf_primary_prefix, bucket_name = "lake") ), tar_target( fraud_by_month_summary, compute_fraud_by_month(baf_primary_prefix) ), tar_target( tbl_fraud_by_month_gt, format_fraud_by_month_gt(fraud_by_month_summary) ), # ---- Tournament Inputs ---- tar_target( imbalance_tasks, tibble::tribble( ~recipe_name, ~data_folder, ~scale_pos_weight, "Standard", "baseline", 1, "Weighted", "baseline", 4, "Under", "under", 1, "Smote", "smote", 1, "Adasyn", "adasyn", 1, "Tomek", "tomek", 1 ) ), tar_target( imbalance_windows, tibble::tribble( ~window_id, ~train_months, ~test_month, "Window 1", c(0, 1, 2), 3, "Window 2", c(1, 2, 3), 4, "Window 3", c(2, 3, 4), 5 ) ), # ---- Hyperparameter Tuning ---- tar_target( best_params, tune_lgbm(imbalance_windows) ), # ---- Tournament Results ---- tar_target( tbl_strategy_showdown, { force(baf_model_input_prefix) run_imbalance_tournament(imbalance_tasks, imbalance_windows, baf_feature_prefix) } ), tar_target( fig_strategy_showdown, plot_efficiency(tbl_strategy_showdown) ), tar_target( tbl_strategy_showdown_gt, format_tournament_gt(tbl_strategy_showdown) ), # ---- Final Evaluation (Months 6-7) ---- tar_target( test_predictions, evaluate_final_model(params = best_params) ), tar_target( final_conf_mat, yardstick::conf_mat(test_predictions, truth, pred_class) ), tar_target( final_roc_curve, yardstick::roc_curve(test_predictions, truth, prob) ), tar_target( final_pr_curve, yardstick::pr_curve(test_predictions, truth, prob) ), tar_target( fig_final_curves, { p1 <- ggplot2::autoplot(final_roc_curve) + ggplot2::labs(title = "ROC Curve (Months 6-7)") p2 <- ggplot2::autoplot(final_pr_curve) + ggplot2::labs(title = "PR Curve (Months 6-7)") cowplot::plot_grid(p1, p2) } ), tar_target( fig_final_conf_mat, plot_conf_mat_heatmap(final_conf_mat) ), # ---- Production Deployment ---- tar_target( data_full, connect_baf(baf_feature_prefix, use_duckdb = TRUE) |> collect() ), tar_target( prod_recipe, build_baf_recipe(data_full) ), tar_target( production_model_uri, train_production_model( data = data_full, recipe = prod_recipe, best_params = best_params, model_filename = "baf_lgbm_prod_v1.txt" ), format = "rds" ), # ---- Saved Figure Paths ---- tar_target(fig_fraud_by_month_path, save_report_figure(fig_fraud_by_month, "fig_fraud_by_month.png"), format = "file"), tar_target(fig_var_imp_path, save_report_figure(fig_var_imp, "fig_var_imp.png"), format = "file"), tar_target(fig_hexbin_interaction_path, save_report_figure(fig_hexbin_interaction, "fig_hexbin_interaction.png"), format = "file"), tar_target(fig_missingness_path, save_report_figure(fig_missingness, "fig_missingness.png"), format = "file"), tar_target(fig_num_cor_path, save_report_figure(fig_num_cor, "fig_num_cor.png"), format = "file"), tar_target(fig_strategy_showdown_path, save_report_figure(fig_strategy_showdown, "fig_strategy_showdown.png", out_dir = "reports/figures"), format = "file"), tar_target(fig_final_conf_mat_path, save_report_figure(fig_final_conf_mat, "fig_final_conf_mat.png"), format = "file"), tar_target(fig_final_curves_path, save_report_figure(fig_final_curves, "fig_final_curves.png"), format = "file"), # ---- Saved Table Paths ---- tar_target(tbl_fraud_by_month_path, save_report_table(tbl_fraud_by_month_gt, "tbl_fraud_by_month.rds"), format = "file"), tar_target(tbl_strategy_showdown_path, save_report_table(tbl_strategy_showdown_gt, "tbl_strategy_showdown.rds", out_dir = "reports/tables"), format = "file"), tar_target(tbl_final_conf_mat_path, save_report_table(final_conf_mat, "tbl_final_conf_mat.rds", out_dir = "reports/tables"), format = "file"), # ---- Report Assembly ---- tar_target( report_assets, c( fig_fraud_by_month_path, tbl_fraud_by_month_path, fig_strategy_showdown_path, tbl_strategy_showdown_path, fig_var_imp_path, fig_hexbin_interaction_path, fig_missingness_path, fig_num_cor_path ), format = "file" ), tar_quarto( report_slides, path = "index.qmd" ) )