Initial commit: BAF Lakehouse fraud detection pipeline
End-to-end LightGBM fraud detection pipeline built as an R package, orchestrated by targets with data stored in MinIO via Apache Arrow. Includes 6-layer Lakehouse architecture, class imbalance tournament, formally tuned hyperparameters (PR-AUC 0.198), and Quarto RevealJS slides. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
33
.Rbuildignore
Normal file
@@ -0,0 +1,33 @@
|
||||
^\.Rproj\.user$
|
||||
^.*\.Rproj$
|
||||
^\.Rhistory$
|
||||
^\.RData$
|
||||
^\.Renviron$
|
||||
^\.Rprofile$
|
||||
^Rprofile$
|
||||
|
||||
^renv$
|
||||
^renv\.lock$
|
||||
|
||||
^\.git$
|
||||
^\.github$
|
||||
|
||||
^_targets$
|
||||
^_targets\.R$
|
||||
|
||||
^reports$
|
||||
|
||||
^index\.qmd$
|
||||
^_quarto\.yml$
|
||||
^\.quarto$
|
||||
|
||||
^README\.Rmd$
|
||||
|
||||
^LICENSE\.md$
|
||||
^scratch$
|
||||
^TODO\.md$
|
||||
^dev$
|
||||
|
||||
^_pkgdown\.yml$
|
||||
^docs$
|
||||
^pkgdown$
|
||||
1
.dockerignore
Normal file
@@ -0,0 +1 @@
|
||||
/dev/
|
||||
32
.gitignore
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
# --- RStudio & System Basics ---
|
||||
.Rproj.user
|
||||
.Rhistory
|
||||
.RData
|
||||
.Ruserdata
|
||||
.DS_Store
|
||||
|
||||
# --- SECURITY: Never commit these ---
|
||||
.Renviron
|
||||
.Rprofile
|
||||
*.secret
|
||||
credentials/
|
||||
|
||||
# --- DATA: The "Lake" Rule ---
|
||||
# Since data lives on Smaug/MinIO, we ignore any local data attempts
|
||||
data/*
|
||||
!data/README.md
|
||||
*.csv
|
||||
*.parquet
|
||||
*.rds
|
||||
|
||||
# --- WORKFLOW: Targets (for when you switch later) ---
|
||||
_targets/
|
||||
_targets.user/data/
|
||||
.Renviron
|
||||
|
||||
/.quarto/
|
||||
**/*.quarto_ipynb
|
||||
docs
|
||||
|
||||
/dev/
|
||||
/scratch/
|
||||
34
DESCRIPTION
Normal file
@@ -0,0 +1,34 @@
|
||||
Package: baflakehouse
|
||||
Title: Lakehouse Workflow for the Bank Account Fraud Dataset
|
||||
Version: 0.0.0.9000
|
||||
Authors@R:
|
||||
person("Rob", "Wiederstein", role = c("aut", "cre"),
|
||||
email = "REPLACE_ME@example.com")
|
||||
Description: Tools to ingest the Bank Account Fraud (BAF) Base dataset into a
|
||||
MinIO/S3-backed lakehouse, clean encoded missing values, and produce
|
||||
reproducible reporting artifacts (tables, figures, slides) orchestrated with
|
||||
targets.
|
||||
License: MIT + file LICENSE
|
||||
Encoding: UTF-8
|
||||
Roxygen: list(markdown = TRUE)
|
||||
RoxygenNote: 7.3.3
|
||||
Imports:
|
||||
arrow,
|
||||
colorspace,
|
||||
cowplot,
|
||||
dplyr,
|
||||
tidyr,
|
||||
stringr,
|
||||
readr,
|
||||
gt,
|
||||
quarto,
|
||||
ggplot2,
|
||||
bonsai
|
||||
Suggests:
|
||||
duckdb,
|
||||
targets,
|
||||
tarchetypes,
|
||||
knitr,
|
||||
scales
|
||||
URL: https://docs.robwiederstein.org/baflakehouse
|
||||
BugReports: https://git.robwiederstein.org/rkw/bank-fraud-baf-lakehouse/issues
|
||||
21
LICENSE.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# MIT License
|
||||
|
||||
Copyright (c) 2026 Rob Wiederstein
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
139
NAMESPACE
Normal file
@@ -0,0 +1,139 @@
|
||||
# Generated by roxygen2: do not edit by hand
|
||||
|
||||
export(build_baf_recipe)
|
||||
export(clean_baf_base)
|
||||
export(compute_fraud_by_month)
|
||||
export(connect_baf)
|
||||
export(convert_to_parquet)
|
||||
export(engineer_features)
|
||||
export(evaluate_final_model)
|
||||
export(format_class_imbalance_tourney_gt)
|
||||
export(format_fraud_by_month_gt)
|
||||
export(generate_model_inputs)
|
||||
export(plot_conf_mat_heatmap)
|
||||
export(plot_fraud_by_month)
|
||||
export(plot_hexbin_interaction)
|
||||
export(plot_missingness)
|
||||
export(plot_num_cor)
|
||||
export(plot_var_imp)
|
||||
export(prepare_eda_recipe)
|
||||
export(render_slides)
|
||||
export(run_imbalance_tournament)
|
||||
export(save_report_figure)
|
||||
export(save_report_table)
|
||||
export(train_diag_model)
|
||||
export(train_production_model)
|
||||
importFrom(arrow,S3FileSystem)
|
||||
importFrom(arrow,open_dataset)
|
||||
importFrom(arrow,read_csv_arrow)
|
||||
importFrom(arrow,s3_bucket)
|
||||
importFrom(arrow,to_duckdb)
|
||||
importFrom(arrow,write_dataset)
|
||||
importFrom(arrow,write_parquet)
|
||||
importFrom(colorspace,qualitative_hcl)
|
||||
importFrom(colorspace,scale_color_discrete_qualitative)
|
||||
importFrom(colorspace,scale_fill_continuous_diverging)
|
||||
importFrom(colorspace,scale_fill_continuous_sequential)
|
||||
importFrom(corrr,correlate)
|
||||
importFrom(corrr,rearrange)
|
||||
importFrom(corrr,shave)
|
||||
importFrom(corrr,stretch)
|
||||
importFrom(cowplot,background_grid)
|
||||
importFrom(cowplot,theme_cowplot)
|
||||
importFrom(cowplot,theme_half_open)
|
||||
importFrom(cowplot,theme_minimal_grid)
|
||||
importFrom(cowplot,theme_minimal_vgrid)
|
||||
importFrom(dplyr,`%>%`)
|
||||
importFrom(dplyr,across)
|
||||
importFrom(dplyr,any_of)
|
||||
importFrom(dplyr,arrange)
|
||||
importFrom(dplyr,bind_rows)
|
||||
importFrom(dplyr,case_when)
|
||||
importFrom(dplyr,collect)
|
||||
importFrom(dplyr,desc)
|
||||
importFrom(dplyr,everything)
|
||||
importFrom(dplyr,filter)
|
||||
importFrom(dplyr,group_by)
|
||||
importFrom(dplyr,if_else)
|
||||
importFrom(dplyr,mutate)
|
||||
importFrom(dplyr,n)
|
||||
importFrom(dplyr,pull)
|
||||
importFrom(dplyr,rename)
|
||||
importFrom(dplyr,select)
|
||||
importFrom(dplyr,slice_max)
|
||||
importFrom(dplyr,slice_sample)
|
||||
importFrom(dplyr,summarise)
|
||||
importFrom(dplyr,summarize)
|
||||
importFrom(dplyr,tbl_vars)
|
||||
importFrom(dplyr,ungroup)
|
||||
importFrom(ggplot2,aes)
|
||||
importFrom(ggplot2,autoplot)
|
||||
importFrom(ggplot2,coord_flip)
|
||||
importFrom(ggplot2,element_blank)
|
||||
importFrom(ggplot2,element_text)
|
||||
importFrom(ggplot2,expansion)
|
||||
importFrom(ggplot2,geom_line)
|
||||
importFrom(ggplot2,geom_linerange)
|
||||
importFrom(ggplot2,geom_point)
|
||||
importFrom(ggplot2,geom_segment)
|
||||
importFrom(ggplot2,geom_text)
|
||||
importFrom(ggplot2,geom_tile)
|
||||
importFrom(ggplot2,ggplot)
|
||||
importFrom(ggplot2,ggsave)
|
||||
importFrom(ggplot2,labs)
|
||||
importFrom(ggplot2,position_dodge)
|
||||
importFrom(ggplot2,scale_color_manual)
|
||||
importFrom(ggplot2,scale_fill_gradient)
|
||||
importFrom(ggplot2,scale_y_continuous)
|
||||
importFrom(ggplot2,scale_y_log10)
|
||||
importFrom(ggplot2,stat_summary_hex)
|
||||
importFrom(ggplot2,theme)
|
||||
importFrom(ggplot2,theme_minimal)
|
||||
importFrom(ggrepel,geom_text_repel)
|
||||
importFrom(glue,glue)
|
||||
importFrom(gt,cols_label)
|
||||
importFrom(gt,data_color)
|
||||
importFrom(gt,fmt_number)
|
||||
importFrom(gt,gt)
|
||||
importFrom(gt,tab_header)
|
||||
importFrom(gt,tab_options)
|
||||
importFrom(lightgbm,lgb.Dataset)
|
||||
importFrom(lightgbm,lgb.importance)
|
||||
importFrom(lightgbm,lgb.save)
|
||||
importFrom(lightgbm,lgb.train)
|
||||
importFrom(lubridate,"%m+%")
|
||||
importFrom(parsnip,boost_tree)
|
||||
importFrom(parsnip,set_engine)
|
||||
importFrom(parsnip,set_mode)
|
||||
importFrom(quarto,quarto_render)
|
||||
importFrom(readr,write_rds)
|
||||
importFrom(recipes,all_nominal_predictors)
|
||||
importFrom(recipes,all_numeric_predictors)
|
||||
importFrom(recipes,all_predictors)
|
||||
importFrom(recipes,bake)
|
||||
importFrom(recipes,prep)
|
||||
importFrom(recipes,recipe)
|
||||
importFrom(recipes,step_dummy)
|
||||
importFrom(recipes,step_impute_median)
|
||||
importFrom(recipes,step_indicate_na)
|
||||
importFrom(recipes,step_novel)
|
||||
importFrom(recipes,step_unknown)
|
||||
importFrom(recipes,step_zv)
|
||||
importFrom(recipes,update_role)
|
||||
importFrom(scales,percent)
|
||||
importFrom(stats,reorder)
|
||||
importFrom(stats,sd)
|
||||
importFrom(stats,t.test)
|
||||
importFrom(stringr,str_remove)
|
||||
importFrom(stringr,str_replace_all)
|
||||
importFrom(themis,adasyn)
|
||||
importFrom(themis,smote)
|
||||
importFrom(themis,step_tomek)
|
||||
importFrom(tidyr,pivot_longer)
|
||||
importFrom(tidyselect,where)
|
||||
importFrom(workflows,add_model)
|
||||
importFrom(workflows,add_recipe)
|
||||
importFrom(workflows,extract_fit_engine)
|
||||
importFrom(workflows,fit)
|
||||
importFrom(workflows,workflow)
|
||||
importFrom(yardstick,pr_auc)
|
||||
9
R/baflakehouse-package.R
Normal file
@@ -0,0 +1,9 @@
|
||||
#' baflakehouse: Lakehouse Workflow for the Bank Account Fraud Dataset
|
||||
#'
|
||||
#' Tools to ingest the Bank Account Fraud (BAF) Base dataset into a MinIO/S3-backed
|
||||
#' lakehouse, clean encoded missing values, and produce reproducible reporting
|
||||
#' artifacts orchestrated with targets.
|
||||
#'
|
||||
#' @docType _PACKAGE
|
||||
#' @name baflakehouse-package
|
||||
NULL
|
||||
1210
R/functions.R
Normal file
10
R/zzz.R
Normal file
@@ -0,0 +1,10 @@
|
||||
# fix R CMD check notes for dplyr non-standard evaluation
|
||||
utils::globalVariables(c(
|
||||
"fraud_bool", "outcome", "month",
|
||||
"Fraud", "Legit", "Total", "Pct_Fraud",
|
||||
"Outcome", "Month",
|
||||
"prev_address_months_count", "current_address_months_count",
|
||||
"bank_months_count", "session_length_in_minutes",
|
||||
"device_distinct_emails_8w", "device_distinct_emails",
|
||||
"intended_balcon_amount"
|
||||
))
|
||||
76
README.md
Normal file
@@ -0,0 +1,76 @@
|
||||
---
|
||||
output: github_document
|
||||
---
|
||||
|
||||
- [baflakehouse](#baflakehouse)
|
||||
- [About](#about)
|
||||
- [Results](#results)
|
||||
- [Clone](#clone)
|
||||
- [Acknowledgements](#acknowledgements)
|
||||
- [Citation](#citation)
|
||||
|
||||
# baflakehouse
|
||||
|
||||
## About
|
||||
|
||||
The baflakehouse package is an end-to-end machine learning pipeline built to detect credit card fraud. Rather than relying on static local files, it implements a modern Lakehouse architecture. It ingests a massive 1-million-row dataset, partitions it into Parquet files via Apache Arrow, stores it on a MinIO object server, and trains a production-ready LightGBM model orchestrated entirely by the targets package.
|
||||
Significance
|
||||
|
||||
Financial fraud datasets suffer from extreme class imbalance, making traditional accuracy metrics highly misleading. This pipeline is engineered specifically to handle that imbalance without aggressive synthetic oversampling.
|
||||
|
||||
## Pipeline
|
||||
|
||||
The pipeline is orchestrated by the `targets` package and executes as a reproducible DAG. All data is stored remotely in MinIO and accessed via Apache Arrow — no local CSVs or intermediate files on disk.
|
||||
|
||||
**Layer 01 → 02 | Ingest**
|
||||
Raw CSVs are read from `baf-fraud/01_raw` and converted to Hive-partitioned Parquet files in `02_intermediate` using Arrow's `write_dataset()`.
|
||||
|
||||
**Layer 02 → 03 | Clean**
|
||||
Sentinel values (`-1`) are recoded to `NA`, the binary outcome is relabelled from `fraud_bool` to `outcome` ("Fraud"/"Legit"), and the cleaned data is written to `03_primary` partitioned by month.
|
||||
|
||||
**Layer 03 → 04 | Feature Engineering**
|
||||
A missingness count feature (`n_missing`) is computed out-of-memory via Arrow compute and written to `04_feature`.
|
||||
|
||||
**Layer 04 → 05 | Resampling**
|
||||
Five versions of each monthly slice are generated — Baseline, Undersampling, SMOTE, ADASYN, and Tomek Links — and saved to `05_model_input`.
|
||||
|
||||
**Imbalance Tournament**
|
||||
LightGBM models are trained across all five strategies using three sliding time windows (train on months t, t+1, t+2; test on t+3). Strategies are ranked by PR-AUC and evaluated for statistical significance via paired t-test against the Standard baseline.
|
||||
|
||||
| Strategy | PR-AUC | Avg Train Time (s) | Sig. vs Standard |
|
||||
|---|---|---|---|
|
||||
| Standard | 0.1650 | 2.19 | — |
|
||||
| ADASYN | 0.1629 | 3.87 | No (p = 0.37) |
|
||||
| SMOTE | 0.1617 | 3.79 | No (p = 0.15) |
|
||||
| Weighted | 0.1577 | 2.18 | No (p = 0.15) |
|
||||
| Tomek | 0.1483 | 2.16 | **Yes (p = 0.009)** |
|
||||
| Undersampling | 0.1394 | 0.92 | **Yes (p = 0.029)** |
|
||||
|
||||
The Standard baseline wins outright. SMOTE and ADASYN offer no statistically significant gain while nearly doubling training time. Tomek Links and Undersampling significantly *hurt* performance and are discarded.
|
||||
|
||||
**Layer 05 → 06 | Production**
|
||||
The winning Standard strategy is retrained on months 0–5, evaluated on the held-out months 6–7, serialised to `baf_lgbm_prod_v1.txt`, and uploaded to `baf-fraud/06_models` in MinIO.
|
||||
|
||||
**Reporting**
|
||||
All figures and tables are written to `reports/` and assembled into a Quarto RevealJS slide deck via `tar_quarto()`.
|
||||
|
||||
## Results
|
||||
|
||||
By leveraging LightGBM's native cost-sensitive learning (scale_pos_weight) and leaf-wise tree growth, the production model achieves an elite ~49.1% Recall at a strict 5% False Positive Rate (FPR). It maximizes the detection of fraudulent applications while minimizing the number of legitimate customers flagged for manual review.
|
||||
|
||||
## Clone
|
||||
|
||||
To replicate this pipeline locally, you will need to clone the repository and set up your MinIO environment variables.
|
||||
|
||||
```
|
||||
git clone
|
||||
```
|
||||
|
||||
Once your .Renviron is configured with your BAF_KEY, BAF_SECRET, and BAF_ENDPOINT, you can execute the entire DAG:
|
||||
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
This project utilizes the Bank Account Fraud (BAF) dataset, originally published and presented at NeurIPS 2022. It is a massive, privacy-preserving suite of realistic tabular data designed specifically for evaluating fairness and performance in machine learning fraud detection.
|
||||
|
||||
## Citation
|
||||
57
_pkgdown.yml
Normal file
@@ -0,0 +1,57 @@
|
||||
url: https://docs.robwiederstein.org/baflakehouse
|
||||
|
||||
template:
|
||||
bootstrap: 5
|
||||
bootswatch: flatly # Clean, professional look
|
||||
|
||||
navbar:
|
||||
structure:
|
||||
left: [intro, reference, articles, presentation]
|
||||
components:
|
||||
presentation:
|
||||
text: "Slides"
|
||||
icon: fa-person-chalkboard
|
||||
href: slides/index.html
|
||||
|
||||
reference:
|
||||
- title: "Data Ingestion & Lakehouse Setup"
|
||||
desc: "Functions for moving data from CSV to partitioned Parquet in MinIO."
|
||||
contents:
|
||||
- baflakehouse-package
|
||||
- convert_to_parquet
|
||||
- connect_baf
|
||||
- clean_baf_base
|
||||
|
||||
- title: "Feature Engineering & Preprocessing"
|
||||
desc: "The 'Recipes' layer of the pipeline."
|
||||
contents:
|
||||
- engineer_features
|
||||
- prepare_eda_recipe
|
||||
- build_baf_recipe # NEW: Untrained blueprint for production
|
||||
- generate_model_inputs
|
||||
|
||||
- title: "The Tournament (Model Selection)"
|
||||
desc: "Cross-validation and imbalance strategy testing."
|
||||
contents:
|
||||
- run_imbalance_tournament
|
||||
- train_diag_model
|
||||
- create_efficiency_plot # Moved here: Belongs with the tournament
|
||||
|
||||
- title: "Final Evaluation & Production Deployment"
|
||||
desc: "Results on unseen data (Months 6-7) and MinIO artifact serialization."
|
||||
contents:
|
||||
- evaluate_final_model
|
||||
- train_production_model # NEW: The final deployment function
|
||||
|
||||
- title: "Reporting: Tables & Visualizations"
|
||||
desc: "Generating ggplot2 figures and gt tables for Quarto."
|
||||
contents:
|
||||
- starts_with("plot_")
|
||||
- starts_with("compute_")
|
||||
- starts_with("format_") # Neatly catches all your gt table formatters
|
||||
|
||||
- title: "Pipeline Utilities"
|
||||
desc: "Internal helpers for the targets workflow and slide generation."
|
||||
contents:
|
||||
- starts_with("save_report_")
|
||||
- render_slides # Consolidated here
|
||||
24
_quarto.yml
Normal file
@@ -0,0 +1,24 @@
|
||||
project:
|
||||
type: default
|
||||
output-dir: reports/slides
|
||||
render:
|
||||
- index.qmd
|
||||
|
||||
format:
|
||||
revealjs:
|
||||
theme: simple
|
||||
slide-number: true
|
||||
incremental: true
|
||||
controls: true
|
||||
bibliography: references.bib
|
||||
csl: ieee.csl
|
||||
|
||||
execute:
|
||||
echo: false
|
||||
warning: false
|
||||
message: false
|
||||
|
||||
knitr:
|
||||
opts_chunk:
|
||||
out.width: "100%"
|
||||
fig.align: "center"
|
||||
327
_targets.R
Normal file
@@ -0,0 +1,327 @@
|
||||
library(targets)
|
||||
library(tarchetypes)
|
||||
|
||||
tar_option_set(
|
||||
packages = c(
|
||||
"arrow",
|
||||
"bonsai",
|
||||
"duckdb",
|
||||
"glue",
|
||||
"gt",
|
||||
"here",
|
||||
"lightgbm",
|
||||
"lubridate",
|
||||
"tidymodels",
|
||||
"tidyverse",
|
||||
"cowplot",
|
||||
"colorspace",
|
||||
"readr",
|
||||
"scales",
|
||||
"ggplot2",
|
||||
"quarto",
|
||||
"corrr",
|
||||
"recipes",
|
||||
"themis",
|
||||
"tidyselect"
|
||||
)
|
||||
)
|
||||
|
||||
tar_source("./R/functions.R")
|
||||
|
||||
list(
|
||||
tar_target(
|
||||
baf_parquet_prefix,
|
||||
convert_to_parquet(
|
||||
from_prefix = "01_raw",
|
||||
to_prefix = "02_intermediate",
|
||||
bucket_name = "baf-fraud"
|
||||
)
|
||||
),
|
||||
|
||||
tar_target(
|
||||
baf_primary_prefix,
|
||||
clean_baf_base(
|
||||
in_prefix = baf_parquet_prefix,
|
||||
out_prefix = "03_primary/variant=Base",
|
||||
bucket_name = "baf-fraud",
|
||||
partitioning = "month",
|
||||
existing_data_behavior = "delete_matching",
|
||||
verbose = TRUE
|
||||
)
|
||||
),
|
||||
|
||||
tar_target(
|
||||
baf_feature_prefix,
|
||||
engineer_features(
|
||||
in_prefix = baf_primary_prefix,
|
||||
out_prefix = "04_feature/variant=Base",
|
||||
bucket_name = "baf-fraud",
|
||||
partitioning = "month",
|
||||
existing_data_behavior = "delete_matching",
|
||||
verbose = TRUE
|
||||
)
|
||||
),
|
||||
|
||||
# ---- Figure objects ----
|
||||
tar_target(
|
||||
fig_fraud_by_month,
|
||||
plot_fraud_by_month(baf_primary_prefix, bucket_name = "baf-fraud")
|
||||
),
|
||||
|
||||
# ---- Saved figure path (file target) ----
|
||||
tar_target(
|
||||
fig_fraud_by_month_path,
|
||||
save_report_figure(
|
||||
fig_fraud_by_month,
|
||||
filename = "fig_fraud_by_month.png",
|
||||
out_dir = "reports/figures"
|
||||
),
|
||||
format = "file"
|
||||
),
|
||||
tar_target(
|
||||
tbl_fraud_by_month_data,
|
||||
compute_fraud_by_month(baf_primary_prefix)
|
||||
),
|
||||
|
||||
tar_target(
|
||||
tbl_fraud_by_month_gt,
|
||||
format_fraud_by_month_gt(tbl_fraud_by_month_data)
|
||||
),
|
||||
|
||||
tar_target(
|
||||
tbl_fraud_by_month_path,
|
||||
save_report_table(tbl_fraud_by_month_gt, filename = "tbl_fraud_by_month.rds"),
|
||||
format = "file"
|
||||
),
|
||||
|
||||
# ---- Exploratory Data Analysis (EDA) Layer ----
|
||||
tar_target(
|
||||
data_eda_m0,
|
||||
connect_baf(baf_primary_prefix, use_duckdb = TRUE) |>
|
||||
filter(month == 0) |>
|
||||
collect()
|
||||
),
|
||||
|
||||
tar_target(
|
||||
eda_recipe,
|
||||
prepare_eda_recipe(data_eda_m0)
|
||||
),
|
||||
|
||||
tar_target(
|
||||
data_baked_eda_m0,
|
||||
bake(eda_recipe, new_data = data_eda_m0)
|
||||
),
|
||||
|
||||
tar_target(
|
||||
model_diag,
|
||||
train_diag_model(data_baked_eda_m0)
|
||||
),
|
||||
|
||||
# ---- EDA Figures ----
|
||||
tar_target(fig_var_imp, plot_var_imp(model_diag)),
|
||||
tar_target(fig_hexbin_interaction, plot_hexbin_interaction(data_baked_eda_m0)),
|
||||
tar_target(fig_missingness, plot_missingness(data_eda_m0)),
|
||||
tar_target(fig_num_cor, plot_num_cor(data_eda_m0)),
|
||||
|
||||
# ---- Saved EDA Figure Paths ----
|
||||
tar_target(
|
||||
fig_var_imp_path,
|
||||
save_report_figure(fig_var_imp, "fig_var_imp.png"),
|
||||
format = "file"
|
||||
),
|
||||
tar_target(
|
||||
fig_hexbin_interaction_path,
|
||||
save_report_figure(fig_hexbin_interaction, "fig_hexbin_interaction.png"),
|
||||
format = "file"
|
||||
),
|
||||
tar_target(
|
||||
fig_missingness_path,
|
||||
save_report_figure(fig_missingness, "fig_missingness.png"),
|
||||
format = "file"
|
||||
),
|
||||
tar_target(
|
||||
fig_num_cor_path,
|
||||
save_report_figure(fig_num_cor, "fig_num_cor.png"),
|
||||
format = "file"
|
||||
),
|
||||
|
||||
# ---- 05_model_input Generation ----
|
||||
tar_target(
|
||||
model_inputs_prefix,
|
||||
generate_model_inputs(
|
||||
feature_prefix = baf_feature_prefix,
|
||||
out_prefix = "05_model_input",
|
||||
bucket_name = "baf-fraud"
|
||||
)
|
||||
),
|
||||
|
||||
# ---- Tournament Inputs ----
|
||||
tar_target(
|
||||
imbalance_tasks,
|
||||
tibble::tribble(
|
||||
~recipe_name, ~data_folder, ~scale_pos_weight,
|
||||
"Standard", "baseline", 1,
|
||||
"Weighted", "baseline", 4,
|
||||
"Under", "under", 1,
|
||||
"Smote", "smote", 1,
|
||||
"Adasyn", "adasyn", 1,
|
||||
"Tomek", "tomek", 1
|
||||
)
|
||||
),
|
||||
|
||||
tar_target(
|
||||
imbalance_windows,
|
||||
tibble::tribble(
|
||||
~window_id, ~train_months, ~test_month,
|
||||
"Window 1", c(0, 1, 2), 3,
|
||||
"Window 2", c(1, 2, 3), 4,
|
||||
"Window 3", c(2, 3, 4), 5
|
||||
)
|
||||
),
|
||||
|
||||
# ---- 1. Data Layer (The Tournament Results) ----
|
||||
tar_target(
|
||||
tbl_strategy_showdown,
|
||||
{
|
||||
# Force DAG to wait for the folders to be generated
|
||||
force(model_inputs_prefix)
|
||||
# Pass baf_feature_prefix so it tracks the latest layer
|
||||
run_imbalance_tournament(imbalance_tasks, imbalance_windows, baf_feature_prefix)
|
||||
}
|
||||
),
|
||||
|
||||
# ---- 2. Figure Layer ----
|
||||
tar_target(
|
||||
fig_strategy_showdown,
|
||||
create_efficiency_plot(tbl_strategy_showdown)
|
||||
),
|
||||
|
||||
tar_target(
|
||||
fig_strategy_showdown_path,
|
||||
save_report_figure(
|
||||
fig_strategy_showdown,
|
||||
filename = "fig_strategy_showdown.png",
|
||||
out_dir = "reports/figures"
|
||||
),
|
||||
format = "file"
|
||||
),
|
||||
|
||||
# ---- 3. Table Layer (gt object) ----
|
||||
tar_target(
|
||||
tbl_strategy_showdown_gt,
|
||||
format_class_imbalance_tourney_gt(tbl_strategy_showdown)
|
||||
),
|
||||
|
||||
tar_target(
|
||||
tbl_strategy_showdown_path,
|
||||
save_report_table(
|
||||
tbl_strategy_showdown_gt,
|
||||
filename = "tbl_strategy_showdown.rds",
|
||||
out_dir = "reports/tables"
|
||||
),
|
||||
format = "file"
|
||||
),
|
||||
|
||||
# ---- Final Production Evaluation ----
|
||||
tar_target(
|
||||
final_eval_data,
|
||||
evaluate_final_model(params = winning_params)
|
||||
),
|
||||
|
||||
tar_target(
|
||||
final_conf_mat,
|
||||
yardstick::conf_mat(final_eval_data, truth, pred_class)
|
||||
),
|
||||
|
||||
tar_target(
|
||||
final_roc_curve,
|
||||
yardstick::roc_curve(final_eval_data, truth, prob)
|
||||
),
|
||||
|
||||
tar_target(
|
||||
final_pr_curve,
|
||||
yardstick::pr_curve(final_eval_data, truth, prob)
|
||||
),
|
||||
|
||||
# ---- Save Final Assets ----
|
||||
tar_target(
|
||||
fig_final_curves,
|
||||
{
|
||||
p1 <- ggplot2::autoplot(final_roc_curve) + ggplot2::labs(title = "ROC Curve (Months 6-7)")
|
||||
p2 <- ggplot2::autoplot(final_pr_curve) + ggplot2::labs(title = "PR Curve (Months 6-7)")
|
||||
cowplot::plot_grid(p1, p2)
|
||||
}
|
||||
),
|
||||
|
||||
tar_target(
|
||||
fig_final_curves_path,
|
||||
save_report_figure(fig_final_curves, "fig_final_curves.png"),
|
||||
format = "file"
|
||||
),
|
||||
|
||||
tar_target(
|
||||
tbl_final_conf_mat_path,
|
||||
save_report_table(final_conf_mat, "tbl_final_conf_mat.rds", out_dir = "reports/tables"),
|
||||
format = "file"
|
||||
),
|
||||
# ---- Generate and Save Heatmap ----
|
||||
tar_target(
|
||||
fig_final_conf_mat,
|
||||
plot_conf_mat_heatmap(final_conf_mat)
|
||||
),
|
||||
|
||||
tar_target(
|
||||
fig_final_conf_mat_path,
|
||||
save_report_figure(fig_final_conf_mat, "fig_final_conf_mat.png"),
|
||||
format = "file"
|
||||
),
|
||||
# ---- Report Dependency Update ----
|
||||
tar_target(
|
||||
report_assets,
|
||||
c(
|
||||
fig_fraud_by_month_path,
|
||||
tbl_fraud_by_month_path,
|
||||
fig_strategy_showdown_path,
|
||||
tbl_strategy_showdown_path,
|
||||
fig_var_imp_path,
|
||||
fig_hexbin_interaction_path,
|
||||
fig_missingness_path,
|
||||
fig_num_cor_path
|
||||
),
|
||||
format = "file"
|
||||
),
|
||||
|
||||
tar_quarto(
|
||||
report_slides,
|
||||
path = "index.qmd"
|
||||
),
|
||||
# production model deployment
|
||||
tar_target(
|
||||
data_full,
|
||||
connect_baf(baf_feature_prefix, use_duckdb = TRUE) |>
|
||||
collect()
|
||||
),
|
||||
tar_target(
|
||||
production_recipe_blueprint,
|
||||
build_baf_recipe(data_full)
|
||||
),
|
||||
tar_target(
|
||||
winning_params,
|
||||
list(
|
||||
trees = 844,
|
||||
tree_depth = 3,
|
||||
learn_rate = 0.0204,
|
||||
min_n = 389
|
||||
)
|
||||
),
|
||||
tar_target(
|
||||
production_model_uri,
|
||||
train_production_model(
|
||||
data = data_full,
|
||||
recipe = production_recipe_blueprint, # <--- Pass the untrained blueprint!
|
||||
best_params = winning_params,
|
||||
model_filename = "baf_lgbm_prod_v1.txt"
|
||||
),
|
||||
format = "rds"
|
||||
)
|
||||
)
|
||||
22
deploy.R
Normal file
@@ -0,0 +1,22 @@
|
||||
# deploy.R
|
||||
|
||||
message("📝 1. Updating package documentation and namespace...")
|
||||
devtools::document()
|
||||
|
||||
message("🚀 2. Running targets pipeline...")
|
||||
targets::tar_make()
|
||||
|
||||
message("🏗️ 3. Building pkgdown website...")
|
||||
pkgdown::build_site()
|
||||
|
||||
message("📦 4. Injecting slides into public docs/ folder...")
|
||||
# Create target directories
|
||||
fs::dir_create("docs/slides/reports/figures")
|
||||
|
||||
# Copy the HTML and index_files
|
||||
fs::dir_copy("reports/slides/", "docs/slides/", overwrite = TRUE)
|
||||
|
||||
# Copy the figures so the slides can see them
|
||||
fs::dir_copy("reports/figures/", "docs/slides/reports/figures/", overwrite = TRUE)
|
||||
|
||||
message("✅ DONE: Site and slides successfully deployed to docs/")
|
||||
519
ieee.csl
Normal file
@@ -0,0 +1,519 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<style xmlns="http://purl.org/net/xbiblio/csl" class="in-text" version="1.0" demote-non-dropping-particle="sort-only">
|
||||
<info>
|
||||
<title>IEEE Reference Guide version 11.29.2023</title>
|
||||
<title-short>Institute of Electrical and Electronics Engineers</title-short>
|
||||
<id>http://www.zotero.org/styles/ieee</id>
|
||||
<link href="http://www.zotero.org/styles/ieee" rel="self"/>
|
||||
<link href="https://journals.ieeeauthorcenter.ieee.org/your-role-in-article-production/ieee-editorial-style-manual/" rel="documentation"/>
|
||||
<author>
|
||||
<name>Michael Berkowitz</name>
|
||||
<email>mberkowi@gmu.edu</email>
|
||||
</author>
|
||||
<contributor>
|
||||
<name>Julian Onions</name>
|
||||
<email>julian.onions@gmail.com</email>
|
||||
</contributor>
|
||||
<contributor>
|
||||
<name>Rintze Zelle</name>
|
||||
<uri>http://twitter.com/rintzezelle</uri>
|
||||
</contributor>
|
||||
<contributor>
|
||||
<name>Stephen Frank</name>
|
||||
<uri>http://www.zotero.org/sfrank</uri>
|
||||
</contributor>
|
||||
<contributor>
|
||||
<name>Sebastian Karcher</name>
|
||||
</contributor>
|
||||
<contributor>
|
||||
<name>Giuseppe Silano</name>
|
||||
<email>g.silano89@gmail.com</email>
|
||||
<uri>http://giuseppesilano.net</uri>
|
||||
</contributor>
|
||||
<contributor>
|
||||
<name>Patrick O'Brien</name>
|
||||
</contributor>
|
||||
<contributor>
|
||||
<name>Brenton M. Wiernik</name>
|
||||
</contributor>
|
||||
<contributor>
|
||||
<name>Oliver Couch</name>
|
||||
<email>oliver.couch@gmail.com</email>
|
||||
</contributor>
|
||||
<contributor>
|
||||
<name>Andrew Dunning</name>
|
||||
<uri>https://orcid.org/0000-0003-0464-5036</uri>
|
||||
</contributor>
|
||||
<category citation-format="numeric"/>
|
||||
<category field="engineering"/>
|
||||
<category field="generic-base"/>
|
||||
<summary>IEEE style as per the 2023 guidelines.</summary>
|
||||
<updated>2024-03-27T11:41:27+00:00</updated>
|
||||
<rights license="http://creativecommons.org/licenses/by-sa/3.0/">This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License</rights>
|
||||
</info>
|
||||
<locale xml:lang="en">
|
||||
<date form="text">
|
||||
<date-part name="month" form="short" suffix=" "/>
|
||||
<date-part name="day" form="numeric-leading-zeros" suffix=", "/>
|
||||
<date-part name="year"/>
|
||||
</date>
|
||||
<terms>
|
||||
<term name="chapter" form="short">ch.</term>
|
||||
<term name="chapter-number" form="short">ch.</term>
|
||||
<term name="presented at">presented at the</term>
|
||||
<term name="available at">available</term>
|
||||
<!-- always use three-letter abbreviations for months -->
|
||||
<term name="month-06" form="short">Jun.</term>
|
||||
<term name="month-07" form="short">Jul.</term>
|
||||
<term name="month-09" form="short">Sep.</term>
|
||||
</terms>
|
||||
</locale>
|
||||
<!-- Macros -->
|
||||
<macro name="status">
|
||||
<choose>
|
||||
<if variable="page issue volume" match="none">
|
||||
<text variable="status" text-case="capitalize-first" suffix="" font-weight="bold"/>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="edition">
|
||||
<choose>
|
||||
<if type="bill book chapter graphic legal_case legislation motion_picture paper-conference report song" match="any">
|
||||
<choose>
|
||||
<if is-numeric="edition">
|
||||
<group delimiter=" ">
|
||||
<number variable="edition" form="ordinal"/>
|
||||
<text term="edition" form="short"/>
|
||||
</group>
|
||||
</if>
|
||||
<else>
|
||||
<text variable="edition" text-case="capitalize-first" suffix="."/>
|
||||
</else>
|
||||
</choose>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="issued">
|
||||
<choose>
|
||||
<if type="article-journal report" match="any">
|
||||
<date variable="issued">
|
||||
<date-part name="month" form="short" suffix=" "/>
|
||||
<date-part name="year" form="long"/>
|
||||
</date>
|
||||
</if>
|
||||
<else-if type="bill book chapter graphic legal_case legislation song thesis" match="any">
|
||||
<date variable="issued">
|
||||
<date-part name="year" form="long"/>
|
||||
</date>
|
||||
</else-if>
|
||||
<else-if type="paper-conference" match="any">
|
||||
<date variable="issued">
|
||||
<date-part name="month" form="short"/>
|
||||
<date-part name="year" prefix=" "/>
|
||||
</date>
|
||||
</else-if>
|
||||
<else-if type="motion_picture" match="any">
|
||||
<date variable="issued" form="text" prefix="(" suffix=")"/>
|
||||
</else-if>
|
||||
<else>
|
||||
<date variable="issued" form="text"/>
|
||||
</else>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="author">
|
||||
<names variable="author">
|
||||
<name and="text" et-al-min="7" et-al-use-first="1" initialize-with=". "/>
|
||||
<label form="short" prefix=", " text-case="capitalize-first"/>
|
||||
<et-al font-style="italic"/>
|
||||
<substitute>
|
||||
<names variable="editor"/>
|
||||
<names variable="translator"/>
|
||||
<text macro="director"/>
|
||||
</substitute>
|
||||
</names>
|
||||
</macro>
|
||||
<macro name="editor">
|
||||
<names variable="editor">
|
||||
<name initialize-with=". " delimiter=", " and="text"/>
|
||||
<label form="short" prefix=", " text-case="capitalize-first"/>
|
||||
</names>
|
||||
</macro>
|
||||
<macro name="director">
|
||||
<names variable="director">
|
||||
<name and="text" et-al-min="7" et-al-use-first="1" initialize-with=". "/>
|
||||
<et-al font-style="italic"/>
|
||||
</names>
|
||||
</macro>
|
||||
<macro name="locators">
|
||||
<group delimiter=", ">
|
||||
<text macro="edition"/>
|
||||
<group delimiter=" ">
|
||||
<text term="volume" form="short"/>
|
||||
<number variable="volume" form="numeric"/>
|
||||
</group>
|
||||
<group delimiter=" ">
|
||||
<number variable="number-of-volumes" form="numeric"/>
|
||||
<text term="volume" form="short" plural="true"/>
|
||||
</group>
|
||||
<group delimiter=" ">
|
||||
<text term="issue" form="short"/>
|
||||
<number variable="issue" form="numeric"/>
|
||||
</group>
|
||||
</group>
|
||||
</macro>
|
||||
<macro name="title">
|
||||
<choose>
|
||||
<if type="bill book graphic legal_case legislation motion_picture song standard software" match="any">
|
||||
<text variable="title" font-style="italic"/>
|
||||
</if>
|
||||
<else>
|
||||
<text variable="title" quotes="true"/>
|
||||
</else>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="publisher">
|
||||
<choose>
|
||||
<if type="bill book chapter graphic legal_case legislation motion_picture paper-conference song" match="any">
|
||||
<group delimiter=": ">
|
||||
<text variable="publisher-place"/>
|
||||
<text variable="publisher"/>
|
||||
</group>
|
||||
</if>
|
||||
<else>
|
||||
<group delimiter=", ">
|
||||
<text variable="publisher"/>
|
||||
<text variable="publisher-place"/>
|
||||
</group>
|
||||
</else>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="event">
|
||||
<choose>
|
||||
<!-- Published Conference Paper -->
|
||||
<if type="paper-conference speech" match="any">
|
||||
<choose>
|
||||
<if variable="container-title" match="any">
|
||||
<group delimiter=" ">
|
||||
<text term="in"/>
|
||||
<text variable="container-title" font-style="italic"/>
|
||||
</group>
|
||||
</if>
|
||||
<!-- Unpublished Conference Paper -->
|
||||
<else>
|
||||
<group delimiter=" ">
|
||||
<text term="presented at"/>
|
||||
<text variable="event"/>
|
||||
</group>
|
||||
</else>
|
||||
</choose>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="access">
|
||||
<choose>
|
||||
<if type="webpage post post-weblog" match="any">
|
||||
<!-- https://url.com/ (accessed Mon. DD, YYYY). -->
|
||||
<choose>
|
||||
<if variable="URL">
|
||||
<group delimiter=". " prefix=" ">
|
||||
<group delimiter=": ">
|
||||
<text term="accessed" text-case="capitalize-first"/>
|
||||
<date variable="accessed" form="text"/>
|
||||
</group>
|
||||
<text term="online" prefix="[" suffix="]" text-case="capitalize-first"/>
|
||||
<group delimiter=": ">
|
||||
<text term="available at" text-case="capitalize-first"/>
|
||||
<text variable="URL"/>
|
||||
</group>
|
||||
</group>
|
||||
</if>
|
||||
</choose>
|
||||
</if>
|
||||
<else-if match="any" variable="DOI">
|
||||
<!-- doi: 10.1000/xyz123. -->
|
||||
<text variable="DOI" prefix=" doi: " suffix="."/>
|
||||
</else-if>
|
||||
<else-if variable="URL">
|
||||
<!-- Accessed: Mon. DD, YYYY. [Medium]. Available: https://URL.com/ -->
|
||||
<group delimiter=". " prefix=" " suffix=". ">
|
||||
<!-- Accessed: Mon. DD, YYYY. -->
|
||||
<group delimiter=": ">
|
||||
<text term="accessed" text-case="capitalize-first"/>
|
||||
<date variable="accessed" form="text"/>
|
||||
</group>
|
||||
<!-- [Online Video]. -->
|
||||
<group prefix="[" suffix="]" delimiter=" ">
|
||||
<choose>
|
||||
<if variable="medium" match="any">
|
||||
<text variable="medium" text-case="capitalize-first"/>
|
||||
</if>
|
||||
<else>
|
||||
<text term="online" text-case="capitalize-first"/>
|
||||
<choose>
|
||||
<if type="motion_picture">
|
||||
<text term="video" text-case="capitalize-first"/>
|
||||
</if>
|
||||
</choose>
|
||||
</else>
|
||||
</choose>
|
||||
</group>
|
||||
</group>
|
||||
<!-- Available: https://URL.com/ -->
|
||||
<group delimiter=": " prefix=" ">
|
||||
<text term="available at" text-case="capitalize-first"/>
|
||||
<text variable="URL"/>
|
||||
</group>
|
||||
</else-if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="page">
|
||||
<choose>
|
||||
<if type="article-journal" variable="number" match="all">
|
||||
<group delimiter=" ">
|
||||
<text value="Art."/>
|
||||
<text term="issue" form="short"/>
|
||||
<text variable="number"/>
|
||||
</group>
|
||||
</if>
|
||||
<else>
|
||||
<group delimiter=" ">
|
||||
<label variable="page" form="short"/>
|
||||
<text variable="page"/>
|
||||
</group>
|
||||
</else>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="citation-locator">
|
||||
<group delimiter=" ">
|
||||
<choose>
|
||||
<if locator="page">
|
||||
<label variable="locator" form="short"/>
|
||||
</if>
|
||||
<else>
|
||||
<label variable="locator" form="short" text-case="capitalize-first"/>
|
||||
</else>
|
||||
</choose>
|
||||
<text variable="locator"/>
|
||||
</group>
|
||||
</macro>
|
||||
<macro name="geographic-location">
|
||||
<group delimiter=", " suffix=".">
|
||||
<choose>
|
||||
<if variable="publisher-place">
|
||||
<text variable="publisher-place" text-case="title"/>
|
||||
</if>
|
||||
<else-if variable="event-place">
|
||||
<text variable="event-place" text-case="title"/>
|
||||
</else-if>
|
||||
</choose>
|
||||
</group>
|
||||
</macro>
|
||||
<!-- Series -->
|
||||
<macro name="collection">
|
||||
<choose>
|
||||
<if variable="collection-title" match="any">
|
||||
<text term="in" suffix=" "/>
|
||||
<group delimiter=", " suffix=". ">
|
||||
<text variable="collection-title"/>
|
||||
<text variable="collection-number" prefix="no. "/>
|
||||
<text variable="volume" prefix="vol. "/>
|
||||
</group>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<!-- Citation -->
|
||||
<citation>
|
||||
<sort>
|
||||
<key variable="citation-number"/>
|
||||
</sort>
|
||||
<layout delimiter=", ">
|
||||
<group prefix="[" suffix="]" delimiter=", ">
|
||||
<text variable="citation-number"/>
|
||||
<text macro="citation-locator"/>
|
||||
</group>
|
||||
</layout>
|
||||
</citation>
|
||||
<!-- Bibliography -->
|
||||
<bibliography entry-spacing="0" second-field-align="flush">
|
||||
<layout>
|
||||
<!-- Citation Number -->
|
||||
<text variable="citation-number" prefix="[" suffix="]"/>
|
||||
<!-- Author(s) -->
|
||||
<text macro="author" suffix=", "/>
|
||||
<!-- Rest of Citation -->
|
||||
<choose>
|
||||
<!-- Specific Formats -->
|
||||
<if type="article-journal">
|
||||
<group delimiter=", ">
|
||||
<text macro="title"/>
|
||||
<text variable="container-title" font-style="italic" form="short"/>
|
||||
<text macro="locators"/>
|
||||
<text macro="page"/>
|
||||
<text macro="issued"/>
|
||||
<text macro="status"/>
|
||||
</group>
|
||||
<choose>
|
||||
<if variable="URL DOI" match="none">
|
||||
<text value="."/>
|
||||
</if>
|
||||
<else>
|
||||
<text value=","/>
|
||||
</else>
|
||||
</choose>
|
||||
<text macro="access"/>
|
||||
</if>
|
||||
<else-if type="paper-conference speech" match="any">
|
||||
<group delimiter=", " suffix=", ">
|
||||
<text macro="title"/>
|
||||
<text macro="event"/>
|
||||
<text macro="editor"/>
|
||||
</group>
|
||||
<text macro="collection"/>
|
||||
<group delimiter=", " suffix=".">
|
||||
<text macro="publisher"/>
|
||||
<text macro="issued"/>
|
||||
<text macro="page"/>
|
||||
<text macro="status"/>
|
||||
</group>
|
||||
<text macro="access"/>
|
||||
</else-if>
|
||||
<else-if type="chapter">
|
||||
<group delimiter=", " suffix=".">
|
||||
<text macro="title"/>
|
||||
<group delimiter=" ">
|
||||
<text term="in" suffix=" "/>
|
||||
<text variable="container-title" font-style="italic"/>
|
||||
</group>
|
||||
<text macro="locators"/>
|
||||
<text macro="editor"/>
|
||||
<text macro="collection"/>
|
||||
<text macro="publisher"/>
|
||||
<text macro="issued"/>
|
||||
<group delimiter=" ">
|
||||
<label variable="chapter-number" form="short"/>
|
||||
<text variable="chapter-number"/>
|
||||
</group>
|
||||
<text macro="page"/>
|
||||
</group>
|
||||
<text macro="access"/>
|
||||
</else-if>
|
||||
<else-if type="report">
|
||||
<group delimiter=", " suffix=".">
|
||||
<text macro="title"/>
|
||||
<text macro="publisher"/>
|
||||
<group delimiter=" ">
|
||||
<text variable="genre"/>
|
||||
<text variable="number"/>
|
||||
</group>
|
||||
<text macro="issued"/>
|
||||
</group>
|
||||
<text macro="access"/>
|
||||
</else-if>
|
||||
<else-if type="thesis">
|
||||
<group delimiter=", " suffix=".">
|
||||
<text macro="title"/>
|
||||
<text variable="genre"/>
|
||||
<text macro="publisher"/>
|
||||
<text macro="issued"/>
|
||||
</group>
|
||||
<text macro="access"/>
|
||||
</else-if>
|
||||
<else-if type="software">
|
||||
<group delimiter=". " suffix=".">
|
||||
<text macro="title"/>
|
||||
<text macro="issued" prefix="(" suffix=")"/>
|
||||
<text variable="genre"/>
|
||||
<text macro="publisher"/>
|
||||
</group>
|
||||
<text macro="access"/>
|
||||
</else-if>
|
||||
<else-if type="article">
|
||||
<group delimiter=", " suffix=".">
|
||||
<text macro="title"/>
|
||||
<text macro="issued"/>
|
||||
<group delimiter=": ">
|
||||
<text macro="publisher" font-style="italic"/>
|
||||
<text variable="number"/>
|
||||
</group>
|
||||
</group>
|
||||
<text macro="access"/>
|
||||
</else-if>
|
||||
<else-if type="webpage post-weblog post" match="any">
|
||||
<group delimiter=", " suffix=".">
|
||||
<text macro="title"/>
|
||||
<text variable="container-title"/>
|
||||
</group>
|
||||
<text macro="access"/>
|
||||
</else-if>
|
||||
<else-if type="patent">
|
||||
<group delimiter=", ">
|
||||
<text macro="title"/>
|
||||
<text variable="number"/>
|
||||
<text macro="issued"/>
|
||||
</group>
|
||||
<text macro="access"/>
|
||||
</else-if>
|
||||
<!-- Online Video -->
|
||||
<else-if type="motion_picture">
|
||||
<text macro="geographic-location" suffix=". "/>
|
||||
<group delimiter=", " suffix=".">
|
||||
<text macro="title"/>
|
||||
<text macro="issued"/>
|
||||
</group>
|
||||
<text macro="access"/>
|
||||
</else-if>
|
||||
<else-if type="standard">
|
||||
<group delimiter=", " suffix=".">
|
||||
<text macro="title"/>
|
||||
<group delimiter=" ">
|
||||
<text variable="genre"/>
|
||||
<text variable="number"/>
|
||||
</group>
|
||||
<text macro="geographic-location"/>
|
||||
<text macro="issued"/>
|
||||
</group>
|
||||
<text macro="access"/>
|
||||
</else-if>
|
||||
<!-- Generic/Fallback Formats -->
|
||||
<else-if type="bill book graphic legal_case legislation report song" match="any">
|
||||
<group delimiter=", " suffix=". ">
|
||||
<text macro="title"/>
|
||||
<text macro="locators"/>
|
||||
</group>
|
||||
<text macro="collection"/>
|
||||
<group delimiter=", " suffix=".">
|
||||
<text macro="publisher"/>
|
||||
<text macro="issued"/>
|
||||
<text macro="page"/>
|
||||
</group>
|
||||
<text macro="access"/>
|
||||
</else-if>
|
||||
<else-if type="article-magazine article-newspaper broadcast interview manuscript map patent personal_communication song speech thesis webpage" match="any">
|
||||
<group delimiter=", " suffix=".">
|
||||
<text macro="title"/>
|
||||
<text variable="container-title" font-style="italic"/>
|
||||
<text macro="locators"/>
|
||||
<text macro="publisher"/>
|
||||
<text macro="page"/>
|
||||
<text macro="issued"/>
|
||||
</group>
|
||||
<text macro="access"/>
|
||||
</else-if>
|
||||
<else>
|
||||
<group delimiter=", " suffix=". ">
|
||||
<text macro="title"/>
|
||||
<text variable="container-title" font-style="italic"/>
|
||||
<text macro="locators"/>
|
||||
</group>
|
||||
<text macro="collection"/>
|
||||
<group delimiter=", " suffix=".">
|
||||
<text macro="publisher"/>
|
||||
<text macro="page"/>
|
||||
<text macro="issued"/>
|
||||
</group>
|
||||
<text macro="access"/>
|
||||
</else>
|
||||
</choose>
|
||||
</layout>
|
||||
</bibliography>
|
||||
</style>
|
||||
364
index.qmd
Normal file
@@ -0,0 +1,364 @@
|
||||
---
|
||||
title: "BAF Fraud Modeling"
|
||||
author: "Rob Wiederstein"
|
||||
date: today
|
||||
date-format: long
|
||||
---
|
||||
|
||||
```{r}
|
||||
#| label: setup
|
||||
#| include: false
|
||||
library(here)
|
||||
library(targets)
|
||||
library(knitr)
|
||||
|
||||
# Make chunk paths resolve relative to reports/
|
||||
#knitr::opts_knit$set(root.dir = here::here("reports"))
|
||||
|
||||
# Declare deps for tar_quarto() (optional, but good)
|
||||
invisible(targets::tar_read(report_assets))
|
||||
```
|
||||
|
||||
# Introduction
|
||||
|
||||
## Bank Account Fraud Dataset{.incremental}
|
||||
|
||||
- Synthetic online account applications
|
||||
- 1M rows (Base)
|
||||
- 8 months (0–7)
|
||||
- Base + 5 biased variants
|
||||
- Label: Fraud vs Legit
|
||||
- Fraud $\approx 1\%$
|
||||
|
||||
:::{.notes}
|
||||
**What it is (plain English):** each row is a bank account opening application submitted online. Fraudsters may impersonate someone (identity theft) or invent a person; once approved they quickly exploit the credit line or use the account to move illicit funds.
|
||||
|
||||
**Why it exists:** the BAF *suite* was created as a large, realistic benchmark to stress-test ML performance and fairness under **dynamic / drifting** conditions and “extreme” class imbalance. The variants introduce controlled bias patterns; the Base set has no induced bias.
|
||||
|
||||
**How it was made:** the released data are **synthetic** (generated from a CTGAN trained on an anonymized, feature-engineered real dataset). Privacy protections mean no row corresponds to a real identifiable person.
|
||||
|
||||
**Time structure:** `month` ranges 0–7 (eight months). This is why we use chronological evaluation (train early months, test late months).
|
||||
|
||||
**Target variable:** datasheet label is `fraud_bool` (0/1). In our pipeline we rename/recode to `outcome` with labels “Legit” and “Fraud” for readability.
|
||||
:::
|
||||
|
||||
## Typical Scenario{.incremental}
|
||||
|
||||
Fraudsters will
|
||||
|
||||
1. Impersonate someone or
|
||||
|
||||
2. Create fake identity then
|
||||
|
||||
3. Max out the line or
|
||||
|
||||
4. receive illicit payment
|
||||
|
||||
|
||||
## Data Cleaning{.incremental}
|
||||
|
||||
- Relabel outcome.
|
||||
- -1 → NA.
|
||||
- Negative amount → NA.
|
||||
- Write clean Parquet.
|
||||
|
||||
:::{.notes}
|
||||
**Outcome**
|
||||
- `fraud_bool` (0/1) → `outcome` ("Legit"/"Fraud"); drop `fraud_bool`.
|
||||
|
||||
**Missing encoded as values**
|
||||
- Recode `-1` to `NA` for:
|
||||
- `prev_address_months_count`
|
||||
- `current_address_months_count`
|
||||
- `bank_months_count`
|
||||
- `session_length_in_minutes`
|
||||
- `device_distinct_emails` (your data uses `device_distinct_emails_8w`; function handles either name)
|
||||
|
||||
**Range constraint**
|
||||
- `intended_balcon_amount < 0` → `NA` (negative values are missing-encoding).
|
||||
|
||||
**Output**
|
||||
- Saved cleaned dataset as Parquet under `03_primary/variant=Base/` partitioned by `month`.
|
||||
:::
|
||||
|
||||
# Explore
|
||||
|
||||
## Variable Importance
|
||||
|
||||
```{r}
|
||||
#| label: fig-var-imp
|
||||
#| fig-cap: "Top 15 features driving the diagnostic model."
|
||||
knitr::include_graphics("reports/figures/fig_var_imp.png")
|
||||
```
|
||||
|
||||
:::{.notes}
|
||||
The diagnostic LightGBM model shows that behavior and identity structure dominate the early splits.
|
||||
:::
|
||||
|
||||
## Feature Interaction
|
||||
|
||||
```{r}
|
||||
#| label: fig-hexbin-interaction
|
||||
#| fig-cap: "Interaction between Credit Risk Score and Address History."
|
||||
knitr::include_graphics("reports/figures/fig_hexbin_interaction.png")
|
||||
```
|
||||
|
||||
:::{.notes}
|
||||
Fraud clusters noticeably in high credit risk profiles combined with specific address tenure patterns.
|
||||
:::
|
||||
|
||||
## Missingness Signal
|
||||
|
||||
```{r}
|
||||
#| label: fig-missingness
|
||||
#| fig-cap: "Missingness rates by outcome."
|
||||
knitr::include_graphics("reports/figures/fig_missingness.png")
|
||||
```
|
||||
|
||||
:::{.notes}
|
||||
Fraudsters are systematically omitting key tenure details (like previous address and bank history) compared to legitimate applicants.
|
||||
:::
|
||||
|
||||
## Numeric Correlation
|
||||
|
||||
```{r}
|
||||
#| label: fig-num-cor
|
||||
#| fig-cap: "Core numeric correlation matrix."
|
||||
knitr::include_graphics("reports/figures/fig_num_cor.png")
|
||||
```
|
||||
|
||||
:::{.notes}
|
||||
The structural anchor of the synthetic data is visible here, particularly the relationship between credit score and proposed limit.
|
||||
:::
|
||||
|
||||
# LightGBM
|
||||
|
||||
## About {.incremental}
|
||||
|
||||
- Originally released in 2016
|
||||
- Maintained by Microsoft
|
||||
- Over 18,000 stars on GitHub
|
||||
- King of Kaggle for tabular data
|
||||
- Announcing paper over 23,000 citations
|
||||
- Sped up similar gradient boosting algorithms 20x
|
||||
|
||||
|
||||
## Academic Support
|
||||
|
||||
::: {.panel-tabset}
|
||||
|
||||
### Standard
|
||||
|
||||
>For tabular supervised learning, gradient boosted decision trees—most notably XGBoost and LightGBM—are strong, low-latency baselines because they exploit hand-engineered behavioral features; LightGBM remains a **standard** reference point for card and e-commerce fraud tasks [@aminian_fraudtransformer_2025]
|
||||
|
||||
### Accurate
|
||||
|
||||
>[W]e found that the LightGBM approach had the highest detection **accuracy** of fraudulent activity with 97% in the experiments conducted. An additional key objective of reducing false alerts was accomplished, as the number of false alarms went from 13,024 to 6,249[@iscan_walletbased_2023]
|
||||
|
||||
### Efficient
|
||||
|
||||
>[W]e choose LightGBM as the base machine learning model due to its **efficiency** and widespread use in handling large-scale and structured datasets, particularly in financial domains such as credit card fraud detection.[@zhao_improved_2024]
|
||||
|
||||
|
||||
:::
|
||||
|
||||
# Unbalanced Classes
|
||||
|
||||
## The Challenge
|
||||
|
||||
<br>
|
||||
<br>
|
||||
|
||||
>The scarce occurrences of rare events impair the detection task …
|
||||
|
||||
|
||||
:::{.notes}
|
||||
**Citation:** Guo, H., Li, Y., Shang, J., Gu, M., Huang, Y., & Gong, B. (2017).
|
||||
*Learning from class-imbalanced data: Review of methods and applications.*
|
||||
**Expert Systems with Applications, 73**, 220–239. https://doi.org/10.1016/j.eswa.2016.12.035
|
||||
:::
|
||||
|
||||
## Bank Fraud Prevalence
|
||||
|
||||
```{r}
|
||||
#| label: fig-fraud-prevalence-plot
|
||||
#| fig-cap: "Fraudulent versus legitimate applications by month."
|
||||
knitr::include_graphics("reports/figures/fig_fraud_by_month.png")
|
||||
```
|
||||
|
||||
:::{.notes}
|
||||
Fraud represents approximately one percent of applications.
|
||||
:::
|
||||
|
||||
## Fraud Prevalence
|
||||
|
||||
```{r}
|
||||
#| label: tbl-fraud-by-month
|
||||
#| tbl-cap: "Something"
|
||||
readRDS("reports/tables/tbl_fraud_by_month.rds")
|
||||
```
|
||||
|
||||
|
||||
## Methods Tested{.incremental}
|
||||
|
||||
- **Standard:** Baseline (No sampling).
|
||||
- **Weighted:** Cost-sensitive learning ($4\times$ penalty).
|
||||
- **Undersampling:** Random removal of majority class.
|
||||
- **SMOTE:** Synthetic Minority Over-sampling Technique.
|
||||
- **ADASYN:** Adaptive Synthetic Sampling (hard examples).
|
||||
- **Tomek Links:** Cleaning boundary ambiguity.
|
||||
|
||||
:::{.notes}
|
||||
**Standard:** The control group. We let the gradient booster handle the 1% imbalance naturally.
|
||||
|
||||
**Weighted:** We used `scale_pos_weight` to tell LightGBM that missing a Fraud case is 4x worse than a false alarm.
|
||||
|
||||
**Undersampling:** We threw away about 75% of the Legit cases to balance the ratio. Fast, but risky.
|
||||
|
||||
**SMOTE & ADASYN:** The "heavy hitters." These generate fake fraud data based on nearest neighbors. Adasyn focuses specifically on "hard to learn" fraud cases.
|
||||
|
||||
**Tomek:** A cleaning method that removes Legit cases that are "too close" to Fraud cases, theoretically making the decision boundary clearer.
|
||||
:::
|
||||
|
||||
## Strategy Showdown: Results
|
||||
|
||||
```{r}
|
||||
#| label: tbl-strategy-showdown
|
||||
#| tbl-cap: "Performance comparison across imbalance strategies using 3-month rolling windows."
|
||||
readRDS("reports/tables/tbl_strategy_showdown.rds")
|
||||
```
|
||||
|
||||
:::{.notes}
|
||||
The "Standard" baseline is statistically indistinguishable from more complex methods like SMOTE and Adasyn (p > 0.05). Complex sampling provides no significant predictive gain for this dataset.
|
||||
:::
|
||||
|
||||
## Sampling Compared
|
||||
|
||||
```{r}
|
||||
#| label: fig-strategy-showdown
|
||||
#| fig-cap: "PR-AUC performance versus computational training time."
|
||||
knitr::include_graphics("reports/figures/fig_strategy_showdown.png")
|
||||
```
|
||||
|
||||
:::{.notes}
|
||||
The Standard strategy represents the "Efficient Frontier." It achieves near-peak performance while being nearly twice as fast as SMOTE or Adasyn. Tomek sampling actually degraded performance while increasing compute time.
|
||||
:::
|
||||
|
||||
## Sampling Methods Discarded {.incremental}
|
||||
|
||||
- No statistical gain
|
||||
|
||||
- Resource intensive
|
||||
|
||||
- Scalability
|
||||
|
||||
:::{.notes}
|
||||
|
||||
Complex sampling methods like SMOTE and Adasyn do not outperform the baseline "Standard" model, as shown by their non-significant p-values (p > 0.05).
|
||||
|
||||
Synthetic generation and neighbor calculations nearly double the average training time per fold compared to the standard approach.
|
||||
|
||||
For larger file sizes, simplicity helps avoid memory bottlenecks and excessive compute costs.
|
||||
|
||||
Future model performance gains should focus in places other than sampling techniques.
|
||||
:::
|
||||
|
||||
# Feature Creation
|
||||
|
||||
# Final Results
|
||||
|
||||
## The Confusion Matrix
|
||||
|
||||
```{r}
|
||||
#| label: fig-confusion-matrix
|
||||
#| echo: false
|
||||
#| out-width: "100%"
|
||||
knitr::include_graphics("resources/images/confusion-matrix.png")
|
||||
```
|
||||
|
||||
:::{.notes}
|
||||
The confusion matrix is the foundation of all classification metrics. Every metric we care about is derived from these four cells.
|
||||
|
||||
In the fraud context:
|
||||
- **TN:** Legitimate application correctly approved. No harm done.
|
||||
- **FP:** Legitimate application flagged as fraud. Customer friction, potential churn.
|
||||
- **FN:** Fraud case missed. Direct financial loss — the costliest error.
|
||||
- **TP:** Fraud correctly caught. The goal.
|
||||
|
||||
The key insight: not all errors are equal. A missed fraud case (FN) costs far more than a false alarm (FP). Our threshold and metric choices reflect this asymmetry.
|
||||
:::
|
||||
|
||||
## Precision & Recall
|
||||
|
||||
<br>
|
||||
|
||||
$$\text{Recall} = \frac{TP}{TP + FN}$$
|
||||
|
||||
> Of all actual frauds, how many did we catch?
|
||||
|
||||
$$\text{Precision} = \frac{TP}{TP + FP}$$
|
||||
|
||||
> Of all flagged cases, how many were real fraud?
|
||||
|
||||
:::{.notes}
|
||||
**Recall** (also called **detection rate**) is the primary metric for fraud detection. Missing a fraud case (FN) is costly, so we want Recall as high as possible. A model that flags every application gets a perfect detection rate — but at the cost of Precision.
|
||||
|
||||
**Precision** captures that cost: if we flag everything, every legitimate customer gets rejected. Precision measures how trustworthy our fraud flags actually are.
|
||||
|
||||
The **Precision-Recall tradeoff** is the core tension in fraud modeling. Lowering the decision threshold increases Recall (catch more fraud) but decreases Precision (more false alarms). The right balance depends on the operational cost of each error type.
|
||||
|
||||
Our model targets **~49% Recall at a 5% False Positive Rate** — a deliberate operating point chosen to limit customer friction while catching nearly half of fraud.
|
||||
:::
|
||||
|
||||
## ROC vs Precision-Recall AUC
|
||||
|
||||
::: {.panel-tabset}
|
||||
|
||||
### ROC AUC
|
||||
|
||||
- Plots **Recall** vs **False Positive Rate**
|
||||
- AUC = 0.5 is random; 1.0 is perfect
|
||||
- Optimistic under class imbalance
|
||||
- Inflated by the large TN pool
|
||||
|
||||
### PR AUC
|
||||
|
||||
- Plots **Precision** vs **Recall**
|
||||
- Focuses entirely on the minority class
|
||||
- Harder to game with a large Legit majority
|
||||
- Preferred metric for fraud detection
|
||||
|
||||
:::
|
||||
|
||||
:::{.notes}
|
||||
**Why ROC AUC can mislead on imbalanced data:** with 99% legitimate applications, even a naive model achieves a low False Positive Rate simply because the TN pool is enormous. ROC AUC rewards this, making models look better than they are.
|
||||
|
||||
**PR AUC** ignores true negatives entirely. It only asks: of the positive (fraud) predictions, how precise were we, and how much fraud did we recall? This makes it a far more honest scoreboard when positives are rare.
|
||||
|
||||
**Rule of thumb:** use ROC AUC for balanced classes; use PR AUC for imbalanced fraud/anomaly detection tasks. We report both, but optimise for PR AUC.
|
||||
:::
|
||||
|
||||
## Final Model Evaluation
|
||||
|
||||
```{r}
|
||||
#| label: fig-conf-mat-heatmap
|
||||
#| echo: false
|
||||
#| out-width: "100%"
|
||||
#| fig-cap: "Confusion Matrix Heatmap (5% Decision Threshold)"
|
||||
knitr::include_graphics("reports/figures/fig_final_conf_mat.png")
|
||||
```
|
||||
|
||||
## Diagnostic Metrics
|
||||
|
||||
```{r}
|
||||
#| label: fig-final-curves
|
||||
#| echo: false
|
||||
#| out-width: "100%"
|
||||
#| fig-cap: "ROC and Precision-Recall Curves for Out-of-Sample Data"
|
||||
knitr::include_graphics("reports/figures/fig_final_curves.png")
|
||||
```
|
||||
|
||||
# References {.smaller}
|
||||
|
||||
|
||||
|
||||
11
man/baflakehouse-package.Rd
Normal file
@@ -0,0 +1,11 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/baflakehouse-package.R
|
||||
\docType{_PACKAGE}
|
||||
\name{baflakehouse-package}
|
||||
\alias{baflakehouse-package}
|
||||
\title{baflakehouse: Lakehouse Workflow for the Bank Account Fraud Dataset}
|
||||
\description{
|
||||
Tools to ingest the Bank Account Fraud (BAF) Base dataset into a MinIO/S3-backed
|
||||
lakehouse, clean encoded missing values, and produce reproducible reporting
|
||||
artifacts orchestrated with targets.
|
||||
}
|
||||
17
man/build_baf_recipe.Rd
Normal file
@@ -0,0 +1,17 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{build_baf_recipe}
|
||||
\alias{build_baf_recipe}
|
||||
\title{Build Untrained BAF Recipe}
|
||||
\usage{
|
||||
build_baf_recipe(data)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{A data frame}
|
||||
}
|
||||
\value{
|
||||
An untrained tidymodels recipe
|
||||
}
|
||||
\description{
|
||||
Build Untrained BAF Recipe
|
||||
}
|
||||
34
man/clean_baf_base.Rd
Normal file
@@ -0,0 +1,34 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{clean_baf_base}
|
||||
\alias{clean_baf_base}
|
||||
\title{Clean the BAF Base dataset and write to 03_primary}
|
||||
\usage{
|
||||
clean_baf_base(
|
||||
in_prefix,
|
||||
out_prefix = "03_primary/variant=Base",
|
||||
bucket_name = "baf-fraud",
|
||||
partitioning = "month",
|
||||
existing_data_behavior = c("overwrite", "error", "delete_matching"),
|
||||
verbose = TRUE
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{in_prefix}{Character. Input dataset prefix inside bucket (e.g. "02_intermediate/variant=Base").}
|
||||
|
||||
\item{out_prefix}{Character. Output dataset prefix inside bucket (e.g. "03_primary/variant=Base").}
|
||||
|
||||
\item{bucket_name}{Character. Bucket name. Default "baf-fraud".}
|
||||
|
||||
\item{partitioning}{Character vector of columns to partition by. Default "month". Set NULL to disable.}
|
||||
|
||||
\item{existing_data_behavior}{One of "overwrite", "error", "delete_matching". Default "overwrite".}
|
||||
|
||||
\item{verbose}{Logical. Emit progress messages. Default TRUE.}
|
||||
}
|
||||
\value{
|
||||
Character. out_prefix (for downstream targets).
|
||||
}
|
||||
\description{
|
||||
Clean the BAF Base dataset and write to 03_primary
|
||||
}
|
||||
19
man/compute_fraud_by_month.Rd
Normal file
@@ -0,0 +1,19 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{compute_fraud_by_month}
|
||||
\alias{compute_fraud_by_month}
|
||||
\title{Fraud prevalence by month (counts + percent)}
|
||||
\usage{
|
||||
compute_fraud_by_month(in_prefix, use_duckdb = TRUE)
|
||||
}
|
||||
\arguments{
|
||||
\item{in_prefix}{Character. Dataset prefix inside the bucket, e.g. "03_primary/variant=Base".}
|
||||
|
||||
\item{use_duckdb}{Logical. Use DuckDB for lazy querying. Default TRUE.}
|
||||
}
|
||||
\value{
|
||||
A tibble with Month, Fraud, Legit, Total, Pct_Fraud.
|
||||
}
|
||||
\description{
|
||||
Computes monthly counts of Fraud/Legit, totals, and percent fraud.
|
||||
}
|
||||
22
man/connect_baf.Rd
Normal file
@@ -0,0 +1,22 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{connect_baf}
|
||||
\alias{connect_baf}
|
||||
\title{Connect to BAF dataset on MinIO (Arrow or DuckDB)}
|
||||
\usage{
|
||||
connect_baf(prefix, bucket_name = Sys.getenv("BAF_BUCKET"), use_duckdb = TRUE)
|
||||
}
|
||||
\arguments{
|
||||
\item{prefix}{Character. Dataset prefix inside the bucket
|
||||
(e.g., "02_intermediate/variant=Base").}
|
||||
|
||||
\item{bucket_name}{Character. Bucket name. Defaults to env var BAF_BUCKET.}
|
||||
|
||||
\item{use_duckdb}{Logical. If TRUE, return a DuckDB-backed lazy tbl.}
|
||||
}
|
||||
\value{
|
||||
An Arrow Dataset (default) or a DuckDB-backed lazy table.
|
||||
}
|
||||
\description{
|
||||
Connect to BAF dataset on MinIO (Arrow or DuckDB)
|
||||
}
|
||||
41
man/convert_to_parquet.Rd
Normal file
@@ -0,0 +1,41 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{convert_to_parquet}
|
||||
\alias{convert_to_parquet}
|
||||
\title{Convert BAF CSV to partitioned Parquet in MinIO (S3)}
|
||||
\usage{
|
||||
convert_to_parquet(from_prefix, to_prefix, bucket_name = "baf-fraud")
|
||||
}
|
||||
\arguments{
|
||||
\item{from_prefix}{Character. Prefix/key under the bucket containing CSVs (e.g. \code{"01_raw"}).}
|
||||
|
||||
\item{to_prefix}{Character. Prefix/key under the bucket to write Parquet dataset (e.g. \code{"02_intermediate"}).}
|
||||
|
||||
\item{bucket_name}{Character. Bucket name. Default \code{"baf-fraud"}.}
|
||||
}
|
||||
\value{
|
||||
A character string giving the destination dataset prefix (typically \code{to_prefix}).
|
||||
}
|
||||
\description{
|
||||
Reads \code{Base.csv} from a MinIO/S3 bucket prefix (e.g., \code{"01_raw"}) and writes a
|
||||
Hive-style partitioned Parquet dataset to another prefix (e.g., \code{"02_intermediate"}),
|
||||
partitioned by \code{variant} (e.g., \verb{variant=Base/part-*.parquet}).
|
||||
}
|
||||
\details{
|
||||
Connection settings are taken from environment variables:
|
||||
\itemize{
|
||||
\item \code{BAF_ENDPOINT} (e.g. \code{"minio:9000"} or \code{"192.168.4.xx:9000"})
|
||||
\item \code{BAF_KEY} (MinIO access key)
|
||||
\item \code{BAF_SECRET} (MinIO secret key)
|
||||
}
|
||||
}
|
||||
\examples{
|
||||
\dontrun{
|
||||
Sys.setenv(
|
||||
BAF_ENDPOINT = "minio:9000",
|
||||
BAF_KEY = "YOUR_ACCESS_KEY",
|
||||
BAF_SECRET = "YOUR_SECRET_KEY"
|
||||
)
|
||||
convert_to_parquet(from_prefix = "01_raw", to_prefix = "02_intermediate", bucket_name = "baf-fraud")
|
||||
}
|
||||
}
|
||||
14
man/create_efficiency_plot.Rd
Normal file
@@ -0,0 +1,14 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{create_efficiency_plot}
|
||||
\alias{create_efficiency_plot}
|
||||
\title{Create Effectiveness vs Efficiency Plot}
|
||||
\usage{
|
||||
create_efficiency_plot(results_df)
|
||||
}
|
||||
\arguments{
|
||||
\item{results_df}{Tibble from run_imbalance_tournament}
|
||||
}
|
||||
\description{
|
||||
Create Effectiveness vs Efficiency Plot
|
||||
}
|
||||
37
man/engineer_features.Rd
Normal file
@@ -0,0 +1,37 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{engineer_features}
|
||||
\alias{engineer_features}
|
||||
\title{Engineer features for the BAF dataset}
|
||||
\usage{
|
||||
engineer_features(
|
||||
in_prefix = "03_primary/variant=Base",
|
||||
out_prefix = "04_feature/variant=Base",
|
||||
bucket_name = "baf-fraud",
|
||||
partitioning = "month",
|
||||
existing_data_behavior = "delete_matching",
|
||||
verbose = TRUE
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{in_prefix}{Character. Input dataset prefix (e.g., "03_primary/variant=Base").}
|
||||
|
||||
\item{out_prefix}{Character. Output dataset prefix (e.g., "04_feature/variant=Base").}
|
||||
|
||||
\item{bucket_name}{Character. The S3/MinIO bucket name. Default "baf-fraud".}
|
||||
|
||||
\item{partitioning}{Character vector. Columns to partition by. Default "month".}
|
||||
|
||||
\item{existing_data_behavior}{Character. Behavior when data exists. Default "delete_matching".}
|
||||
|
||||
\item{verbose}{Logical. Whether to print progress messages. Default TRUE.}
|
||||
}
|
||||
\value{
|
||||
Character. The output prefix path for downstream targets.
|
||||
}
|
||||
\description{
|
||||
Reads the primary BAF dataset and engineers new features, such as
|
||||
\code{n_missing}, which counts the number of missing values across key
|
||||
tenure and financial columns. This calculation is performed out-of-memory
|
||||
using Arrow compute.
|
||||
}
|
||||
27
man/evaluate_final_model.Rd
Normal file
@@ -0,0 +1,27 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{evaluate_final_model}
|
||||
\alias{evaluate_final_model}
|
||||
\title{Final Model Evaluation (Months 6 & 7)}
|
||||
\usage{
|
||||
evaluate_final_model(
|
||||
params,
|
||||
bucket_name = "baf-fraud",
|
||||
inputs_prefix = "05_model_input"
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{A named list of LightGBM hyperparameters with elements:
|
||||
\code{trees}, \code{tree_depth}, \code{learn_rate}, \code{loss_reduction}, \code{min_n}.}
|
||||
|
||||
\item{bucket_name}{Character. Bucket name. Default "baf-fraud".}
|
||||
|
||||
\item{inputs_prefix}{Character. Model input prefix. Default "05_model_input".}
|
||||
}
|
||||
\value{
|
||||
A tibble with columns \code{truth}, \code{prob}, and \code{pred_class}.
|
||||
}
|
||||
\description{
|
||||
Trains the winning strategy on the full training set (Months 0-5)
|
||||
and evaluates it on the unseen test set (Months 6-7).
|
||||
}
|
||||
18
man/format_class_imbalance_tourney_gt.Rd
Normal file
@@ -0,0 +1,18 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{format_class_imbalance_tourney_gt}
|
||||
\alias{format_class_imbalance_tourney_gt}
|
||||
\title{Format Class Imbalance Tournament Table}
|
||||
\usage{
|
||||
format_class_imbalance_tourney_gt(results_df)
|
||||
}
|
||||
\arguments{
|
||||
\item{results_df}{The tibble output from \code{run_imbalance_tournament}.}
|
||||
}
|
||||
\value{
|
||||
A formatted gt table object.
|
||||
}
|
||||
\description{
|
||||
Aggregates results from the model tournament and performs paired t-tests
|
||||
against the 'Standard' model to determine statistical significance.
|
||||
}
|
||||
17
man/format_fraud_by_month_gt.Rd
Normal file
@@ -0,0 +1,17 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{format_fraud_by_month_gt}
|
||||
\alias{format_fraud_by_month_gt}
|
||||
\title{Format fraud-by-month table as a gt object}
|
||||
\usage{
|
||||
format_fraud_by_month_gt(x)
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{Tibble from compute_fraud_by_month().}
|
||||
}
|
||||
\value{
|
||||
A gt table.
|
||||
}
|
||||
\description{
|
||||
Format fraud-by-month table as a gt object
|
||||
}
|
||||
27
man/generate_model_inputs.Rd
Normal file
@@ -0,0 +1,27 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{generate_model_inputs}
|
||||
\alias{generate_model_inputs}
|
||||
\title{Generate Resampled Model Inputs}
|
||||
\usage{
|
||||
generate_model_inputs(
|
||||
feature_prefix = "04_feature/variant=Base",
|
||||
out_prefix = "05_model_input",
|
||||
bucket_name = "baf-fraud"
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{feature_prefix}{Character. Input prefix (e.g., "04_feature/variant=Base").}
|
||||
|
||||
\item{out_prefix}{Character. Output prefix base (e.g., "05_model_input").}
|
||||
|
||||
\item{bucket_name}{Character. Bucket name. Default "baf-fraud".}
|
||||
}
|
||||
\value{
|
||||
Character. The output prefix (for targets dependency tracking).
|
||||
}
|
||||
\description{
|
||||
Reads the engineered feature layer, prepares a base tidymodels recipe,
|
||||
and generates resampled datasets (Baseline, Under, SMOTE, Adasyn, Tomek)
|
||||
across all months, saving them to the 05_model_input prefix.
|
||||
}
|
||||
21
man/plot_conf_mat_heatmap.Rd
Normal file
@@ -0,0 +1,21 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{plot_conf_mat_heatmap}
|
||||
\alias{plot_conf_mat_heatmap}
|
||||
\title{Plot Confusion Matrix Heatmap}
|
||||
\usage{
|
||||
plot_conf_mat_heatmap(cm, title = "")
|
||||
}
|
||||
\arguments{
|
||||
\item{cm}{A yardstick conf_mat object.}
|
||||
|
||||
\item{title}{Character. The main title of the plot.}
|
||||
|
||||
\item{subtitle}{Character. The subtitle of the plot.}
|
||||
}
|
||||
\value{
|
||||
A ggplot object.
|
||||
}
|
||||
\description{
|
||||
Generates a styled 4-quadrant heatmap from a yardstick confusion matrix.
|
||||
}
|
||||
34
man/plot_fraud_by_month.Rd
Normal file
@@ -0,0 +1,34 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{plot_fraud_by_month}
|
||||
\alias{plot_fraud_by_month}
|
||||
\title{Plot applications by month (Legit vs Fraud) on a log scale}
|
||||
\usage{
|
||||
plot_fraud_by_month(
|
||||
dataset_prefix,
|
||||
bucket_name = "baf-fraud",
|
||||
palette = "Dark 3",
|
||||
title = ""
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{dataset_prefix}{Character. Prefix inside the bucket, e.g. "03_primary/variant=Base".}
|
||||
|
||||
\item{bucket_name}{Character. Bucket name. Default "baf-fraud".}
|
||||
|
||||
\item{palette}{Character. colorspace qualitative palette name. Default "Dark 3".}
|
||||
|
||||
\item{title}{Character. Plot title. Default "".}
|
||||
}
|
||||
\value{
|
||||
A ggplot object.
|
||||
}
|
||||
\description{
|
||||
Builds an exploratory chart of absolute application counts by month
|
||||
split by outcome (Legit vs Fraud). Uses a log10 y-axis so rare fraud
|
||||
remains visible on the same axis.
|
||||
}
|
||||
\details{
|
||||
Data source: expects a cleaned "primary" dataset prefix (e.g. 03_primary/variant=Base)
|
||||
stored in MinIO/S3, accessed via \code{connect_baf()}.
|
||||
}
|
||||
16
man/plot_hexbin_interaction.Rd
Normal file
@@ -0,0 +1,16 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{plot_hexbin_interaction}
|
||||
\alias{plot_hexbin_interaction}
|
||||
\title{Plot Hexbin Interaction}
|
||||
\usage{
|
||||
plot_hexbin_interaction(baked_data, title = "")
|
||||
}
|
||||
\arguments{
|
||||
\item{baked_data}{Baked EDA data}
|
||||
|
||||
\item{title}{Character. Plot title. Default "".}
|
||||
}
|
||||
\description{
|
||||
Plot Hexbin Interaction
|
||||
}
|
||||
16
man/plot_missingness.Rd
Normal file
@@ -0,0 +1,16 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{plot_missingness}
|
||||
\alias{plot_missingness}
|
||||
\title{Plot Missingness Signal}
|
||||
\usage{
|
||||
plot_missingness(eda_data, title = "")
|
||||
}
|
||||
\arguments{
|
||||
\item{eda_data}{Raw EDA data}
|
||||
|
||||
\item{title}{Character. Plot title. Default "".}
|
||||
}
|
||||
\description{
|
||||
Plot Missingness Signal
|
||||
}
|
||||
16
man/plot_num_cor.Rd
Normal file
@@ -0,0 +1,16 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{plot_num_cor}
|
||||
\alias{plot_num_cor}
|
||||
\title{Plot Numeric Correlation Matrix}
|
||||
\usage{
|
||||
plot_num_cor(eda_data, title = "")
|
||||
}
|
||||
\arguments{
|
||||
\item{eda_data}{Raw EDA data}
|
||||
|
||||
\item{title}{Character. Plot title. Default "".}
|
||||
}
|
||||
\description{
|
||||
Plot Numeric Correlation Matrix
|
||||
}
|
||||
16
man/plot_var_imp.Rd
Normal file
@@ -0,0 +1,16 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{plot_var_imp}
|
||||
\alias{plot_var_imp}
|
||||
\title{Plot Variable Importance}
|
||||
\usage{
|
||||
plot_var_imp(model, title = "")
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{Trained LightGBM model}
|
||||
|
||||
\item{title}{Character. Plot title. Default "".}
|
||||
}
|
||||
\description{
|
||||
Plot Variable Importance
|
||||
}
|
||||
14
man/prepare_eda_recipe.Rd
Normal file
@@ -0,0 +1,14 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{prepare_eda_recipe}
|
||||
\alias{prepare_eda_recipe}
|
||||
\title{Prepare EDA Recipe}
|
||||
\usage{
|
||||
prepare_eda_recipe(eda_data)
|
||||
}
|
||||
\arguments{
|
||||
\item{eda_data}{Raw EDA data}
|
||||
}
|
||||
\description{
|
||||
Prepare EDA Recipe
|
||||
}
|
||||
21
man/render_slides.Rd
Normal file
@@ -0,0 +1,21 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{render_slides}
|
||||
\alias{render_slides}
|
||||
\title{Render Quarto revealjs slideshow after required assets exist}
|
||||
\usage{
|
||||
render_slides(qmd = "index.qmd", assets, output_dir = "reports/slides")
|
||||
}
|
||||
\arguments{
|
||||
\item{qmd}{Character. Input Quarto file (e.g. "index.qmd").}
|
||||
|
||||
\item{assets}{Character vector. File paths that must exist before rendering.}
|
||||
|
||||
\item{output_dir}{Character. Output directory for rendered slides.}
|
||||
}
|
||||
\value{
|
||||
Character path to the rendered HTML file.
|
||||
}
|
||||
\description{
|
||||
Render Quarto revealjs slideshow after required assets exist
|
||||
}
|
||||
34
man/run_imbalance_tournament.Rd
Normal file
@@ -0,0 +1,34 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{run_imbalance_tournament}
|
||||
\alias{run_imbalance_tournament}
|
||||
\title{Run Class Imbalance Tournament}
|
||||
\usage{
|
||||
run_imbalance_tournament(
|
||||
tasks,
|
||||
windows,
|
||||
feature_prefix,
|
||||
bucket_name = "baf-fraud",
|
||||
inputs_prefix = "05_model_input"
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{tasks}{A tibble containing recipe_name, data_folder, and scale_pos_weight.}
|
||||
|
||||
\item{windows}{A tibble containing window_id, train_months, and test_month.}
|
||||
|
||||
\item{feature_prefix}{Character. The upstream dependency prefix (used to force DAG execution).}
|
||||
|
||||
\item{bucket_name}{Character. Bucket name. Default "baf-fraud".}
|
||||
|
||||
\item{inputs_prefix}{Character. The folder containing the sampled data. Default "05_model_input".}
|
||||
}
|
||||
\value{
|
||||
A tibble with the summarized tournament results.
|
||||
}
|
||||
\description{
|
||||
Trains LightGBM models across different class imbalance strategies
|
||||
(Standard, SMOTE, Adasyn, etc.) using sliding time windows. Evaluates
|
||||
performance using PR-AUC and calculates statistical significance.
|
||||
Includes common-sense hyperparameter defaults to prevent overfitting.
|
||||
}
|
||||
31
man/save_report_figure.Rd
Normal file
@@ -0,0 +1,31 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{save_report_figure}
|
||||
\alias{save_report_figure}
|
||||
\title{Save a report figure artifact}
|
||||
\usage{
|
||||
save_report_figure(
|
||||
plot,
|
||||
filename,
|
||||
out_dir = "reports/figures",
|
||||
width = 12,
|
||||
height = 6.75,
|
||||
dpi = 300
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{plot}{A ggplot object.}
|
||||
|
||||
\item{filename}{Character. Output filename, e.g. \code{"fig_fraud_by_month.png"}.}
|
||||
|
||||
\item{out_dir}{Character. Output directory. Default \code{"reports/figures"}.}
|
||||
|
||||
\item{width, height, dpi}{Numeric. Passed to \code{ggplot2::ggsave()}.}
|
||||
}
|
||||
\value{
|
||||
Character. Normalized path to the saved file.
|
||||
}
|
||||
\description{
|
||||
Saves a ggplot object to \code{reports/figures/}.
|
||||
Intended for use in \code{targets} pipelines as a file-producing target.
|
||||
}
|
||||
21
man/save_report_table.Rd
Normal file
@@ -0,0 +1,21 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{save_report_table}
|
||||
\alias{save_report_table}
|
||||
\title{Save a report table artifact}
|
||||
\usage{
|
||||
save_report_table(x, filename, out_dir = "reports/tables")
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{Object to save.}
|
||||
|
||||
\item{filename}{Output filename, e.g. "tbl_fraud_by_month.rds".}
|
||||
|
||||
\item{out_dir}{Output directory. Default "reports/tables".}
|
||||
}
|
||||
\value{
|
||||
Character path to saved file.
|
||||
}
|
||||
\description{
|
||||
Save a report table artifact
|
||||
}
|
||||
14
man/train_diag_model.Rd
Normal file
@@ -0,0 +1,14 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{train_diag_model}
|
||||
\alias{train_diag_model}
|
||||
\title{Train Diagnostic Model}
|
||||
\usage{
|
||||
train_diag_model(baked_data)
|
||||
}
|
||||
\arguments{
|
||||
\item{baked_data}{Baked EDA data}
|
||||
}
|
||||
\description{
|
||||
Train Diagnostic Model
|
||||
}
|
||||
30
man/train_production_model.Rd
Normal file
@@ -0,0 +1,30 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/functions.R
|
||||
\name{train_production_model}
|
||||
\alias{train_production_model}
|
||||
\title{Train and Serialize Production LightGBM Model}
|
||||
\usage{
|
||||
train_production_model(
|
||||
data,
|
||||
recipe,
|
||||
best_params,
|
||||
model_filename = "lgbm_prod.txt"
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{A data frame containing the full BAF dataset (Months 0-7).}
|
||||
|
||||
\item{recipe}{A prepared tidymodels recipe.}
|
||||
|
||||
\item{best_params}{A list or tibble of the winning hyperparameters.}
|
||||
|
||||
\item{model_filename}{Character. The target filename. Defaults to "lgbm_prod.txt".}
|
||||
}
|
||||
\value{
|
||||
Character. The MinIO URI of the uploaded model artifact.
|
||||
}
|
||||
\description{
|
||||
Trains a LightGBM model on the complete dataset using the winning
|
||||
hyperparameters, serializes it to a text file, and uploads it directly
|
||||
to MinIO via the Apache Arrow S3 interface.
|
||||
}
|
||||
154
references.bib
Normal file
@@ -0,0 +1,154 @@
|
||||
@misc{_bankaccountfraud_,
|
||||
title = {Bank-Account-Fraud/Documents/Datasheet.Pdf at Main {$\cdot$} Feedzai/Bank-Account-Fraud},
|
||||
journal = {GitHub},
|
||||
urldate = {2026-02-11},
|
||||
abstract = {Supporting documentation for the paper \"Turning the Tables: Biased, Imbalanced, Dynamic Tabular Datasets for ML Evaluation\", and the Bank Account Fraud suite of datasets. - feedzai/bank-...},
|
||||
howpublished = {https://github.com/feedzai/bank-account-fraud/blob/main/documents/datasheet.pdf},
|
||||
langid = {english},
|
||||
file = {/home/rkw/Zotero/storage/LT4CJB34/datasheet.html}
|
||||
}
|
||||
|
||||
@article{ali2013classification,
|
||||
title = {Classification with Class Imbalance Problem},
|
||||
author = {Ali, Aida and Shamsuddin, Siti Mariyam and Ralescu, Anca L},
|
||||
year = 2013,
|
||||
journal = {Int. J. Advance Soft Compu. Appl},
|
||||
volume = {5},
|
||||
number = {3},
|
||||
pages = {176--204},
|
||||
keywords = {class imbalance,unbalanced classes},
|
||||
file = {/home/rkw/Zotero/storage/3AVBB4SQ/Ali et al. - 2013 - Classification with class imbalance problem.pdf}
|
||||
}
|
||||
|
||||
@misc{aminian_fraudtransformer_2025,
|
||||
title = {{{FraudTransformer}}: {{Time-Aware GPT}} for {{Transaction Fraud Detection}}},
|
||||
shorttitle = {{{FraudTransformer}}},
|
||||
author = {Aminian, Gholamali and Elliott, Andrew and Li, Tiger and Wong, Timothy Cheuk Hin and Dehon, Victor Claude and Szpruch, Lukasz and Maple, Carsten and Read, Christopher and Brown, Martin and Reinert, Gesine and Mamouei, Mo},
|
||||
year = 2025,
|
||||
month = oct,
|
||||
number = {arXiv:2509.23712},
|
||||
eprint = {2509.23712},
|
||||
primaryclass = {cs},
|
||||
publisher = {arXiv},
|
||||
doi = {10.48550/arXiv.2509.23712},
|
||||
urldate = {2026-02-21},
|
||||
abstract = {Detecting payment fraud in real-world banking streams requires models that can exploit both the order of events and the irregular time gaps between them. We introduce FraudTransformer, a sequence model that augments a vanilla GPT-style architecture with (i) a dedicated time encoder that embeds either absolute timestamps or inter-event values, and (ii) a learned positional encoder that preserves relative order. Experiments on a large industrial dataset -- tens of millions of transactions and auxiliary events -- show that FraudTransformer surpasses four strong classical baselines (Logistic Regression, XGBoost and LightGBM) as well as transformer ablations that omit either the time or positional component. On the held-out test set it delivers the highest AUROC and PRAUC.},
|
||||
archiveprefix = {arXiv},
|
||||
keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
|
||||
file = {/home/rkw/Zotero/storage/YQQE72ZK/Aminian et al. - 2025 - FraudTransformer Time-Aware GPT for Transaction Fraud Detection.pdf;/home/rkw/Zotero/storage/XVL3X42S/2509.html}
|
||||
}
|
||||
|
||||
@article{bartoszkrawczyk_learning_2016,
|
||||
title = {Learning from Imbalanced Data: Open Challenges and Future Directions},
|
||||
author = {{Bartosz Krawczyk} and Krawczyk, Bartosz},
|
||||
year = 2016,
|
||||
month = apr,
|
||||
journal = {Progress in Artificial Intelligence},
|
||||
volume = {5},
|
||||
number = {4},
|
||||
pages = {221--232},
|
||||
doi = {10.1007/s13748-016-0094-0},
|
||||
abstract = {Despite more than two decades of continuous development learning from imbalanced data is still a focus of intense research. Starting as a problem of skewed distributions of binary tasks, this topic evolved way beyond this conception. With the expansion of machine learning and data mining, combined with the arrival of big data era, we have gained a deeper insight into the nature of imbalanced learning, while at the same time facing new emerging challenges. Data-level and algorithm-level methods are constantly being improved and hybrid approaches gain increasing popularity. Recent trends focus on analyzing not only the disproportion between classes, but also other difficulties embedded in the nature of data. New real-life problems motivate researchers to focus on computationally efficient, adaptive and real-time methods. This paper aims at discussing open issues and challenges that need to be addressed to further develop the field of imbalanced learning. Seven vital areas of research in this topic are identified, covering the full spectrum of learning from imbalanced data: classification, regression, clustering, data streams, big data analytics and applications, e.g., in social media and computer vision. This paper provides a discussion and suggestions concerning lines of future research for each of them.},
|
||||
keywords = {imbalanced data,unbalanced classes},
|
||||
annotation = {MAG ID: 2338318698},
|
||||
file = {/home/rkw/Zotero/storage/ZFYYHYYR/Bartosz Krawczyk and Krawczyk - 2016 - Learning from imbalanced data open challenges and future directions.pdf}
|
||||
}
|
||||
|
||||
@article{iscan_walletbased_2023,
|
||||
title = {Wallet-{{Based Transaction Fraud Prevention Through LightGBM With}} the {{Focus}} on {{Minimizing False Alarms}}},
|
||||
author = {Iscan, Can and Kumas, Osman and Akbulut, Fatma Patlar and Akbulut, Akhan},
|
||||
year = 2023,
|
||||
journal = {IEEE Access},
|
||||
volume = {11},
|
||||
pages = {131465--131474},
|
||||
issn = {2169-3536},
|
||||
doi = {10.1109/ACCESS.2023.3321666},
|
||||
urldate = {2026-02-21},
|
||||
abstract = {E-wallets' rising popularity can be attributed to the fact that they facilitate a wide variety of financial activities such as payments, transfers, investments, etc., and eliminate the need for actual cash or cards. The confidentiality, availability, and integrity of a user's financial information stored in an electronic wallet can be compromised by threats such as phishing, malware, and social engineering; therefore, fintech platforms employ intelligent fraud detection mechanisms to mitigate the problem. The purpose of this study is to detect fraudulent activity using cutting-edge machine learning techniques on data obtained from the leading e-wallet platform in Turkey. After a comprehensive analysis of the dataset's features via feature engineering procedures, we found that the LightGBM approach had the highest detection accuracy of fraudulent activity with 97\% in the experiments conducted. An additional key objective of reducing false alerts was accomplished, as the number of false alarms went from 13,024 to 6,249. This approach resulted in the establishment of a machine-learning model suitable for use by relatively small fraud detection teams.},
|
||||
keywords = {E-wallet,Feature extraction,fintech,Fraud,fraud detection,LightGBM,Machine learning,Machine learning algorithms,Monitoring,Online banking,Real-time systems},
|
||||
file = {/home/rkw/Zotero/storage/B2K3D8W9/Iscan et al. - 2023 - Wallet-Based Transaction Fraud Prevention Through LightGBM With the Focus on Minimizing False Alarms.pdf}
|
||||
}
|
||||
|
||||
@article{jesus_baf_,
|
||||
title = {{{BAF Dataset Suite Datasheet}}},
|
||||
author = {Jesus, S{\'e}rgio and Pombal, Jos{\'e} and Alves, Duarte and Cruz, Andr{\'e} F and Saleiro, Pedro and Ribeiro, Rita P and Gama, Jo{\~a}o and Bizarro, Pedro},
|
||||
langid = {english},
|
||||
file = {/home/rkw/Zotero/storage/6A29JS3R/Jesus et al. - BAF Dataset Suite Datasheet.pdf}
|
||||
}
|
||||
|
||||
@misc{jesus_turning_2022,
|
||||
title = {Turning the {{Tables}}: {{Biased}}, {{Imbalanced}}, {{Dynamic Tabular Datasets}} for {{ML Evaluation}}},
|
||||
shorttitle = {Turning the {{Tables}}},
|
||||
author = {Jesus, S{\'e}rgio and Pombal, Jos{\'e} and Alves, Duarte and Cruz, Andr{\'e} and Saleiro, Pedro and Ribeiro, Rita P. and Gama, Jo{\~a}o and Bizarro, Pedro},
|
||||
year = 2022,
|
||||
month = nov,
|
||||
number = {arXiv:2211.13358},
|
||||
eprint = {2211.13358},
|
||||
primaryclass = {cs},
|
||||
publisher = {arXiv},
|
||||
doi = {10.48550/arXiv.2211.13358},
|
||||
urldate = {2026-02-11},
|
||||
abstract = {Evaluating new techniques on realistic datasets plays a crucial role in the development of ML research and its broader adoption by practitioners. In recent years, there has been a significant increase of publicly available unstructured data resources for computer vision and NLP tasks. However, tabular data -- which is prevalent in many high-stakes domains -- has been lagging behind. To bridge this gap, we present Bank Account Fraud (BAF), the first publicly available privacy-preserving, large-scale, realistic suite of tabular datasets. The suite was generated by applying state-of-the-art tabular data generation techniques on an anonymized,real-world bank account opening fraud detection dataset. This setting carries a set of challenges that are commonplace in real-world applications, including temporal dynamics and significant class imbalance. Additionally, to allow practitioners to stress test both performance and fairness of ML methods, each dataset variant of BAF contains specific types of data bias. With this resource, we aim to provide the research community with a more realistic, complete, and robust test bed to evaluate novel and existing methods.},
|
||||
archiveprefix = {arXiv},
|
||||
keywords = {Computer Science - Machine Learning},
|
||||
file = {/home/rkw/Zotero/storage/FSBNDIP4/Jesus et al. - 2022 - Turning the Tables Biased, Imbalanced, Dynamic Tabular Datasets for ML Evaluation.pdf;/home/rkw/Zotero/storage/7HXKQPDC/2211.html}
|
||||
}
|
||||
|
||||
@article{johnson_deep_2019,
|
||||
title = {Deep {{Learning}} and {{Data Sampling}} with {{Imbalanced Big Data}}},
|
||||
author = {Johnson, Justin M. and Khoshgoftaar, Taghi M.},
|
||||
year = 2019,
|
||||
month = jul,
|
||||
pages = {175--183},
|
||||
doi = {10.1109/iri.2019.00038},
|
||||
abstract = {This study evaluates the use of deep learning and data sampling on a class-imbalanced Big Data problem, i.e. Medicare fraud detection. Medicare offers affordable health insurance to the elderly population and serves more than 15\% of the United States population. To increase transparency and help reduce fraud, the Centers for Medicare and Medicaid Services (CMS) have made several data sets publicly available for analysis. Our research group has conducted several studies using CMS data and traditional machine learning algorithms (non-deep learning), but challenges associated with severe class imbalance leave room for improvement. These previous studies serve as baselines as we employ deep neural networks with various data-sampling techniques to determine the efficacy of deep learning in addressing class imbalance. Random over-sampling (ROS), random under-sampling (RUS), and combinations of the two (ROS-RUS) are applied to study how varying levels of class imbalance impact model training and performance. Classwise performance is maximized by identifying optimal decision thresholds, and a strong linear relationship between minority class size and optimal threshold is observed. Results show that ROS significantly outperforms RUS, combining RUS and ROS both maximizes performance and efficiency with a 4 x speedup in training time, and the default threshold of 0.5 is never optimal when training data is imbalanced. To the best of our knowledge, this is the first study to provide statistical results comparing ROS, RUS, and ROS-RUS deep learning methods across a range of class distributions. Additional contributions include a unique analysis of thresholding as it relates to the minority class size and state-of-the-art performance on the given fraud detection task.},
|
||||
keywords = {LEIE},
|
||||
annotation = {MAG ID: 2974916584},
|
||||
file = {/home/rkw/Zotero/storage/ZQR6NJPU/Johnson and Khoshgoftaar - 2019 - Deep Learning and Data Sampling with Imbalanced Bi.pdf}
|
||||
}
|
||||
|
||||
@article{kaur_systematic_2019,
|
||||
title = {A {{Systematic Review}} on {{Imbalanced Data Challenges}} in {{Machine Learning}}: {{Applications}} and {{Solutions}}},
|
||||
shorttitle = {A {{Systematic Review}} on {{Imbalanced Data Challenges}} in {{Machine Learning}}},
|
||||
author = {Kaur, Harsurinder and Pannu, Husanbir Singh and Malhi, Avleen Kaur},
|
||||
year = 2019,
|
||||
month = aug,
|
||||
journal = {ACM Comput. Surv.},
|
||||
volume = {52},
|
||||
number = {4},
|
||||
pages = {79:1--79:36},
|
||||
issn = {0360-0300},
|
||||
doi = {10.1145/3343440},
|
||||
urldate = {2026-02-11},
|
||||
abstract = {In machine learning, the data imbalance imposes challenges to perform data analytics in almost all areas of real-world research. The raw primary data often suffers from the skewed perspective of data distribution of one class over the other as in the case of computer vision, information security, marketing, and medical science. The goal of this article is to present a comparative analysis of the approaches from the reference of data pre-processing, algorithmic and hybrid paradigms for contemporary imbalance data analysis techniques, and their comparative study in lieu of different data distribution and their application areas.},
|
||||
file = {/home/rkw/Zotero/storage/4WZYQG9W/Kaur et al. - 2019 - A Systematic Review on Imbalanced Data Challenges in Machine Learning Applications and Solutions.pdf}
|
||||
}
|
||||
|
||||
@inproceedings{zhang_leveraging_2025,
|
||||
title = {Leveraging {{LightGBM}} for {{High-Accuracy Telecom Fraud Detection}} with {{Clustering-Based Undersampling}}},
|
||||
booktitle = {2025 8th {{International Symposium}} on {{Big Data}} and {{Applied Statistics}} ({{ISBDAS}})},
|
||||
author = {Zhang, Shuo and Zhang, Bo and Hou, Shichong and Fu, Zhiyuan},
|
||||
year = 2025,
|
||||
month = feb,
|
||||
pages = {384--388},
|
||||
doi = {10.1109/ISBDAS64762.2025.11117117},
|
||||
urldate = {2026-02-21},
|
||||
abstract = {This study presents a machine learning framework designed to predict and prevent telecom fraud by analyzing 1 million transaction records. The model identifies critical fraud patterns through the analysis of key features, including PIN usage, transaction frequency, and location. To address the severe class imbalance in the dataset (initially a 10:1 ratio), a clustering-based undersampling technique was employed, balancing the dataset to a 1:1 ratio while preserving data integrity and improving model performance. The framework utilizes LightGBM, optimized through Bayesian hyperparameter tuning and five-fold cross-validation, achieving an accuracy of 98\% and a robust AUC of 0.9. Key findings highlight that transactions involving both bank cards and PIN verification exhibit a drastically reduced fraud risk (0.0001 ratio), while cardless or PIN-less transactions are significantly more susceptible to fraud. The study emphasizes the importance of multi-factor authentication and provides actionable insights for financial institutions to mitigate fraud risks. Additionally, it underscores the transformative potential of machine learning in real-time fraud detection, with future opportunities for integrating emerging technologies like blockchain to further enhance security.},
|
||||
keywords = {Accuracy,Bayes methods,Bayesian optimization,Clustering-Based Undersampling,Fraud,fraud prediction,LightGBM model,machine learning,Machine learning,Pins,Predictive models,Real-time systems,Security,Telecommunications,Tuning},
|
||||
file = {/home/rkw/Zotero/storage/8XIZ5CCT/11117117.html}
|
||||
}
|
||||
|
||||
@article{zhao_improved_2024,
|
||||
title = {Improved {{LightGBM}} for {{Extremely Imbalanced Data}} and {{Application}} to {{Credit Card Fraud Detection}}},
|
||||
author = {Zhao, Xiaosong and Liu, Yong and Zhao, Qiangfu},
|
||||
year = 2024,
|
||||
journal = {IEEE Access},
|
||||
volume = {12},
|
||||
pages = {159316--159335},
|
||||
issn = {2169-3536},
|
||||
doi = {10.1109/ACCESS.2024.3487212},
|
||||
urldate = {2026-02-21},
|
||||
abstract = {Credit card fraud (CCF) is a significant threat to cardholders and financial institutions. CCF detection against this threat is challenging due to extremely imbalanced data (EID). EID involves extremely few instances of fraud for training and an extremely high risk of overlooking fraud. While class balancing or oversampling techniques can address the former problem by punishing negative classes or augmenting the positive data, they do not mitigate the latter. In contrast, the cost-sensitive learning approach targets only the high risk of false negative errors. Therefore, existing approaches are insufficient to solve all the issues of the EID problem. Based on the LightGBM (Light Gradient Boosting Machine) framework, this study introduces two novel machine-learning methods: the class balancing cost-harmonization LightGBM (CB-CHL-LightGBM) and the oversampling cost-harmonization LightGBM (OS-CHL-LightGBM). The new approaches combine class balancing or oversampling technology with LightGBM to solve the EID problem comprehensively. They enhance the efficacy of LightGBM in CCF detection scenarios. Experimental results on three CCF datasets indicate that the two proposed methods outperform LightGBM in several crucial performance metrics. For example, compared with the original LightGBM, CB-CHL-LightGBM or OS-CHL-LightGBM can increase the F2-score from 0.77 to 0.83 for the first dataset, from 0.77 to 0.86 for the second dataset, and from 0.70 to 0.82 for the third dataset. However, adding class balancing, oversampling, and cost-harmonization loss separately to LightGBM may not obtain better results.},
|
||||
keywords = {Accuracy,Boosting,Class balancing cost-harmonization LightGBM,Classification algorithms,cost-sensitive,Costs,credit card fraud detection,Credit cards,Data models,extremely imbalanced data,Fraud,interpretability,Loss measurement,oversampling,Synthetic data,Training},
|
||||
file = {/home/rkw/Zotero/storage/KI2Y7NIA/Zhao et al. - 2024 - Improved LightGBM for Extremely Imbalanced Data and Application to Credit Card Fraud Detection.pdf}
|
||||
}
|
||||
7
renv/.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
library/
|
||||
local/
|
||||
cellar/
|
||||
lock/
|
||||
python/
|
||||
sandbox/
|
||||
staging/
|
||||
1334
renv/activate.R
Normal file
19
renv/settings.json
Normal file
@@ -0,0 +1,19 @@
|
||||
{
|
||||
"bioconductor.version": null,
|
||||
"external.libraries": [],
|
||||
"ignored.packages": [],
|
||||
"package.dependency.fields": [
|
||||
"Imports",
|
||||
"Depends",
|
||||
"LinkingTo"
|
||||
],
|
||||
"ppm.enabled": null,
|
||||
"ppm.ignored.urls": [],
|
||||
"r.version": null,
|
||||
"snapshot.type": "implicit",
|
||||
"use.cache": true,
|
||||
"vcs.ignore.cellar": true,
|
||||
"vcs.ignore.library": true,
|
||||
"vcs.ignore.local": true,
|
||||
"vcs.manage.ignores": true
|
||||
}
|
||||
BIN
reports/figures/fig_final_conf_mat.png
Normal file
|
After Width: | Height: | Size: 56 KiB |
BIN
reports/figures/fig_final_curves.png
Normal file
|
After Width: | Height: | Size: 151 KiB |
BIN
reports/figures/fig_fraud_by_month.png
Normal file
|
After Width: | Height: | Size: 106 KiB |
BIN
reports/figures/fig_hexbin_interaction.png
Normal file
|
After Width: | Height: | Size: 375 KiB |
BIN
reports/figures/fig_missingness.png
Normal file
|
After Width: | Height: | Size: 76 KiB |
BIN
reports/figures/fig_num_cor.png
Normal file
|
After Width: | Height: | Size: 291 KiB |
BIN
reports/figures/fig_strategy_showdown.png
Normal file
|
After Width: | Height: | Size: 95 KiB |
BIN
reports/figures/fig_var_imp.png
Normal file
|
After Width: | Height: | Size: 128 KiB |
BIN
resources/images/confusion-matrix.png
Normal file
|
After Width: | Height: | Size: 72 KiB |