Initial commit: BAF Lakehouse fraud detection pipeline

End-to-end LightGBM fraud detection pipeline built as an R package,
orchestrated by targets with data stored in MinIO via Apache Arrow.
Includes 6-layer Lakehouse architecture, class imbalance tournament,
formally tuned hyperparameters (PR-AUC 0.198), and Quarto RevealJS slides.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-21 21:19:09 -05:00
commit 33d0fc31c7
56 changed files with 15596 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/baflakehouse-package.R
\docType{_PACKAGE}
\name{baflakehouse-package}
\alias{baflakehouse-package}
\title{baflakehouse: Lakehouse Workflow for the Bank Account Fraud Dataset}
\description{
Tools to ingest the Bank Account Fraud (BAF) Base dataset into a MinIO/S3-backed
lakehouse, clean encoded missing values, and produce reproducible reporting
artifacts orchestrated with targets.
}

17
man/build_baf_recipe.Rd Normal file
View File

@@ -0,0 +1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{build_baf_recipe}
\alias{build_baf_recipe}
\title{Build Untrained BAF Recipe}
\usage{
build_baf_recipe(data)
}
\arguments{
\item{data}{A data frame}
}
\value{
An untrained tidymodels recipe
}
\description{
Build Untrained BAF Recipe
}

34
man/clean_baf_base.Rd Normal file
View File

@@ -0,0 +1,34 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{clean_baf_base}
\alias{clean_baf_base}
\title{Clean the BAF Base dataset and write to 03_primary}
\usage{
clean_baf_base(
in_prefix,
out_prefix = "03_primary/variant=Base",
bucket_name = "baf-fraud",
partitioning = "month",
existing_data_behavior = c("overwrite", "error", "delete_matching"),
verbose = TRUE
)
}
\arguments{
\item{in_prefix}{Character. Input dataset prefix inside bucket (e.g. "02_intermediate/variant=Base").}
\item{out_prefix}{Character. Output dataset prefix inside bucket (e.g. "03_primary/variant=Base").}
\item{bucket_name}{Character. Bucket name. Default "baf-fraud".}
\item{partitioning}{Character vector of columns to partition by. Default "month". Set NULL to disable.}
\item{existing_data_behavior}{One of "overwrite", "error", "delete_matching". Default "overwrite".}
\item{verbose}{Logical. Emit progress messages. Default TRUE.}
}
\value{
Character. out_prefix (for downstream targets).
}
\description{
Clean the BAF Base dataset and write to 03_primary
}

View File

@@ -0,0 +1,19 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{compute_fraud_by_month}
\alias{compute_fraud_by_month}
\title{Fraud prevalence by month (counts + percent)}
\usage{
compute_fraud_by_month(in_prefix, use_duckdb = TRUE)
}
\arguments{
\item{in_prefix}{Character. Dataset prefix inside the bucket, e.g. "03_primary/variant=Base".}
\item{use_duckdb}{Logical. Use DuckDB for lazy querying. Default TRUE.}
}
\value{
A tibble with Month, Fraud, Legit, Total, Pct_Fraud.
}
\description{
Computes monthly counts of Fraud/Legit, totals, and percent fraud.
}

22
man/connect_baf.Rd Normal file
View File

@@ -0,0 +1,22 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{connect_baf}
\alias{connect_baf}
\title{Connect to BAF dataset on MinIO (Arrow or DuckDB)}
\usage{
connect_baf(prefix, bucket_name = Sys.getenv("BAF_BUCKET"), use_duckdb = TRUE)
}
\arguments{
\item{prefix}{Character. Dataset prefix inside the bucket
(e.g., "02_intermediate/variant=Base").}
\item{bucket_name}{Character. Bucket name. Defaults to env var BAF_BUCKET.}
\item{use_duckdb}{Logical. If TRUE, return a DuckDB-backed lazy tbl.}
}
\value{
An Arrow Dataset (default) or a DuckDB-backed lazy table.
}
\description{
Connect to BAF dataset on MinIO (Arrow or DuckDB)
}

41
man/convert_to_parquet.Rd Normal file
View File

@@ -0,0 +1,41 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{convert_to_parquet}
\alias{convert_to_parquet}
\title{Convert BAF CSV to partitioned Parquet in MinIO (S3)}
\usage{
convert_to_parquet(from_prefix, to_prefix, bucket_name = "baf-fraud")
}
\arguments{
\item{from_prefix}{Character. Prefix/key under the bucket containing CSVs (e.g. \code{"01_raw"}).}
\item{to_prefix}{Character. Prefix/key under the bucket to write Parquet dataset (e.g. \code{"02_intermediate"}).}
\item{bucket_name}{Character. Bucket name. Default \code{"baf-fraud"}.}
}
\value{
A character string giving the destination dataset prefix (typically \code{to_prefix}).
}
\description{
Reads \code{Base.csv} from a MinIO/S3 bucket prefix (e.g., \code{"01_raw"}) and writes a
Hive-style partitioned Parquet dataset to another prefix (e.g., \code{"02_intermediate"}),
partitioned by \code{variant} (e.g., \verb{variant=Base/part-*.parquet}).
}
\details{
Connection settings are taken from environment variables:
\itemize{
\item \code{BAF_ENDPOINT} (e.g. \code{"minio:9000"} or \code{"192.168.4.xx:9000"})
\item \code{BAF_KEY} (MinIO access key)
\item \code{BAF_SECRET} (MinIO secret key)
}
}
\examples{
\dontrun{
Sys.setenv(
BAF_ENDPOINT = "minio:9000",
BAF_KEY = "YOUR_ACCESS_KEY",
BAF_SECRET = "YOUR_SECRET_KEY"
)
convert_to_parquet(from_prefix = "01_raw", to_prefix = "02_intermediate", bucket_name = "baf-fraud")
}
}

View File

@@ -0,0 +1,14 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{create_efficiency_plot}
\alias{create_efficiency_plot}
\title{Create Effectiveness vs Efficiency Plot}
\usage{
create_efficiency_plot(results_df)
}
\arguments{
\item{results_df}{Tibble from run_imbalance_tournament}
}
\description{
Create Effectiveness vs Efficiency Plot
}

37
man/engineer_features.Rd Normal file
View File

@@ -0,0 +1,37 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{engineer_features}
\alias{engineer_features}
\title{Engineer features for the BAF dataset}
\usage{
engineer_features(
in_prefix = "03_primary/variant=Base",
out_prefix = "04_feature/variant=Base",
bucket_name = "baf-fraud",
partitioning = "month",
existing_data_behavior = "delete_matching",
verbose = TRUE
)
}
\arguments{
\item{in_prefix}{Character. Input dataset prefix (e.g., "03_primary/variant=Base").}
\item{out_prefix}{Character. Output dataset prefix (e.g., "04_feature/variant=Base").}
\item{bucket_name}{Character. The S3/MinIO bucket name. Default "baf-fraud".}
\item{partitioning}{Character vector. Columns to partition by. Default "month".}
\item{existing_data_behavior}{Character. Behavior when data exists. Default "delete_matching".}
\item{verbose}{Logical. Whether to print progress messages. Default TRUE.}
}
\value{
Character. The output prefix path for downstream targets.
}
\description{
Reads the primary BAF dataset and engineers new features, such as
\code{n_missing}, which counts the number of missing values across key
tenure and financial columns. This calculation is performed out-of-memory
using Arrow compute.
}

View File

@@ -0,0 +1,27 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{evaluate_final_model}
\alias{evaluate_final_model}
\title{Final Model Evaluation (Months 6 & 7)}
\usage{
evaluate_final_model(
params,
bucket_name = "baf-fraud",
inputs_prefix = "05_model_input"
)
}
\arguments{
\item{params}{A named list of LightGBM hyperparameters with elements:
\code{trees}, \code{tree_depth}, \code{learn_rate}, \code{loss_reduction}, \code{min_n}.}
\item{bucket_name}{Character. Bucket name. Default "baf-fraud".}
\item{inputs_prefix}{Character. Model input prefix. Default "05_model_input".}
}
\value{
A tibble with columns \code{truth}, \code{prob}, and \code{pred_class}.
}
\description{
Trains the winning strategy on the full training set (Months 0-5)
and evaluates it on the unseen test set (Months 6-7).
}

View File

@@ -0,0 +1,18 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{format_class_imbalance_tourney_gt}
\alias{format_class_imbalance_tourney_gt}
\title{Format Class Imbalance Tournament Table}
\usage{
format_class_imbalance_tourney_gt(results_df)
}
\arguments{
\item{results_df}{The tibble output from \code{run_imbalance_tournament}.}
}
\value{
A formatted gt table object.
}
\description{
Aggregates results from the model tournament and performs paired t-tests
against the 'Standard' model to determine statistical significance.
}

View File

@@ -0,0 +1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{format_fraud_by_month_gt}
\alias{format_fraud_by_month_gt}
\title{Format fraud-by-month table as a gt object}
\usage{
format_fraud_by_month_gt(x)
}
\arguments{
\item{x}{Tibble from compute_fraud_by_month().}
}
\value{
A gt table.
}
\description{
Format fraud-by-month table as a gt object
}

View File

@@ -0,0 +1,27 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{generate_model_inputs}
\alias{generate_model_inputs}
\title{Generate Resampled Model Inputs}
\usage{
generate_model_inputs(
feature_prefix = "04_feature/variant=Base",
out_prefix = "05_model_input",
bucket_name = "baf-fraud"
)
}
\arguments{
\item{feature_prefix}{Character. Input prefix (e.g., "04_feature/variant=Base").}
\item{out_prefix}{Character. Output prefix base (e.g., "05_model_input").}
\item{bucket_name}{Character. Bucket name. Default "baf-fraud".}
}
\value{
Character. The output prefix (for targets dependency tracking).
}
\description{
Reads the engineered feature layer, prepares a base tidymodels recipe,
and generates resampled datasets (Baseline, Under, SMOTE, Adasyn, Tomek)
across all months, saving them to the 05_model_input prefix.
}

View File

@@ -0,0 +1,21 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_conf_mat_heatmap}
\alias{plot_conf_mat_heatmap}
\title{Plot Confusion Matrix Heatmap}
\usage{
plot_conf_mat_heatmap(cm, title = "")
}
\arguments{
\item{cm}{A yardstick conf_mat object.}
\item{title}{Character. The main title of the plot.}
\item{subtitle}{Character. The subtitle of the plot.}
}
\value{
A ggplot object.
}
\description{
Generates a styled 4-quadrant heatmap from a yardstick confusion matrix.
}

View File

@@ -0,0 +1,34 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_fraud_by_month}
\alias{plot_fraud_by_month}
\title{Plot applications by month (Legit vs Fraud) on a log scale}
\usage{
plot_fraud_by_month(
dataset_prefix,
bucket_name = "baf-fraud",
palette = "Dark 3",
title = ""
)
}
\arguments{
\item{dataset_prefix}{Character. Prefix inside the bucket, e.g. "03_primary/variant=Base".}
\item{bucket_name}{Character. Bucket name. Default "baf-fraud".}
\item{palette}{Character. colorspace qualitative palette name. Default "Dark 3".}
\item{title}{Character. Plot title. Default "".}
}
\value{
A ggplot object.
}
\description{
Builds an exploratory chart of absolute application counts by month
split by outcome (Legit vs Fraud). Uses a log10 y-axis so rare fraud
remains visible on the same axis.
}
\details{
Data source: expects a cleaned "primary" dataset prefix (e.g. 03_primary/variant=Base)
stored in MinIO/S3, accessed via \code{connect_baf()}.
}

View File

@@ -0,0 +1,16 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_hexbin_interaction}
\alias{plot_hexbin_interaction}
\title{Plot Hexbin Interaction}
\usage{
plot_hexbin_interaction(baked_data, title = "")
}
\arguments{
\item{baked_data}{Baked EDA data}
\item{title}{Character. Plot title. Default "".}
}
\description{
Plot Hexbin Interaction
}

16
man/plot_missingness.Rd Normal file
View File

@@ -0,0 +1,16 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_missingness}
\alias{plot_missingness}
\title{Plot Missingness Signal}
\usage{
plot_missingness(eda_data, title = "")
}
\arguments{
\item{eda_data}{Raw EDA data}
\item{title}{Character. Plot title. Default "".}
}
\description{
Plot Missingness Signal
}

16
man/plot_num_cor.Rd Normal file
View File

@@ -0,0 +1,16 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_num_cor}
\alias{plot_num_cor}
\title{Plot Numeric Correlation Matrix}
\usage{
plot_num_cor(eda_data, title = "")
}
\arguments{
\item{eda_data}{Raw EDA data}
\item{title}{Character. Plot title. Default "".}
}
\description{
Plot Numeric Correlation Matrix
}

16
man/plot_var_imp.Rd Normal file
View File

@@ -0,0 +1,16 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{plot_var_imp}
\alias{plot_var_imp}
\title{Plot Variable Importance}
\usage{
plot_var_imp(model, title = "")
}
\arguments{
\item{model}{Trained LightGBM model}
\item{title}{Character. Plot title. Default "".}
}
\description{
Plot Variable Importance
}

14
man/prepare_eda_recipe.Rd Normal file
View File

@@ -0,0 +1,14 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{prepare_eda_recipe}
\alias{prepare_eda_recipe}
\title{Prepare EDA Recipe}
\usage{
prepare_eda_recipe(eda_data)
}
\arguments{
\item{eda_data}{Raw EDA data}
}
\description{
Prepare EDA Recipe
}

21
man/render_slides.Rd Normal file
View File

@@ -0,0 +1,21 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{render_slides}
\alias{render_slides}
\title{Render Quarto revealjs slideshow after required assets exist}
\usage{
render_slides(qmd = "index.qmd", assets, output_dir = "reports/slides")
}
\arguments{
\item{qmd}{Character. Input Quarto file (e.g. "index.qmd").}
\item{assets}{Character vector. File paths that must exist before rendering.}
\item{output_dir}{Character. Output directory for rendered slides.}
}
\value{
Character path to the rendered HTML file.
}
\description{
Render Quarto revealjs slideshow after required assets exist
}

View File

@@ -0,0 +1,34 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{run_imbalance_tournament}
\alias{run_imbalance_tournament}
\title{Run Class Imbalance Tournament}
\usage{
run_imbalance_tournament(
tasks,
windows,
feature_prefix,
bucket_name = "baf-fraud",
inputs_prefix = "05_model_input"
)
}
\arguments{
\item{tasks}{A tibble containing recipe_name, data_folder, and scale_pos_weight.}
\item{windows}{A tibble containing window_id, train_months, and test_month.}
\item{feature_prefix}{Character. The upstream dependency prefix (used to force DAG execution).}
\item{bucket_name}{Character. Bucket name. Default "baf-fraud".}
\item{inputs_prefix}{Character. The folder containing the sampled data. Default "05_model_input".}
}
\value{
A tibble with the summarized tournament results.
}
\description{
Trains LightGBM models across different class imbalance strategies
(Standard, SMOTE, Adasyn, etc.) using sliding time windows. Evaluates
performance using PR-AUC and calculates statistical significance.
Includes common-sense hyperparameter defaults to prevent overfitting.
}

31
man/save_report_figure.Rd Normal file
View File

@@ -0,0 +1,31 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{save_report_figure}
\alias{save_report_figure}
\title{Save a report figure artifact}
\usage{
save_report_figure(
plot,
filename,
out_dir = "reports/figures",
width = 12,
height = 6.75,
dpi = 300
)
}
\arguments{
\item{plot}{A ggplot object.}
\item{filename}{Character. Output filename, e.g. \code{"fig_fraud_by_month.png"}.}
\item{out_dir}{Character. Output directory. Default \code{"reports/figures"}.}
\item{width, height, dpi}{Numeric. Passed to \code{ggplot2::ggsave()}.}
}
\value{
Character. Normalized path to the saved file.
}
\description{
Saves a ggplot object to \code{reports/figures/}.
Intended for use in \code{targets} pipelines as a file-producing target.
}

21
man/save_report_table.Rd Normal file
View File

@@ -0,0 +1,21 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{save_report_table}
\alias{save_report_table}
\title{Save a report table artifact}
\usage{
save_report_table(x, filename, out_dir = "reports/tables")
}
\arguments{
\item{x}{Object to save.}
\item{filename}{Output filename, e.g. "tbl_fraud_by_month.rds".}
\item{out_dir}{Output directory. Default "reports/tables".}
}
\value{
Character path to saved file.
}
\description{
Save a report table artifact
}

14
man/train_diag_model.Rd Normal file
View File

@@ -0,0 +1,14 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{train_diag_model}
\alias{train_diag_model}
\title{Train Diagnostic Model}
\usage{
train_diag_model(baked_data)
}
\arguments{
\item{baked_data}{Baked EDA data}
}
\description{
Train Diagnostic Model
}

View File

@@ -0,0 +1,30 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/functions.R
\name{train_production_model}
\alias{train_production_model}
\title{Train and Serialize Production LightGBM Model}
\usage{
train_production_model(
data,
recipe,
best_params,
model_filename = "lgbm_prod.txt"
)
}
\arguments{
\item{data}{A data frame containing the full BAF dataset (Months 0-7).}
\item{recipe}{A prepared tidymodels recipe.}
\item{best_params}{A list or tibble of the winning hyperparameters.}
\item{model_filename}{Character. The target filename. Defaults to "lgbm_prod.txt".}
}
\value{
Character. The MinIO URI of the uploaded model artifact.
}
\description{
Trains a LightGBM model on the complete dataset using the winning
hyperparameters, serializes it to a text file, and uploads it directly
to MinIO via the Apache Arrow S3 interface.
}