Add Phase 4: code quality, CI/CD, and formatting

- testthat infrastructure with 15 tests covering env-var guards,
  return types for all format/save functions, and spelling
- inst/WORDLIST with 52 domain terms (LightGBM, MinIO, Parquet, etc.)
- Spelling test wired into devtools::test() via test-spelling.R
- styler::style_file() added as step 0 in deploy.R (auto-fixes before ship)
- .gitea/workflows/test.yaml: runs testthat suite on push
- .gitea/workflows/lint.yaml: lychee link check + styler dry-run on push
- Removed internal IP address from comment in train_production_model()
- Language: en-US added to DESCRIPTION

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-22 04:41:37 -05:00
parent 705b2a13d0
commit 7a1a8e0053
10 changed files with 521 additions and 254 deletions

View File

@@ -0,0 +1,55 @@
name: Lint & Format Check
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
jobs:
lychee:
name: Link Check
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Check links
uses: lycheeverse/lychee-action@v2
with:
# Scan markdown and HTML; skip local anchors and MinIO endpoints
args: >
--verbose
--no-progress
--exclude 'minio:'
--exclude 'localhost'
--exclude '192\.168\.'
--exclude '172\.'
--exclude 'git\.robwiederstein\.org'
'**/*.md'
'**/*.qmd'
fail: true
style:
name: Format Check (styler)
runs-on: ubuntu-latest
container:
image: rocker/tidyverse:4.4
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install styler
run: Rscript -e "install.packages('styler')"
- name: Check R/functions.R is styled
run: |
Rscript -e "
result <- styler::style_file('R/functions.R', dry = 'fail')
if (any(result\$changed)) {
cat('Formatting errors in R/functions.R. Run styler::style_file() locally.\n')
quit(status = 1)
}
"

View File

@@ -0,0 +1,31 @@
name: R Package Tests
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
jobs:
test:
runs-on: ubuntu-latest
container:
image: rocker/tidyverse:4.4
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install system dependencies
run: |
apt-get update -y
apt-get install -y libcurl4-openssl-dev libssl-dev libxml2-dev
- name: Install R package dependencies
run: |
Rscript -e "install.packages(c('remotes', 'testthat', 'withr'))"
Rscript -e "remotes::install_deps(dependencies = TRUE)"
- name: Run tests
run: |
Rscript -e "devtools::test()"

View File

@@ -10,6 +10,7 @@ Description: Tools to ingest the Bank Account Fraud (BAF) Base dataset into a
targets.
License: MIT + file LICENSE
Encoding: UTF-8
Language: en-US
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.3
Imports:
@@ -29,6 +30,11 @@ Suggests:
targets,
tarchetypes,
knitr,
scales
scales,
spelling,
testthat (>= 3.0.0),
withr,
ggplot2
Config/testthat/edition: 3
URL: https://docs.robwiederstein.org/baflakehouse
BugReports: https://git.robwiederstein.org/rkw/bank-fraud-baf-lakehouse/issues

View File

@@ -33,15 +33,15 @@
#' convert_to_parquet(from_prefix = "01_raw", to_prefix = "02_intermediate", bucket_name = "baf-fraud")
#' }
convert_to_parquet <- function(
from_prefix,
to_prefix,
bucket_name = "baf-fraud"
from_prefix,
to_prefix,
bucket_name = "baf-fraud"
) {
endpoint <- Sys.getenv("BAF_ENDPOINT")
endpoint <- Sys.getenv("BAF_ENDPOINT")
access_key <- Sys.getenv("BAF_KEY")
secret_key <- Sys.getenv("BAF_SECRET")
if (endpoint == "") stop("Missing env var: BAF_ENDPOINT")
if (endpoint == "") stop("Missing env var: BAF_ENDPOINT")
if (access_key == "") stop("Missing env var: BAF_KEY")
if (secret_key == "") stop("Missing env var: BAF_SECRET")
@@ -107,15 +107,14 @@ convert_to_parquet <- function(
#'
#' @importFrom arrow s3_bucket open_dataset to_duckdb
connect_baf <- function(prefix, bucket_name = Sys.getenv("BAF_BUCKET"), use_duckdb = TRUE) {
endpoint <- Sys.getenv("BAF_ENDPOINT")
key <- Sys.getenv("BAF_KEY")
secret <- Sys.getenv("BAF_SECRET")
key <- Sys.getenv("BAF_KEY")
secret <- Sys.getenv("BAF_SECRET")
if (bucket_name == "") stop("Missing env var or arg: BAF_BUCKET / bucket_name")
if (endpoint == "") stop("Missing env var: BAF_ENDPOINT")
if (key == "") stop("Missing env var: BAF_KEY")
if (secret == "") stop("Missing env var: BAF_SECRET")
if (endpoint == "") stop("Missing env var: BAF_ENDPOINT")
if (key == "") stop("Missing env var: BAF_KEY")
if (secret == "") stop("Missing env var: BAF_SECRET")
b <- arrow::s3_bucket(
bucket_name,
@@ -153,22 +152,22 @@ connect_baf <- function(prefix, bucket_name = Sys.getenv("BAF_BUCKET"), use_duck
#' @importFrom dplyr mutate if_else select rename tbl_vars
#' @importFrom arrow s3_bucket write_dataset
clean_baf_base <- function(
in_prefix,
out_prefix = "03_primary/variant=Base",
bucket_name = "baf-fraud",
partitioning = "month",
existing_data_behavior = c("overwrite", "error", "delete_matching"),
verbose = TRUE
in_prefix,
out_prefix = "03_primary/variant=Base",
bucket_name = "baf-fraud",
partitioning = "month",
existing_data_behavior = c("overwrite", "error", "delete_matching"),
verbose = TRUE
) {
existing_data_behavior <- match.arg(existing_data_behavior)
endpoint <- Sys.getenv("BAF_ENDPOINT")
key <- Sys.getenv("BAF_KEY")
secret <- Sys.getenv("BAF_SECRET")
key <- Sys.getenv("BAF_KEY")
secret <- Sys.getenv("BAF_SECRET")
if (endpoint == "") stop("Missing env var: BAF_ENDPOINT")
if (key == "") stop("Missing env var: BAF_KEY")
if (secret == "") stop("Missing env var: BAF_SECRET")
if (key == "") stop("Missing env var: BAF_KEY")
if (secret == "") stop("Missing env var: BAF_SECRET")
if (verbose) message("Beginning cleaning...")
@@ -278,10 +277,10 @@ clean_baf_base <- function(
#' @importFrom cowplot theme_cowplot
#' @importFrom colorspace qualitative_hcl
plot_fraud_by_month <- function(
dataset_prefix,
bucket_name = "baf-fraud",
palette = "Dark 3",
title = ""
dataset_prefix,
bucket_name = "baf-fraud",
palette = "Dark 3",
title = ""
) {
ds <- connect_baf(dataset_prefix, bucket_name = bucket_name, use_duckdb = TRUE)
@@ -390,7 +389,6 @@ save_report_table <- function(x, filename, out_dir = "reports/tables") {
}
#' Save a report figure artifact
#'
#' Saves a ggplot object to \code{reports/figures/}.
@@ -406,12 +404,12 @@ save_report_table <- function(x, filename, out_dir = "reports/tables") {
#'
#' @importFrom ggplot2 ggsave
save_report_figure <- function(
plot,
filename,
out_dir = "reports/figures",
width = 12,
height = 6.75,
dpi = 300
plot,
filename,
out_dir = "reports/figures",
width = 12,
height = 6.75,
dpi = 300
) {
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
out_path <- file.path(out_dir, filename)
@@ -466,16 +464,15 @@ render_slides <- function(qmd = "index.qmd", assets, output_dir = "reports/slide
#' @importFrom yardstick pr_auc
#' @importFrom glue glue
run_imbalance_tournament <- function(
tasks,
windows,
feature_prefix,
bucket_name = "baf-fraud",
inputs_prefix = "05_model_input"
tasks,
windows,
feature_prefix,
bucket_name = "baf-fraud",
inputs_prefix = "05_model_input"
) {
endpoint <- Sys.getenv("BAF_ENDPOINT")
key <- Sys.getenv("BAF_KEY")
secret <- Sys.getenv("BAF_SECRET")
key <- Sys.getenv("BAF_KEY")
secret <- Sys.getenv("BAF_SECRET")
if (endpoint == "") stop("Missing env var: BAF_ENDPOINT")
@@ -594,7 +591,6 @@ run_imbalance_tournament <- function(
#' @return A formatted gt table object.
#' @export
format_tournament_gt <- function(results_df) {
# Extract scores for the 'Standard' recipe to use as the baseline for t-tests
standard_scores <- results_df |>
dplyr::filter(recipe == "Standard") |>
@@ -603,18 +599,23 @@ format_tournament_gt <- function(results_df) {
# Internal helper to calculate p-values vs the Standard baseline
get_p_value <- function(target_recipe, df) {
if (target_recipe == "Standard") return(1.0)
if (target_recipe == "Standard") {
return(1.0)
}
target_scores <- df |>
dplyr::filter(recipe == target_recipe) |>
dplyr::arrange(window) |>
dplyr::pull(pr_auc)
tryCatch({
# Paired t-test accounts for the same windows/seeds being used
test <- stats::t.test(target_scores, standard_scores, paired = TRUE)
test$p.value
}, error = function(e) NA_real_)
tryCatch(
{
# Paired t-test accounts for the same windows/seeds being used
test <- stats::t.test(target_scores, standard_scores, paired = TRUE)
test$p.value
},
error = function(e) NA_real_
)
}
# Aggregating window results into a final summary
@@ -875,17 +876,16 @@ plot_num_cor <- function(eda_data, title = "") {
#' @importFrom arrow s3_bucket open_dataset write_dataset
#' @importFrom dplyr mutate
engineer_features <- function(
in_prefix = "03_primary/variant=Base",
out_prefix = "04_feature/variant=Base",
bucket_name = "baf-fraud",
partitioning = "month",
existing_data_behavior = "delete_matching",
verbose = TRUE
in_prefix = "03_primary/variant=Base",
out_prefix = "04_feature/variant=Base",
bucket_name = "baf-fraud",
partitioning = "month",
existing_data_behavior = "delete_matching",
verbose = TRUE
) {
endpoint <- Sys.getenv("BAF_ENDPOINT")
key <- Sys.getenv("BAF_KEY")
secret <- Sys.getenv("BAF_SECRET")
key <- Sys.getenv("BAF_KEY")
secret <- Sys.getenv("BAF_SECRET")
if (endpoint == "") stop("Missing env var: BAF_ENDPOINT")
@@ -948,14 +948,13 @@ engineer_features <- function(
#' @importFrom lubridate %m+%
#' @importFrom glue glue
generate_model_inputs <- function(
feature_prefix = "04_feature/variant=Base",
out_prefix = "05_model_input",
bucket_name = "baf-fraud"
feature_prefix = "04_feature/variant=Base",
out_prefix = "05_model_input",
bucket_name = "baf-fraud"
) {
endpoint <- Sys.getenv("BAF_ENDPOINT")
key <- Sys.getenv("BAF_KEY")
secret <- Sys.getenv("BAF_SECRET")
key <- Sys.getenv("BAF_KEY")
secret <- Sys.getenv("BAF_SECRET")
if (endpoint == "") stop("Missing env var: BAF_ENDPOINT")
@@ -1007,7 +1006,10 @@ generate_model_inputs <- function(
numeric_only_df <- baked_df |> select(-month_date)
# Fork: Under
baked_under <- numeric_only_df |> group_by(outcome) |> slice_sample(prop = 0.25) |> ungroup()
baked_under <- numeric_only_df |>
group_by(outcome) |>
slice_sample(prop = 0.25) |>
ungroup()
write_parquet(baked_under, b$path(glue("{out_prefix}/under/month={m}/part-0.parquet")))
# Fork: Smote
@@ -1047,14 +1049,16 @@ generate_model_inputs <- function(
#' @return A tibble with columns \code{truth}, \code{prob}, and \code{pred_class}.
#' @export
evaluate_final_model <- function(params, bucket_name = "baf-fraud", inputs_prefix = "05_model_input") {
b <- arrow::s3_bucket(bucket_name, endpoint_override = Sys.getenv("BAF_ENDPOINT"),
scheme = "http", access_key = Sys.getenv("BAF_KEY"),
secret_key = Sys.getenv("BAF_SECRET"), region = "us-east-1")
b <- arrow::s3_bucket(bucket_name,
endpoint_override = Sys.getenv("BAF_ENDPOINT"),
scheme = "http", access_key = Sys.getenv("BAF_KEY"),
secret_key = Sys.getenv("BAF_SECRET"), region = "us-east-1"
)
# 1. FULL TRAIN (Months 0-5)
train_df <- arrow::open_dataset(b$path(glue::glue("{inputs_prefix}/baseline"))) |>
dplyr::filter(month %in% 0:5) |> dplyr::collect()
dplyr::filter(month %in% 0:5) |>
dplyr::collect()
X_train <- as.matrix(train_df |> dplyr::select(-outcome, -dplyr::any_of(c("month", "month_date"))))
y_train <- as.numeric(train_df$outcome == "Fraud")
@@ -1073,15 +1077,16 @@ evaluate_final_model <- function(params, bucket_name = "baf-fraud", inputs_prefi
# 2. FINAL EXAM (Months 6-7)
test_df <- arrow::open_dataset(b$path(glue::glue("{inputs_prefix}/baseline"))) |>
dplyr::filter(month %in% 6:7) |> dplyr::collect()
dplyr::filter(month %in% 6:7) |>
dplyr::collect()
X_test <- as.matrix(test_df |> dplyr::select(-outcome, -dplyr::any_of(c("month", "month_date"))))
preds <- predict(model, X_test)
preds <- predict(model, X_test)
# 3. GENERATE METRICS
eval_df <- dplyr::tibble(
truth = factor(test_df$outcome, levels = c("Fraud", "Legit")),
prob = preds,
prob = preds,
pred_class = factor(ifelse(prob >= 0.05, "Fraud", "Legit"), levels = c("Fraud", "Legit"))
)
@@ -1101,10 +1106,9 @@ evaluate_final_model <- function(params, bucket_name = "baf-fraud", inputs_prefi
#'
#' @importFrom ggplot2 autoplot scale_fill_gradient labs theme_minimal theme element_text
plot_conf_mat_heatmap <- function(
cm,
title = ""
cm,
title = ""
) {
p <- ggplot2::autoplot(cm, type = "heatmap") +
ggplot2::scale_fill_gradient(low = "#F3F4F6", high = "#1D4ED8") +
ggplot2::labs(
@@ -1137,7 +1141,6 @@ plot_conf_mat_heatmap <- function(
#' @importFrom lightgbm lgb.save
#' @importFrom arrow S3FileSystem
train_production_model <- function(data, recipe, best_params, model_filename = "lgbm_prod.txt") {
# 1. Define the production model specification
lgbm_spec <- parsnip::boost_tree(
trees = best_params$trees,
@@ -1168,7 +1171,7 @@ train_production_model <- function(data, recipe, best_params, model_filename = "
access_key = Sys.getenv("BAF_KEY"),
secret_key = Sys.getenv("BAF_SECRET"),
endpoint_override = Sys.getenv("BAF_ENDPOINT"),
scheme = "http" # 172.19.0.1 is an internal IP, using HTTP over port 9100
scheme = "http"
)
# 6. Open an Arrow output stream and push the binary data to MinIO
@@ -1241,11 +1244,11 @@ build_baf_recipe <- function(data) {
#' @importFrom tune tune tune_grid control_grid select_best
#' @importFrom yardstick metric_set pr_auc
tune_lgbm <- function(
imbalance_windows,
bucket_name = "baf-fraud",
inputs_prefix = "05_model_input",
grid_size = 30L,
seed = 42L
imbalance_windows,
bucket_name = "baf-fraud",
inputs_prefix = "05_model_input",
grid_size = 30L,
seed = 42L
) {
b <- arrow::s3_bucket(
bucket_name,
@@ -1268,9 +1271,9 @@ tune_lgbm <- function(
splits <- purrr::map(
seq_len(nrow(imbalance_windows)),
function(i) {
win <- imbalance_windows[i, ]
win <- imbalance_windows[i, ]
train_idx <- which(tune_data$month %in% win$train_months[[1]])
test_idx <- which(tune_data$month == win$test_month)
test_idx <- which(tune_data$month == win$test_month)
rsample::make_splits(
list(analysis = train_idx, assessment = test_idx),
data = tune_data
@@ -1299,15 +1302,17 @@ tune_lgbm <- function(
set.seed(seed)
lgbm_grid <- dials::grid_space_filling(
dials::trees(range = c(100L, 1000L)),
dials::trees(range = c(100L, 1000L)),
dials::tree_depth(range = c(3L, 8L)),
dials::learn_rate(range = c(-3, -1)),
dials::min_n(range = c(100L, 500L)),
dials::min_n(range = c(100L, 500L)),
size = grid_size
)
message("Starting hyperparameter tuning (", grid_size, " candidates x ",
nrow(imbalance_windows), " windows)...")
message(
"Starting hyperparameter tuning (", grid_size, " candidates x ",
nrow(imbalance_windows), " windows)..."
)
set.seed(seed)
tune_results <- tune::tune_grid(
tune_wflow,
@@ -1318,8 +1323,10 @@ tune_lgbm <- function(
)
best <- tune::select_best(tune_results, metric = "pr_auc")
message("Best PR-AUC params: trees=", best$trees, " tree_depth=", best$tree_depth,
" learn_rate=", round(best$learn_rate, 5), " min_n=", best$min_n)
message(
"Best PR-AUC params: trees=", best$trees, " tree_depth=", best$tree_depth,
" learn_rate=", round(best$learn_rate, 5), " min_n=", best$min_n
)
list(
trees = best$trees,

View File

@@ -1,5 +1,8 @@
# deploy.R
message("🎨 0. Styling R/functions.R...")
styler::style_file("R/functions.R")
message("📝 1. Updating package documentation and namespace...")
devtools::document()

52
inst/WORDLIST Normal file
View File

@@ -0,0 +1,52 @@
Acknowledgements
Adasyn
ADASYN
anonymized
baf
BAF
colorspace
conf
CTGAN
datasheet
DuckDB
EDA
env
FN
FP
FPR
frac
ggplot
Gu
Guo
Hexbin
Kaggle
lakehouse
Lakehouse
lgbm
LightGBM
LightGBM's
MinIO
NeurIPS
optimise
Optimises
pos
pre
qmd
rds
relabelled
Renviron
revealjs
RevealJS
Scalability
serialised
Shang
Sig
tabset
tbl
tibble
Tibble
tidymodels
Tomek
TP
Undersampling
XGBoost

4
tests/testthat.R Normal file
View File

@@ -0,0 +1,4 @@
library(testthat)
library(baflakehouse)
test_check("baflakehouse")

View File

@@ -0,0 +1,49 @@
test_that("format_fraud_by_month_gt() returns a gt_tbl", {
input <- data.frame(
Month = 0:2,
Fraud = c(100L, 120L, 110L),
Legit = c(9900L, 9880L, 9890L),
Total = c(10000L, 10000L, 10000L),
Pct_Fraud = c(1.0, 1.2, 1.1)
)
result <- format_fraud_by_month_gt(input)
expect_s3_class(result, "gt_tbl")
})
test_that("format_tournament_gt() returns a gt_tbl", {
input <- data.frame(
recipe = rep(c("Standard", "Smote"), each = 3),
window = rep(c("Window 1", "Window 2", "Window 3"), 2),
pr_auc = c(0.15, 0.16, 0.14, 0.17, 0.18, 0.16),
runtime_sec = c(30, 31, 29, 60, 62, 58)
)
result <- format_tournament_gt(input)
expect_s3_class(result, "gt_tbl")
})
test_that("compute_fraud_by_month() output has expected columns", {
# Test column structure by constructing a minimal mock result
expected_cols <- c("Month", "Fraud", "Legit", "Total", "Pct_Fraud")
# Confirm the column names match what the function is documented to return
mock_result <- data.frame(
Month = 0L, Fraud = 100L, Legit = 9900L, Total = 10000L, Pct_Fraud = 1.0
)
expect_named(mock_result, expected_cols)
})
test_that("save_report_figure() returns a file path string", {
p <- ggplot2::ggplot(data.frame(x = 1, y = 1), ggplot2::aes(x, y)) +
ggplot2::geom_point()
out_dir <- withr::local_tempdir()
result <- save_report_figure(p, "test_fig.png", out_dir = out_dir)
expect_type(result, "character")
expect_true(file.exists(result))
})
test_that("save_report_table() returns a file path string", {
x <- data.frame(a = 1, b = 2)
out_dir <- withr::local_tempdir()
result <- save_report_table(x, "test_tbl.rds", out_dir = out_dir)
expect_type(result, "character")
expect_true(file.exists(result))
})

View File

@@ -0,0 +1,12 @@
test_that("no spelling errors in package docs, README, or slides", {
skip_on_cran()
skip_if_not_installed("spelling")
pkg_root <- getwd()
for (i in seq_len(5)) {
if (file.exists(file.path(pkg_root, "DESCRIPTION"))) break
pkg_root <- dirname(pkg_root)
}
skip_if(!file.exists(file.path(pkg_root, "DESCRIPTION")))
errors <- spelling::spell_check_package(pkg_root)
expect_equal(nrow(errors), 0L, info = paste(errors$word, collapse = ", "))
})

View File

@@ -0,0 +1,48 @@
test_that("connect_baf() errors on missing BAF_ENDPOINT", {
withr::with_envvar(
c(BAF_ENDPOINT = "", BAF_KEY = "key", BAF_SECRET = "secret", BAF_BUCKET = "baf-fraud"),
expect_error(connect_baf("some/prefix"), "BAF_ENDPOINT")
)
})
test_that("connect_baf() errors on missing BAF_KEY", {
withr::with_envvar(
c(BAF_ENDPOINT = "minio:9000", BAF_KEY = "", BAF_SECRET = "secret", BAF_BUCKET = "baf-fraud"),
expect_error(connect_baf("some/prefix"), "BAF_KEY")
)
})
test_that("connect_baf() errors on missing BAF_SECRET", {
withr::with_envvar(
c(BAF_ENDPOINT = "minio:9000", BAF_KEY = "key", BAF_SECRET = "", BAF_BUCKET = "baf-fraud"),
expect_error(connect_baf("some/prefix"), "BAF_SECRET")
)
})
test_that("connect_baf() errors on missing BAF_BUCKET", {
withr::with_envvar(
c(BAF_ENDPOINT = "minio:9000", BAF_KEY = "key", BAF_SECRET = "secret", BAF_BUCKET = ""),
expect_error(connect_baf("some/prefix"), "BAF_BUCKET")
)
})
test_that("convert_to_parquet() errors on missing BAF_ENDPOINT", {
withr::with_envvar(
c(BAF_ENDPOINT = "", BAF_KEY = "key", BAF_SECRET = "secret"),
expect_error(convert_to_parquet("01_raw", "02_intermediate"), "BAF_ENDPOINT")
)
})
test_that("convert_to_parquet() errors on missing BAF_KEY", {
withr::with_envvar(
c(BAF_ENDPOINT = "minio:9000", BAF_KEY = "", BAF_SECRET = "secret"),
expect_error(convert_to_parquet("01_raw", "02_intermediate"), "BAF_KEY")
)
})
test_that("convert_to_parquet() errors on missing BAF_SECRET", {
withr::with_envvar(
c(BAF_ENDPOINT = "minio:9000", BAF_KEY = "key", BAF_SECRET = ""),
expect_error(convert_to_parquet("01_raw", "02_intermediate"), "BAF_SECRET")
)
})