From e6c20bd221c9f62f189906eaf8e2b1c7240e387d Mon Sep 17 00:00:00 2001 From: Rob Wiederstein Date: Sun, 22 Feb 2026 16:18:15 -0500 Subject: [PATCH] Add Gitea CI deployment workflow and update dependencies --- .dockerignore | 12 ++++++- .gitea/workflows/deploy.yaml | 65 ++++++++++++++++++++++++++++++++++ DESCRIPTION | 32 ++++++++++++----- Dockerfile | 45 +++++++++++++++++++++++ bank-fraud-baf-lakehouse.Rproj | 18 ++++++++++ bin/~ | 1 + index.qmd | 4 +-- renv.lock | 40 ++------------------- 8 files changed, 167 insertions(+), 50 deletions(-) create mode 100644 .gitea/workflows/deploy.yaml create mode 100644 Dockerfile create mode 100644 bank-fraud-baf-lakehouse.Rproj create mode 120000 bin/~ diff --git a/.dockerignore b/.dockerignore index 4a5e932..137f2f4 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1 +1,11 @@ -/dev/ +.git/ +.Renviron +dev/ +renv/library/ +renv/staging/ +_targets/ +docs/ +reports/ +scratch/ +bin/~ +*.Rproj diff --git a/.gitea/workflows/deploy.yaml b/.gitea/workflows/deploy.yaml new file mode 100644 index 0000000..b0ea60a --- /dev/null +++ b/.gitea/workflows/deploy.yaml @@ -0,0 +1,65 @@ +name: Deploy Lakehouse Docs + +on: + push: + branches: + - main + - master + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + container: + # Use the rocker/verse image as it contains R, pandoc, quarto, and many tidyverse dependencies + image: rocker/verse:4.4 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install system dependencies + # Install system dependencies required for R packages (xml2, curl, graphics libraries) + rsync/ssh + run: | + apt-get update -y + apt-get install -y + libcurl4-openssl-dev libssl-dev libxml2-dev libglpk-dev + libfontconfig1-dev libfreetype6-dev libharfbuzz-dev libfribidi-dev + libpng-dev libjpeg-dev libtiff-dev libzstd-dev cmake + rsync openssh-client + + - name: Install R package dependencies + # First install 'renv' or 'remotes', then install project dependencies + # Using remotes here as it's typically faster for CI if renv cache isn't available + run: | + Rscript -e "install.packages('remotes')" + Rscript -e "remotes::install_deps(dependencies = TRUE)" + + - name: Run Build Script + # This executes deploy.R which runs styler, devtools::document, targets::tar_make, and pkgdown::build_site + # Note: If tar_make() interacts with MinIO, you MUST provide the BAF_* secrets in Gitea repository settings. + env: + BAF_KEY: ${{ secrets.BAF_KEY }} + BAF_SECRET: ${{ secrets.BAF_SECRET }} + BAF_ENDPOINT: ${{ secrets.BAF_ENDPOINT }} + run: | + Rscript deploy.R + + - name: Deploy via Rsync + # Rsync the generated docs/ folder to the host machine. + # This requires setting DEPLOY_SSH_KEY, DEPLOY_SERVER_USER, and DEPLOY_SERVER_IP in Gitea Secrets. + env: + SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }} + SERVER_IP: ${{ secrets.DEPLOY_SERVER_IP }} + SERVER_USER: ${{ secrets.DEPLOY_SERVER_USER }} + TARGET_DIR: /data/projects/bank-fraud-baf-lakehouse/docs/ + run: | + # Setup SSH key + mkdir -p ~/.ssh + echo "$SSH_PRIVATE_KEY" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + + # Add host to known_hosts to prevent interactive prompt + ssh-keyscan -H $SERVER_IP >> ~/.ssh/known_hosts + + # Sync the docs/ directory directly into the Caddy webroot + rsync -avz --delete docs/ ${SERVER_USER}@${SERVER_IP}:${TARGET_DIR} diff --git a/DESCRIPTION b/DESCRIPTION index b517fe6..d2b6893 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,26 +15,40 @@ Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.3 Imports: arrow, + bonsai, colorspace, + corrr, cowplot, + dials, dplyr, - tidyr, - stringr, - readr, - gt, - quarto, ggplot2, - bonsai + ggrepel, + glue, + gt, + lightgbm, + lubridate, + parsnip, + purrr, + quarto, + readr, + recipes, + rsample, + scales, + stringr, + themis, + tidyr, + tidyselect, + tune, + workflows, + yardstick Suggests: duckdb, targets, tarchetypes, knitr, - scales, spelling, testthat (>= 3.0.0), - withr, - ggplot2 + withr Config/testthat/edition: 3 URL: https://docs.robwiederstein.org/baflakehouse BugReports: https://git.robwiederstein.org/rkw/bank-fraud-baf-lakehouse/issues diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d30725b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,45 @@ +FROM rocker/verse:4.5.2 + +# System dependencies for arrow, lightgbm, and ggplot2 (ragg/textshaping) +# Quarto is pre-installed in rocker/verse +RUN apt-get update && apt-get install -y --no-install-recommends \ + cmake \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev \ + libglpk-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libharfbuzz-dev \ + libfribidi-dev \ + libpng-dev \ + libjpeg-dev \ + libtiff-dev \ + libzstd-dev \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy renv infrastructure first so package restore is a cached layer. +# Changes to source code below will not invalidate this layer. +COPY renv.lock .Rprofile ./ +COPY renv/activate.R renv/settings.json renv/ + +RUN Rscript -e "renv::restore()" + +# Copy the full package source +COPY . . + +# Install the local package into the renv library +RUN Rscript -e "renv::install('.')" + +# Non-secret default — override with --env at runtime if needed +ENV BAF_BUCKET=lake + +# Secrets and endpoint are injected at runtime — never baked into the image: +# docker run \ +# --env BAF_ENDPOINT=172.19.0.1:9100 \ +# --env BAF_KEY=... \ +# --env BAF_SECRET=... \ +# baflakehouse +CMD ["Rscript", "deploy.R"] diff --git a/bank-fraud-baf-lakehouse.Rproj b/bank-fraud-baf-lakehouse.Rproj new file mode 100644 index 0000000..eaa6b81 --- /dev/null +++ b/bank-fraud-baf-lakehouse.Rproj @@ -0,0 +1,18 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/bin/~ b/bin/~ new file mode 120000 index 0000000..0901b80 --- /dev/null +++ b/bin/~ @@ -0,0 +1 @@ +/home/rstudio/.local/share/claude/versions/2.1.50 \ No newline at end of file diff --git a/index.qmd b/index.qmd index 45ba317..258ac23 100644 --- a/index.qmd +++ b/index.qmd @@ -277,7 +277,7 @@ knitr::include_graphics("resources/images/confusion-matrix.png") ``` :::{.notes} -The confusion matrix is the foundation of all classification metrics. Every metric we care about is derived from these four cells. +The confusion matrix is the foundation of all classification metrics. Every metric is derived from these four cells. In the fraud context: - **TN:** Legitimate application correctly approved. No harm done. @@ -301,7 +301,7 @@ $$\text{Precision} = \frac{TP}{TP + FP}$$ > Of all flagged cases, how many were real fraud? :::{.notes} -**Recall** (also called **detection rate**) is the primary metric for fraud detection. Missing a fraud case (FN) is costly, so we want Recall as high as possible. A model that flags every application gets a perfect detection rate — but at the cost of Precision. +**Recall** is the primary metric for fraud detection. Missing a fraud case (FN) is costly, so we want Recall as high as possible. A model that flags every application gets perfect Recall — but at the cost of Precision. **Precision** captures that cost: if we flag everything, every legitimate customer gets rejected. Precision measures how trustworthy our fraud flags actually are. diff --git a/renv.lock b/renv.lock index 17ff05c..e242297 100644 --- a/renv.lock +++ b/renv.lock @@ -4558,44 +4558,8 @@ }, "lightgbm": { "Package": "lightgbm", - "Version": "4.6.0.99", - "Source": "unknown", - "Type": "Package", - "Title": "Light Gradient Boosting Machine", - "Date": "2026-02-16", - "Authors@R": "c( person(\"Yu\", \"Shi\", email = \"yushi2@microsoft.com\", role = c(\"aut\")), person(\"Guolin\", \"Ke\", email = \"guolin.ke@outlook.com\", role = c(\"aut\")), person(\"Damien\", \"Soukhavong\", email = \"damien.soukhavong@skema.edu\", role = c(\"aut\")), person(\"James\", \"Lamb\", email=\"jaylamb20@gmail.com\", role = c(\"aut\", \"cre\")), person(\"Qi\", \"Meng\", role = c(\"aut\")), person(\"Thomas\", \"Finley\", role = c(\"aut\")), person(\"Taifeng\", \"Wang\", role = c(\"aut\")), person(\"Wei\", \"Chen\", role = c(\"aut\")), person(\"Weidong\", \"Ma\", role = c(\"aut\")), person(\"Qiwei\", \"Ye\", role = c(\"aut\")), person(\"Tie-Yan\", \"Liu\", role = c(\"aut\")), person(\"Nikita\", \"Titov\", role = c(\"aut\")), person(\"Yachen\", \"Yan\", role = c(\"ctb\")), person(\"Microsoft Corporation\", role = c(\"cph\")), person(\"Dropbox, Inc.\", role = c(\"cph\")), person(\"Alberto\", \"Ferreira\", role = c(\"ctb\")), person(\"Daniel\", \"Lemire\", role = c(\"ctb\")), person(\"Victor\", \"Zverovich\", role = c(\"cph\")), person(\"IBM Corporation\", role = c(\"ctb\")), person(\"David\", \"Cortes\", role = c(\"aut\")), person(\"Michael\", \"Mayer\", role = c(\"ctb\")) )", - "Description": "Tree based algorithms can be improved by introducing boosting frameworks. 'LightGBM' is one such framework, based on Ke, Guolin et al. (2017) . This package offers an R interface to work with it. It is designed to be distributed and efficient with the following advantages: 1. Faster training speed and higher efficiency. 2. Lower memory usage. 3. Better accuracy. 4. Parallel learning supported. 5. Capable of handling large-scale data. In recognition of these advantages, 'LightGBM' has been widely-used in many winning solutions of machine learning competitions. Comparison experiments on public datasets suggest that 'LightGBM' can outperform existing boosting frameworks on both efficiency and accuracy, with significantly lower memory consumption. In addition, parallel experiments suggest that in certain circumstances, 'LightGBM' can achieve a linear speed-up in training time by using multiple machines.", - "Encoding": "UTF-8", - "License": "MIT + file LICENSE", - "URL": "https://github.com/Microsoft/LightGBM", - "BugReports": "https://github.com/Microsoft/LightGBM/issues", - "NeedsCompilation": "yes", - "Biarch": "true", - "VignetteBuilder": "knitr", - "Suggests": [ - "knitr", - "markdown", - "processx", - "RhpcBLASctl", - "testthat" - ], - "Depends": [ - "R (>= 4.0)" - ], - "Imports": [ - "R6 (>= 2.4.0)", - "data.table (>= 1.9.6)", - "graphics", - "jsonlite (>= 1.0)", - "Matrix (>= 1.1-0)", - "methods", - "parallel", - "utils" - ], - "SystemRequirements": "C++17", - "RoxygenNote": "7.3.3", - "Author": "Yu Shi [aut], Guolin Ke [aut], Damien Soukhavong [aut], James Lamb [aut, cre], Qi Meng [aut], Thomas Finley [aut], Taifeng Wang [aut], Wei Chen [aut], Weidong Ma [aut], Qiwei Ye [aut], Tie-Yan Liu [aut], Nikita Titov [aut], Yachen Yan [ctb], Microsoft Corporation [cph], Dropbox, Inc. [cph], Alberto Ferreira [ctb], Daniel Lemire [ctb], Victor Zverovich [cph], IBM Corporation [ctb], David Cortes [aut], Michael Mayer [ctb]", - "Maintainer": "James Lamb " + "Version": "4.6.0", + "Source": "Repository" }, "listenv": { "Package": "listenv",