Add Gitea CI deployment workflow and update dependencies
This commit is contained in:
@@ -1 +1,11 @@
|
||||
/dev/
|
||||
.git/
|
||||
.Renviron
|
||||
dev/
|
||||
renv/library/
|
||||
renv/staging/
|
||||
_targets/
|
||||
docs/
|
||||
reports/
|
||||
scratch/
|
||||
bin/~
|
||||
*.Rproj
|
||||
|
||||
65
.gitea/workflows/deploy.yaml
Normal file
65
.gitea/workflows/deploy.yaml
Normal file
@@ -0,0 +1,65 @@
|
||||
name: Deploy Lakehouse Docs
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- master
|
||||
|
||||
jobs:
|
||||
build-and-deploy:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
# Use the rocker/verse image as it contains R, pandoc, quarto, and many tidyverse dependencies
|
||||
image: rocker/verse:4.4
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install system dependencies
|
||||
# Install system dependencies required for R packages (xml2, curl, graphics libraries) + rsync/ssh
|
||||
run: |
|
||||
apt-get update -y
|
||||
apt-get install -y
|
||||
libcurl4-openssl-dev libssl-dev libxml2-dev libglpk-dev
|
||||
libfontconfig1-dev libfreetype6-dev libharfbuzz-dev libfribidi-dev
|
||||
libpng-dev libjpeg-dev libtiff-dev libzstd-dev cmake
|
||||
rsync openssh-client
|
||||
|
||||
- name: Install R package dependencies
|
||||
# First install 'renv' or 'remotes', then install project dependencies
|
||||
# Using remotes here as it's typically faster for CI if renv cache isn't available
|
||||
run: |
|
||||
Rscript -e "install.packages('remotes')"
|
||||
Rscript -e "remotes::install_deps(dependencies = TRUE)"
|
||||
|
||||
- name: Run Build Script
|
||||
# This executes deploy.R which runs styler, devtools::document, targets::tar_make, and pkgdown::build_site
|
||||
# Note: If tar_make() interacts with MinIO, you MUST provide the BAF_* secrets in Gitea repository settings.
|
||||
env:
|
||||
BAF_KEY: ${{ secrets.BAF_KEY }}
|
||||
BAF_SECRET: ${{ secrets.BAF_SECRET }}
|
||||
BAF_ENDPOINT: ${{ secrets.BAF_ENDPOINT }}
|
||||
run: |
|
||||
Rscript deploy.R
|
||||
|
||||
- name: Deploy via Rsync
|
||||
# Rsync the generated docs/ folder to the host machine.
|
||||
# This requires setting DEPLOY_SSH_KEY, DEPLOY_SERVER_USER, and DEPLOY_SERVER_IP in Gitea Secrets.
|
||||
env:
|
||||
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
|
||||
SERVER_IP: ${{ secrets.DEPLOY_SERVER_IP }}
|
||||
SERVER_USER: ${{ secrets.DEPLOY_SERVER_USER }}
|
||||
TARGET_DIR: /data/projects/bank-fraud-baf-lakehouse/docs/
|
||||
run: |
|
||||
# Setup SSH key
|
||||
mkdir -p ~/.ssh
|
||||
echo "$SSH_PRIVATE_KEY" > ~/.ssh/id_rsa
|
||||
chmod 600 ~/.ssh/id_rsa
|
||||
|
||||
# Add host to known_hosts to prevent interactive prompt
|
||||
ssh-keyscan -H $SERVER_IP >> ~/.ssh/known_hosts
|
||||
|
||||
# Sync the docs/ directory directly into the Caddy webroot
|
||||
rsync -avz --delete docs/ ${SERVER_USER}@${SERVER_IP}:${TARGET_DIR}
|
||||
32
DESCRIPTION
32
DESCRIPTION
@@ -15,26 +15,40 @@ Roxygen: list(markdown = TRUE)
|
||||
RoxygenNote: 7.3.3
|
||||
Imports:
|
||||
arrow,
|
||||
bonsai,
|
||||
colorspace,
|
||||
corrr,
|
||||
cowplot,
|
||||
dials,
|
||||
dplyr,
|
||||
tidyr,
|
||||
stringr,
|
||||
readr,
|
||||
gt,
|
||||
quarto,
|
||||
ggplot2,
|
||||
bonsai
|
||||
ggrepel,
|
||||
glue,
|
||||
gt,
|
||||
lightgbm,
|
||||
lubridate,
|
||||
parsnip,
|
||||
purrr,
|
||||
quarto,
|
||||
readr,
|
||||
recipes,
|
||||
rsample,
|
||||
scales,
|
||||
stringr,
|
||||
themis,
|
||||
tidyr,
|
||||
tidyselect,
|
||||
tune,
|
||||
workflows,
|
||||
yardstick
|
||||
Suggests:
|
||||
duckdb,
|
||||
targets,
|
||||
tarchetypes,
|
||||
knitr,
|
||||
scales,
|
||||
spelling,
|
||||
testthat (>= 3.0.0),
|
||||
withr,
|
||||
ggplot2
|
||||
withr
|
||||
Config/testthat/edition: 3
|
||||
URL: https://docs.robwiederstein.org/baflakehouse
|
||||
BugReports: https://git.robwiederstein.org/rkw/bank-fraud-baf-lakehouse/issues
|
||||
|
||||
45
Dockerfile
Normal file
45
Dockerfile
Normal file
@@ -0,0 +1,45 @@
|
||||
FROM rocker/verse:4.5.2
|
||||
|
||||
# System dependencies for arrow, lightgbm, and ggplot2 (ragg/textshaping)
|
||||
# Quarto is pre-installed in rocker/verse
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
cmake \
|
||||
libcurl4-openssl-dev \
|
||||
libssl-dev \
|
||||
libxml2-dev \
|
||||
libglpk-dev \
|
||||
libfontconfig1-dev \
|
||||
libfreetype6-dev \
|
||||
libharfbuzz-dev \
|
||||
libfribidi-dev \
|
||||
libpng-dev \
|
||||
libjpeg-dev \
|
||||
libtiff-dev \
|
||||
libzstd-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy renv infrastructure first so package restore is a cached layer.
|
||||
# Changes to source code below will not invalidate this layer.
|
||||
COPY renv.lock .Rprofile ./
|
||||
COPY renv/activate.R renv/settings.json renv/
|
||||
|
||||
RUN Rscript -e "renv::restore()"
|
||||
|
||||
# Copy the full package source
|
||||
COPY . .
|
||||
|
||||
# Install the local package into the renv library
|
||||
RUN Rscript -e "renv::install('.')"
|
||||
|
||||
# Non-secret default — override with --env at runtime if needed
|
||||
ENV BAF_BUCKET=lake
|
||||
|
||||
# Secrets and endpoint are injected at runtime — never baked into the image:
|
||||
# docker run \
|
||||
# --env BAF_ENDPOINT=172.19.0.1:9100 \
|
||||
# --env BAF_KEY=... \
|
||||
# --env BAF_SECRET=... \
|
||||
# baflakehouse
|
||||
CMD ["Rscript", "deploy.R"]
|
||||
18
bank-fraud-baf-lakehouse.Rproj
Normal file
18
bank-fraud-baf-lakehouse.Rproj
Normal file
@@ -0,0 +1,18 @@
|
||||
Version: 1.0
|
||||
|
||||
RestoreWorkspace: Default
|
||||
SaveWorkspace: Default
|
||||
AlwaysSaveHistory: Default
|
||||
|
||||
EnableCodeIndexing: Yes
|
||||
UseSpacesForTab: Yes
|
||||
NumSpacesForTab: 2
|
||||
Encoding: UTF-8
|
||||
|
||||
RnwWeave: Sweave
|
||||
LaTeX: pdfLaTeX
|
||||
|
||||
BuildType: Package
|
||||
PackageUseDevtools: Yes
|
||||
PackageInstallArgs: --no-multiarch --with-keep.source
|
||||
PackageRoxygenize: rd,collate,namespace
|
||||
@@ -277,7 +277,7 @@ knitr::include_graphics("resources/images/confusion-matrix.png")
|
||||
```
|
||||
|
||||
:::{.notes}
|
||||
The confusion matrix is the foundation of all classification metrics. Every metric we care about is derived from these four cells.
|
||||
The confusion matrix is the foundation of all classification metrics. Every metric is derived from these four cells.
|
||||
|
||||
In the fraud context:
|
||||
- **TN:** Legitimate application correctly approved. No harm done.
|
||||
@@ -301,7 +301,7 @@ $$\text{Precision} = \frac{TP}{TP + FP}$$
|
||||
> Of all flagged cases, how many were real fraud?
|
||||
|
||||
:::{.notes}
|
||||
**Recall** (also called **detection rate**) is the primary metric for fraud detection. Missing a fraud case (FN) is costly, so we want Recall as high as possible. A model that flags every application gets a perfect detection rate — but at the cost of Precision.
|
||||
**Recall** is the primary metric for fraud detection. Missing a fraud case (FN) is costly, so we want Recall as high as possible. A model that flags every application gets perfect Recall — but at the cost of Precision.
|
||||
|
||||
**Precision** captures that cost: if we flag everything, every legitimate customer gets rejected. Precision measures how trustworthy our fraud flags actually are.
|
||||
|
||||
|
||||
40
renv.lock
40
renv.lock
@@ -4558,44 +4558,8 @@
|
||||
},
|
||||
"lightgbm": {
|
||||
"Package": "lightgbm",
|
||||
"Version": "4.6.0.99",
|
||||
"Source": "unknown",
|
||||
"Type": "Package",
|
||||
"Title": "Light Gradient Boosting Machine",
|
||||
"Date": "2026-02-16",
|
||||
"Authors@R": "c( person(\"Yu\", \"Shi\", email = \"yushi2@microsoft.com\", role = c(\"aut\")), person(\"Guolin\", \"Ke\", email = \"guolin.ke@outlook.com\", role = c(\"aut\")), person(\"Damien\", \"Soukhavong\", email = \"damien.soukhavong@skema.edu\", role = c(\"aut\")), person(\"James\", \"Lamb\", email=\"jaylamb20@gmail.com\", role = c(\"aut\", \"cre\")), person(\"Qi\", \"Meng\", role = c(\"aut\")), person(\"Thomas\", \"Finley\", role = c(\"aut\")), person(\"Taifeng\", \"Wang\", role = c(\"aut\")), person(\"Wei\", \"Chen\", role = c(\"aut\")), person(\"Weidong\", \"Ma\", role = c(\"aut\")), person(\"Qiwei\", \"Ye\", role = c(\"aut\")), person(\"Tie-Yan\", \"Liu\", role = c(\"aut\")), person(\"Nikita\", \"Titov\", role = c(\"aut\")), person(\"Yachen\", \"Yan\", role = c(\"ctb\")), person(\"Microsoft Corporation\", role = c(\"cph\")), person(\"Dropbox, Inc.\", role = c(\"cph\")), person(\"Alberto\", \"Ferreira\", role = c(\"ctb\")), person(\"Daniel\", \"Lemire\", role = c(\"ctb\")), person(\"Victor\", \"Zverovich\", role = c(\"cph\")), person(\"IBM Corporation\", role = c(\"ctb\")), person(\"David\", \"Cortes\", role = c(\"aut\")), person(\"Michael\", \"Mayer\", role = c(\"ctb\")) )",
|
||||
"Description": "Tree based algorithms can be improved by introducing boosting frameworks. 'LightGBM' is one such framework, based on Ke, Guolin et al. (2017) <https://proceedings.neurips.cc/paper/2017/hash/6449f44a102fde848669bdd9eb6b76fa-Abstract.html>. This package offers an R interface to work with it. It is designed to be distributed and efficient with the following advantages: 1. Faster training speed and higher efficiency. 2. Lower memory usage. 3. Better accuracy. 4. Parallel learning supported. 5. Capable of handling large-scale data. In recognition of these advantages, 'LightGBM' has been widely-used in many winning solutions of machine learning competitions. Comparison experiments on public datasets suggest that 'LightGBM' can outperform existing boosting frameworks on both efficiency and accuracy, with significantly lower memory consumption. In addition, parallel experiments suggest that in certain circumstances, 'LightGBM' can achieve a linear speed-up in training time by using multiple machines.",
|
||||
"Encoding": "UTF-8",
|
||||
"License": "MIT + file LICENSE",
|
||||
"URL": "https://github.com/Microsoft/LightGBM",
|
||||
"BugReports": "https://github.com/Microsoft/LightGBM/issues",
|
||||
"NeedsCompilation": "yes",
|
||||
"Biarch": "true",
|
||||
"VignetteBuilder": "knitr",
|
||||
"Suggests": [
|
||||
"knitr",
|
||||
"markdown",
|
||||
"processx",
|
||||
"RhpcBLASctl",
|
||||
"testthat"
|
||||
],
|
||||
"Depends": [
|
||||
"R (>= 4.0)"
|
||||
],
|
||||
"Imports": [
|
||||
"R6 (>= 2.4.0)",
|
||||
"data.table (>= 1.9.6)",
|
||||
"graphics",
|
||||
"jsonlite (>= 1.0)",
|
||||
"Matrix (>= 1.1-0)",
|
||||
"methods",
|
||||
"parallel",
|
||||
"utils"
|
||||
],
|
||||
"SystemRequirements": "C++17",
|
||||
"RoxygenNote": "7.3.3",
|
||||
"Author": "Yu Shi [aut], Guolin Ke [aut], Damien Soukhavong [aut], James Lamb [aut, cre], Qi Meng [aut], Thomas Finley [aut], Taifeng Wang [aut], Wei Chen [aut], Weidong Ma [aut], Qiwei Ye [aut], Tie-Yan Liu [aut], Nikita Titov [aut], Yachen Yan [ctb], Microsoft Corporation [cph], Dropbox, Inc. [cph], Alberto Ferreira [ctb], Daniel Lemire [ctb], Victor Zverovich [cph], IBM Corporation [ctb], David Cortes [aut], Michael Mayer [ctb]",
|
||||
"Maintainer": "James Lamb <jaylamb20@gmail.com>"
|
||||
"Version": "4.6.0",
|
||||
"Source": "Repository"
|
||||
},
|
||||
"listenv": {
|
||||
"Package": "listenv",
|
||||
|
||||
Reference in New Issue
Block a user