From 83e50d2c361e7242c85b54ac1fe2173bad6ab81a Mon Sep 17 00:00:00 2001 From: Rob Wiederstein Date: Mon, 9 Mar 2026 14:20:10 -0400 Subject: [PATCH] Initial commit: illustrative R data pipeline --- .env | 2 + README.md | 70 +++++++++++++++++++++++++++++++++++ main.sh | 25 +++++++++++++ scripts/00_paths.R | 21 +++++++++++ scripts/01_create_data.R | 26 +++++++++++++ scripts/02_validate.R | 14 +++++++ scripts/03_convert_currency.R | 20 ++++++++++ scripts/04_pivot_income.R | 20 ++++++++++ scripts/05_convert_units.R | 26 +++++++++++++ scripts/06_merge.R | 18 +++++++++ scripts/07_calc.R | 18 +++++++++ scripts/08_format.R | 17 +++++++++ 12 files changed, 277 insertions(+) create mode 100644 .env create mode 100644 README.md create mode 100755 main.sh create mode 100644 scripts/00_paths.R create mode 100644 scripts/01_create_data.R create mode 100644 scripts/02_validate.R create mode 100644 scripts/03_convert_currency.R create mode 100644 scripts/04_pivot_income.R create mode 100644 scripts/05_convert_units.R create mode 100644 scripts/06_merge.R create mode 100644 scripts/07_calc.R create mode 100644 scripts/08_format.R diff --git a/.env b/.env new file mode 100644 index 0000000..78771cc --- /dev/null +++ b/.env @@ -0,0 +1,2 @@ +euro_to_dollar_conversion_ratio=1.08 +input_cols=9 diff --git a/README.md b/README.md new file mode 100644 index 0000000..d81ab2b --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +# powershell_example + +This example demonstrates core programming principles that apply regardless of +language — Excel, PowerShell, or R: + +- **One job per script** — each script does exactly one thing +- **Configuration over hardcoding** — constants like exchange rates live in `.env`, not buried in code +- **Immutable inputs** — raw data is never modified; the pipeline can always be rerun from scratch +- **Fail fast** — validation runs early and stops the pipeline with a clear message before bad data spreads +- **Separation of concerns** — scripts don't know or care what runs before or after them +- **Orchestration** — a single caller (`main.sh`) owns the sequence and can be scheduled via cron + +## Project structure + +``` +powershell_example/ +├── .env ← exchange rate and future config +├── main.sh ← pipeline caller, runs all steps in order +├── data/ +│ ├── raw/ ← original source, never modified +│ ├── interim/ ← transformed working files (steps 03–06) +│ ├── processed/ ← calculated output (step 07) +│ └── formatted/ ← presentation-ready, rounded (step 08) +└── scripts/ + ├── 00_paths.R ← paths + config, sourced by all scripts + ├── 01_create_data.R ← creates wide CSVs → raw/ + ├── 02_validate.R ← checks column counts, stops on failure + ├── 03_convert_currency.R ← EUR to USD, stays wide → interim/ + ├── 04_pivot_income.R ← wide to long → interim/ + ├── 05_convert_units.R ← thousands to persons, pivot pop to long → interim/ + ├── 06_merge.R ← join income + population → interim/ + ├── 07_calc.R ← income per person → processed/ + └── 08_format.R ← round to 2 decimals → formatted/ +``` + +## A note on what to commit + +This repo commits everything for illustration purposes. In a real project you +would typically exclude: + +- **`.env`** — may contain API keys, credentials, or proprietary constants +- **`data/`** — raw and processed data files are often too large for git and + may contain proprietary or personally identifiable information + +Both would normally be listed in `.gitignore`. + +## Usage + +```bash +bash /data/projects/r/powershell_example/main.sh +``` + +## Scheduling with cron + +Cron is the Linux/Mac equivalent of **Windows Task Scheduler** — it runs a +program automatically on a schedule with no human intervention. + +To run automatically every Monday at 8am: + +``` +0 8 * * 1 /data/projects/r/powershell_example/main.sh >> /tmp/pipeline.log 2>&1 +``` + +**A note on corporate environments:** IT departments are often protective of +who can schedule automated jobs on shared servers — and for good reason. Silent +background processes can consume resources, touch shared databases, or trigger +emails without anyone knowing they exist. On your own machine, Task Scheduler +is fair game. On a company server, the right move is to document what the job +does, show IT, and ask them to schedule it officially. That conversation also +creates a paper trail, which matters in regulated industries. diff --git a/main.sh b/main.sh new file mode 100755 index 0000000..91b3b2a --- /dev/null +++ b/main.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# main.sh — pipeline caller +# Runs each R script in order via the rstudio Docker container. +# Equivalent to a PowerShell script that calls macros sequentially. +# +# To schedule this automatically, add a cron entry: +# 0 8 * * 1 /data/projects/r/powershell_example/main.sh >> /tmp/pipeline.log 2>&1 +# That runs the pipeline every Monday at 8am. + +set -e # stop on any error + +WORKDIR="/data/projects/r/powershell_example" + +echo "=== Pipeline start: $(date) ===" + +docker exec -w "$WORKDIR" rstudio Rscript scripts/01_create_data.R +docker exec -w "$WORKDIR" rstudio Rscript scripts/02_validate.R +docker exec -w "$WORKDIR" rstudio Rscript scripts/03_convert_currency.R +docker exec -w "$WORKDIR" rstudio Rscript scripts/04_pivot_income.R +docker exec -w "$WORKDIR" rstudio Rscript scripts/05_convert_units.R +docker exec -w "$WORKDIR" rstudio Rscript scripts/06_merge.R +docker exec -w "$WORKDIR" rstudio Rscript scripts/07_calc.R +docker exec -w "$WORKDIR" rstudio Rscript scripts/08_format.R + +echo "=== Pipeline complete: $(date) ===" diff --git a/scripts/00_paths.R b/scripts/00_paths.R new file mode 100644 index 0000000..3d460c1 --- /dev/null +++ b/scripts/00_paths.R @@ -0,0 +1,21 @@ +# 00_paths.R +# Central path constants and config. Source this at the top of every script. +# All paths are relative to the project root. + +dotenv::load_dot_env(".env") +euro_to_dollar_conversion_ratio <- as.numeric(Sys.getenv("euro_to_dollar_conversion_ratio")) +input_cols <- as.integer(Sys.getenv("input_cols")) + +dir_raw <- "data/raw" +dir_interim <- "data/interim" +dir_processed <- "data/processed" +dir_formatted <- "data/formatted" + +income_raw <- file.path(dir_raw, "df_income.csv") +population_raw <- file.path(dir_raw, "df_population.csv") +income_usd_wide <- file.path(dir_interim, "df_income_usd_wide.csv") +income_usd <- file.path(dir_interim, "df_income_usd.csv") +population_full <- file.path(dir_interim, "df_population_full.csv") +merged <- file.path(dir_interim, "df_merged.csv") +result <- file.path(dir_processed, "df_result.csv") +formatted <- file.path(dir_formatted, "df_formatted.csv") diff --git a/scripts/01_create_data.R b/scripts/01_create_data.R new file mode 100644 index 0000000..0d7a285 --- /dev/null +++ b/scripts/01_create_data.R @@ -0,0 +1,26 @@ +# 01_create_data.R +# Creates two wide-format CSV files: income (euros) and population. +# Wide format mirrors how accountants typically lay out data in Excel — +# one row per entity, months as columns. + +source("scripts/00_paths.R") + +source("scripts/00_paths.R") + +library(tibble) +library(readr) + +df_income <- tribble( + ~id, ~category, ~denomination, ~jan, ~feb, ~mar, ~apr, ~may, ~jun, + 1, "income", "euro", 42000, 39500, 44200, 41800, 43100, 40600 +) + +df_population <- tribble( + ~state, ~category, ~unit, ~jan, ~feb, ~mar, ~apr, ~may, ~jun, + "FL", "population", "thousands", 22600, 22600, 22650, 22650, 22700, 22700 +) + +write_csv(df_income, income_raw) +write_csv(df_population, population_raw) + +cat("01_create_data.R: wrote df_income.csv and df_population.csv\n") diff --git a/scripts/02_validate.R b/scripts/02_validate.R new file mode 100644 index 0000000..2cdc651 --- /dev/null +++ b/scripts/02_validate.R @@ -0,0 +1,14 @@ +# 02_validate.R +# Validates raw input files. Stops the pipeline if data doesn't look right. + +source("scripts/00_paths.R") + +library(readr) + +df_income <- read_csv(income_raw, show_col_types = FALSE) +df_population <- read_csv(population_raw, show_col_types = FALSE) + +stopifnot("df_income has wrong number of columns" = ncol(df_income) == input_cols) +stopifnot("df_population has wrong number of columns" = ncol(df_population) == input_cols) + +cat("02_validate.R: input data looks good\n") diff --git a/scripts/03_convert_currency.R b/scripts/03_convert_currency.R new file mode 100644 index 0000000..24af832 --- /dev/null +++ b/scripts/03_convert_currency.R @@ -0,0 +1,20 @@ +# 02_convert_currency.R +# Reads wide-format income CSV, converts EUR to USD. Stays wide. +# In a real pipeline the exchange rate could be fetched from an API. + +source("scripts/00_paths.R") + +library(readr) +library(dplyr) + +df <- read_csv(income_raw, show_col_types = FALSE) + +df_usd <- df |> + mutate( + across(jan:jun, ~ round(.x * euro_to_dollar_conversion_ratio, 2)), + denomination = "usd" + ) + +write_csv(df_usd, income_usd_wide) + +cat("02_convert_currency.R: EUR -> USD conversion done, wrote df_income_usd_wide.csv\n") diff --git a/scripts/04_pivot_income.R b/scripts/04_pivot_income.R new file mode 100644 index 0000000..b59b700 --- /dev/null +++ b/scripts/04_pivot_income.R @@ -0,0 +1,20 @@ +# 03_pivot_income.R +# Reads wide-format income (USD), pivots to long format. + +source("scripts/00_paths.R") + +library(readr) +library(tidyr) + +df <- read_csv(income_usd_wide, show_col_types = FALSE) + +df_long <- df |> + pivot_longer( + cols = jan:jun, + names_to = "month", + values_to = "value_usd" + ) + +write_csv(df_long, income_usd) + +cat("03_pivot_income.R: wide -> long, wrote df_income_usd.csv\n") diff --git a/scripts/05_convert_units.R b/scripts/05_convert_units.R new file mode 100644 index 0000000..c77dbe0 --- /dev/null +++ b/scripts/05_convert_units.R @@ -0,0 +1,26 @@ +# 03_convert_units.R +# Reads wide-format population CSV, pivots to long, converts thousands to +# full unit count. + +source("scripts/00_paths.R") + +library(readr) +library(tidyr) +library(dplyr) + +df <- read_csv(population_raw, show_col_types = FALSE) + +df_long <- df |> + pivot_longer( + cols = jan:jun, + names_to = "month", + values_to = "value" + ) |> + mutate( + value = value * 1000, + unit = "persons" + ) + +write_csv(df_long, population_full) + +cat("04_convert_units.R: thousands -> persons, wrote df_population_full.csv\n") diff --git a/scripts/06_merge.R b/scripts/06_merge.R new file mode 100644 index 0000000..e7ca602 --- /dev/null +++ b/scripts/06_merge.R @@ -0,0 +1,18 @@ +# 04_merge.R +# Merges processed income (long, USD) and population (long, full units) on month. + +source("scripts/00_paths.R") + +library(readr) +library(dplyr) + +df_income <- read_csv(income_usd, show_col_types = FALSE) +df_pop <- read_csv(population_full, show_col_types = FALSE) |> + select(month, population = value) + +df_merged <- df_income |> + left_join(df_pop, by = "month") + +write_csv(df_merged, merged) + +cat("05_merge.R: joined income and population, wrote df_merged.csv\n") diff --git a/scripts/07_calc.R b/scripts/07_calc.R new file mode 100644 index 0000000..f615f43 --- /dev/null +++ b/scripts/07_calc.R @@ -0,0 +1,18 @@ +# 05_calc.R +# Reads merged data and calculates income per person. + +source("scripts/00_paths.R") + +library(readr) +library(dplyr) + +df_merged <- read_csv(merged, show_col_types = FALSE) + +df_result <- df_merged |> + mutate(income_per_person = round(value_usd / population, 4)) |> + select(month, value_usd, population, income_per_person) + +write_csv(df_result, result) + +cat("05_calc.R: calculated income per person, wrote df_result.csv\n") +print(df_result) diff --git a/scripts/08_format.R b/scripts/08_format.R new file mode 100644 index 0000000..f8c962c --- /dev/null +++ b/scripts/08_format.R @@ -0,0 +1,17 @@ +# 07_format.R +# Reads final result, rounds all numeric columns to 2 decimal places. + +source("scripts/00_paths.R") + +library(readr) +library(dplyr) + +df <- read_csv(result, show_col_types = FALSE) + +df_formatted <- df |> + mutate(across(where(is.numeric), ~ round(.x, 2))) + +write_csv(df_formatted, formatted) + +cat("07_format.R: rounded to 2 decimals, wrote df_formatted.csv\n") +print(df_formatted)