Initial commit: illustrative R data pipeline

2026-03-09 14:20:10 -04:00
commit 83e50d2c36
12 changed files with 277 additions and 0 deletions
--- a/.env
+++ b/.env
@@ -0,0 +1,2 @@
+euro_to_dollar_conversion_ratio=1.08
+input_cols=9
--- a/README.md
+++ b/README.md
@@ -0,0 +1,70 @@
+# powershell_example
+
+This example demonstrates core programming principles that apply regardless of
+language — Excel, PowerShell, or R:
+
+- **One job per script** — each script does exactly one thing
+- **Configuration over hardcoding** — constants like exchange rates live in `.env`, not buried in code
+- **Immutable inputs** — raw data is never modified; the pipeline can always be rerun from scratch
+- **Fail fast** — validation runs early and stops the pipeline with a clear message before bad data spreads
+- **Separation of concerns** — scripts don't know or care what runs before or after them
+- **Orchestration** — a single caller (`main.sh`) owns the sequence and can be scheduled via cron
+
+## Project structure
+
+```
+powershell_example/
+├── .env                        ← exchange rate and future config
+├── main.sh                     ← pipeline caller, runs all steps in order
+├── data/
+│   ├── raw/                    ← original source, never modified
+│   ├── interim/                ← transformed working files (steps 03–06)
+│   ├── processed/              ← calculated output (step 07)
+│   └── formatted/              ← presentation-ready, rounded (step 08)
+└── scripts/
+    ├── 00_paths.R              ← paths + config, sourced by all scripts
+    ├── 01_create_data.R        ← creates wide CSVs → raw/
+    ├── 02_validate.R           ← checks column counts, stops on failure
+    ├── 03_convert_currency.R   ← EUR to USD, stays wide → interim/
+    ├── 04_pivot_income.R       ← wide to long → interim/
+    ├── 05_convert_units.R      ← thousands to persons, pivot pop to long → interim/
+    ├── 06_merge.R              ← join income + population → interim/
+    ├── 07_calc.R               ← income per person → processed/
+    └── 08_format.R             ← round to 2 decimals → formatted/
+```
+
+## A note on what to commit
+
+This repo commits everything for illustration purposes. In a real project you
+would typically exclude:
+
+- **`.env`** — may contain API keys, credentials, or proprietary constants
+- **`data/`** — raw and processed data files are often too large for git and
+  may contain proprietary or personally identifiable information
+
+Both would normally be listed in `.gitignore`.
+
+## Usage
+
+```bash
+bash /data/projects/r/powershell_example/main.sh
+```
+
+## Scheduling with cron
+
+Cron is the Linux/Mac equivalent of **Windows Task Scheduler** — it runs a
+program automatically on a schedule with no human intervention.
+
+To run automatically every Monday at 8am:
+
+```
+0 8 * * 1  /data/projects/r/powershell_example/main.sh >> /tmp/pipeline.log 2>&1
+```
+
+**A note on corporate environments:** IT departments are often protective of
+who can schedule automated jobs on shared servers — and for good reason. Silent
+background processes can consume resources, touch shared databases, or trigger
+emails without anyone knowing they exist. On your own machine, Task Scheduler
+is fair game. On a company server, the right move is to document what the job
+does, show IT, and ask them to schedule it officially. That conversation also
+creates a paper trail, which matters in regulated industries.
--- a/main.sh
+++ b/main.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# main.sh — pipeline caller
+# Runs each R script in order via the rstudio Docker container.
+# Equivalent to a PowerShell script that calls macros sequentially.
+#
+# To schedule this automatically, add a cron entry:
+#   0 8 * * 1  /data/projects/r/powershell_example/main.sh >> /tmp/pipeline.log 2>&1
+# That runs the pipeline every Monday at 8am.
+
+set -e  # stop on any error
+
+WORKDIR="/data/projects/r/powershell_example"
+
+echo "=== Pipeline start: $(date) ==="
+
+docker exec -w "$WORKDIR" rstudio Rscript scripts/01_create_data.R
+docker exec -w "$WORKDIR" rstudio Rscript scripts/02_validate.R
+docker exec -w "$WORKDIR" rstudio Rscript scripts/03_convert_currency.R
+docker exec -w "$WORKDIR" rstudio Rscript scripts/04_pivot_income.R
+docker exec -w "$WORKDIR" rstudio Rscript scripts/05_convert_units.R
+docker exec -w "$WORKDIR" rstudio Rscript scripts/06_merge.R
+docker exec -w "$WORKDIR" rstudio Rscript scripts/07_calc.R
+docker exec -w "$WORKDIR" rstudio Rscript scripts/08_format.R
+
+echo "=== Pipeline complete: $(date) ==="
--- a/scripts/00_paths.R
+++ b/scripts/00_paths.R
@@ -0,0 +1,21 @@
+# 00_paths.R
+# Central path constants and config. Source this at the top of every script.
+# All paths are relative to the project root.
+
+dotenv::load_dot_env(".env")
+euro_to_dollar_conversion_ratio <- as.numeric(Sys.getenv("euro_to_dollar_conversion_ratio"))
+input_cols                      <- as.integer(Sys.getenv("input_cols"))
+
+dir_raw       <- "data/raw"
+dir_interim   <- "data/interim"
+dir_processed <- "data/processed"
+dir_formatted <- "data/formatted"
+
+income_raw        <- file.path(dir_raw,       "df_income.csv")
+population_raw    <- file.path(dir_raw,       "df_population.csv")
+income_usd_wide   <- file.path(dir_interim,   "df_income_usd_wide.csv")
+income_usd        <- file.path(dir_interim,   "df_income_usd.csv")
+population_full   <- file.path(dir_interim,   "df_population_full.csv")
+merged            <- file.path(dir_interim,   "df_merged.csv")
+result            <- file.path(dir_processed, "df_result.csv")
+formatted         <- file.path(dir_formatted, "df_formatted.csv")
--- a/scripts/01_create_data.R
+++ b/scripts/01_create_data.R
@@ -0,0 +1,26 @@
+# 01_create_data.R
+# Creates two wide-format CSV files: income (euros) and population.
+# Wide format mirrors how accountants typically lay out data in Excel —
+# one row per entity, months as columns.
+
+source("scripts/00_paths.R")
+
+source("scripts/00_paths.R")
+
+library(tibble)
+library(readr)
+
+df_income <- tribble(
+  ~id, ~category, ~denomination, ~jan,  ~feb,  ~mar,  ~apr,  ~may,  ~jun,
+    1,  "income",        "euro", 42000, 39500, 44200, 41800, 43100, 40600
+)
+
+df_population <- tribble(
+  ~state, ~category,     ~unit,       ~jan,  ~feb,  ~mar,  ~apr,  ~may,  ~jun,
+    "FL", "population", "thousands", 22600, 22600, 22650, 22650, 22700, 22700
+)
+
+write_csv(df_income,     income_raw)
+write_csv(df_population, population_raw)
+
+cat("01_create_data.R: wrote df_income.csv and df_population.csv\n")
--- a/scripts/02_validate.R
+++ b/scripts/02_validate.R
@@ -0,0 +1,14 @@
+# 02_validate.R
+# Validates raw input files. Stops the pipeline if data doesn't look right.
+
+source("scripts/00_paths.R")
+
+library(readr)
+
+df_income     <- read_csv(income_raw,     show_col_types = FALSE)
+df_population <- read_csv(population_raw, show_col_types = FALSE)
+
+stopifnot("df_income has wrong number of columns"     = ncol(df_income)     == input_cols)
+stopifnot("df_population has wrong number of columns" = ncol(df_population) == input_cols)
+
+cat("02_validate.R: input data looks good\n")
--- a/scripts/03_convert_currency.R
+++ b/scripts/03_convert_currency.R
@@ -0,0 +1,20 @@
+# 02_convert_currency.R
+# Reads wide-format income CSV, converts EUR to USD. Stays wide.
+# In a real pipeline the exchange rate could be fetched from an API.
+
+source("scripts/00_paths.R")
+
+library(readr)
+library(dplyr)
+
+df <- read_csv(income_raw, show_col_types = FALSE)
+
+df_usd <- df |>
+  mutate(
+    across(jan:jun, ~ round(.x * euro_to_dollar_conversion_ratio, 2)),
+    denomination = "usd"
+  )
+
+write_csv(df_usd, income_usd_wide)
+
+cat("02_convert_currency.R: EUR -> USD conversion done, wrote df_income_usd_wide.csv\n")
--- a/scripts/04_pivot_income.R
+++ b/scripts/04_pivot_income.R
@@ -0,0 +1,20 @@
+# 03_pivot_income.R
+# Reads wide-format income (USD), pivots to long format.
+
+source("scripts/00_paths.R")
+
+library(readr)
+library(tidyr)
+
+df <- read_csv(income_usd_wide, show_col_types = FALSE)
+
+df_long <- df |>
+  pivot_longer(
+    cols      = jan:jun,
+    names_to  = "month",
+    values_to = "value_usd"
+  )
+
+write_csv(df_long, income_usd)
+
+cat("03_pivot_income.R: wide -> long, wrote df_income_usd.csv\n")
--- a/scripts/05_convert_units.R
+++ b/scripts/05_convert_units.R
@@ -0,0 +1,26 @@
+# 03_convert_units.R
+# Reads wide-format population CSV, pivots to long, converts thousands to
+# full unit count.
+
+source("scripts/00_paths.R")
+
+library(readr)
+library(tidyr)
+library(dplyr)
+
+df <- read_csv(population_raw, show_col_types = FALSE)
+
+df_long <- df |>
+  pivot_longer(
+    cols      = jan:jun,
+    names_to  = "month",
+    values_to = "value"
+  ) |>
+  mutate(
+    value = value * 1000,
+    unit  = "persons"
+  )
+
+write_csv(df_long, population_full)
+
+cat("04_convert_units.R: thousands -> persons, wrote df_population_full.csv\n")
--- a/scripts/06_merge.R
+++ b/scripts/06_merge.R
@@ -0,0 +1,18 @@
+# 04_merge.R
+# Merges processed income (long, USD) and population (long, full units) on month.
+
+source("scripts/00_paths.R")
+
+library(readr)
+library(dplyr)
+
+df_income <- read_csv(income_usd,       show_col_types = FALSE)
+df_pop    <- read_csv(population_full,  show_col_types = FALSE) |>
+  select(month, population = value)
+
+df_merged <- df_income |>
+  left_join(df_pop, by = "month")
+
+write_csv(df_merged, merged)
+
+cat("05_merge.R: joined income and population, wrote df_merged.csv\n")
--- a/scripts/07_calc.R
+++ b/scripts/07_calc.R
@@ -0,0 +1,18 @@
+# 05_calc.R
+# Reads merged data and calculates income per person.
+
+source("scripts/00_paths.R")
+
+library(readr)
+library(dplyr)
+
+df_merged <- read_csv(merged, show_col_types = FALSE)
+
+df_result <- df_merged |>
+  mutate(income_per_person = round(value_usd / population, 4)) |>
+  select(month, value_usd, population, income_per_person)
+
+write_csv(df_result, result)
+
+cat("05_calc.R: calculated income per person, wrote df_result.csv\n")
+print(df_result)
--- a/scripts/08_format.R
+++ b/scripts/08_format.R
@@ -0,0 +1,17 @@
+# 07_format.R
+# Reads final result, rounds all numeric columns to 2 decimal places.
+
+source("scripts/00_paths.R")
+
+library(readr)
+library(dplyr)
+
+df <- read_csv(result, show_col_types = FALSE)
+
+df_formatted <- df |>
+  mutate(across(where(is.numeric), ~ round(.x, 2)))
+
+write_csv(df_formatted, formatted)
+
+cat("07_format.R: rounded to 2 decimals, wrote df_formatted.csv\n")
+print(df_formatted)