Initial commit: illustrative R data pipeline

2026-03-09 14:20:10 -04:00
commit 83e50d2c36
12 changed files with 277 additions and 0 deletions
--- a/.env
+++ b/.env
@@ -0,0 +1,2 @@
 euro_to_dollar_conversion_ratio=1.08
 input_cols=9
--- a/README.md
+++ b/README.md
@@ -0,0 +1,70 @@
 # powershell_example
 This example demonstrates core programming principles that apply regardless of
 language — Excel, PowerShell, or R:
 - **One job per script** — each script does exactly one thing
 - **Configuration over hardcoding** — constants like exchange rates live in `.env`, not buried in code
 - **Immutable inputs** — raw data is never modified; the pipeline can always be rerun from scratch
 - **Fail fast** — validation runs early and stops the pipeline with a clear message before bad data spreads
 - **Separation of concerns** — scripts don't know or care what runs before or after them
 - **Orchestration** — a single caller (`main.sh`) owns the sequence and can be scheduled via cron
 ## Project structure
 ```
 powershell_example/
 ├── .env                        ← exchange rate and future config
 ├── main.sh                     ← pipeline caller, runs all steps in order
 ├── data/
 │   ├── raw/                    ← original source, never modified
 │   ├── interim/                ← transformed working files (steps 03–06)
 │   ├── processed/              ← calculated output (step 07)
 │   └── formatted/              ← presentation-ready, rounded (step 08)
 └── scripts/
    ├── 00_paths.R              ← paths + config, sourced by all scripts
    ├── 01_create_data.R        ← creates wide CSVs → raw/
    ├── 02_validate.R           ← checks column counts, stops on failure
    ├── 03_convert_currency.R   ← EUR to USD, stays wide → interim/
    ├── 04_pivot_income.R       ← wide to long → interim/
    ├── 05_convert_units.R      ← thousands to persons, pivot pop to long → interim/
    ├── 06_merge.R              ← join income + population → interim/
    ├── 07_calc.R               ← income per person → processed/
    └── 08_format.R             ← round to 2 decimals → formatted/
 ```
 ## A note on what to commit
 This repo commits everything for illustration purposes. In a real project you
 would typically exclude:
 - **`.env`** — may contain API keys, credentials, or proprietary constants
 - **`data/`** — raw and processed data files are often too large for git and
  may contain proprietary or personally identifiable information
 Both would normally be listed in `.gitignore`.
 ## Usage
 ```bash
 bash /data/projects/r/powershell_example/main.sh
 ```
 ## Scheduling with cron
 Cron is the Linux/Mac equivalent of **Windows Task Scheduler** — it runs a
 program automatically on a schedule with no human intervention.
 To run automatically every Monday at 8am:
 ```
 0 8 * * 1  /data/projects/r/powershell_example/main.sh >> /tmp/pipeline.log 2>&1
 ```
 **A note on corporate environments:** IT departments are often protective of
 who can schedule automated jobs on shared servers — and for good reason. Silent
 background processes can consume resources, touch shared databases, or trigger
 emails without anyone knowing they exist. On your own machine, Task Scheduler
 is fair game. On a company server, the right move is to document what the job
 does, show IT, and ask them to schedule it officially. That conversation also
 creates a paper trail, which matters in regulated industries.
--- a/main.sh
+++ b/main.sh
@@ -0,0 +1,25 @@
 #!/usr/bin/env bash
 # main.sh — pipeline caller
 # Runs each R script in order via the rstudio Docker container.
 # Equivalent to a PowerShell script that calls macros sequentially.
 #
 # To schedule this automatically, add a cron entry:
 #   0 8 * * 1  /data/projects/r/powershell_example/main.sh >> /tmp/pipeline.log 2>&1
 # That runs the pipeline every Monday at 8am.
 set -e  # stop on any error
 WORKDIR="/data/projects/r/powershell_example"
 echo "=== Pipeline start: $(date) ==="
 docker exec -w "$WORKDIR" rstudio Rscript scripts/01_create_data.R
 docker exec -w "$WORKDIR" rstudio Rscript scripts/02_validate.R
 docker exec -w "$WORKDIR" rstudio Rscript scripts/03_convert_currency.R
 docker exec -w "$WORKDIR" rstudio Rscript scripts/04_pivot_income.R
 docker exec -w "$WORKDIR" rstudio Rscript scripts/05_convert_units.R
 docker exec -w "$WORKDIR" rstudio Rscript scripts/06_merge.R
 docker exec -w "$WORKDIR" rstudio Rscript scripts/07_calc.R
 docker exec -w "$WORKDIR" rstudio Rscript scripts/08_format.R
 echo "=== Pipeline complete: $(date) ==="
--- a/scripts/00_paths.R
+++ b/scripts/00_paths.R
@@ -0,0 +1,21 @@
 # 00_paths.R
 # Central path constants and config. Source this at the top of every script.
 # All paths are relative to the project root.
 dotenv::load_dot_env(".env")
 euro_to_dollar_conversion_ratio <- as.numeric(Sys.getenv("euro_to_dollar_conversion_ratio"))
 input_cols                      <- as.integer(Sys.getenv("input_cols"))
 dir_raw       <- "data/raw"
 dir_interim   <- "data/interim"
 dir_processed <- "data/processed"
 dir_formatted <- "data/formatted"
 income_raw        <- file.path(dir_raw,       "df_income.csv")
 population_raw    <- file.path(dir_raw,       "df_population.csv")
 income_usd_wide   <- file.path(dir_interim,   "df_income_usd_wide.csv")
 income_usd        <- file.path(dir_interim,   "df_income_usd.csv")
 population_full   <- file.path(dir_interim,   "df_population_full.csv")
 merged            <- file.path(dir_interim,   "df_merged.csv")
 result            <- file.path(dir_processed, "df_result.csv")
 formatted         <- file.path(dir_formatted, "df_formatted.csv")
--- a/scripts/01_create_data.R
+++ b/scripts/01_create_data.R
@@ -0,0 +1,26 @@
 # 01_create_data.R
 # Creates two wide-format CSV files: income (euros) and population.
 # Wide format mirrors how accountants typically lay out data in Excel —
 # one row per entity, months as columns.
 source("scripts/00_paths.R")
 source("scripts/00_paths.R")
 library(tibble)
 library(readr)
 df_income <- tribble(
  ~id, ~category, ~denomination, ~jan,  ~feb,  ~mar,  ~apr,  ~may,  ~jun,
    1,  "income",        "euro", 42000, 39500, 44200, 41800, 43100, 40600
 )
 df_population <- tribble(
  ~state, ~category,     ~unit,       ~jan,  ~feb,  ~mar,  ~apr,  ~may,  ~jun,
    "FL", "population", "thousands", 22600, 22600, 22650, 22650, 22700, 22700
 )
 write_csv(df_income,     income_raw)
 write_csv(df_population, population_raw)
 cat("01_create_data.R: wrote df_income.csv and df_population.csv\n")
--- a/scripts/02_validate.R
+++ b/scripts/02_validate.R
@@ -0,0 +1,14 @@
 # 02_validate.R
 # Validates raw input files. Stops the pipeline if data doesn't look right.
 source("scripts/00_paths.R")
 library(readr)
 df_income     <- read_csv(income_raw,     show_col_types = FALSE)
 df_population <- read_csv(population_raw, show_col_types = FALSE)
 stopifnot("df_income has wrong number of columns"     = ncol(df_income)     == input_cols)
 stopifnot("df_population has wrong number of columns" = ncol(df_population) == input_cols)
 cat("02_validate.R: input data looks good\n")
--- a/scripts/03_convert_currency.R
+++ b/scripts/03_convert_currency.R
@@ -0,0 +1,20 @@
 # 02_convert_currency.R
 # Reads wide-format income CSV, converts EUR to USD. Stays wide.
 # In a real pipeline the exchange rate could be fetched from an API.
 source("scripts/00_paths.R")
 library(readr)
 library(dplyr)
 df <- read_csv(income_raw, show_col_types = FALSE)
 df_usd <- df |>
  mutate(
    across(jan:jun, ~ round(.x * euro_to_dollar_conversion_ratio, 2)),
    denomination = "usd"
  )
 write_csv(df_usd, income_usd_wide)
 cat("02_convert_currency.R: EUR -> USD conversion done, wrote df_income_usd_wide.csv\n")
--- a/scripts/04_pivot_income.R
+++ b/scripts/04_pivot_income.R
@@ -0,0 +1,20 @@
 # 03_pivot_income.R
 # Reads wide-format income (USD), pivots to long format.
 source("scripts/00_paths.R")
 library(readr)
 library(tidyr)
 df <- read_csv(income_usd_wide, show_col_types = FALSE)
 df_long <- df |>
  pivot_longer(
    cols      = jan:jun,
    names_to  = "month",
    values_to = "value_usd"
  )
 write_csv(df_long, income_usd)
 cat("03_pivot_income.R: wide -> long, wrote df_income_usd.csv\n")
--- a/scripts/05_convert_units.R
+++ b/scripts/05_convert_units.R
@@ -0,0 +1,26 @@
 # 03_convert_units.R
 # Reads wide-format population CSV, pivots to long, converts thousands to
 # full unit count.
 source("scripts/00_paths.R")
 library(readr)
 library(tidyr)
 library(dplyr)
 df <- read_csv(population_raw, show_col_types = FALSE)
 df_long <- df |>
  pivot_longer(
    cols      = jan:jun,
    names_to  = "month",
    values_to = "value"
  ) |>
  mutate(
    value = value * 1000,
    unit  = "persons"
  )
 write_csv(df_long, population_full)
 cat("04_convert_units.R: thousands -> persons, wrote df_population_full.csv\n")
--- a/scripts/06_merge.R
+++ b/scripts/06_merge.R
@@ -0,0 +1,18 @@
 # 04_merge.R
 # Merges processed income (long, USD) and population (long, full units) on month.
 source("scripts/00_paths.R")
 library(readr)
 library(dplyr)
 df_income <- read_csv(income_usd,       show_col_types = FALSE)
 df_pop    <- read_csv(population_full,  show_col_types = FALSE) |>
  select(month, population = value)
 df_merged <- df_income |>
  left_join(df_pop, by = "month")
 write_csv(df_merged, merged)
 cat("05_merge.R: joined income and population, wrote df_merged.csv\n")
--- a/scripts/07_calc.R
+++ b/scripts/07_calc.R
@@ -0,0 +1,18 @@
 # 05_calc.R
 # Reads merged data and calculates income per person.
 source("scripts/00_paths.R")
 library(readr)
 library(dplyr)
 df_merged <- read_csv(merged, show_col_types = FALSE)
 df_result <- df_merged |>
  mutate(income_per_person = round(value_usd / population, 4)) |>
  select(month, value_usd, population, income_per_person)
 write_csv(df_result, result)
 cat("05_calc.R: calculated income per person, wrote df_result.csv\n")
 print(df_result)
--- a/scripts/08_format.R
+++ b/scripts/08_format.R
@@ -0,0 +1,17 @@
 # 07_format.R
 # Reads final result, rounds all numeric columns to 2 decimal places.
 source("scripts/00_paths.R")
 library(readr)
 library(dplyr)
 df <- read_csv(result, show_col_types = FALSE)
 df_formatted <- df |>
  mutate(across(where(is.numeric), ~ round(.x, 2)))
 write_csv(df_formatted, formatted)
 cat("07_format.R: rounded to 2 decimals, wrote df_formatted.csv\n")
 print(df_formatted)
		`@@ -0,0 +1,2 @@`
							`euro_to_dollar_conversion_ratio=1.08`
							`input_cols=9`