Initial commit: illustrative R data pipeline

This commit is contained in:
2026-03-09 14:20:10 -04:00
commit 83e50d2c36
12 changed files with 277 additions and 0 deletions

2
.env Normal file
View File

@@ -0,0 +1,2 @@
euro_to_dollar_conversion_ratio=1.08
input_cols=9

70
README.md Normal file
View File

@@ -0,0 +1,70 @@
# powershell_example
This example demonstrates core programming principles that apply regardless of
language — Excel, PowerShell, or R:
- **One job per script** — each script does exactly one thing
- **Configuration over hardcoding** — constants like exchange rates live in `.env`, not buried in code
- **Immutable inputs** — raw data is never modified; the pipeline can always be rerun from scratch
- **Fail fast** — validation runs early and stops the pipeline with a clear message before bad data spreads
- **Separation of concerns** — scripts don't know or care what runs before or after them
- **Orchestration** — a single caller (`main.sh`) owns the sequence and can be scheduled via cron
## Project structure
```
powershell_example/
├── .env ← exchange rate and future config
├── main.sh ← pipeline caller, runs all steps in order
├── data/
│ ├── raw/ ← original source, never modified
│ ├── interim/ ← transformed working files (steps 0306)
│ ├── processed/ ← calculated output (step 07)
│ └── formatted/ ← presentation-ready, rounded (step 08)
└── scripts/
├── 00_paths.R ← paths + config, sourced by all scripts
├── 01_create_data.R ← creates wide CSVs → raw/
├── 02_validate.R ← checks column counts, stops on failure
├── 03_convert_currency.R ← EUR to USD, stays wide → interim/
├── 04_pivot_income.R ← wide to long → interim/
├── 05_convert_units.R ← thousands to persons, pivot pop to long → interim/
├── 06_merge.R ← join income + population → interim/
├── 07_calc.R ← income per person → processed/
└── 08_format.R ← round to 2 decimals → formatted/
```
## A note on what to commit
This repo commits everything for illustration purposes. In a real project you
would typically exclude:
- **`.env`** — may contain API keys, credentials, or proprietary constants
- **`data/`** — raw and processed data files are often too large for git and
may contain proprietary or personally identifiable information
Both would normally be listed in `.gitignore`.
## Usage
```bash
bash /data/projects/r/powershell_example/main.sh
```
## Scheduling with cron
Cron is the Linux/Mac equivalent of **Windows Task Scheduler** — it runs a
program automatically on a schedule with no human intervention.
To run automatically every Monday at 8am:
```
0 8 * * 1 /data/projects/r/powershell_example/main.sh >> /tmp/pipeline.log 2>&1
```
**A note on corporate environments:** IT departments are often protective of
who can schedule automated jobs on shared servers — and for good reason. Silent
background processes can consume resources, touch shared databases, or trigger
emails without anyone knowing they exist. On your own machine, Task Scheduler
is fair game. On a company server, the right move is to document what the job
does, show IT, and ask them to schedule it officially. That conversation also
creates a paper trail, which matters in regulated industries.

25
main.sh Executable file
View File

@@ -0,0 +1,25 @@
#!/usr/bin/env bash
# main.sh — pipeline caller
# Runs each R script in order via the rstudio Docker container.
# Equivalent to a PowerShell script that calls macros sequentially.
#
# To schedule this automatically, add a cron entry:
# 0 8 * * 1 /data/projects/r/powershell_example/main.sh >> /tmp/pipeline.log 2>&1
# That runs the pipeline every Monday at 8am.
set -e # stop on any error
WORKDIR="/data/projects/r/powershell_example"
echo "=== Pipeline start: $(date) ==="
docker exec -w "$WORKDIR" rstudio Rscript scripts/01_create_data.R
docker exec -w "$WORKDIR" rstudio Rscript scripts/02_validate.R
docker exec -w "$WORKDIR" rstudio Rscript scripts/03_convert_currency.R
docker exec -w "$WORKDIR" rstudio Rscript scripts/04_pivot_income.R
docker exec -w "$WORKDIR" rstudio Rscript scripts/05_convert_units.R
docker exec -w "$WORKDIR" rstudio Rscript scripts/06_merge.R
docker exec -w "$WORKDIR" rstudio Rscript scripts/07_calc.R
docker exec -w "$WORKDIR" rstudio Rscript scripts/08_format.R
echo "=== Pipeline complete: $(date) ==="

21
scripts/00_paths.R Normal file
View File

@@ -0,0 +1,21 @@
# 00_paths.R
# Central path constants and config. Source this at the top of every script.
# All paths are relative to the project root.
dotenv::load_dot_env(".env")
euro_to_dollar_conversion_ratio <- as.numeric(Sys.getenv("euro_to_dollar_conversion_ratio"))
input_cols <- as.integer(Sys.getenv("input_cols"))
dir_raw <- "data/raw"
dir_interim <- "data/interim"
dir_processed <- "data/processed"
dir_formatted <- "data/formatted"
income_raw <- file.path(dir_raw, "df_income.csv")
population_raw <- file.path(dir_raw, "df_population.csv")
income_usd_wide <- file.path(dir_interim, "df_income_usd_wide.csv")
income_usd <- file.path(dir_interim, "df_income_usd.csv")
population_full <- file.path(dir_interim, "df_population_full.csv")
merged <- file.path(dir_interim, "df_merged.csv")
result <- file.path(dir_processed, "df_result.csv")
formatted <- file.path(dir_formatted, "df_formatted.csv")

26
scripts/01_create_data.R Normal file
View File

@@ -0,0 +1,26 @@
# 01_create_data.R
# Creates two wide-format CSV files: income (euros) and population.
# Wide format mirrors how accountants typically lay out data in Excel —
# one row per entity, months as columns.
source("scripts/00_paths.R")
source("scripts/00_paths.R")
library(tibble)
library(readr)
df_income <- tribble(
~id, ~category, ~denomination, ~jan, ~feb, ~mar, ~apr, ~may, ~jun,
1, "income", "euro", 42000, 39500, 44200, 41800, 43100, 40600
)
df_population <- tribble(
~state, ~category, ~unit, ~jan, ~feb, ~mar, ~apr, ~may, ~jun,
"FL", "population", "thousands", 22600, 22600, 22650, 22650, 22700, 22700
)
write_csv(df_income, income_raw)
write_csv(df_population, population_raw)
cat("01_create_data.R: wrote df_income.csv and df_population.csv\n")

14
scripts/02_validate.R Normal file
View File

@@ -0,0 +1,14 @@
# 02_validate.R
# Validates raw input files. Stops the pipeline if data doesn't look right.
source("scripts/00_paths.R")
library(readr)
df_income <- read_csv(income_raw, show_col_types = FALSE)
df_population <- read_csv(population_raw, show_col_types = FALSE)
stopifnot("df_income has wrong number of columns" = ncol(df_income) == input_cols)
stopifnot("df_population has wrong number of columns" = ncol(df_population) == input_cols)
cat("02_validate.R: input data looks good\n")

View File

@@ -0,0 +1,20 @@
# 02_convert_currency.R
# Reads wide-format income CSV, converts EUR to USD. Stays wide.
# In a real pipeline the exchange rate could be fetched from an API.
source("scripts/00_paths.R")
library(readr)
library(dplyr)
df <- read_csv(income_raw, show_col_types = FALSE)
df_usd <- df |>
mutate(
across(jan:jun, ~ round(.x * euro_to_dollar_conversion_ratio, 2)),
denomination = "usd"
)
write_csv(df_usd, income_usd_wide)
cat("02_convert_currency.R: EUR -> USD conversion done, wrote df_income_usd_wide.csv\n")

20
scripts/04_pivot_income.R Normal file
View File

@@ -0,0 +1,20 @@
# 03_pivot_income.R
# Reads wide-format income (USD), pivots to long format.
source("scripts/00_paths.R")
library(readr)
library(tidyr)
df <- read_csv(income_usd_wide, show_col_types = FALSE)
df_long <- df |>
pivot_longer(
cols = jan:jun,
names_to = "month",
values_to = "value_usd"
)
write_csv(df_long, income_usd)
cat("03_pivot_income.R: wide -> long, wrote df_income_usd.csv\n")

View File

@@ -0,0 +1,26 @@
# 03_convert_units.R
# Reads wide-format population CSV, pivots to long, converts thousands to
# full unit count.
source("scripts/00_paths.R")
library(readr)
library(tidyr)
library(dplyr)
df <- read_csv(population_raw, show_col_types = FALSE)
df_long <- df |>
pivot_longer(
cols = jan:jun,
names_to = "month",
values_to = "value"
) |>
mutate(
value = value * 1000,
unit = "persons"
)
write_csv(df_long, population_full)
cat("04_convert_units.R: thousands -> persons, wrote df_population_full.csv\n")

18
scripts/06_merge.R Normal file
View File

@@ -0,0 +1,18 @@
# 04_merge.R
# Merges processed income (long, USD) and population (long, full units) on month.
source("scripts/00_paths.R")
library(readr)
library(dplyr)
df_income <- read_csv(income_usd, show_col_types = FALSE)
df_pop <- read_csv(population_full, show_col_types = FALSE) |>
select(month, population = value)
df_merged <- df_income |>
left_join(df_pop, by = "month")
write_csv(df_merged, merged)
cat("05_merge.R: joined income and population, wrote df_merged.csv\n")

18
scripts/07_calc.R Normal file
View File

@@ -0,0 +1,18 @@
# 05_calc.R
# Reads merged data and calculates income per person.
source("scripts/00_paths.R")
library(readr)
library(dplyr)
df_merged <- read_csv(merged, show_col_types = FALSE)
df_result <- df_merged |>
mutate(income_per_person = round(value_usd / population, 4)) |>
select(month, value_usd, population, income_per_person)
write_csv(df_result, result)
cat("05_calc.R: calculated income per person, wrote df_result.csv\n")
print(df_result)

17
scripts/08_format.R Normal file
View File

@@ -0,0 +1,17 @@
# 07_format.R
# Reads final result, rounds all numeric columns to 2 decimal places.
source("scripts/00_paths.R")
library(readr)
library(dplyr)
df <- read_csv(result, show_col_types = FALSE)
df_formatted <- df |>
mutate(across(where(is.numeric), ~ round(.x, 2)))
write_csv(df_formatted, formatted)
cat("07_format.R: rounded to 2 decimals, wrote df_formatted.csv\n")
print(df_formatted)