Initial commit: illustrative R data pipeline

2026-03-09 14:20:10 -04:00
commit 83e50d2c36
12 changed files with 277 additions and 0 deletions
--- a/scripts/00_paths.R
+++ b/scripts/00_paths.R
@@ -0,0 +1,21 @@
+# 00_paths.R
+# Central path constants and config. Source this at the top of every script.
+# All paths are relative to the project root.
+
+dotenv::load_dot_env(".env")
+euro_to_dollar_conversion_ratio <- as.numeric(Sys.getenv("euro_to_dollar_conversion_ratio"))
+input_cols                      <- as.integer(Sys.getenv("input_cols"))
+
+dir_raw       <- "data/raw"
+dir_interim   <- "data/interim"
+dir_processed <- "data/processed"
+dir_formatted <- "data/formatted"
+
+income_raw        <- file.path(dir_raw,       "df_income.csv")
+population_raw    <- file.path(dir_raw,       "df_population.csv")
+income_usd_wide   <- file.path(dir_interim,   "df_income_usd_wide.csv")
+income_usd        <- file.path(dir_interim,   "df_income_usd.csv")
+population_full   <- file.path(dir_interim,   "df_population_full.csv")
+merged            <- file.path(dir_interim,   "df_merged.csv")
+result            <- file.path(dir_processed, "df_result.csv")
+formatted         <- file.path(dir_formatted, "df_formatted.csv")
--- a/scripts/01_create_data.R
+++ b/scripts/01_create_data.R
@@ -0,0 +1,26 @@
+# 01_create_data.R
+# Creates two wide-format CSV files: income (euros) and population.
+# Wide format mirrors how accountants typically lay out data in Excel —
+# one row per entity, months as columns.
+
+source("scripts/00_paths.R")
+
+source("scripts/00_paths.R")
+
+library(tibble)
+library(readr)
+
+df_income <- tribble(
+  ~id, ~category, ~denomination, ~jan,  ~feb,  ~mar,  ~apr,  ~may,  ~jun,
+    1,  "income",        "euro", 42000, 39500, 44200, 41800, 43100, 40600
+)
+
+df_population <- tribble(
+  ~state, ~category,     ~unit,       ~jan,  ~feb,  ~mar,  ~apr,  ~may,  ~jun,
+    "FL", "population", "thousands", 22600, 22600, 22650, 22650, 22700, 22700
+)
+
+write_csv(df_income,     income_raw)
+write_csv(df_population, population_raw)
+
+cat("01_create_data.R: wrote df_income.csv and df_population.csv\n")
--- a/scripts/02_validate.R
+++ b/scripts/02_validate.R
@@ -0,0 +1,14 @@
+# 02_validate.R
+# Validates raw input files. Stops the pipeline if data doesn't look right.
+
+source("scripts/00_paths.R")
+
+library(readr)
+
+df_income     <- read_csv(income_raw,     show_col_types = FALSE)
+df_population <- read_csv(population_raw, show_col_types = FALSE)
+
+stopifnot("df_income has wrong number of columns"     = ncol(df_income)     == input_cols)
+stopifnot("df_population has wrong number of columns" = ncol(df_population) == input_cols)
+
+cat("02_validate.R: input data looks good\n")
--- a/scripts/03_convert_currency.R
+++ b/scripts/03_convert_currency.R
@@ -0,0 +1,20 @@
+# 02_convert_currency.R
+# Reads wide-format income CSV, converts EUR to USD. Stays wide.
+# In a real pipeline the exchange rate could be fetched from an API.
+
+source("scripts/00_paths.R")
+
+library(readr)
+library(dplyr)
+
+df <- read_csv(income_raw, show_col_types = FALSE)
+
+df_usd <- df |>
+  mutate(
+    across(jan:jun, ~ round(.x * euro_to_dollar_conversion_ratio, 2)),
+    denomination = "usd"
+  )
+
+write_csv(df_usd, income_usd_wide)
+
+cat("02_convert_currency.R: EUR -> USD conversion done, wrote df_income_usd_wide.csv\n")
--- a/scripts/04_pivot_income.R
+++ b/scripts/04_pivot_income.R
@@ -0,0 +1,20 @@
+# 03_pivot_income.R
+# Reads wide-format income (USD), pivots to long format.
+
+source("scripts/00_paths.R")
+
+library(readr)
+library(tidyr)
+
+df <- read_csv(income_usd_wide, show_col_types = FALSE)
+
+df_long <- df |>
+  pivot_longer(
+    cols      = jan:jun,
+    names_to  = "month",
+    values_to = "value_usd"
+  )
+
+write_csv(df_long, income_usd)
+
+cat("03_pivot_income.R: wide -> long, wrote df_income_usd.csv\n")
--- a/scripts/05_convert_units.R
+++ b/scripts/05_convert_units.R
@@ -0,0 +1,26 @@
+# 03_convert_units.R
+# Reads wide-format population CSV, pivots to long, converts thousands to
+# full unit count.
+
+source("scripts/00_paths.R")
+
+library(readr)
+library(tidyr)
+library(dplyr)
+
+df <- read_csv(population_raw, show_col_types = FALSE)
+
+df_long <- df |>
+  pivot_longer(
+    cols      = jan:jun,
+    names_to  = "month",
+    values_to = "value"
+  ) |>
+  mutate(
+    value = value * 1000,
+    unit  = "persons"
+  )
+
+write_csv(df_long, population_full)
+
+cat("04_convert_units.R: thousands -> persons, wrote df_population_full.csv\n")
--- a/scripts/06_merge.R
+++ b/scripts/06_merge.R
@@ -0,0 +1,18 @@
+# 04_merge.R
+# Merges processed income (long, USD) and population (long, full units) on month.
+
+source("scripts/00_paths.R")
+
+library(readr)
+library(dplyr)
+
+df_income <- read_csv(income_usd,       show_col_types = FALSE)
+df_pop    <- read_csv(population_full,  show_col_types = FALSE) |>
+  select(month, population = value)
+
+df_merged <- df_income |>
+  left_join(df_pop, by = "month")
+
+write_csv(df_merged, merged)
+
+cat("05_merge.R: joined income and population, wrote df_merged.csv\n")
--- a/scripts/07_calc.R
+++ b/scripts/07_calc.R
@@ -0,0 +1,18 @@
+# 05_calc.R
+# Reads merged data and calculates income per person.
+
+source("scripts/00_paths.R")
+
+library(readr)
+library(dplyr)
+
+df_merged <- read_csv(merged, show_col_types = FALSE)
+
+df_result <- df_merged |>
+  mutate(income_per_person = round(value_usd / population, 4)) |>
+  select(month, value_usd, population, income_per_person)
+
+write_csv(df_result, result)
+
+cat("05_calc.R: calculated income per person, wrote df_result.csv\n")
+print(df_result)
--- a/scripts/08_format.R
+++ b/scripts/08_format.R
@@ -0,0 +1,17 @@
+# 07_format.R
+# Reads final result, rounds all numeric columns to 2 decimal places.
+
+source("scripts/00_paths.R")
+
+library(readr)
+library(dplyr)
+
+df <- read_csv(result, show_col_types = FALSE)
+
+df_formatted <- df |>
+  mutate(across(where(is.numeric), ~ round(.x, 2)))
+
+write_csv(df_formatted, formatted)
+
+cat("07_format.R: rounded to 2 decimals, wrote df_formatted.csv\n")
+print(df_formatted)