Initial commit: illustrative R data pipeline

This commit is contained in:
2026-03-09 14:20:10 -04:00
commit 83e50d2c36
12 changed files with 277 additions and 0 deletions

21
scripts/00_paths.R Normal file
View File

@@ -0,0 +1,21 @@
# 00_paths.R
# Central path constants and config. Source this at the top of every script.
# All paths are relative to the project root.
dotenv::load_dot_env(".env")
euro_to_dollar_conversion_ratio <- as.numeric(Sys.getenv("euro_to_dollar_conversion_ratio"))
input_cols <- as.integer(Sys.getenv("input_cols"))
dir_raw <- "data/raw"
dir_interim <- "data/interim"
dir_processed <- "data/processed"
dir_formatted <- "data/formatted"
income_raw <- file.path(dir_raw, "df_income.csv")
population_raw <- file.path(dir_raw, "df_population.csv")
income_usd_wide <- file.path(dir_interim, "df_income_usd_wide.csv")
income_usd <- file.path(dir_interim, "df_income_usd.csv")
population_full <- file.path(dir_interim, "df_population_full.csv")
merged <- file.path(dir_interim, "df_merged.csv")
result <- file.path(dir_processed, "df_result.csv")
formatted <- file.path(dir_formatted, "df_formatted.csv")

26
scripts/01_create_data.R Normal file
View File

@@ -0,0 +1,26 @@
# 01_create_data.R
# Creates two wide-format CSV files: income (euros) and population.
# Wide format mirrors how accountants typically lay out data in Excel —
# one row per entity, months as columns.
source("scripts/00_paths.R")
source("scripts/00_paths.R")
library(tibble)
library(readr)
df_income <- tribble(
~id, ~category, ~denomination, ~jan, ~feb, ~mar, ~apr, ~may, ~jun,
1, "income", "euro", 42000, 39500, 44200, 41800, 43100, 40600
)
df_population <- tribble(
~state, ~category, ~unit, ~jan, ~feb, ~mar, ~apr, ~may, ~jun,
"FL", "population", "thousands", 22600, 22600, 22650, 22650, 22700, 22700
)
write_csv(df_income, income_raw)
write_csv(df_population, population_raw)
cat("01_create_data.R: wrote df_income.csv and df_population.csv\n")

14
scripts/02_validate.R Normal file
View File

@@ -0,0 +1,14 @@
# 02_validate.R
# Validates raw input files. Stops the pipeline if data doesn't look right.
source("scripts/00_paths.R")
library(readr)
df_income <- read_csv(income_raw, show_col_types = FALSE)
df_population <- read_csv(population_raw, show_col_types = FALSE)
stopifnot("df_income has wrong number of columns" = ncol(df_income) == input_cols)
stopifnot("df_population has wrong number of columns" = ncol(df_population) == input_cols)
cat("02_validate.R: input data looks good\n")

View File

@@ -0,0 +1,20 @@
# 02_convert_currency.R
# Reads wide-format income CSV, converts EUR to USD. Stays wide.
# In a real pipeline the exchange rate could be fetched from an API.
source("scripts/00_paths.R")
library(readr)
library(dplyr)
df <- read_csv(income_raw, show_col_types = FALSE)
df_usd <- df |>
mutate(
across(jan:jun, ~ round(.x * euro_to_dollar_conversion_ratio, 2)),
denomination = "usd"
)
write_csv(df_usd, income_usd_wide)
cat("02_convert_currency.R: EUR -> USD conversion done, wrote df_income_usd_wide.csv\n")

20
scripts/04_pivot_income.R Normal file
View File

@@ -0,0 +1,20 @@
# 03_pivot_income.R
# Reads wide-format income (USD), pivots to long format.
source("scripts/00_paths.R")
library(readr)
library(tidyr)
df <- read_csv(income_usd_wide, show_col_types = FALSE)
df_long <- df |>
pivot_longer(
cols = jan:jun,
names_to = "month",
values_to = "value_usd"
)
write_csv(df_long, income_usd)
cat("03_pivot_income.R: wide -> long, wrote df_income_usd.csv\n")

View File

@@ -0,0 +1,26 @@
# 03_convert_units.R
# Reads wide-format population CSV, pivots to long, converts thousands to
# full unit count.
source("scripts/00_paths.R")
library(readr)
library(tidyr)
library(dplyr)
df <- read_csv(population_raw, show_col_types = FALSE)
df_long <- df |>
pivot_longer(
cols = jan:jun,
names_to = "month",
values_to = "value"
) |>
mutate(
value = value * 1000,
unit = "persons"
)
write_csv(df_long, population_full)
cat("04_convert_units.R: thousands -> persons, wrote df_population_full.csv\n")

18
scripts/06_merge.R Normal file
View File

@@ -0,0 +1,18 @@
# 04_merge.R
# Merges processed income (long, USD) and population (long, full units) on month.
source("scripts/00_paths.R")
library(readr)
library(dplyr)
df_income <- read_csv(income_usd, show_col_types = FALSE)
df_pop <- read_csv(population_full, show_col_types = FALSE) |>
select(month, population = value)
df_merged <- df_income |>
left_join(df_pop, by = "month")
write_csv(df_merged, merged)
cat("05_merge.R: joined income and population, wrote df_merged.csv\n")

18
scripts/07_calc.R Normal file
View File

@@ -0,0 +1,18 @@
# 05_calc.R
# Reads merged data and calculates income per person.
source("scripts/00_paths.R")
library(readr)
library(dplyr)
df_merged <- read_csv(merged, show_col_types = FALSE)
df_result <- df_merged |>
mutate(income_per_person = round(value_usd / population, 4)) |>
select(month, value_usd, population, income_per_person)
write_csv(df_result, result)
cat("05_calc.R: calculated income per person, wrote df_result.csv\n")
print(df_result)

17
scripts/08_format.R Normal file
View File

@@ -0,0 +1,17 @@
# 07_format.R
# Reads final result, rounds all numeric columns to 2 decimal places.
source("scripts/00_paths.R")
library(readr)
library(dplyr)
df <- read_csv(result, show_col_types = FALSE)
df_formatted <- df |>
mutate(across(where(is.numeric), ~ round(.x, 2)))
write_csv(df_formatted, formatted)
cat("07_format.R: rounded to 2 decimals, wrote df_formatted.csv\n")
print(df_formatted)