diff --git a/examples/soil-metals/.gitignore b/examples/soil-metals/.gitignore new file mode 100644 index 000000000..611527399 --- /dev/null +++ b/examples/soil-metals/.gitignore @@ -0,0 +1,10 @@ +.DS_Store +dist/ +docs/.observablehq/cache/ +node_modules/ +yarn-error.log +yarn.lock +.Rhistory +.Rapp.history +.RData +.Ruserdata \ No newline at end of file diff --git a/examples/soil-metals/README.md b/examples/soil-metals/README.md new file mode 100644 index 000000000..e567dae76 --- /dev/null +++ b/examples/soil-metals/README.md @@ -0,0 +1,15 @@ +# Soil metal analysis + +This is an example Observable Framework project that explores soil metal concentrations in soils from mining districts in Moquegua, Peru. The purpose is to show an R data loader within a Framework project. Metals selected for [principal component analysis]((https://en.wikipedia.org/wiki/Principal_component_analysis)) are random; charts should not be interpreted as research findings. + +Data source: Bedoya-Perales, N.S., Escobedo-Pacheco, E., Maus, D. et al. Dataset of metals and metalloids in food crops and soils sampled across the mining region of Moquegua in Peru. Sci Data 10, 483 (2023). https://doi.org/10.1038/s41597-023-02363-0 + +View the [live project](TODO). + +## Data loader + +A single R data loader (`soil-metals.zip.R`) reads in data from a local Excel file (`heavy-metals.xlsx`), does basic wrangling, then performs principal component analysis. Multiple outputs (as CSVs) are added to a Zip archive. + +## Charts + +All charts are created as functions in `index.md` using [Observable Plot](https://observablehq.com/plot/). \ No newline at end of file diff --git a/examples/soil-metals/docs/data/heavy-metals.xlsx b/examples/soil-metals/docs/data/heavy-metals.xlsx new file mode 100644 index 000000000..be151edc1 Binary files /dev/null and b/examples/soil-metals/docs/data/heavy-metals.xlsx differ diff --git a/examples/soil-metals/docs/data/soil-metals.zip.R b/examples/soil-metals/docs/data/soil-metals.zip.R new file mode 100644 index 000000000..715f50fdb --- /dev/null +++ b/examples/soil-metals/docs/data/soil-metals.zip.R @@ -0,0 +1,83 @@ +# Attach libraries (must be installed) +library(readxl) +library(dplyr) +library(tidyr) +library(janitor) +library(stringr) +library(readr) + +# Access and wrangle data +# Get only soil metal contentrations +hm_soils <- read_xlsx("docs/data/heavy-metals.xlsx", skip = 1, na = "-") |> + clean_names() |> + slice(-1) |> + select(-c(3, 4, 5, 14, 15, 16, 19:49)) |> + rename(altitude_masl = altitude_m_a_s_l) + +# Update column names to only metal (e.g. "aluminum") +metal_matrix <- c(sub("_.*", "", names(hm_soils)[13:35])) +colnames(hm_soils)[13:35] <- metal_matrix + +# Replace values below detection limit with LOD / 2: +detect_function <- function(vec) { + ifelse(str_detect(vec, "<"), + as.numeric(str_replace(vec, "<", "")) / 2, + as.numeric(vec) + ) +} + +hm_clean <- hm_soils |> + mutate(across(13:35, detect_function)) + +# Principal component analysis +pca_metals <- c("aluminum", "arsenic", "barium", "calcium", "chromium", "cobalt", "copper", "iron", "magnesium", "lead", "potassium", "vanadium") + +# Subset with location and sampling information, and metals above (complete) +soil_subset <- hm_clean |> + select(1:12, all_of(pca_metals)) |> + drop_na() + +# Only metal variables for PCA +pca_subset <- soil_subset |> + select(-(1:12)) + +# PCA: +soil_pca <- prcomp(pca_subset, scale = TRUE) + +# PCA results + +# Loadings +var_loadings <- as.data.frame(soil_pca$rotation) |> + tibble::rownames_to_column(var = "metal") + +# Variance explained +var_exp <- data.frame( + pc = names(var_loadings[2:length(var_loadings)]), + variance = soil_pca$sdev^2 / sum(soil_pca$sdev^2) +) + +# Scores +obs_scores <- data.frame(soil_subset, soil_pca$x) + +# Scaled loadings (as in factoextra) +scaling <- min( + (max(obs_scores["PC2"]) - min(obs_scores["PC2"]) / (max(var_loadings["PC2"]) - min(var_loadings["PC2"]))), + (max(obs_scores["PC1"]) - min(obs_scores["PC1"]) / (max(var_loadings["PC1"]) - min(var_loadings["PC1"]))) +) + +# Note: factoextra uses 0.7 * scaling +var_loadings_scaled <- var_loadings |> + mutate( + PC1_scale = 0.7 * scaling * PC1, + PC2_scale = 0.7 * scaling * PC2 + ) + +# Add to zip archive, write to stdout +setwd(tempdir()) + +write_csv(hm_clean, "soil-metals.csv") +write_csv(var_loadings_scaled, "var-loadings.csv") +write_csv(var_exp, "var-explained.csv") +write_csv(obs_scores, "obs-scores.csv") + +system("zip - -r .") diff --git a/examples/soil-metals/docs/index.md b/examples/soil-metals/docs/index.md new file mode 100644 index 000000000..30e65ab7b --- /dev/null +++ b/examples/soil-metals/docs/index.md @@ -0,0 +1,180 @@ +# Soil metal analysis +## In mining regions across Moquegua, Peru + +```js + +const soilMetals = FileAttachment("data/soil-metals/soil-metals.csv").csv({typed: true}); + +const varLoadings = FileAttachment("data/soil-metals/var-loadings.csv").csv({typed: true}); + +const varExplained = FileAttachment("data/soil-metals/var-explained.csv").csv({typed: true}); + +const obsScores = FileAttachment("data/soil-metals/obs-scores.csv").csv({typed: true}); +``` + +```js +const pickMetal = Inputs.radio(varLoadings.map(d => d.metal), {label: "Pick metal:", value: "aluminum"}); +const metal = Generators.input(pickMetal); +``` + + +```js +function boxPlot(width, height) { +return Plot.plot({ + width, + height: height - 100, + marginLeft: 100, + x: { grid: true, label: "Soil concentration (mg/kg)"}, + y: { grid: true, label: "District", tickSize: 0 }, + marks: [ + Plot.boxX(soilMetals, { + x: metal, + y: "district", + fill: "gray", + opacity: 0.3, + stroke: null + }), + Plot.dot(soilMetals, { + x: metal, + y: "district", + fill: "darkgray", + opacity: 0.7 + }), + Plot.tickX( + soilMetals, + Plot.groupY( + { x: "median" }, + { + x: metal, + y: "district", + stroke: "#ff725c", + strokeWidth: 4, + sort: { y: "x", reverse: true } + } + ) + ) + ] +}); +} +``` + +```js +const pickDistrict = Inputs.checkbox( + soilMetals.map((d) => d.district), + { + label: "Highlight district(s):", + unique: true, + value: ["Algarrobal", "Carumas", "Chojata", "Coalaque"], + sort: true + } +); + +const inputDistrict = Generators.input(pickDistrict); +``` + + +```js +function biplot(width, height) { +return Plot.plot({ + width, + height: height - 120, + x: {label: `PC1 (${d3.format(".1%")(varExplained.map(d => d.variance)[0])})`, ticks: 6}, + y: {label: `PC1 (${d3.format(".1%")(varExplained.map(d => d.variance)[1])})`, ticks: 6}, + marks: [ + Plot.frame({stroke: "#BCBCBC"}), + Plot.dot(obsScores, { + x: "PC1", + y: "PC2", + fill: "#BCBCBC", + opacity: 0.5, + r: 2.5 + }), + Plot.hull(obsScores, { + x: "PC1", + y: "PC2", + fill: d => inputDistrict.includes(d.district) ? d.district : null, + opacity: 0.4 + }), + Plot.hull(obsScores, { + x: "PC1", + y: "PC2", + stroke: d => inputDistrict.includes(d.district) ? d.district : null, + strokeWidth: 2 + }), + Plot.dot(obsScores, { + x: "PC1", + y: "PC2", + filter: d => inputDistrict.includes(d.district), + fill: "district", + r: 3, + opacity: 0.8 + }), + Plot.arrow(varLoadings, { x1: 0, y1: 0, x2: "PC1_scale", y2: "PC2_scale" }), + Plot.text(varLoadings, { + text: "metal", + x: "PC1_scale", + y: "PC2_scale", + dy: -5, + dx: 30, + fill: "black", + stroke: "white", + fontSize: 11 + }) + ] +}); +} +``` + +```js +function screeplot(width, height) { +return Plot.plot({ + marginTop: 15, + width, + height, + x: {label: null}, + y: {axis: null}, + marks: [ + Plot.barY(varExplained, { + x: "pc", + y: "variance", + fill: "gray", + sort: { x: "y", reverse: true } + }), + Plot.ruleY([0]), + Plot.text(varExplained, { + text: (d) => d3.format(".1%")(d.variance), + x: "pc", + y: "variance", + dy: -8 + }) + ] +}); +} +``` + +