observablehq · allisonhorst · Feb 11, 2024 · Feb 11, 2024 · Feb 11, 2024 · Feb 11, 2024
diff --git a/examples/soil-metals/.gitignore b/examples/soil-metals/.gitignore
@@ -0,0 +1,10 @@
+.DS_Store
+dist/
+docs/.observablehq/cache/
+node_modules/
+yarn-error.log
+yarn.lock
+.Rhistory
+.Rapp.history
+.RData
+.Ruserdata
diff --git a/examples/soil-metals/README.md b/examples/soil-metals/README.md
@@ -0,0 +1,15 @@
+# Soil metal analysis
+
+This is an example Observable Framework project that explores soil metal concentrations in soils from mining districts in Moquegua, Peru. The purpose is to show an R data loader within a Framework project. Metals selected for [principal component analysis]((https://en.wikipedia.org/wiki/Principal_component_analysis)) are random; charts should not be interpreted as research findings.
+
+Data source: Bedoya-Perales, N.S., Escobedo-Pacheco, E., Maus, D. et al. Dataset of metals and metalloids in food crops and soils sampled across the mining region of Moquegua in Peru. Sci Data 10, 483 (2023). https://doi.org/10.1038/s41597-023-02363-0
+
+View the [live project](TODO).
+
+## Data loader
+
+A single R data loader (`soil-metals.zip.R`) reads in data from a local Excel file (`heavy-metals.xlsx`), does basic wrangling, then performs principal component analysis. Multiple outputs (as CSVs) are added to a Zip archive. 
+
+## Charts
+
+All charts are created as functions in `index.md` using [Observable Plot](https://observablehq.com/plot/). 
diff --git a/examples/soil-metals/docs/data/heavy-metals.xlsx b/examples/soil-metals/docs/data/heavy-metals.xlsx
diff --git a/examples/soil-metals/docs/data/soil-metals.zip.R b/examples/soil-metals/docs/data/soil-metals.zip.R
@@ -0,0 +1,83 @@
+# Attach libraries (must be installed)
+library(readxl)
+library(dplyr)
+library(tidyr)
+library(janitor)
+library(stringr)
+library(readr)
+
+# Access and wrangle data
+# Get only soil metal contentrations
+hm_soils <- read_xlsx("docs/data/heavy-metals.xlsx", skip = 1, na = "-") |>
+    clean_names() |>
+    slice(-1) |>
+    select(-c(3, 4, 5, 14, 15, 16, 19:49)) |>
+    rename(altitude_masl = altitude_m_a_s_l)
+
+# Update column names to only metal (e.g. "aluminum")
+metal_matrix <- c(sub("_.*", "", names(hm_soils)[13:35]))
+colnames(hm_soils)[13:35] <- metal_matrix
+
+# Replace values below detection limit with LOD / 2:
+detect_function <- function(vec) {
+    ifelse(str_detect(vec, "<"),
+        as.numeric(str_replace(vec, "<", "")) / 2,
+        as.numeric(vec)
+    )
+}
+
+hm_clean <- hm_soils |>
+    mutate(across(13:35, detect_function))
+
+# Principal component analysis
+pca_metals <- c("aluminum", "arsenic", "barium", "calcium", "chromium", "cobalt", "copper", "iron", "magnesium", "lead", "potassium", "vanadium")
+
+# Subset with location and sampling information, and metals above (complete)
+soil_subset <- hm_clean |>
+    select(1:12, all_of(pca_metals)) |>
+    drop_na()
+
+# Only metal variables for PCA
+pca_subset <- soil_subset |>
+    select(-(1:12))
+
+# PCA:
+soil_pca <- prcomp(pca_subset, scale = TRUE)
+
+# PCA results
+
+# Loadings
+var_loadings <- as.data.frame(soil_pca$rotation) |>
+    tibble::rownames_to_column(var = "metal")
+
+# Variance explained
+var_exp <- data.frame(
+    pc = names(var_loadings[2:length(var_loadings)]),
+    variance = soil_pca$sdev^2 / sum(soil_pca$sdev^2)
+)
+
+# Scores
+obs_scores <- data.frame(soil_subset, soil_pca$x)
+
+# Scaled loadings (as in factoextra)
+scaling <- min(
+    (max(obs_scores["PC2"]) - min(obs_scores["PC2"]) / (max(var_loadings["PC2"]) - min(var_loadings["PC2"]))),
+    (max(obs_scores["PC1"]) - min(obs_scores["PC1"]) / (max(var_loadings["PC1"]) - min(var_loadings["PC1"])))
+)
+
+# Note: factoextra uses 0.7 * scaling
+var_loadings_scaled <- var_loadings |>
+    mutate(
+        PC1_scale = 0.7 * scaling * PC1,
+        PC2_scale = 0.7 * scaling * PC2
+    )
+
+# Add to zip archive, write to stdout
+setwd(tempdir())
+
+write_csv(hm_clean, "soil-metals.csv")
+write_csv(var_loadings_scaled, "var-loadings.csv")
+write_csv(var_exp, "var-explained.csv")
+write_csv(obs_scores, "obs-scores.csv")
+
+system("zip - -r .")
diff --git a/examples/soil-metals/docs/index.md b/examples/soil-metals/docs/index.md
@@ -0,0 +1,180 @@
+# Soil metal analysis
+## In mining regions across Moquegua, Peru
+
+```js
+
+const soilMetals = FileAttachment("data/soil-metals/soil-metals.csv").csv({typed: true});
+
+const varLoadings = FileAttachment("data/soil-metals/var-loadings.csv").csv({typed: true});
+
+const varExplained = FileAttachment("data/soil-metals/var-explained.csv").csv({typed: true});
+
+const obsScores = FileAttachment("data/soil-metals/obs-scores.csv").csv({typed: true});
+```
+
+```js
+const pickMetal = Inputs.radio(varLoadings.map(d => d.metal), {label: "Pick metal:", value: "aluminum"});
+const metal = Generators.input(pickMetal);
+```
+
+
+```js
+function boxPlot(width, height) {
+return Plot.plot({
+  width, 
+  height: height - 100,
+  marginLeft: 100,
+  x: { grid: true, label: "Soil concentration (mg/kg)"},
+  y: { grid: true, label: "District", tickSize: 0 },
+  marks: [
+    Plot.boxX(soilMetals, {
+      x: metal,
+      y: "district",
+      fill: "gray",
+      opacity: 0.3,
+      stroke: null
+    }),
+    Plot.dot(soilMetals, {
+      x: metal,
+      y: "district",
+      fill: "darkgray",
+      opacity: 0.7
+    }),
+    Plot.tickX(
+      soilMetals,
+      Plot.groupY(
+        { x: "median" },
+        {
+          x: metal,
+          y: "district",
+          stroke: "#ff725c",
+          strokeWidth: 4,
+          sort: { y: "x", reverse: true }
+        }
+      )
+    )
+  ]
+});
+}
+```
+
+```js
+const pickDistrict = Inputs.checkbox(
+  soilMetals.map((d) => d.district),
+  {
+    label: "Highlight district(s):",
+    unique: true,
+    value: ["Algarrobal", "Carumas", "Chojata", "Coalaque"],
+    sort: true
+  }
+);
+
+const inputDistrict = Generators.input(pickDistrict);
+```
+
+
+```js
+function biplot(width, height) {
+return Plot.plot({
+  width,
+  height: height - 120,
+  x: {label: `PC1 (${d3.format(".1%")(varExplained.map(d => d.variance)[0])})`, ticks: 6},
+  y: {label: `PC1 (${d3.format(".1%")(varExplained.map(d => d.variance)[1])})`, ticks: 6},
+  marks: [
+    Plot.frame({stroke: "#BCBCBC"}),
+    Plot.dot(obsScores, {
+      x: "PC1",
+      y: "PC2",
+      fill: "#BCBCBC",
+      opacity: 0.5,
+      r: 2.5
+    }),
+    Plot.hull(obsScores, {
+      x: "PC1",
+      y: "PC2",
+      fill: d => inputDistrict.includes(d.district) ? d.district : null,
+      opacity: 0.4
+    }),
+    Plot.hull(obsScores, {
+      x: "PC1",
+      y: "PC2",
+      stroke: d => inputDistrict.includes(d.district) ? d.district : null,
+      strokeWidth: 2
+    }),
+    Plot.dot(obsScores, {
+      x: "PC1",
+      y: "PC2",
+      filter: d => inputDistrict.includes(d.district),
+      fill: "district",
+      r: 3,
+      opacity: 0.8
+    }),
+    Plot.arrow(varLoadings, { x1: 0, y1: 0, x2: "PC1_scale", y2: "PC2_scale" }),
+    Plot.text(varLoadings, {
+      text: "metal",
+      x: "PC1_scale",
+      y: "PC2_scale",
+      dy: -5,
+      dx: 30,
+      fill: "black",
+      stroke: "white",
+      fontSize: 11
+    })
+  ]
+});
+}
+```
+
+```js
+function screeplot(width, height) {
+return Plot.plot({
+  marginTop: 15,
+  width,
+  height,
+  x: {label: null},
+  y: {axis: null},
+  marks: [
+    Plot.barY(varExplained, {
+      x: "pc",
+      y: "variance",
+      fill: "gray",
+      sort: { x: "y", reverse: true }
+    }),
+    Plot.ruleY([0]),
+    Plot.text(varExplained, {
+      text: (d) => d3.format(".1%")(d.variance),
+      x: "pc",
+      y: "variance",
+      dy: -8
+    })
+  ]
+});
+}
+```
+
+<div class="grid grid-cols-4" style="grid-auto-rows: 145px;">
+  <div class="card grid-colspan-2 grid-rowspan-4">
+    <h2>Principal component analysis</h2>
+    <h3>Axis scales are for observation scores. See table for loading values.</h3>
+    ${pickDistrict}
+    ${resize((width, height) => biplot(width, height))}
+  </div>
+  <div class="card grid-colspan-2 grid-rowspan-2">
+    <h2>Variance explained</h2>
+    ${resize((width, height) => screeplot(width, height))}
+  </div>
+  <div class="card grid-colspan-2 grid-rowspan-2" style="padding: 0; border-radius: 12px; overflow: hidden;">
+    ${Inputs.table(varLoadings.map(({PC1_scale, PC2_scale, ...rest}) => rest), {height: 320})}
+  </div>
+</div>
+
+<div class="grid grid-cols-4" style="grid-auto-rows: 160px;">
+  <div class="card grid-colspan-4 grid-rowspan-3">
+    <h2>Soil metal concentration by district</h2>
+    <h3>Dots are individual soil sample values; black line is the median for each district. Box is the interquartile range. All values in mg/kg.</h3>
+    ${pickMetal}
+    ${resize((width, height) => boxPlot(width, height))}
+  </div>
+</div>
+
+  <div class="note" label="Data">Bedoya-Perales, N.S., Escobedo-Pacheco, E., Maus, D. et al. Dataset of metals and metalloids in food crops and soils sampled across the mining region of Moquegua in Peru. Sci Data 10, 483 (2023). <a href="https://doi.org/10.1038/s41597-023-02363-0">https://doi.org/10.1038/s41597-023-02363-0</a></div>
diff --git a/examples/soil-metals/package.json b/examples/soil-metals/package.json
@@ -0,0 +1,14 @@
+{
+  "type": "module",
+  "scripts": {
+    "dev": "observable preview",
+    "build": "rm -rf dist && observable build",
+    "postinstall": "ln -sf ../../../../bin/observable-init.js node_modules/.bin/observable"
+  },
+  "dependencies": {
+    "@observablehq/framework": "link:../.."
+  },
+  "engines": {
+    "node": ">=20.6"
+  }
+}