From e2e6630f73e0ad5c05c89336440795f6f0b77f78 Mon Sep 17 00:00:00 2001
From: Stu Field <sfield@somalogic.com>
Date: Wed, 5 Apr 2023 13:19:59 -0600
Subject: [PATCH] Add new proper vignettes (#35)

- moved statistical examples into their own
  vignettes
- they will now be rendered by `pkgdown`
- simplifies `README`
- four new vignettes:
  - loading-and-wrangling
  - binary-classification
  - linear-regression
  - two-group-comparison
- fixes #35
---
 DESCRIPTION                         |   1 +
 _pkgdown.yml                        |  29 +++++
 vignettes/.gitignore                |   2 +
 vignettes/SomaDataIO.Rmd            |  26 ++--
 vignettes/binary-classification.Rmd | 148 +++++++++++++++++++++++
 vignettes/figures/.gitignore        |   2 +
 vignettes/linear-regression.Rmd     | 171 +++++++++++++++++++++++++++
 vignettes/loading-and-wrangling.Rmd | 176 ++++++++++++++++++++++++++++
 vignettes/two-group-comparison.Rmd  | 155 ++++++++++++++++++++++++
 9 files changed, 703 insertions(+), 7 deletions(-)
 create mode 100644 vignettes/.gitignore
 create mode 100644 vignettes/binary-classification.Rmd
 create mode 100644 vignettes/figures/.gitignore
 create mode 100644 vignettes/linear-regression.Rmd
 create mode 100644 vignettes/loading-and-wrangling.Rmd
 create mode 100644 vignettes/two-group-comparison.Rmd

diff --git a/DESCRIPTION b/DESCRIPTION
index 8c4df99..70fbe1d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -37,6 +37,7 @@ Suggests:
     Biobase,
     ggplot2,
     knitr,
+    purrr,
     recipes,
     rmarkdown,
     spelling,
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 312a1c4..7594e09 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -27,6 +27,35 @@ articles:
     contents:
     - SomaDataIO
 
+  - title: Loading and Wrangling
+    navbar: ~
+    desc: >
+      How to load and manipulate a 'SomaScan' flat text file into 
+      and R environment.
+    contents:
+    - loading-and-wrangling
+
+  - title: Two-group Comparison
+    navbar: ~
+    desc: >
+      Typical two-group comparison of 'SomaScan' data.
+    contents:
+    - two-group-comparison
+
+  - title: Binary Classification
+    navbar: ~
+    desc: >
+      Typical binary classification of 'SomaScan' data.
+    contents:
+    - binary-classification
+
+  - title: Linear Regression
+    navbar: ~
+    desc: >
+      Typical linear regression of continuous 'SomaScan' data.
+    contents:
+    - linear-regression
+
 reference:
   - title: Load an ADAT
     desc: >
diff --git a/vignettes/.gitignore b/vignettes/.gitignore
new file mode 100644
index 0000000..097b241
--- /dev/null
+++ b/vignettes/.gitignore
@@ -0,0 +1,2 @@
+*.html
+*.R
diff --git a/vignettes/SomaDataIO.Rmd b/vignettes/SomaDataIO.Rmd
index 42fd711..9a35ae7 100644
--- a/vignettes/SomaDataIO.Rmd
+++ b/vignettes/SomaDataIO.Rmd
@@ -1,6 +1,8 @@
 ---
 title: "Introduction to SomaDataIO"
-output: rmarkdown::html_vignette
+output:
+  rmarkdown::html_vignette:
+    fig_caption: yes
 vignette: >
   %\VignetteIndexEntry{Introduction to SomaDataIO}
   %\VignetteEngine{knitr::rmarkdown}
@@ -8,8 +10,8 @@ vignette: >
 ---
 
 ```{r setup, include = FALSE}
-library(SomaDataIO)
 knitr::opts_chunk$set(
+  echo = TRUE,
   collapse = TRUE,
   comment = "#>"
 )
@@ -20,7 +22,8 @@ knitr::opts_chunk$set(
 This document accompanies the `SomaDataIO` R package, which loads
 and exports 'SomaScan' data via the SomaLogic Operating Co., Inc.
 proprietary text file called an ADAT (`*.adat`).
-For file format see [here](https://github.com/SomaLogic/SomaLogic-Data/blob/master/README.md).
+For file format see
+[here](https://github.com/SomaLogic/SomaLogic-Data/blob/master/README.md).
 The package also exports auxiliary functions for manipulating, wrangling,
 and extracting relevant information from an ADAT object once in memory.
 Basic familiarity with the R environment is assumed, as is the ability to install
@@ -42,6 +45,7 @@ contributed packages from the Comprehensive R Archive Network (CRAN).
     + `?SeqId` analyte (feature) matching.
     + `dplyr` and `tidyr` verb S3 methods for the `soma_adat` class.
     + `?rownames` helpers that do not break `soma_adat` attributes.
+    + please see vignette `vignette("loading-and-wrangling", package = "SomaDataIO")`
 
 * Exporting data (Output)
     + write out a `soma_adat` object as a `*.adat` text file.
@@ -49,12 +53,20 @@ contributed packages from the Comprehensive R Archive Network (CRAN).
 
 ----------------------
 
-## Workflows and Analysis (TODO)
+## Workflows and Analysis
 This section will become more fleshed out in future versions of
-`SomaDataIO`
+`SomaDataIO`. In the meantime, below are 3 examples of typical
+primary statistical analyses that are commonly performed on
+'SomaScan' data:
 
-#### In the meantime please see the package
-[README](https://github.com/SomaLogic/SomaDataIO/blob/main/README.md)
+- Two-group comparison (e.g. differential expression) via *t*-test
+  + see vignette `vignette("two-group-comparison", package = "SomaDataIO")`
+
+- Binary classification
+  + see vignette `vignette("binary-classification", package = "SomaDataIO")`
+
+- Linear regression
+  + see vignette `vignette("linear-regression", package = "SomaDataIO")`
 
 
 ---------------------
diff --git a/vignettes/binary-classification.Rmd b/vignettes/binary-classification.Rmd
new file mode 100644
index 0000000..f4ad302
--- /dev/null
+++ b/vignettes/binary-classification.Rmd
@@ -0,0 +1,148 @@
+---
+title: "Binary Classification"
+author: "Stu Field, SomaLogic Operating Co., Inc."
+output:
+  rmarkdown::html_vignette:
+    fig_caption: yes
+vignette: >
+  %\VignetteIndexEntry{Binary Classification}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include = FALSE}
+library(SomaDataIO)
+library(dplyr)
+library(tidyr)
+library(purrr)
+knitr::opts_chunk$set(
+  echo = TRUE,
+  collapse = TRUE,
+  comment = "#>",
+  fig.path = "figures/classify-"
+)
+```
+
+
+----------------
+
+
+## Classification via Logistic Regression
+
+
+Although targeted statistical analyses are beyond the scope of 
+the `SomaDataIO` package, below is an example analysis
+that typical users/customers would perform on 'SomaScan' data.
+
+It is not intended to be a definitive guide in statistical
+analysis and existing packages do exist in the `R` ecosystem that perform
+parts or extensions of these techniques. Many variations of the workflow
+below exist, however the framework highlights how one could perform standard
+_preliminary_ analyses on 'SomaScan' data.
+
+
+## Data Preparation
+```{r data-prep}
+# the `example_data` package data
+dim(example_data)
+
+table(example_data$SampleType)
+
+# center/scale
+cs <- function(.x) {    # .x = numeric vector
+  out <- .x - mean(.x)  # center
+  out / sd(out)         # scale
+}
+
+# prepare data set for analysis
+cleanData <- example_data |>
+  filter(SampleType == "Sample") |>               # rm control samples
+  drop_na(Sex) |>                                 # rm NAs if present
+  log10() |>                                      # log10-transform (Math Generic)
+  mutate(Group = as.numeric(factor(Sex)) - 1) |>  # map Sex -> 0/1
+  modify_at(getAnalytes(example_data), cs)
+
+table(cleanData$Sex)
+
+table(cleanData$Group)    # F = 0; M = 1
+```
+
+## Set up Train/Test Data
+
+```{r train-test}
+# idx = hold-out 
+# seed resulting in 50/50 class balance
+idx   <- withr::with_seed(3, sample(1:nrow(cleanData), size = nrow(cleanData) - 50))
+train <- cleanData[idx, ]
+test  <- cleanData[-idx, ]
+
+# assert no overlap
+isTRUE(
+  all.equal(intersect(rownames(train), rownames(test)), character(0))
+)
+```
+
+
+## Logistic Regression
+We use the `cleanData`, `train`, and `test` data objects from above.
+
+### Predict Sex
+```{r logreg-tbl}
+LR_tbl <- getAnalyteInfo(train) |>
+  select(AptName, SeqId, Target = TargetFullName, EntrezGeneSymbol, UniProt) |>
+  mutate(
+    formula  = map(AptName, ~ as.formula(paste("Group ~", .x))),  # create formula
+    model    = map(formula, ~ stats::glm(.x, data = train, family = "binomial", model = FALSE)),  # fit glm()
+    beta_hat = map_dbl(model, ~ coef(.x)[2L]),      # pull out coef Beta
+    p.value  = map2_dbl(model, AptName, ~ {
+      summary(.x)$coefficients[.y, "Pr(>|z|)"] }),  # pull out p-values
+    fdr      = p.adjust(p.value, method = "BH")     # FDR correction multiple testing
+  ) |>
+  arrange(p.value) |>            # re-order by `p-value`
+  mutate(rank = row_number())    # add numeric ranks
+
+LR_tbl
+```
+
+
+### Fit Model | Calculate Performance
+
+Next, select features for the model fit. We have a good idea of reasonable `Sex`
+markers from prior knowledge (`CGA*`), and fortunately many of these are highly
+ranked in `LR_tbl`. Below we fit a 4-marker logistic regression model from
+cherry-picked gender-related features:
+
+```{r fit-logreg}
+# AptName is index key between `LR_tbl` and `train`
+feats <- LR_tbl$AptName[c(1L, 3L, 5L, 7L)]
+form  <- as.formula(paste("Group ~", paste(feats, collapse = "+")))
+fit   <- glm(form, data = train, family = "binomial", model = FALSE)
+pred  <- tibble(
+  true_class = test$Sex,                                         # orig class label
+  pred       = predict(fit, newdata = test, type = "response"),  # prob. 'Male'
+  pred_class = ifelse(pred < 0.5, "F", "M"),                     # class label
+)
+conf <- table(pred$true_class, pred$pred_class, dnn = list("Actual", "Predicted"))
+tp   <- conf[2L, 2L]
+tn   <- conf[1L, 1L]
+fp   <- conf[1L, 2L]
+fn   <- conf[2L, 1L]
+
+# Confusion matrix
+conf
+
+# Classification metrics
+tibble(Sensitivity = tp / (tp + fn),
+       Specificity = tn / (tn + fp),
+       Accuracy    = (tp + tn) / sum(conf),
+       PPV         = tp / (tp + fp),
+       NPV         = tn / (tn + fn)
+)
+```
+
+
+---------------------
+
+
+Created by [Rmarkdown](https://github.com/rstudio/rmarkdown)
+(v`r utils::packageVersion("rmarkdown")`) and `r R.version$version.string`.
diff --git a/vignettes/figures/.gitignore b/vignettes/figures/.gitignore
new file mode 100644
index 0000000..506eee3
--- /dev/null
+++ b/vignettes/figures/.gitignore
@@ -0,0 +1,2 @@
+*.png
+*.html
diff --git a/vignettes/linear-regression.Rmd b/vignettes/linear-regression.Rmd
new file mode 100644
index 0000000..dd38450
--- /dev/null
+++ b/vignettes/linear-regression.Rmd
@@ -0,0 +1,171 @@
+---
+title: "Linear Regression"
+author: "Stu Field, SomaLogic Operating Co., Inc."
+output:
+  rmarkdown::html_vignette:
+    fig_caption: yes
+vignette: >
+  %\VignetteIndexEntry{Linear Regression}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include = FALSE}
+library(SomaDataIO)
+library(ggplot2)
+library(dplyr)
+library(tidyr)
+library(purrr)
+knitr::opts_chunk$set(
+  echo = TRUE,
+  collapse = TRUE,
+  comment = "#>",
+  fig.path = "figures/linear-reg-"
+)
+```
+
+
+--------------
+
+
+## Regression of Continuous Variables
+
+Although targeted statistical analyses are beyond the scope of 
+the `SomaDataIO` package, below is an example analysis
+that typical users/customers would perform on 'SomaScan' data.
+
+It is not intended to be a definitive guide in statistical
+analysis and existing packages do exist in the `R` ecosystem that perform
+parts or extensions of these techniques. Many variations of the workflow
+below exist, however the framework highlights how one could perform standard
+_preliminary_ analyses on 'SomaScan' data.
+
+
+## Data Preparation
+```{r data-prep}
+# the `example_data` package data
+dim(example_data)
+
+table(example_data$SampleType)
+
+# center/scale
+cs <- function(.x) {    # .x = numeric vector
+  out <- .x - mean(.x)  # center
+  out / sd(out)         # scale
+}
+
+# prepare data set for analysis
+cleanData <- example_data |>
+  filter(SampleType == "Sample") |>               # rm control samples
+  drop_na(Sex) |>                                 # rm NAs if present
+  log10() |>                                      # log10-transform (Math Generic)
+  mutate(Group = as.numeric(factor(Sex)) - 1) |>  # map Sex -> 0/1
+  modify_at(getAnalytes(example_data), cs)
+
+table(cleanData$Sex)
+
+table(cleanData$Group)    # F = 0; M = 1
+```
+
+
+## Set up Train/Test Data
+
+```{r train-test}
+# idx = hold-out 
+# seed resulting in 50/50 class balance
+idx   <- withr::with_seed(3, sample(1:nrow(cleanData), size = nrow(cleanData) - 50))
+train <- cleanData[idx, ]
+test  <- cleanData[-idx, ]
+
+# assert no overlap
+isTRUE(
+  all.equal(intersect(rownames(train), rownames(test)), character(0))
+)
+```
+
+
+## Linear Regression
+We use the `cleanData`, `train`, and `test` data objects from above.
+
+### Predict Age
+```{r linreg-tbl}
+LinR_tbl <- getAnalyteInfo(train) |>                # `train` from above
+  select(AptName, SeqId, Target = TargetFullName, EntrezGeneSymbol, UniProt) |>
+  mutate(
+    formula = map(AptName, ~ as.formula(paste("Age ~", .x, collapse = " + "))),
+    model   = map(formula, ~ lm(.x, data = train, model = FALSE)),  # fit linear models
+    slope   = map_dbl(model, ~ coef(.x)[2L]),       # pull out B_1
+    p.value = map2_dbl(model, AptName, ~ {
+      summary(.x)$coefficients[.y, "Pr(>|t|)"] }),  # pull out p-values
+    fdr     = p.adjust(p.value, method = "BH")      # FDR for multiple testing
+  ) |>
+  arrange(p.value) |>            # re-order by `p-value`
+  mutate(rank = row_number())    # add numeric ranks
+
+LinR_tbl
+```
+
+
+### Fit Model | Calculate Performance
+Fit an 8-marker model with the top 8 features from `LinR_tbl`:
+
+```{r linreg-fit}
+feats <- head(LinR_tbl$AptName, 8L)
+form  <- as.formula(paste("Age ~", paste(feats, collapse = "+")))
+fit   <- lm(form, data = train, model = FALSE)
+n     <- nrow(test)
+p     <- length(feats)
+
+# Results
+res   <- tibble(
+  true_age   = test$Age,
+  pred_age   = predict(fit, newdata = test),
+  pred_error = pred_age - true_age
+)
+
+# Lin's Concordance Correl. Coef.
+# Accounts for location + scale shifts
+linCCC <- function(x, y) {
+  stopifnot(length(x) == length(y))
+  a <- 2 * cor(x, y) * sd(x) * sd(y)
+  b <- var(x) + var(y) + (mean(x) - mean(y))^2
+  a / b
+}
+
+# Regression metrics
+tibble(
+  rss  = sum(res$pred_error^2),                 # residual sum of squares
+  tss  = sum((test$Age - mean(test$Age))^2),    # total sum of squares
+  rsq  = 1 - (rss / tss),                       # R-squared
+  rsqadj = max(0, 1 - (1 - rsq) * (n - 1) / (n - p - 1)), # Adjusted R-squared
+  R2   = stats::cor(res$true_age, res$pred_age)^2,        # R-squared Pearson approx.
+  MAE  = mean(abs(res$pred_error)),             # Mean Absolute Error
+  RMSE = sqrt(mean(res$pred_error^2)),          # Root Mean Squared Error
+  CCC  = linCCC(res$true_age, res$pred_age)     # Lin's CCC
+)
+```
+
+
+### Visualize Concordance
+```{r linreg-plot, fig.width = 7, fig.height = 7, fig.align = "center"}
+lims <- range(res$true_age, res$pred_age)
+res |>
+  ggplot(aes(x = true_age, y = pred_age)) +
+  geom_point(colour = "#24135F", alpha = 0.5, size = 4) +
+  expand_limits(x = lims, y = lims) +                # make square
+  geom_abline(slope = 1, colour = "black") +         # add unit line
+  geom_rug(colour = "#286d9b", linewidth = 0.2) +
+  labs(y = "Predicted Age", x = "Actual Age") +
+  ggtitle("Concordance in Predicted vs. Actual Age") +
+  theme(plot.title = element_text(size = 21, face = "bold"),
+        axis.title.x = element_text(size = 14),
+        axis.title.y = element_text(size = 14))
+```
+
+
+
+---------------------
+
+
+Created by [Rmarkdown](https://github.com/rstudio/rmarkdown)
+(v`r utils::packageVersion("rmarkdown")`) and `r R.version$version.string`.
diff --git a/vignettes/loading-and-wrangling.Rmd b/vignettes/loading-and-wrangling.Rmd
new file mode 100644
index 0000000..732ebd7
--- /dev/null
+++ b/vignettes/loading-and-wrangling.Rmd
@@ -0,0 +1,176 @@
+---
+title: "Loading and Wrangling 'SomaScan'"
+author: "Stu Field, SomaLogic Operating Co., Inc."
+output:
+  rmarkdown::html_vignette:
+    fig_caption: yes
+vignette: >
+  %\VignetteIndexEntry{Loading and Wrangling 'SomaScan'}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, echo = FALSE, results = FALSE, message = FALSE}
+options(width = 80)
+#Sys.setlocale("LC_COLLATE", "C")
+Sys.setlocale("LC_COLLATE", "en_US.UTF-8") # ensure common sorting envir
+library(SomaDataIO)
+knitr::opts_chunk$set(
+  echo = TRUE,
+  collapse = TRUE,
+  comment = "#>",
+  fig.path = "figures/wrangling-"
+)
+```
+
+
+## Loading an ADAT
+
+```{r read-adat}
+# Sample file name
+f <- system.file("extdata", "example_data10.adat",
+                 package = "SomaDataIO", mustWork = TRUE)
+my_adat <- read_adat(f)
+is.soma_adat(my_adat)
+
+# S3 print method forwards -> tibble
+my_adat
+
+print(my_adat, show_header = TRUE)  # if simply wish to see Header info
+
+# S3 summary method
+# View Target and summary statistics
+seqs <- tail(names(my_adat), 3L)
+summary(my_adat[, seqs])
+
+# Summarize by Sex
+my_adat[, seqs] |>
+  split(my_adat$Sex) |>
+  lapply(summary)
+```
+
+
+---------------------
+
+
+## Wrangling
+### Attributes Contain File and Feature Information
+```{r atts}
+names(attributes(my_adat))
+
+# The `Col.Meta` attribute contains
+# target annotation information
+attr(my_adat, "Col.Meta")
+```
+
+### Analyte Features (`seq.xxxx.xx`)
+```{r feats}
+getAnalytes(my_adat) |> head(20L)    # first 20 analytes; see AptName above
+getAnalytes(my_adat) |> length()     # how many analytes
+getAnalytes(my_adat, n = TRUE)       # the `n` argument; no. analytes
+```
+
+### Feature Data
+The `getAnalyteInfo()` function creates a lookup table that links
+analyte feature names in the `soma_adat` object to the annotation
+data in `?Col.Meta` via the common index-key, `AptName`, in column 1:
+
+```{r annotations}
+getAnalyteInfo(my_adat)
+```
+
+
+### Clinical Data
+```{r meta}
+getMeta(my_adat)             # clinical meta data for each sample
+getMeta(my_adat, n = TRUE)   # also an `n` argument
+```
+
+
+### ADAT structure
+
+The `soma_adat` object also contains specific structure that are useful
+to users. Please also see `?colmeta` or `?annotations` for further
+details about these fields.
+
+---------------------
+
+
+
+### Group Generics
+You may perform basic mathematical transformations on the feature data _only_
+with special `soma_adat` S3 methods (see `?groupGenerics`):
+
+```{r group-generics}
+head(my_adat$seq.2429.27)
+
+logData <- log10(my_adat)    # a typical log10() transform
+head(logData$seq.2429.27)
+
+roundData <- round(my_adat)
+head(roundData$seq.2429.27)
+
+sqData <- sqrt(my_adat)
+head(sqData$seq.2429.27)
+
+antilog(1:4)
+
+sum(my_adat < 100)  # low signalling values
+
+all.equal(my_adat, sqrt(my_adat^2))
+
+all.equal(my_adat, antilog(log10(my_adat)))
+```
+
+
+#### Math Generics
+
+```{r math}
+getGroupMembers("Math")
+
+getGroupMembers("Compare")
+
+getGroupMembers("Arith")
+
+getGroupMembers("Summary")
+```
+
+
+### Full Complement of [dplyr](https://dplyr.tidyverse.org) S3 Methods
+
+The `soma_adat` also comes with numerous class specific methods to the most
+popular [dplyr](https://dplyr.tidyverse.org) generics that make working
+with `soma_adat` objects simpler for those familiar with this standard toolkit:
+
+```{r dplyr}
+dim(my_adat)
+males <- dplyr::filter(my_adat, Sex == "M")
+dim(males)
+
+males |>
+  dplyr::select(SampleType, SampleMatrix, starts_with("NormScale"))
+```
+
+
+### Available S3 Methods `soma_adat`
+```{r methods}
+# see full complement of `soma_adat` methods
+methods(class = "soma_adat")
+```
+
+
+---------------------
+
+## Writing a `soma_adat`
+```{r write}
+is_intact_attr(my_adat)   # MUST have intact attrs
+
+write_adat(my_adat, file = tempfile("my-adat-", fileext = ".adat"))
+```
+
+
+---------------------
+
+
+Created by [Rmarkdown](https://github.com/rstudio/rmarkdown)
+(v`r utils::packageVersion("rmarkdown")`) and `r R.version$version.string`.
diff --git a/vignettes/two-group-comparison.Rmd b/vignettes/two-group-comparison.Rmd
new file mode 100644
index 0000000..f443aed
--- /dev/null
+++ b/vignettes/two-group-comparison.Rmd
@@ -0,0 +1,155 @@
+---
+title: "Two-Group Comparison"
+author: "Stu Field, SomaLogic Operating Co., Inc."
+output:
+  rmarkdown::html_vignette:
+    fig_caption: yes
+vignette: >
+  %\VignetteIndexEntry{Two-Group Comparison}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include = FALSE}
+library(SomaDataIO)
+library(ggplot2)
+library(dplyr)
+library(tidyr)
+library(purrr)
+knitr::opts_chunk$set(
+  echo = TRUE,
+  collapse = TRUE,
+  comment = "#>",
+  fig.path = "figures/two-group-"
+)
+```
+
+
+--------------
+
+
+## Differential Expression via *t*-test
+
+Although targeted statistical analyses are beyond the scope of 
+the `SomaDataIO` package, below is an example analysis
+that typical users/customers would perform on 'SomaScan' data.
+
+It is not intended to be a definitive guide in statistical
+analysis and existing packages do exist in the `R` ecosystem that perform
+parts or extensions of these techniques. Many variations of the workflow
+below exist, however the framework highlights how one could perform standard
+_preliminary_ analyses on 'SomaScan' data.
+
+
+## Data Preparation
+```{r data-prep}
+# the `example_data` package data
+dim(example_data)
+
+table(example_data$SampleType)
+
+# center/scale
+cs <- function(.x) {    # .x = numeric vector
+  out <- .x - mean(.x)  # center
+  out / sd(out)         # scale
+}
+
+# prepare data set for analysis
+cleanData <- example_data |>
+  filter(SampleType == "Sample") |>               # rm control samples
+  drop_na(Sex) |>                                 # rm NAs if present
+  log10() |>                                      # log10-transform (Math Generic)
+  mutate(Group = as.numeric(factor(Sex)) - 1) |>  # map Sex -> 0/1
+  modify_at(getAnalytes(example_data), cs)
+
+table(cleanData$Sex)
+
+table(cleanData$Group)    # F = 0; M = 1
+```
+
+
+
+## Compare Two Groups (`M`/`F`)
+### Get annotations via `getAnalyteInfo()`:
+
+```{r get-anno}
+t_tests <- getAnalyteInfo(cleanData) |>
+  select(AptName, SeqId, Target = TargetFullName, EntrezGeneSymbol, UniProt)
+
+# Feature data info:
+#   Subset via dplyr::filter(t_tests, ...) here to
+#   restrict analysis to only certain analytes
+t_tests
+```
+
+
+
+### Calculate `t-tests`
+Use a "list columns" approach via nested tibble object
+using `dplyr`, `purrr`, and `stats::t.test()`
+
+```{r t-tests}
+t_tests <- t_tests |>
+  mutate(
+    formula = map(AptName, ~ as.formula(paste(.x, "~ Sex"))), # create formula
+    t_test  = map(formula, ~ stats::t.test(.x, data = cleanData)),  # fit t-tests
+    t_stat  = map_dbl(t_test, "statistic"),            # pull out t-statistic
+    p.value = map_dbl(t_test, "p.value"),              # pull out p-values
+    fdr     = p.adjust(p.value, method = "BH")         # FDR for multiple testing
+  ) |>
+  arrange(p.value) |>            # re-order by `p-value`
+  mutate(rank = row_number())    # add numeric ranks
+
+# View analysis tibble
+t_tests
+```
+
+
+
+### Visualize with `ggplot2()`
+Create a plotting tibble in the "long" format for `ggplot2`:
+
+```{r ggplot-data}
+target_map <- head(t_tests, 12L) |>     # mapping table
+  select(AptName, Target)               # SeqId -> Target
+
+plot_tbl <- example_data |>
+  filter(SampleType == "Sample") |>     # rm control samples
+  drop_na(Sex) |>                       # rm NAs if present
+  log10() |>                            # log10-transform for plotting
+  select(Sex, target_map$AptName) |>    # top 12 analytes
+  pivot_longer(cols = -Sex, names_to = "AptName", values_to = "RFU") |>
+  dplyr::left_join(target_map, by = "AptName") |>
+  # order factor levels by 't_tests' rank to order plots below
+  mutate(Target = factor(Target, levels = target_map$Target))
+
+plot_tbl
+```
+
+```{r seed, include = FALSE}
+# seed for geom::jitter() so Git isn't triggered every time
+set.seed(1)
+```
+
+```{r ggplot-boxes, fig.width = 7, fig.height = 7, fig.align = "center"}
+plot_tbl |>
+  ggplot(aes(x = Sex, y = RFU, fill = Sex)) +
+  geom_boxplot(alpha = 0.5, outlier.shape = NA) +
+  scale_fill_manual(values = c("#24135F", "#00A499")) +
+  geom_jitter(shape = 16, width = 0.1, alpha = 0.5) +
+  facet_wrap(~ Target) +
+  ggtitle("Boxplots of Top Analytes by t-test") +
+  labs(y = "log10(RFU)") +
+  theme(plot.title = element_text(size = 21, face = "bold"),
+        axis.title.x = element_text(size = 14),
+        axis.title.y = element_text(size = 14),
+        legend.position = "top"
+  )
+```
+
+
+---------------------
+
+
+Created by [Rmarkdown](https://github.com/rstudio/rmarkdown)
+(v`r utils::packageVersion("rmarkdown")`) and `r R.version$version.string`.