Merge pull request #27 from menger5/pdfparser

Pull pdf parser docker into the master branch
RTIInternational · Feb 5, 2024 · 3f549d5 · 3f549d5
2 parents b11abd8 + 5c15d94
commit 3f549d5
Show file tree

Hide file tree

Showing 3 changed files with 394 additions and 0 deletions.
diff --git a/pdf_parse_r/v1/Dockerfile b/pdf_parse_r/v1/Dockerfile
@@ -0,0 +1,64 @@
+#----------------------------------------------------------------
+# Use the official R image as a base
+#----------------------------------------------------------------
+FROM r-base:4.3.2
+
+#----------------------------------------------------------------
+# Container Metadata
+#----------------------------------------------------------------
+LABEL base.image="r-base:v4.3.2"
+LABEL maintainer="Mike Enger <[email protected]>"
+LABEL description="PDF parser for cell viability outputs"
+LABEL software="R, pdftools, dplyr, openxlsx, getopt"
+LABEL software-website="https://www.r-project.org/ https://dplyr.tidyverse.org/ https://cran.r-project.org/web/packages/openxlsx/index.html https://cran.r-project.org/web/packages/pdftools/pdftools.pdf https://cran.r-project.org/web/packages/getopt/index.html"
+LABEL software.version="1.0.0"
+LABEL license="GPL-2 | GPL-3 "
+LABEL about.tags="RMIP"
+
+ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+#----------------------------------------------------------------
+# Install required command line tools and packages
+#----------------------------------------------------------------
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-get -qq update && apt-get -y upgrade && \
+	apt-get install -y --no-install-recommends \
+	libxml2-dev \
+	libcurl4-openssl-dev \
+	libssl-dev \
+	libjpeg-dev \
+	libicu-dev \
+	libpq-dev \
+	libmysqlclient-dev \
+	libpoppler-cpp-dev && \
+	apt-get clean && \
+	apt-get autoremove
+
+#----------------------------------------------------------------
+# Install R Packages
+#----------------------------------------------------------------
+ENV R_VERSION 4.3.2
+
+# Setup the CRAN to get the packages
+RUN echo "r <- getOption('repos'); r['CRAN'] <- 'http://cran.us.r-project.org'; options(repos = r);" > ~/.Rprofile
+RUN Rscript -e "install.packages('getopt', dependencies = T)"
+RUN Rscript -e "install.packages(c('RMySQL', 'RPostgreSQL', 'dplyr'), dependencies = T)"
+RUN Rscript -e "install.packages(c('xml2', 'curl', 'qpdf', 'pdftools', dependencies = T))"
+RUN Rscript -e "install.packages('openxlsx', dependencies = T)"
+RUN Rscript -e "library('getopt');##### R SESSION INFORMATION #####; sessionInfo()"
+
+#----------------------------------------------------------------
+# Copy over analysis scripts
+#----------------------------------------------------------------
+ADD parse_viability_pdf.R /opt/parser/parse_viability_pdf.R
+
+#----------------------------------------------------------------
+# Set working dir
+#----------------------------------------------------------------
+WORKDIR /data/
+
+#----------------------------------------------------------------
+# Set default command or entrypoint if needed
+#----------------------------------------------------------------
+CMD ["Rscript", "/opt/parser/parse_viability_pdf.R"]
diff --git a/pdf_parse_r/v1/README.md b/pdf_parse_r/v1/README.md
@@ -0,0 +1,129 @@
+# Cell Viability PDF Parser
+
+This Dockerfile sets up an environment for running an [Rscript](v1/parse_viability_pdf.R) that takes the PDF output from a cell viability assay run on the [ChemoMetec NucleoCounter NC-200](https://chemometec.com/nucleocounters/nc-200/) and exports a tab delimited file (1 row per sample).
+
+## Overview
+
+**Cell Viability**
+
+Cell viability assays are used to measure the physical and physiological health of cells in response to extracellular stimuli, chemical agents, or therapeutic treatments, or when determining optimal growth conditions in cell culture.
+
+For the Regenerative Medicine Innovation Project (RMIP), the In-Depth Cell Characterization Hub uses a NucleoCounter NC-200 to count the total number of viable versus dead cells in the sample. The output of this process is a standardized PDF file. 
+
+The purpose of this docker is to run an Rscript that will take the PDF outputs generated by the IDCCH, extracts key values from the PDF and export a standardized, machine readable, tab-delimited output.
+
+<br>
+
+## Usage
+The following command can be used to run the docker: 
+```
+docker pull rtibiocloud/pdf_parse_r:<tagname>
+docker run -it rtibiocloud/pdf_parse_r:<tagname> -c "Rscript /opt/parser/parse_viability_pdf.R --help"
+```
+
+Example Docker run command with volume mounting:
+```bash
+docker run --rm -v ${PWD}:/data -w /data rtibiocloud/pdf_parse_r:<tagname> /bin/bash -c " Rscript /opt/parser/parse_viability_pdf.R -i /data/example.pdf -p /data -o example.tsv -v"
+```
+
+If not running the docker from the directory with the data, replace `${PWD}` with the actual path on your host system with the PDF outputs.
+
+<br>
+
+## Build
+To build this Docker image, you can use the following command:
+```
+docker build --rm -t rtibiocloud/pdf_parse_r:<tagname> -f Dockerfile .
+```
+Here's what each part of the command does:
+
+`docker build`: This command tells Docker to build an image.
+`--rm`: This flag removes any intermediate containers that are created during the build process, helping to keep your system clean.
+`-t rtibiocloud/pdf_parse_r:v1.0.0`: The -t flag specifies the name and tag for the image. In this case, it's named pdf_parse_r with version v1.0.0.
+`-f Dockerfile`: This flag specifies the Dockerfile to use for building the image. You can replace Dockerfile with the actual name of your Dockerfile if it's different.
+`.`: The dot at the end of the command indicates that the build context is the current directory, where the Dockerfile is located.
+Running this command will build a Docker image with the name `rtibiocloud/pdf_parse_r:v1.0.0`. Make sure you are in the directory containing the Dockerfile you want to use for building the image.
+
+## Rscript Inputs
+| Short Flag | Long Flag | Description |
+|:-----:|:--------:|--------------------------------|
+|   -i  |  --pdf       | Path to the input PDF file                 |
+|   -o  |  --outfile   | Name of the output file                    |
+|   -p  |  --outpath   | Path to the output directory               |
+|   -E  |  --excel     | Export the results as an MS Excel Workbook |
+|   -v  |  --verbose   | Display verbose logging                    |
+|   -h  |  --help      | Display the function usage statement       |
+
+## Rscript Output
+A tab-delimited file with 1 row and 14 columns
+|     Column     | Description                                                                                           |
+|:--------------:|-------------------------------------------------------------------------------------------------------|
+|   Consortium   | The consortium, derived from the linker in the file name.                                             |
+|     Project    | A 3-digit code that is assigned to the study by the IDCCH, derived from the linker in the file name.  |
+|  Participant   | A unique deidentified identifier assigned to a participant by the RMIP investigator, derived from the linker in the file name.   |
+| Discriminator  | An alphabetic character assigned to a particular biospecimen collection event, derived from the linker in the file name.         |
+|   Identifier   | A numeric character assigned to a particular biospecimen collection event, derived from the linker in the file name.             |
+|     Vial       | An alphabetic character assigned to a specific biospecimen collection aliquot, derived from the linker in the file name.         |
+|     Date       | The date that the sample was processed.                                                               |
+|   Viability    | The proportion of live cells in the sample, reported as a percentage.                                 |
+|     Live       | The number of viable cells per milliliter of sample                                                   |
+|     Dead       | The number of non-viable cells per milliliter of sample.                                              |
+|     Total      | The total number of cells assessed per milliliter of sample.                                          |
+|  Cell Diameter | The estimated cell diameter reported in units of micrometers.                                         |
+|  Cell Diameter stdev  |The standard deviation observed in the cell diameter measurements.                              |
+| pct aggregated |  The percentage of cells that occur in aggregates with five or more cells.                            |
+
+## Perform a testrun
+`docker run -v ${PWD}/example_files/:/data -t rtibiocloud/pdf_parse_r:v1.0.0 /bin/bash  -c "Rscript /opt/parser/parse_viability_pdf.R -i /data/example.pdf -p /data -o example.tsv"`
+
+<details>
+
+```
+Loading required package: getopt
+Loading required package: dplyr
+
+Attaching package: ‘dplyr’
+
+The following objects are masked from ‘package:stats’:
+
+    filter, lag
+
+The following objects are masked from ‘package:base’:
+
+    intersect, setdiff, setequal, union
+
+Loading required package: pdftools
+Using poppler version 22.12.0
+[2024-02-03 03:27:51.597269] - main - INFO - User: root
+[2024-02-03 03:27:51.601783] - main - INFO - Running from: 7592e68cb469
+[2024-02-03 03:27:51.602141] - main - INFO - Platform: x86_64-pc-linux-gnu (64-bit)
+[2024-02-03 03:27:51.608045] - main - INFO - R version: R version 4.3.2 (2023-10-31)
+[2024-02-03 03:27:51.608333] - main - INFO - R packages loaded: pdftools, dplyr, getopt
+[2024-02-03 03:27:51.612687] - main - INFO - Rscript: /opt/parser/parse_viability_pdf.R
+[2024-02-03 03:27:51.613006] - getopt - INFO - CommandLine: -i /data/example.pdf -p /data -o example.tsv
+[2024-02-03 03:27:51.61331] - getopt - INFO - Arguments: ARGS = character(0)
+[2024-02-03 03:27:51.61331] - getopt - INFO - Arguments: pdf = /data/example.pdf
+[2024-02-03 03:27:51.61331] - getopt - INFO - Arguments: outpath = /data
+[2024-02-03 03:27:51.61331] - getopt - INFO - Arguments: outfile = example.tsv
+[2024-02-03 03:27:51.61331] - getopt - INFO - Arguments: excel = FALSE
+[2024-02-03 03:27:51.61331] - getopt - INFO - Arguments: verbose = FALSE
+[2024-02-03 03:27:51.624912] - load_pdf - INFO - Reading in the PDF file
+[2024-02-03 03:27:51.629189] - load_pdf - INFO - PDF file /data/example.pdf processing complete
+[2024-02-03 03:27:51.649153] - main - INFO - Process began at 2024-02-03 03:27:51.167343 and finished at 2024-02-03 03:27:51.649318
+[2024-02-03 03:27:51.64963] - main - INFO - Finished
+```
+
+<br>
+
+```
+Outputs:
+- Tab separated table:                      /data/example.tsv
+```
+</details>
+
+<br>
+
+## Contact
+For additional information or assistance, please contact Mike Enger ([email protected]).
+
+#################################################################
diff --git a/pdf_parse_r/v1/parse_viability_pdf.R b/pdf_parse_r/v1/parse_viability_pdf.R
@@ -0,0 +1,201 @@
+rm(list=ls())
+init <- Sys.time(); timer <- proc.time();
+
+#-----------------------------------------------------
+# If you have a PDF from a cell counter, how to 
+# extract the cell count and viability of the sample
+#
+# Developer: Jeran Stratford
+# Project: RMIP
+# Date: 27OCT2023
+#
+# Version of the PDF that we're working with %PDF-1.4
+#
+# Revisions
+# v1.0 initial commit
+#
+#-----------------------------------------------------
+
+#-----------------------------------------------------
+# Load Required Packages
+#-----------------------------------------------------
+if(!require('getopt')){install.packages('getopt', dependencies = T); library(getopt)}
+if(!require('dplyr')){install.packages('dplyr', dependencies = T); library(dplyr)}
+if(!require('pdftools')){install.packages('pdftools', dependencies = T); library(pdftools)}
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Setup logging
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+add_to_log <- function(lvl, func, message){
+	  # <Date> <function> <level> <information>
+	  timestamp <- paste0("[",Sys.time(),"]")
+  	  entry <- paste(timestamp, func, toupper(lvl), message, sep = " - ") 
+	  cat(paste0(entry, "\n"))
+}
+
+
+#-----------------------------------------------------
+# Setup global arguments and command line use
+#-----------------------------------------------------
+
+argString <- commandArgs(trailingOnly = T) # Read in command line arguments
+
+usage <- paste("Usage: parse_viability_pdf.r
+             -- Required Parameters --
+              [-i | --pdf]          <Path to the PDF> (Required)
+             -- Optional Parameters -- 
+              [-o | --outfile]      <The output file name> (default = pdf_extract.tsv)
+              [-p | --outpath]      <Path to the directory to save the outputs> (default = path of the input file)
+             -- Optional Flags --   
+              [-E | --excel]        <Export results as a MS Excel Workbook>(default=FALSE)
+              [-v | --verbose]      <Display verbose logging>(default=FALSE)
+             -- Help Flag --  
+              [-h | --help]             <Displays this help message>
+             Example:
+             parse_viability_pd.r -i abc.pdf -v
+              \n",sep="")
+
+#0=no-arg, 1=required-arg, 2=optional-arg
+spec <- matrix(c(
+          'excel',    'E', 0, "logical",
+          'pdf',      'i', 1, "character",
+          'outfile',  'o', 2, "character",
+          'outpath',  'p', 2, "character",
+          'verbose',  'v', 0, "logical",
+          'help',     'h', 0, "logical"
+          ), byrow=TRUE, ncol=4);
+
+
+args=getopt( spec, argString)
+
+if ( !is.null(args$help) | is.null(args$pdf) ) {
+  add_to_log(lvl="error", func="getopt", message = "\nEither you asked for help or you are missing a required parameters: pdf\n\n")
+  add_to_log(lvl="error", func="getopt", message = usage)
+  q(save="no",status=1,runLast=FALSE)
+}
+
+suffix <- '.tsv'
+if(is.null(args$excel)){
+    args$excel <- F
+} else {
+  if(!require('openxlsx')){install.packages('openxlsx', dependencies = T); library(openxlsx)}
+  suffix <- '.xlsx'
+}
+
+if(is.null(args$outfile)){args$outfile <- paste0("pdf_extract", suffix)}
+if(is.null(args$outpath)){args$outpath <- dirname(args$pdf)}
+if(is.null(args$verbose)){args$verbose <- F}
+
+#-----------------------------------------------------
+# Required Functions
+#-----------------------------------------------------
+load_pdf <- function(fname){
+
+  tmp <- tryCatch(
+    {
+      add_to_log(lvl="info", func="load_pdf", message = "Reading in the PDF file")
+      pdf_text(fname)
+    },
+    error=function(cond) {
+      add_to_log(lvl="error", func="load_pdf", message = paste("Error reading in pdf file:", basename(fname)))
+      add_to_log(lvl="error", func="load_pdf", message = "Original error message:")
+      add_to_log(lvl="error", func="load_pdf", message = cond)
+      return(NA)
+    },
+    warning=function(cond) {
+      add_to_log(lvl="warn", func="load_pdf", message = paste("Warning while reading in pdf file:", basename(fname)))
+      add_to_log(lvl="warn", func="load_pdf", message = "Original warning message:")
+      add_to_log(lvl="warn", func="load_pdf", message = cond)
+      return(NULL)
+    },
+    finally={
+      add_to_log(lvl="info", func="load_pdf", message = paste("PDF file", fname, "processing complete"))
+    }
+  )    
+
+  out <- strsplit(tmp, split = "\n") %>% lapply(., function(x) trimws(x))
+
+  return(out)
+}
+
+extract_value <- function(findme, lns = txt){
+  # Prep the string for grep with perl
+  x <- findme %>% trimws(.) %>% strsplit(.," ") %>% sapply(., `[`, 1) 
+
+  idx <- grep(paste0("^", x), lns[[1]], perl = T, ignore.case = T)
+
+  if (length(idx) > 0 ){
+    # The value was found
+    out <- 
+      lns[[1]][idx] %>% 
+        substr(., start = nchar(findme)+1, stop = nchar(lns[[1]][idx])) %>% 
+        trimws(.) %>% 
+        gsub("[ ]+", " ", .) %>% 
+        strsplit(x = , split = " ", .) %>% 
+        sapply(., `[`, 1) 
+  } else {
+    # Could not find it 
+    out <- NA
+  }
+
+  return(out)
+}
+
+#-----------------------------------------------------
+# Main
+#----------------------------------------------------- 
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Logging information
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+add_to_log(lvl = "info", func="main", message=paste0("User: ", Sys.info()[['effective_user']]))
+add_to_log(lvl = "info", func="main", message=paste0("Running from: ", Sys.info()[['nodename']]))
+add_to_log(lvl = "info", func="main", message=paste0("Platform: ", sessionInfo()["platform"]))
+add_to_log(lvl = "info", func="main", message=paste0("R version: ", R.version.string ))
+add_to_log(lvl = "info", func="main", message=paste0("R packages loaded: ",  paste(names(sessionInfo()$otherPkgs), collapse=", ")))
+add_to_log(lvl = "info", func="main", message=paste0("Rscript: ", gsub("--file=", "", grep(pattern = "^--file", commandArgs(trailingOnly = F), value = T))))
+add_to_log(lvl = "info", func="getopt", message=paste0("CommandLine: ", paste(commandArgs(trailingOnly = T), collapse=" ")))
+add_to_log(lvl = "info", func="getopt", message=paste0("Arguments: ", paste(names(args), args, sep=" = ")))
+
+txt <- load_pdf(fname = args$pdf)
+
+id <- txt[[1]][length(txt[[1]])-1] %>% strsplit(., " ") %>% sapply(., `[`, 1) %>% gsub(".pdf$", "", ., perl = T, ignore.case = T) %>% strsplit(x = , split = "-", .)
+date <-  id %>% sapply(., `[`, 1)
+id2 <-  id %>% sapply(., `[`, 2)
+id3 <-  id %>% sapply(., `[`, 3)
+prefix <-  id %>% sapply(., `[`, 4)
+
+viable <- extract_value("Viability (%)")
+live <- extract_value("Live (cells/ml)")
+dead <- extract_value("Dead (cells/ml)")
+total <- extract_value("Total (cells/ml)")
+diameter <- extract_value("Estimated cell diameter (um)")
+diameter_stdev <- extract_value("Cell diameter standard deviation (um)")
+agg <- extract_value("\\(%\\) of cells in aggregates with five or more cells")
+
+final <- data.frame("Consortium" = prefix %>% substr(.,1,4),
+                    "Project" = prefix %>% substr(.,5,7),
+                    "Participant" = prefix %>% substr(.,8,10),
+                    "Discriminator" = prefix %>% substr(.,11,11),
+                    "Identifier" = prefix %>% substr(.,12,14),
+                    "Vial" = prefix %>% substr(.,15,15),
+                    "Date" = id %>% sapply(., `[`, 1),
+                    "Viability" = viable,
+                    "Live" = live,
+                    "Dead" = dead,
+                    "Total" = total,
+                    "cell diameter" = diameter,
+                    "cell diameter stdev" = diameter_stdev,
+                    "pct aggregated" = agg,
+                    stringsAsFactors = F)
+
+if (args$excel){
+  write.xlsx(x = final, file = file.path(args$outpath, args$outfile))
+} else {
+  write.table(x = final, file = file.path(args$outpath, args$outfile), row.names = F, col.names = T, sep = '\t', quote = F)  
+}
+
+#-----------------------------------------------------
+# Close out the script
+#-----------------------------------------------------
+add_to_log(lvl="info", func="main", message = paste0("Process began at ", init, " and finished at ", Sys.time(), "\n"))
+add_to_log(lvl="info", func="main", message = "Finished\n")