Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework design table and L1 column structure #124

Merged
merged 13 commits into from
Apr 4, 2024
2 changes: 2 additions & 0 deletions synoptic/L0.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ f <- function(fn, new_dir) {
new_fqfn <- file.path(new_dir, new_fn)
message("\tWriting ", new_fqfn)
write.csv(dat_long, new_fqfn, row.names = FALSE)
rm(dat_long)
# ...and move to 'Raw_done' folder
if(params$remove_input_files) {
message("\tArchiving raw input files")
Expand Down Expand Up @@ -192,3 +193,4 @@ Git commit `r GIT_COMMIT`.
```{r reproducibility}
sessionInfo()
```

57 changes: 26 additions & 31 deletions synoptic/L1.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ params:
L1_NORMALIZE: "L1_normalize/"
L1: "L1/"
L1_METADATA: "L1_metadata/"
METADATA_VARS_TABLE: "L1_metadata_variables.csv"
METADATA_COLUMNS_TABLE: "L1_metadata_columns.csv"
L1_VERSION: "???"
debug: false
remove_input_files: false
Expand Down Expand Up @@ -43,6 +45,13 @@ if(packageVersion("compasstools") < "0.2") {

source("helpers.R")

# Get the column metadata file
column_md <- read.csv(file.path(params$METADATA_ROOT,
params$L1_METADATA,
params$METADATA_COLUMNS_TABLE),
comment.char = "#",
stringsAsFactors = FALSE)

# Open the fLag database (in the data, not metadata, folder)
source("flag-database.R")
fdb_open(params$DATA_ROOT, init = TRUE)
Expand All @@ -55,6 +64,8 @@ L1 <- file.path(params$DATA_ROOT, params$L1)

I see `r length(dirs_to_process)` directories to process in `r L1_NORMALIZE`.

L1 column metadata has `r nrow(column_md)` entries.

Output directory is `r L1`.

HTML outfile is "`r params$html_outfile`".
Expand All @@ -72,7 +83,7 @@ f <- function(dir_name, dirs_to_process, out_dir) {
# everything can be stacked into a single data frame
dat_raw <- read_csv_group(d,
remove_input_files = params$remove_input_files,
col_types = "cccccTdcccdii")
col_types = "cccccTdcccccccdii")

message("\tTotal data: ", nrow(dat_raw), " rows, ", ncol(dat_raw), " columns")

Expand All @@ -85,18 +96,17 @@ f <- function(dir_name, dirs_to_process, out_dir) {
Files = length(d),
Rows = nrow(dat),
NA_rows = sum(is.na(dat$value)))

# Make sure Plot (if present) and TIMESTAMP columns are on the left
lefts <- intersect(c("Plot", "TIMESTAMP"), colnames(dat))
rights <- setdiff(colnames(dat), lefts)
dat <- dat[c(lefts, rights)]
# Remove unneeded columns if present

site <- dat$Site[1]
dat <- dat[setdiff(colnames(dat),
c("Site", "Logger", "Table",
"value_raw", "units", "loggernet_variable"))]
# And finally, sort
dat <- dat[order(dat$TIMESTAMP, dat$design_link),]

# Order columns following the column metadata...
if(!all(column_md$Column %in% colnames(dat))) {
stop("Column metadata file ", params$METADATA_COLUMNS_TABLE,
" has entries not in data: ", setdiff(column_md$Column, colnames(dat)))
}
dat <- dat[column_md$Column]
# ...and sort rows
dat <- dat[order(dat$TIMESTAMP, dat$Plot),]

write_to_folders(dat,
root_dir = out_dir,
Expand All @@ -113,7 +123,8 @@ f <- function(dir_name, dirs_to_process, out_dir) {
message("Sending ", nrow(flg), " flags to database")
fdb_add_flags(site, flg)
}


rm(dat)
return(smry)
}

Expand Down Expand Up @@ -160,16 +171,10 @@ if(!file.exists(template_file)) {
}
L1_metadata_template <- readLines(template_file)

# Get the column metadata file
col_md <- read.csv(file.path(params$METADATA_ROOT,
params$L1_METADATA,
"L1_metadata_columns.csv"))
col_md_for_insert <- paste(sprintf("%-15s", col_md$Column), col_md$Description)
col_md_for_insert <- paste(sprintf("%-15s", column_md$Column), column_md$Description)

# Get the variable metadata
var_md <- read.csv(file.path(params$METADATA_ROOT,
params$L1_METADATA,
"L1_metadata_vars.csv"))
var_md <- read.csv(file.path(params$METADATA_ROOT, params$L1_METADATA, params$METADATA_VARS_TABLE))
var_md_for_insert <- paste(sprintf("%-20s", c("research_name", var_md$research_name)),
sprintf("%-12s", c("Sensor", var_md$sensor)),
sprintf("%-10s", c("Units", var_md$final_units)),
Expand Down Expand Up @@ -204,16 +209,6 @@ for(dd in data_dirs) {
file_info <- c()
# Build up information about files...
for(f in files) {
# Ensure that the file headers match our column metadata
file_headers <- colnames(read.csv(f, nrows = 0))
if(!identical(sort(col_md$Column), sort(file_headers))) {
stop("File ", basename(f),
" headers don't match column metadata.",
"\nColumns in metadata but not in data: ",
paste(setdiff(col_md$Column, file_headers), collapse = ", "),
"\nColumns in data but not in metdata: ",
paste(setdiff(file_headers, col_md$Column), collapse = ", "))
}
fdata <- readLines(f) # just for a quick line count
file_info <- c(file_info,
basename(f),
Expand Down
47 changes: 26 additions & 21 deletions synoptic/L1_normalize.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ params:
L0: "L0/"
L1_NORMALIZE: "L1_normalize/"
DESIGN_TABLE: "design_table.csv"
METADATA_VARS_TABLE: "L1_metadata/L1_metadata_vars.csv"
METADATA_VARS_TABLE: "L1_metadata/L1_metadata_variables.csv"
OOS: "out-of-service/"
debug: false
remove_input_files: false
Expand Down Expand Up @@ -57,17 +57,24 @@ options(warn = 2)

# Read the design table (everything must have an entry)
DESIGN_TABLE <- file.path(params$METADATA_ROOT, params$DESIGN_TABLE)
dt <- read_csv(DESIGN_TABLE, col_types = "cccccDcc")
dt <- read_csv(DESIGN_TABLE, col_types = "ccccccccDcc")
dt$note <- NULL
dt <- dt[!is.na(dt$Logger),] # remove empty rows

# For compactness, the design table may have expansions. For example,
# "DiffVoltA_Avg({1:8})" -> "DiffVoltA_Avg(1)", "DiffVoltA_Avg(2)", etc., with
# extra rows added as needed:
# "DiffVoltA_Avg({1:8})" -> "DiffVoltA_Avg(1)", "DiffVoltA_Avg(2)", etc.
# Expand these rows into their individual entries
dt_ex <- compasstools::expand_df(dt)
links <- na.omit(dt_ex$design_link)
if(any(duplicated(links))) {
stop("There are duplicated design links: ",
paste(unique(links[duplicated(links)]), collapse = ", "))

# The Site-Plot-instrument-which-individual-research_name columns
# should be unique (for rows with non-empty research_name entries)
dt_ex$design_link <- with(dt_ex, paste(Site, Plot, Instrument, Which,
Individual, research_name, sep = "-"))
chk <- dt_ex[!is.na(dt_ex$research_name), "design_link"]
if(any(duplicated(chk))) {
stop("There are duplicate mappings in the design table!")
}
DESIGN_TABLE_ENTRIES <- with(dt_ex, paste(Logger, Table, loggernet_variable)) # save for later

# Read the variable metadata table
METADATA_VARS_TABLE <- file.path(params$METADATA_ROOT, params$METADATA_VARS_TABLE)
Expand Down Expand Up @@ -124,7 +131,7 @@ f <- function(fn, out_dir, design_table) {

# The row of the summary data frame, displayed at the end of processing, for this data file
smry <- data.frame(File = basename(fn),
no_design_links = NA_integer_,
no_research_name = NA_integer_,
`OOB%` = NA_real_,
`OOS%` = NA_real_,
Note = "",
Expand All @@ -137,11 +144,9 @@ f <- function(fn, out_dir, design_table) {

# Check for missing entries in the design table
ltlv <- unique(paste(dat$Logger, dat$Table, dat$loggernet_variable))
present <- ltlv %in% paste(design_table$Logger,
design_table$Table,
design_table$loggernet_variable)
present <- ltlv %in% DESIGN_TABLE_ENTRIES
if(!all(present)) {
stop("Some entries are missing in the design table!",
stop("Some data file entries are missing in the design table!",
paste(ltlv[!present], collapse = ", "))
}

Expand Down Expand Up @@ -174,13 +179,14 @@ f <- function(fn, out_dir, design_table) {
}

# Summary information
smry$no_design_links <- sum(is.na(dat$design_link))
message("\tFiltering out ", smry$no_design_links, " empty design_link rows")
dat <- subset(dat, !is.na(dat$design_link))
smry$no_research_name <- sum(is.na(dat$research_name))
message("\tFiltering out ", smry$no_research_name, " empty research_name rows")
dat <- subset(dat, !is.na(dat$research_name))

# If no rows left, note this fact and move on
if(!nrow(dat)) {
smry$Note <- "No design links; nothing to process"
message("\tNo research_names left; nothing to process")
smry$Note <- "No research_names left; nothing to process"
return(smry)
}

Expand Down Expand Up @@ -221,13 +227,11 @@ f <- function(fn, out_dir, design_table) {

# ------------- Out-of-service flags

message("\tAdding OOS flags")
x <- separate(dat, design_link, sep = "-",
into = c("What", "Site", "Location", "Sensor"))
# It turns out that the out of service check is SUPER expensive (50% of driver runtime)
# Since right now it's only applicable to Aquatroll, so check for this and skip if possible
if(grepl("WaterLevel", dat$Table[1])) {
dat$F_OOS <- as.integer(oos(oos_troll, x))
dat$F_OOS <- as.integer(oos(oos_troll, dat))
message("\tAdding ", sum(dat$F_OOS), " OOS flags")
} else {
dat$F_OOS <- 0L
}
Expand All @@ -241,6 +245,7 @@ f <- function(fn, out_dir, design_table) {
site = dat$Site[1],
logger = dat$Logger[1],
table = dat$Table[1])
rm(dat)

if(params$remove_input_files) {
message("\tRemoving input files")
Expand Down
11 changes: 4 additions & 7 deletions synoptic/driver.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ driver_try <- function(...) {
}

# Construct L0 data ---------------------------------------------
# L0 data are raw but in CSV form, and with "Logger" and "Table" columns added
# L0 data are raw but in long CSV form, with Logger/Table/ID columns added

message("Running L0")
new_section("Starting L0")
Expand All @@ -67,7 +67,7 @@ copy_output("L0.html", outfile)


# 'Normalize' L0 data -------------------------------------------
# Reshaped to long form and matched with design_link info
# Matched with design_link info
# This is an intermediate step, not exposed to data users

message("Running L1_normalize.qmd")
Expand All @@ -87,11 +87,8 @@ copy_output("L1_normalize.html", outfile)


# Construct L1 data --------------------------------------------
# Unit conversion and bounds checks performed
# L1 data are long form but without any plot (experimental) info

# This step will use a 'units_bounds.csv' file or something like that
# This step also sorts data into folders based on site, year, and month;
# This step drops unneeded columns, sorts, and adds extensive metadata
# File are written into folders based on site, year, and month;
# see write_to_folders() in helpers.R

message("Running L1.qmd")
Expand Down
6 changes: 3 additions & 3 deletions synoptic/helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ write_to_folders <- function(x, root_dir, data_level, site,
filename <- paste0(paste(site, time_period, data_level, vversion, sep = "_"), ".csv")
na_string <- NA_STRING_L1
write_this_plot <- TRUE
p <- ggplot(x, aes(TIMESTAMP, value, group = design_link)) +
p <- ggplot(x, aes(TIMESTAMP, value, group = Individual)) +
geom_line() +
facet_wrap(~research_name, scales = "free") +
ggtitle(filename) +
Expand All @@ -148,7 +148,7 @@ write_to_folders <- function(x, root_dir, data_level, site,
}
}

# Before writing, convert timestamp to character to ensure that observations
# Convert timestamp to character to ensure that observations
# at midnight have seconds written correctly
if(is.POSIXct(dat$TIMESTAMP)) {
dat$TIMESTAMP <- format(dat$TIMESTAMP, "%Y-%m-%d %H:%M:%S")
Expand Down Expand Up @@ -286,7 +286,7 @@ list_directories <- function(dir_list, outfile = "", prefix = "",
# which rows to keep (correct design_link assignment) and which to drop.
valid_entries <- function(objects, times, valid_through) {
# Nothing to do if there are no valid_through entries
if(all(is.na(valid_through))) return(rep(TRUE, length(objects())))
if(all(is.na(valid_through))) return(rep(TRUE, length(objects)))

# Any NA valid_through entries apply into the far future
valid_through[is.na(valid_through)] <- MAX_DATE
Expand Down
4 changes: 2 additions & 2 deletions synoptic/metadata/L1_metadata/CRC.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ Crane Creek (CRC) is one of COMPASS-FME synoptic sites within the Crane
Creek lacustuary region of southwestern Lake Erie, 9.25 miles (14.9 km)
northwest of Oak Harbor, Ohio. This site is owned by the USFWS and
managed as a part of the Ottawa National WIldlife Refuge complex. This
site consists of three plots: upland (located at 41.6153N, 83.2297W),
transition (41.6219N, 83.2381W), and wetland (41.6219N, 83.2389W). This
site consists of three plots: upland ("UP"; located at 41.6153N, 83.2297W),
transition ("TR"; 41.6219N, 83.2381W), and wetland ("W"; 41.6219N, 83.2389W). This
site was established in December 2021.

Contacts for the Crane Creek site:
Expand Down
4 changes: 2 additions & 2 deletions synoptic/metadata/L1_metadata/GCW.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
GCREeW (GCW) is one of COMPASS-FME synoptic sites, and has three plots:
upland (located at 38.8741N, 76.5522W), transition (38.8745, 76.5515W),
and wetland (38.8750N, 76.5500W). Note that this site overlaps spatially
upland ("UP"; located at 38.8741N, 76.5522W), transition ("TR"; 38.8745, 76.5515W),
and wetland ("W"; 38.8750N, 76.5500W). Note that this site overlaps spatially
with the TEMPEST (TMP) plots, the upland portion of the transect
overlaps with the TEMPEST experiment control plot. This transect spans a
mid- to late-successional (~80 years old) temperate, deciduous coastal
Expand Down
4 changes: 2 additions & 2 deletions synoptic/metadata/L1_metadata/GWI.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Goodwin Island (GWI) is one of COMPASS-FME synoptic sites, and has three plots:
upland (located at 37.2192N, 76.4087W), transition (37.2194N, 76.4092W),
and wetland (37.2189N, 76.4101W). The site was established in June 2022.
upland ("UP"; located at 37.2192N, 76.4087W), transition ("TR"; 37.2194N, 76.4092W),
and wetland ("W"; 37.2189N, 76.4101W). The site was established in June 2022.

Contacts for the Goodwin Island site:
Stephanie J. Wilson [email protected]
Expand Down
8 changes: 6 additions & 2 deletions synoptic/metadata/L1_metadata/L1_metadata_columns.csv
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
# L1 data will be ordered following rows below, and these descriptions inserted into metadata
Column,Description
Plot,"Plot name within site"
TIMESTAMP,Datalogger timestamp (EST) (POSIXct)
design_link,Design name that links to experimental unit (character)
research_name,Measurement name (character)
Instrument,Name of measurement instrument
Which,Which instrument within plot
Individual,"Individual sensor, tree, etc. being measured"
value,Observed value (numeric). The no-data value is '[NA_STRING_L1]'
research_name,Measurement name (character)
ID,Observation ID (character)
F_OOB,Flag: Out of instrumental bounds (logical; 1=TRUE)
F_OOS,Flag: Sensor listed as out of service (logical; 1=TRUE)
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
table,variable,research_name,sensor_units,final_units,conversion,low_bound,high_bound,description
Datalogger,Voltage,voltage,V,V,x * 1,11,13,Battery voltage of the data logger
Datalogger,BattV,battery_voltage,V,V,x * 1,11,13,Battery voltage of the data logger
Datalogger,SolarV,solar_voltage,V,V,x * 1,0,30,Solar/line voltage of the data logger
Sapflow,DiffVolt_Avg,sapflow_2.5cm,mV,mV,x * 1,0,1,Raw sap flow at 2.5 cm depth
Sapflow,DiffVolt_Avg,sapflow_5cm,mV,mV,x * 1,0,1,Raw sap flow at 5 cm depth
TerosTable,VWC,soil_vwc_5cm,m3/m3,m3/m3,(3.879*10^-4) * x - 0.6956,0,0.7,Soil volumetric water content at 5 cm calibrated for mineral soil
Expand All @@ -17,22 +18,22 @@ TerosTable,EC,soil_EC_30cm,µS/cm,µS/cm,x * 1,0,20000,Soil electrical conductiv
WaterLevel,Barometric_Pressure600,gw_bar_pressure,,,x * 1,,,
WaterLevel,Temperature600,gw_temperature,degC,degC,x * 1,-5,50,Groundwater temperature
WaterLevel,Temperature_Int600,gw_temperature_int,degC,degC,x * 1,-5,50,
WaterLevel,Actual_Conductivity600,gw_act_cond,µS/cm,µS/cm,x * 1,,,
WaterLevel,Specific_Conductivity600,gw_conductivity,µS/cm,µS/cm,x * 1,0,350000,Groundwater specific conductivity
WaterLevel,Actual_Conductivity600,gw_act_cond,μS/cm,μS/cm,x * 1,,,
WaterLevel,Specific_Conductivity600,gw_conductivity,uS/cm,uS/cm,x * 1,0,350000,Groundwater specific conductivity
WaterLevel,Salinity600,gw_salinity,PSU,PSU,x * 1,0,350,Groundwater salinity
WaterLevel,TDS600,gw_tds,,,x * 1,,,
WaterLevel,Water_Density600,gw_density,g/cm3,g/cm3,x * 1,0.98,1.05,Groundwater density
WaterLevel,Resistivity600,gw_resistivity,,,x * 1,,,
WaterLevel,pH600,gw_ph,unitless,unitless,x * 1,0,14,Groundwater pH
WaterLevel,pH_mV600,gw_ph_mv,mV,mV,x * 1,,,
WaterLevel,pH_ORP600,gw_ph_orp,mV,mV,x * 1,-1400,1400,Groundwater oxidation-reduction potential
WaterLevel,RDO_concen600,gw_rdo_concentration,mg/L,mg/L,x * 1,0,20,Groundwater dissolved oxygen concentration
WaterLevel,RDO_concen600,gw_rdo_conc,mg/L,mg/L,x * 1,0,20,Groundwater dissolved oxygen concentration
WaterLevel,RDO_perc_sat600,gw_perc_sat,%,%,x * 1,0,100,
WaterLevel,RDO_part_Pressure600,gw_part_pressure,,,x * 1,,,
WaterLevel,Pressure600,gw_pressure,psi,mbar,x * 68.948,-10,910,Vented pressure corrected for barometric pressure
WaterLevel,Depth600,gw_depth,,,x * 1,,,
WaterLevel,Voltage_Ext600A,gw_voltage_ext,V,V,x * 1,,,External battery voltage coming into the Aquatroll
WaterLevel,Battery_Int600A,gw_battery_int,%,%,x * 1,0,100,Internal battery percentage
WaterLevel,Battery_Int600A,gw_battery,%,%,x * 1,0,100,Internal battery percentage
ClimaVue50_15min,SlrFD_W_Avg,wx_slr_fd15,MJ/m2,MJ/m2,x * 1,,,Average solar flux over 15 minute period
ClimaVue50_15min,SlrTF_MJ_Tot,wx_slr_tf15,W/m2,W/m2,x * 1,,,Total solar flux in over minute period
ClimaVue50_15min,Rain_mm_Tot,wx_rain15,mm,mm,x * 1,,,Total rain over 15 minute period
Expand Down Expand Up @@ -97,11 +98,11 @@ ClimaVue50_24hr,PAR_Tot_C_Tot,wx_par_tot24,mmol/m2,mmol/m2,x * 1,,,Sum of total
ClimaVue50_24hr,CVMeta,,,,,,,"Current configuration of ClimaVue sensors (serial number, etc). Format is a13CAMPBELLCLIM50xxxxxx-yyyyyyyyy, where a is the SDI-12 address, xxxxxx is the model, and yyyyyyyyy is the serial number."
ExoTable,Conductivity,sonde_conductivity,,,x * 1,,,
ExoTable,FDOM_QSU,sonde_fdom,QSU,QSU,x * 1,0,300,"Fluorescent dissolved organic matter concentration, Quinine Sulfate Units"
ExoTable,FDOM_RFU,sonde_rfu,RFU,RFU,x * 1,0,100,"Fluorescent dissolved organic matter concentration, Relative Fluorescent Units"
ExoTable,FDOM_RFU,sonde_fdom_rfu,RFU,RFU,x * 1,0,100,"Fluorescent dissolved organic matter concentration, Relative Fluorescent Units"
ExoTable,NLF_conductivity,sonde_nlf_cond,,,x * 1,,,
ExoTable,ODO_sat,sonde_odo_sat,%,%,x * 1,0,200,"Dissolved oxygen concentration, percent air saturation"
ExoTable,ODO_local,sonde_odo_local,,,x * 1,,,
ExoTable,ODO_MgL,sonde_odo,mg/L,mg/L,x * 1,0,50,"Dissolved oxygen concentration, milligrams per liter"
ExoTable,ODO_MgL,sonde_odo_mgl,mg/L,mg/L,x * 1,0,50,"Dissolved oxygen concentration, milligrams per liter"
ExoTable,Pressure_psia,sonde_pressure,psia,psia,x * 1,,,Water pressure
ExoTable,Salinity_PPT,sonde_salinity,ppt,ppt,x * 1,,,Water salinity
ExoTable,Specific_Conductivity_uScm,sonde_spcond,µS/cm,µS/cm,x * 1,0,100000,Water specific conductivity
Expand All @@ -112,7 +113,7 @@ ExoTable,Temp_C,sonde_temp,C,C,x * 1,-5,50,Water temperature
ExoTable,Depth_m,sonde_depth,m,m,x * 1,0,10,Sonde depth
ExoTable,Battery_v,sonde_battery,V,V,x * 1,,,Sonde battery voltage
ExoTable,Cable_v,sonde_cable,V,V,x * 1,,,Sonde cable power
ExoTable,Wiper_Current_ma,sonde_wiper,mA,mA,x * 1,,,Wiper brush current
ExoTable,Wiper_Current_ma,sonde_wipercur,mA,mA,x * 1,,,Wiper brush current
ExoTable,TSS_MgL,sonde_tss,mg/L,mg/L,x * 1,,,Total suspended solids concentration
ExoTable,ORP_mv,sonde_orp,mV,mV,x * 1,-999,999,Water oxidation reduction potential
ExoTable,TDS_mg_L,sonde_tds,mg/L,mg/L,x * 1,,,Total dissolved solids concentration
Expand Down
Loading
Loading