Merge pull request #3052 from DongchenZ/SDA_dongchen

Sda dongchen
PecanProject · Oct 21, 2022 · 731f77a · 731f77a
2 parents 0ff670e + a74bed0
commit 731f77a
Show file tree

Hide file tree

Showing 14 changed files with 238 additions and 46 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -58,6 +58,11 @@ convert data for a single PFT fixed (#1329, #2974, #2981)
 - `PEcAn.data.land::gSSURGO.Query` has been updated to work again after changes to the gSSURGO API.
 - `PEcAn.settings::read.settings()` now prints a warning when falling back on default `"pecan.xml"` if the named `inputfile` doesn't exist.
 - fqdn() can access hostname on Windows (#3044 fixed by #3058)
+- The model2netcdf_SIPNET function can now export full year nc files by using 
+  the cdo_setup argument in the template job file. In detail, people will need
+  to specify cdosetup = "module load cdo/2.0.6" in the host section. More details
+  are in the Create_Multi_settings.R script. (#3052)
+
 
 ### Changed
 

diff --git a/book_source/03_topical_pages/03_pecan_xml.Rmd b/book_source/03_topical_pages/03_pecan_xml.Rmd
@@ -503,6 +503,7 @@ The following provides a quick overview of XML tags related to remote execution.
 		<qsub.jobid>Your job ([0-9]+) .*</qsub.jobid>
 		<qstat>'qstat -j @JOBID@ &amp;> /dev/null || echo DONE'</qstat>
 		<prerun>module load udunits R/R-3.0.0_gnu-4.4.6</prerun>
+		<cdosetup>module load cdo/2.0.6</cdosetup>
 		<modellauncher>
       <binary>/usr/local/bin/modellauncher</binary>
       <qsub.extra>-pe omp 20</qsub.extra>
@@ -522,6 +523,7 @@ The `host` section has the following tags:
 * `qsub.jobid`: [optional] the regular expression used to find the `jobid` returned from `qsub`. If not specified (and `qsub` is) it will use the default value is `Your job ([0-9]+) .*`
 * `qstat`: [optional] the command to execute to check if a job is finished, this should return DONE if the job is finished. There is one parameter this command should take `@JOBID@` which is the ID of the job as returned by `qsub.jobid`. If not specified (and qsub is) it will use the default value is `qstat -j @JOBID@ || echo DONE`
 * `prerun`: [optional] additional options to add to the job.sh at the top.
+* `cdosetup`: [optional] additional options to add to the job.sh at the top, specifically work for the CDO Linux command line package, allowing the user to generate the full year netCDF file through model2netcdf.SIPNET function (there has been an issue while this function will iteratively overlap the previous netCDF file, resulting in partial data loss).
 * `modellauncher`: [optional] this is an experimental section that will allow you to submit all the runs as a single job to a HPC system.
 
 The `modellauncher` section if specified will group all runs together and only submit a single job to the HPC cluster. This single job will leverage of a MPI program that will execute a single run. Some HPC systems will place a limit on the number of jobs that can be executed in parallel, this will only submit a single job (using multiple nodes). In case there is no limit on the number of jobs, a single PEcAn run could potentially submit a lot of jobs resulting in the full cluster running jobs for a single PEcAn run, preventing others from executing on the cluster.

diff --git a/models/sipnet/R/read_restart.SIPNET.R b/models/sipnet/R/read_restart.SIPNET.R
@@ -91,10 +91,10 @@ read_restart.SIPNET <- function(outdir, runid, stop.time, settings, var.names, p
     names(forecast[[length(forecast)]]) <- c("litter_carbon_content")
   }
 
-
   if ("SoilMoistFrac" %in% var.names) {
     forecast[[length(forecast) + 1]] <- ens$SoilMoistFrac[last]  ## unitless
     names(forecast[[length(forecast)]]) <- c("SoilMoistFrac")
+    params$restart <- c(params$restart, forecast[[length(forecast)]])
   }
 
   # This is snow

diff --git a/models/sipnet/R/write.configs.SIPNET.R b/models/sipnet/R/write.configs.SIPNET.R
@@ -55,6 +55,13 @@ write.config.SIPNET <- function(defaults, trait.values, settings, run.id, inputs
     hostsetup <- paste(hostsetup, sep = "\n", paste(settings$host$prerun, collapse = "\n"))
   }
 
+  # create cdo specific settings
+  cdosetup <- ""
+  if (!is.null(settings$host$cdosetup)) {
+    cdosetup <- paste(cdosetup, sep = "\n", paste(settings$host$cdosetup, collapse = "\n"))
+  }
+
+
   hostteardown <- ""
   if (!is.null(settings$model$postrun)) {
     hostteardown <- paste(hostteardown, sep = "\n", paste(settings$model$postrun, collapse = "\n"))
@@ -64,6 +71,7 @@ write.config.SIPNET <- function(defaults, trait.values, settings, run.id, inputs
   }
   # create job.sh
   jobsh <- gsub("@HOST_SETUP@", hostsetup, jobsh)
+  jobsh <- gsub("@CDO_SETUP@", cdosetup, jobsh)
   jobsh <- gsub("@HOST_TEARDOWN@", hostteardown, jobsh)
 
   jobsh <- gsub("@SITE_LAT@", settings$run$site$lat, jobsh)
@@ -79,6 +87,23 @@ write.config.SIPNET <- function(defaults, trait.values, settings, run.id, inputs
   jobsh <- gsub("@BINARY@", settings$model$binary, jobsh)
   jobsh <- gsub("@REVISION@", settings$model$revision, jobsh)
 
+  if(is.null(settings$state.data.assimilation$NC.Prefix)){
+    settings$state.data.assimilation$NC.Prefix <- "sipnet.out"
+  }
+  jobsh <- gsub("@PREFIX@", settings$state.data.assimilation$NC.Prefix, jobsh)
+
+  #overwrite argument
+  if(is.null(settings$state.data.assimilation$NC.Overwrite)){
+    settings$state.data.assimilation$NC.Overwrite <- FALSE
+  }
+  jobsh <- gsub("@OVERWRITE@", settings$state.data.assimilation$NC.Overwrite, jobsh)
+
+  #allow conflict? meaning allow full year nc export.
+  if(is.null(settings$state.data.assimilation$FullYearNC)){
+    settings$state.data.assimilation$FullYearNC <- FALSE
+  }
+  jobsh <- gsub("@CONFLICT@", settings$state.data.assimilation$FullYearNC, jobsh)
+
   if (is.null(settings$model$delete.raw)) {
     settings$model$delete.raw <- FALSE
   }

diff --git a/models/sipnet/inst/template.job b/models/sipnet/inst/template.job
@@ -7,6 +7,9 @@ exec &> "@OUTDIR@/logfile.txt"
 # host specific setup
 @HOST_SETUP@
 
+# cdo setup
+@CDO_SETUP@
+
 # create output folder
 mkdir -p "@OUTDIR@"
 

diff --git a/models/template/inst/template.job b/models/template/inst/template.job
@@ -6,6 +6,7 @@ exec &> "@OUTDIR@/logfile.txt"
 
 # host specific setup
 @HOST_SETUP@
+@CDO_SETUP@
 
 # create output folder
 mkdir -p "@OUTDIR@"

diff --git a/models/template/inst/template_geo.job b/models/template/inst/template_geo.job
@@ -0,0 +1,46 @@
+#!/bin/bash -l
+
+# redirect output
+exec 3>&1
+exec &> "@OUTDIR@/logfile.txt"
+
+# host specific setup
+@HOST_SETUP@
+@CDO_SETUP@
+
+# create output folder
+mkdir -p "@OUTDIR@"
+
+# see if application needs running
+if [ ! -e "@OUTDIR@/sipnet.out" ]; then
+  cd "@RUNDIR@"
+  ln -s "@SITE_MET@" sipnet.clim
+
+  "@BINARY@"
+  STATUS=$?
+
+  # copy output
+  mv "@RUNDIR@/sipnet.out" "@OUTDIR@"
+
+  # check the status
+  if [ $STATUS -ne 0 ]; then
+  	echo -e "ERROR IN MODEL RUN\nLogfile is located at '@OUTDIR@/logfile.txt'" >&3
+  	exit $STATUS
+  fi
+
+  # convert to MsTMIP
+  echo "require (PEcAn.SIPNET)
+    model2netcdf.SIPNET('@OUTDIR@', @SITE_LAT@, @SITE_LON@, '@START_DATE@', '@END_DATE@', @DELETE.RAW@, '@REVISION@', '@PREFIX@', @OVERWRITE@, @CONFLICT@)
+    " | R --no-save
+fi
+
+# copy readme with specs to output
+cp  "@RUNDIR@/README.txt" "@OUTDIR@/README.txt"
+
+# run getdata to extract right variables
+
+# host specific teardown
+@HOST_TEARDOWN@
+
+# all done
+echo -e "MODEL FINISHED\nLogfile is located at '@OUTDIR@/logfile.txt'" >&3
diff --git a/modules/assim.sequential/R/Analysis_sda_multiSite.R b/modules/assim.sequential/R/Analysis_sda_multiSite.R
@@ -242,6 +242,7 @@ GEF.MultiSite<-function(setting, Forecast, Observed, H, extraArg,...){
               niter = 50000,
               progressBar = TRUE)
     #dat.tobit2space <- do.call(rbind, dat.tobit2space)
+    save(dat.tobit2space, file = file.path(settings$outdir, paste0('censored',t,'.Rdata')))
     ## update parameters
     mu.f <-
       colMeans(dat.tobit2space[, grep("muf", colnames(dat.tobit2space))])

diff --git a/modules/assim.sequential/inst/MultiSite-Exs/SDA/Create_Multi_settings.R b/modules/assim.sequential/inst/MultiSite-Exs/SDA/Create_Multi_settings.R
@@ -30,6 +30,9 @@ template <- Settings(list(
     process.variance = TRUE,
     adjustment = FALSE,
     censored.data = FALSE,
+    FullYearNC = TRUE,
+    NC.Overwrite = FALSE,
+    NC.Prefix = "sipnet.out",
     q.type = "SINGLE",
     Localization.FUN = "Local.support",
     scalef = 1,
@@ -38,8 +41,8 @@ template <- Settings(list(
       #you could add more state variables here
       variable = structure(list(variable.name = "AbvGrndWood", unit = "MgC/ha", min_value = 0, max_value = 9999)),
       variable = structure(list(variable.name = "LAI", unit = "", min_value = 0, max_value = 9999)),
-      variable = structure(list(variable.name = "TotSoilCarb", unit = "kg/m^2", min_value = 0, max_value = 9999)),
-      variable = structure(list(variable.name = "soilWFracInit", unit = "", min_value = 0, max_value = 1))#soilWFracInit
+      variable = structure(list(variable.name = "SoilMoistFrac", unit = "", min_value = 0, max_value = 1)),#soilWFracInit
+      variable = structure(list(variable.name = "TotSoilCarb", unit = "kg/m^2", min_value = 0, max_value = 9999))
     )),
     forecast.time.step = "year",
     start.date = start_date,
@@ -164,7 +167,8 @@ template <- Settings(list(
     name = "geo.bu.edu",
     usr = "zhangdc",
     folder = "/projectnb/dietzelab/dongchen/All_NEON_SDA/NEON42/SDA/out",
-    prerun = "module load R/4.0.5",
+    prerun = "module load R/4.1.2",
+    cdosetup = "module load cdo/2.0.6",
     qsub = "qsub -l h_rt=24:00:00 -q &apos;geo*&apos; -N @NAME@ -o @STDOUT@ -e @STDERR@ -S /bin/bash",
     qsub.jobid = "Your job ([0-9]+) .*",
     qstat = "qstat -j @JOBID@ || echo DONE",

diff --git a/modules/data.land/R/cohort2pool.R b/modules/data.land/R/cohort2pool.R
@@ -83,17 +83,21 @@ cohort2pool <- function(dat, allom_param = NULL, dbh_name="DBH") {
   tot_leaf <- sum(leaf,na.rm = TRUE)
 
   #Divide by plot area, divide by 2 to convert from kg to kgC
-  leaf_biomass = (tot_leaf/(total_area))/2 + tot_herb/1000
+  leaf_biomass = ((tot_leaf/(total_area))/2 + tot_herb/1000)*1000#convert from kg to g
 
   if(tot_biomass == 0){
-    AGB <- tot_herb/1000
+    AGB <- leaf_biomass
   }else{
-    AGB <- (tot_biomass/(total_area))/2 + tot_herb/1000
+    AGB <- ((tot_biomass/(total_area))/2 + tot_herb/1000)*1000#in gram
   }
   wood_biomass = AGB - leaf_biomass
 
   #grab soil carbon info
-  soil_carbon = dat[[3]] #conversion done in extract_NEON_veg (gC/m^2)
+  if(sum(is.na(dat[[3]]))){
+    soil_carbon <- NA
+  }else{
+    soil_carbon <- mean(dat[[3]]$SoilCarbon) #conversion done in extract_NEON_veg (gC/m^2)
+  }
 
   #Prep Arguments for pool_ic function
   dims <- list(time =1) #Time dimension may be irrelevant

diff --git a/modules/data.land/R/extract_NEON_veg.R b/modules/data.land/R/extract_NEON_veg.R
@@ -115,26 +115,67 @@ extract_NEON_veg <- function(lon, lat, start_date, end_date, store_dir, neonsite
 
   # #soil carbon
   neonstore::neon_download("DP1.00096.001", dir = store_dir, table = NA, site = sitename, start_date = as.Date("2012-01-01"), end_date = end_date, type = "basic",api = "https://data.neonscience.org/api/v0")
-  perbiogeosample <- neonstore::neon_read(table = "perbiogeosample", product = "DP1.00096.001", site = sitename, start_date = as.Date("2012-01-01"), end_date = end_date, dir = store_dir)
-  perarchivesample <- neonstore::neon_read(table = "perarchivesample", product = "DP1.00096.001", site = sitename, start_date = as.Date("2012-01-01"), end_date = end_date, dir = store_dir)
   perbulksample <- neonstore::neon_read(table = "perbulksample", product = "DP1.00096.001", site = sitename, start_date = as.Date("2012-01-01"), end_date = end_date, dir = store_dir)
-  if(is.null(perbiogeosample)){
+  if(is.null(perbulksample)){
     print("no soil carbon data found!")
+    joined.soil <- NA
+  }else{
+    #filter for regular type of bulk density measurements
+    perbulksample <- perbulksample[perbulksample$bulkDensSampleType=="Regular",]
+
+    #remove duplicated and NA measurements for bulk density
+    bulkDensBottomDepth <- perbulksample$bulkDensBottomDepth
+    bulkDensExclCoarseFrag <- perbulksample$bulkDensExclCoarseFrag
+    Ind <- (!duplicated(bulkDensBottomDepth))&(!is.na(bulkDensExclCoarseFrag))
+    bulkDensBottomDepth <- bulkDensBottomDepth[Ind]
+    bulkDensExclCoarseFrag <- bulkDensExclCoarseFrag[Ind]
+
+    #calculate bulk density (need to do: more precise depth matching)
+    bulkDensity <- mean(bulkDensExclCoarseFrag[which(bulkDensBottomDepth <= 30)])
+
+    #if there is no bulk density measurements bellow 30cm.
+    if(is.na(bulkDensity)){
+      joined.soil <- NA
+    }else{
+      #download periodic data and join tables
+      #so far we use end date of the the year 2021 
+      #because the "sls_soilCoreCollection" table will fail when we use more recent end date for some sites.
+      neonstore::neon_download("DP1.10086.001", dir = store_dir, table = NA, site = sitename, start_date = as.Date("2012-01-01"), end_date = as.Date("2021-01-01"), type = "basic",api = "https://data.neonscience.org/api/v0")
+      sls_soilChemistry <- neonstore::neon_read(table = "sls_soilChemistry", product = "DP1.10086.001", site = sitename, start_date = as.Date("2012-01-01"), end_date = as.Date("2021-01-01"), dir = store_dir)
+      sls_soilCoreCollection <- neonstore::neon_read(table = "sls_soilCoreCollection", product = "DP1.10086.001", site = sitename, start_date = as.Date("2012-01-01"), end_date = as.Date("2021-01-01"), dir = store_dir)
+
+      if(is.null(sls_soilChemistry) | is.null(sls_soilCoreCollection)){
+        print("no soil carbon data found!")
+        joined.soil <- NA
+      }else{
+        joined.soil <- dplyr::left_join(sls_soilChemistry, sls_soilCoreCollection, by = "sampleID")
+
+        #select columns
+        joined.soil <- dplyr::select(joined.soil, .data$siteID.x, .data$plotID.x, .data$plotType.x, .data$organicCPercent, .data$collectDate.x, .data$sampleTopDepth, .data$sampleBottomDepth)
+        joined.soil$year <- lubridate::year(joined.soil$collectDate.x)
+        colnames(joined.soil) <- c("site_name", "plot", "plotType", "organicCPercent", "date", "top", "bottom", "year")
+
+        joined.soil <- Grab_First_Measurements_of_Each_Plot(joined.soil)
+
+        #remove NA values for organicCPercent data
+        joined.soil <- joined.soil[which(!is.na(joined.soil$organicCPercent)),]
+
+        #calculate soil carbon
+        joined.soil$bulkDensity <- bulkDensity
+
+        #convert from g/cm2 to g/m2, note that we have to divide by 100 because of percentage
+        joined.soil$SoilCarbon <- (joined.soil$organicCPercent * joined.soil$bulkDensity)*30*100 
+      }
+    }
   }
-  joined.soil <- dplyr::left_join(perarchivesample, perbiogeosample, by = "horizonID")
-  joined.soil <- dplyr::left_join(joined.soil, perbulksample, by = "horizonID")
-
-  #remove NA from soil data
-  soilcarbon.per.m2 <- sum(joined.soil$bulkDensExclCoarseFrag * joined.soil$carbonTot * 0.001 *  (joined.soil$biogeoBottomDepth - joined.soil$biogeoTopDepth) * 10000, na.rm=T)/1000 #convert from gram to kilogram
 
   #Create veg_info object as a list
   veg_info <- list()
   #Set filter.date as veg_info[[2]]
   veg_info[[2]] <- filter.date
   #Set plot size as veg_info[[1]]
   veg_info[[1]] <- filter.herb
-  veg_info[[3]] <- soilcarbon.per.m2
-  veg_info[[4]] <- joined.soil
+  veg_info[[3]] <- joined.soil
 
   return(veg_info)
 }
diff --git a/modules/data.land/R/sample_ic.R b/modules/data.land/R/sample_ic.R
@@ -11,12 +11,13 @@
 ##' @param source string to appear in file names, e.g. "PalEON"
 ##' @param bin_var variable you would like to sample by, DEFAULT is DBH
 ##' @param bin_size bin size for sampling, DEFAULT is 10
+##' @param bin_herb_soil if we want to use bin size for both herb and soil sampling
 ##' @param ... Other inputs
 ##' 
 ##' @export
 ##' @author Istem Fer
 sample_ic <- function(in.path, in.name, start_date, end_date, outfolder,
-                      n.ensemble, machine_host, source, bin_var = "DBH", bin_size = 10, ...){
+                      n.ensemble, machine_host, source, bin_var = "DBH", bin_size = 10, bin_herb_soil = TRUE, ...){
 
 
   #--------------------------------------------------------------------------------------------------#
@@ -108,19 +109,71 @@ sample_ic <- function(in.path, in.name, start_date, end_date, outfolder,
           samples$plot <- 1
           plot.n <- 1
         }
-        if(n.plot>bin_size){
-          herb_plot_bin_size <- bin_size
+
+        #if we are using bin_size to sample
+        if(bin_herb_soil){
+          if(n.plot>bin_size){
+            plot_bin_size <- bin_size
+          }else{
+            plot_bin_size <- ceiling(n.plot/5)
+          }
         }else{
-          herb_plot_bin_size <- ceiling(n.plot/5)
+          plot_bin_size <- 1
         }
+
         sub.list <- list()
-        sample_plots <- sample(plot.n, herb_plot_bin_size)
+        sample_plots <- sample(plot.n, plot_bin_size)
         for(np in 1:length(sample_plots)){
           samples_sub <- samples[samples$plot == sample_plots[np],] 
           sub.list[[np]]  <- samples_sub
         }
         veg_ens[[1]] <- do.call("rbind", sub.list)
       }
+
+      #sample Soil Carbon
+      if("SoilCarbon" %in% bin_var){
+        bin_Var <- "SoilCarbon"
+        obs <- as.data.frame(veg_info[[3]], stringsAsFactors = FALSE)
+
+        year.start <- lubridate::year(start_date)
+        year.end <- lubridate::year(end_date)
+
+        # subset samples for the year 
+        samples <- obs[obs$year >= year.start & obs$year <= year.end, ]
+
+        # remove rows with NAs (we don't want DBH to be NA but do we want to allow missing taxa?)
+        #samples <- samples[complete.cases(samples), ]
+        samples <- samples[!is.na(samples[bin_Var]), ]
+
+        # if there are subplots, sample within each subplot instead of pooling all together, maybe pass down a flag if we want to pool anyway
+        if(!is.null(samples$plot)){
+          n.plot <- length(unique(samples$plot))
+          plot.n <- unique(samples$plot)
+        }else{
+          n.plot <- 1
+          samples$plot <- 1
+          plot.n <- 1
+        }
+
+        #if we are using bin_size to sample
+        if(bin_herb_soil){
+          if(n.plot>bin_size){
+            plot_bin_size <- bin_size
+          }else{
+            plot_bin_size <- ceiling(n.plot/5)
+          }
+        }else{
+          plot_bin_size <- 1
+        }
+
+        sub.list <- list()
+        sample_plots <- sample(plot.n, plot_bin_size)
+        for(np in 1:length(sample_plots)){
+          samples_sub <- samples[samples$plot == sample_plots[np],] 
+          sub.list[[np]]  <- samples_sub
+        }
+        veg_ens[[3]] <- do.call("rbind", sub.list)
+      }
       #--------------------------------------------------------------------------------------------------#
       # Write vegetation data as rds, return results to convert_input