From e0aef9300478f4033b0a77c2f6cf7a944118d013 Mon Sep 17 00:00:00 2001 From: Karl Broman Date: Fri, 23 Feb 2018 09:09:21 -0600 Subject: [PATCH] Rename threshold_genoprob() as clean_genoprob() + clean columns - "clean" seems potentially more informative than "threshold" - In addition to setting small values to 0, we also look at the maximum probability in a genotype column; if that's not large we set all values in that column to 0. - Related to Issue #34. --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 11 +++++- R/RcppExports.R | 8 ++--- R/clean_genoprob.R | 69 +++++++++++++++++++++++++++++++++++ R/threshold_genoprob.R | 19 ---------- man/clean_genoprob.Rd | 61 +++++++++++++++++++++++++++++++ src/RcppExports.cpp | 27 +++++++------- src/clean_genoprob.cpp | 74 ++++++++++++++++++++++++++++++++++++++ src/clean_genoprob.h | 12 +++++++ src/threshold_genoprob.cpp | 55 ---------------------------- src/threshold_genoprob.h | 11 ------ 12 files changed, 246 insertions(+), 104 deletions(-) create mode 100644 R/clean_genoprob.R delete mode 100644 R/threshold_genoprob.R create mode 100644 man/clean_genoprob.Rd create mode 100644 src/clean_genoprob.cpp create mode 100644 src/clean_genoprob.h delete mode 100644 src/threshold_genoprob.cpp delete mode 100644 src/threshold_genoprob.h diff --git a/DESCRIPTION b/DESCRIPTION index f153117e..eccd9b1d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: qtl2 Version: 0.13-4 -Date: 2018-02-22 +Date: 2018-02-23 Title: Quantitative Trait Locus Mapping in Experimental Crosses Description: R/qtl2 provides a set of tools to perform quantitative trait locus (QTL) analysis in experimental crosses. It is a diff --git a/NAMESPACE b/NAMESPACE index 3d891381..237ca0d3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -45,6 +45,7 @@ export(check_cross2) export(chisq_colpairs) export(chr_lengths) export(chr_names) +export(clean_genoprob) export(compare_geno) export(compare_genoprob) export(convert2cross2) diff --git a/NEWS.md b/NEWS.md index a1417079..c7c5378b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,13 @@ -## qtl2 0.13-3 (2018-02-21) +## qtl2 0.13-4 (2018-02-23) + +### New features + +- Added function `clean_genoprob()` which cleans genotype + probabilities by setting small values to 0 and, for genotype columns + where the maximum value is not large, setting all values to 0. This + is intended to help with the problem of unstable estimates of + genotype effects in `scan1coef()` and `fit1()` when there's a + genotype that is largely absent. ### Minor changes diff --git a/R/RcppExports.R b/R/RcppExports.R index 34399f49..64b4ff01 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -109,6 +109,10 @@ check_handle_x_chr <- function(crosstype, any_x_chr) { .Call(`_qtl2_chisq_colpairs`, input) } +.clean_genoprob <- function(prob_array, value_threshold = 1e-6, column_threshold = 0.01) { + .Call(`_qtl2_clean_genoprob`, prob_array, value_threshold, column_threshold) +} + .compare_geno <- function(geno) { .Call(`_qtl2_compare_geno`, geno) } @@ -653,7 +657,3 @@ test_initvector <- function(crosstype, is_x_chr, is_female, cross_info) { .Call(`_qtl2_test_initvector`, crosstype, is_x_chr, is_female, cross_info) } -.threshold_genoprob <- function(prob_array, threshold = 1e-6) { - .Call(`_qtl2_threshold_genoprob`, prob_array, threshold) -} - diff --git a/R/clean_genoprob.R b/R/clean_genoprob.R new file mode 100644 index 00000000..7edb7029 --- /dev/null +++ b/R/clean_genoprob.R @@ -0,0 +1,69 @@ +#' Clean genotype probabilities +#' +#' Clean up genotype probabilities by setting small values to 0 and +#' for a genotype column where the maximum value is rather small, set +#' all values in that column to 0. +#' +#' @md +#' +#' @param probs Genotype probabilities as calculated by +#' [calc_genoprob()]. +#' @param value_threshold Probabilities below this value will be set to 0. +#' @param column_threshold For genotype columns where the maximum +#' value is below this threshold, all values will be set to 0. +#' This must be less than \eqn{1/k} where \eqn{k} is the number of genotypes. +#' @param cores Number of CPU cores to use, for parallel calculations. +#' (If `0`, use [parallel::detectCores()].) +#' Alternatively, this can be links to a set of cluster sockets, as +#' produced by [parallel::makeCluster()]. +#' +#' @return A cleaned version of the input genotype probabilities object, `probs`. +#' +#' @details +#' In cases where a particular genotype is largely absent, +#' `scan1coef()` and `fit1()` can give unstable estimates of the +#' genotype effects. Cleaning up the genotype probabilities by setting +#' small values to 0 helps to ensure that such effects get set to +#' `NA`. +#' +#' At each position and for each genotype column, we find the maximum +#' probability across individuals. If that maximum is < +#' `column_threshold`, all values in that genotype column at that +#' position are set to 0. +#' +#' In addition, any genotype probabilties that are < `value_threshold` +#' (generally < `column_threshold`) are set to 0. +#' +#' The probabilities are then re-scaled so that the probabilities for +#' each individual at each position sum to 1. +#' +#' @examples +#' iron <- read_cross2(system.file("extdata", "iron.zip", package="qtl2")) +#' \dontshow{iron <- iron[,c("19", "X")] # subset to chr 19 and X} +#' +#' # calculate genotype probabilities +#' probs <- calc_genoprob(iron, map, error_prob=0.002) +#' +#' # clean the genotype probabilities and paste over original values +#' probs <- clean_genoprob(probs) +#' +#' @export +clean_genoprob <- + function(probs, value_threshold=1e-6, column_threshold=0.01, cores=1) +{ + attrib <- attributes(probs) + + cores <- setup_cluster(cores) + + result <- cluster_lapply(cores, seq_along(probs), + function(i) { + this_result <- .clean_genoprob(probs[[i]], value_threshold, column_threshold) + dimnames(this_result) <- dimnames(probs[[i]]) + this_result }) + + for(a in names(attrib)) { + attr(result, a) <- attrib[[a]] + } + + result +} diff --git a/R/threshold_genoprob.R b/R/threshold_genoprob.R deleted file mode 100644 index 6bf73ec4..00000000 --- a/R/threshold_genoprob.R +++ /dev/null @@ -1,19 +0,0 @@ -threshold_genoprob <- - function(probs, threshold=1e-6, cores=1) -{ - attrib <- attributes(probs) - - cores <- setup_cluster(cores) - - result <- cluster_lapply(cores, seq_along(probs), - function(i) { - this_result <- aperm( .threshold_genoprob(aperm(probs[[i]], c(2,1,3)), threshold), c(2,1,3)) - dimnames(this_result) <- dimnames(probs[[i]]) - this_result }) - - for(a in names(attrib)) { - attr(result, a) <- attrib[[a]] - } - - result -} diff --git a/man/clean_genoprob.Rd b/man/clean_genoprob.Rd new file mode 100644 index 00000000..ee33074d --- /dev/null +++ b/man/clean_genoprob.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clean_genoprob.R +\name{clean_genoprob} +\alias{clean_genoprob} +\title{Clean genotype probabilities} +\usage{ +clean_genoprob(probs, value_threshold = 0.000001, column_threshold = 0.01, + cores = 1) +} +\arguments{ +\item{probs}{Genotype probabilities as calculated by +\code{\link[=calc_genoprob]{calc_genoprob()}}.} + +\item{value_threshold}{Probabilities below this value will be set to 0.} + +\item{column_threshold}{For genotype columns where the maximum +value is below this threshold, all values will be set to 0. +This must be less than \eqn{1/k} where \eqn{k} is the number of genotypes.} + +\item{cores}{Number of CPU cores to use, for parallel calculations. +(If \code{0}, use \code{\link[parallel:detectCores]{parallel::detectCores()}}.) +Alternatively, this can be links to a set of cluster sockets, as +produced by \code{\link[parallel:makeCluster]{parallel::makeCluster()}}.} +} +\value{ +A cleaned version of the input genotype probabilities object, \code{probs}. +} +\description{ +Clean up genotype probabilities by setting small values to 0 and +for a genotype column where the maximum value is rather small, set +all values in that column to 0. +} +\details{ +In cases where a particular genotype is largely absent, +\code{scan1coef()} and \code{fit1()} can give unstable estimates of the +genotype effects. Cleaning up the genotype probabilities by setting +small values to 0 helps to ensure that such effects get set to +\code{NA}. + +At each position and for each genotype column, we find the maximum +probability across individuals. If that maximum is < +\code{column_threshold}, all values in that genotype column at that +position are set to 0. + +In addition, any genotype probabilties that are < \code{value_threshold} +(generally < \code{column_threshold}) are set to 0. + +The probabilities are then re-scaled so that the probabilities for +each individual at each position sum to 1. +} +\examples{ +iron <- read_cross2(system.file("extdata", "iron.zip", package="qtl2")) +\dontshow{iron <- iron[,c("19", "X")] # subset to chr 19 and X} + +# calculate genotype probabilities +probs <- calc_genoprob(iron, map, error_prob=0.002) + +# clean the genotype probabilities and paste over original values +probs <- clean_genoprob(probs) + +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 115288dd..c054569b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -398,6 +398,19 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// clean_genoprob +NumericVector clean_genoprob(const NumericVector& prob_array, double value_threshold, double column_threshold); +RcppExport SEXP _qtl2_clean_genoprob(SEXP prob_arraySEXP, SEXP value_thresholdSEXP, SEXP column_thresholdSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const NumericVector& >::type prob_array(prob_arraySEXP); + Rcpp::traits::input_parameter< double >::type value_threshold(value_thresholdSEXP); + Rcpp::traits::input_parameter< double >::type column_threshold(column_thresholdSEXP); + rcpp_result_gen = Rcpp::wrap(clean_genoprob(prob_array, value_threshold, column_threshold)); + return rcpp_result_gen; +END_RCPP +} // compare_geno IntegerMatrix compare_geno(const IntegerMatrix& geno); RcppExport SEXP _qtl2_compare_geno(SEXP genoSEXP) { @@ -2354,18 +2367,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// threshold_genoprob -NumericVector threshold_genoprob(const NumericVector& prob_array, const double threshold); -RcppExport SEXP _qtl2_threshold_genoprob(SEXP prob_arraySEXP, SEXP thresholdSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const NumericVector& >::type prob_array(prob_arraySEXP); - Rcpp::traits::input_parameter< const double >::type threshold(thresholdSEXP); - rcpp_result_gen = Rcpp::wrap(threshold_genoprob(prob_array, threshold)); - return rcpp_result_gen; -END_RCPP -} static const R_CallMethodDef CallEntries[] = { {"_qtl2_arrange_genes", (DL_FUNC) &_qtl2_arrange_genes, 2}, @@ -2395,6 +2396,7 @@ static const R_CallMethodDef CallEntries[] = { {"_qtl2_check_is_female_vector", (DL_FUNC) &_qtl2_check_is_female_vector, 3}, {"_qtl2_check_handle_x_chr", (DL_FUNC) &_qtl2_check_handle_x_chr, 2}, {"_qtl2_chisq_colpairs", (DL_FUNC) &_qtl2_chisq_colpairs, 1}, + {"_qtl2_clean_genoprob", (DL_FUNC) &_qtl2_clean_genoprob, 3}, {"_qtl2_compare_geno", (DL_FUNC) &_qtl2_compare_geno, 1}, {"_qtl2_count_xo", (DL_FUNC) &_qtl2_count_xo, 3}, {"_qtl2_count_xo_3d", (DL_FUNC) &_qtl2_count_xo_3d, 3}, @@ -2531,7 +2533,6 @@ static const R_CallMethodDef CallEntries[] = { {"_qtl2_test_emitmatrix", (DL_FUNC) &_qtl2_test_emitmatrix, 7}, {"_qtl2_test_stepmatrix", (DL_FUNC) &_qtl2_test_stepmatrix, 5}, {"_qtl2_test_initvector", (DL_FUNC) &_qtl2_test_initvector, 4}, - {"_qtl2_threshold_genoprob", (DL_FUNC) &_qtl2_threshold_genoprob, 2}, {NULL, NULL, 0} }; diff --git a/src/clean_genoprob.cpp b/src/clean_genoprob.cpp new file mode 100644 index 00000000..bb920503 --- /dev/null +++ b/src/clean_genoprob.cpp @@ -0,0 +1,74 @@ +// clean genoprobs, setting small values to 0 + +#include "clean_genoprob.h" +#include +#include + +using namespace Rcpp; + +// clean genoprobs, setting small values to 0 +// [[Rcpp::export(".clean_genoprob")]] +NumericVector clean_genoprob(const NumericVector& prob_array, // array as n_ind x n_gen x n_pos + double value_threshold=1e-6, + double column_threshold=0.01) +{ + if(Rf_isNull(prob_array.attr("dim"))) + throw std::invalid_argument("prob_array should be a 3d array but has no dimension attribute"); + const IntegerVector& dim = prob_array.attr("dim"); + if(dim.size() != 3) + throw std::invalid_argument("prob_array should be a 3d array of probabilities"); + const int n_ind = dim[0]; + const int n_gen = dim[1]; + const int n_pos = dim[2]; + + NumericVector result = clone(prob_array); + + // ensure that we don't set all values in a row to 0 + if(column_threshold > 1.0/(double)n_gen) + column_threshold = 0.5/(double)n_gen; + if(value_threshold > 1.0/(double)n_gen) + value_threshold = 0.5/(double)n_gen; + + for(int pos=0, offset=0; pos= column_threshold) { + zero_column = false; + break; + } + } + if(zero_column) { // biggest value was < column_threshold so zero the column + for(int ind=0; ind + +// clean genoprobs, setting small values to 0 +Rcpp::NumericVector clean_genoprob(const Rcpp::NumericVector& prob_array, // array as n_ind x n_gen x n_pos + double value_threshold, + double column_threshold); + +#endif // CLEAN_GENOPROB_H diff --git a/src/threshold_genoprob.cpp b/src/threshold_genoprob.cpp deleted file mode 100644 index ed765fb3..00000000 --- a/src/threshold_genoprob.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// threshold genoprobs, setting small values to 0 - -#include "threshold_genoprob.h" -#include -#include - -using namespace Rcpp; - -// threshold genoprobs, setting small values to 0 -// [[Rcpp::export(".threshold_genoprob")]] -NumericVector threshold_genoprob(const NumericVector& prob_array, // array as n_gen x n_ind x n_pos - const double threshold=1e-6) -{ - if(Rf_isNull(prob_array.attr("dim"))) - throw std::invalid_argument("prob_array should be a 3d array but has no dimension attribute"); - const IntegerVector& dim = prob_array.attr("dim"); - if(dim.size() != 3) - throw std::invalid_argument("prob_array should be a 3d array of probabilities"); - const int n_gen = dim[0]; - const int n_ind = dim[1]; - const int n_pos = dim[2]; - const int ind_by_pos = n_ind*n_pos; - - NumericVector result(n_gen*n_ind*n_pos); - - for(int pos=0, offset=0; pos - -// threshold genoprobs, setting small values to 0 -Rcpp::NumericVector threshold_genoprob(const Rcpp::NumericVector& prob_array, // array as n_gen x n_ind x n_pos - const double threshold); - -#endif // THRESHOLD_GENOPROB_H