Skip to content

Commit

Permalink
more unit tests and a bug fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
Zilong-Li committed Dec 17, 2023
1 parent fab276e commit dba1787
Show file tree
Hide file tree
Showing 11 changed files with 234 additions and 20 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@
^README.html
.*\.tar\.gz$
^.covrignore$
^codecov.yaml$
^\.github$
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# vcfppR 0.3.6

* add `vcfreader::line`
* remove `vcfreader::setVariant`
* bug fix `setFormatStr`
* more units tests

# vcfppR 0.3.5

* add vcfreader and vcfwriter
Expand Down
3 changes: 2 additions & 1 deletion R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,9 @@ heterozygosity <- function(vcffile, region = "", samples = "-", pass = FALSE, qu
#' @field hasOTHER Test if current variant has a OTHER or not
#' @field hasOVERLAP Test if current variant has a OVERLAP or not
#' @field nsamples Return the number of samples
#' @field string Return the raw string of current variant
#' @field header Return the raw string of the vcf header
#' @field string Return the raw string of current variant including newline
#' @field line Return the raw string of current variant without newline
#' @field output Init an output object for streaming out the variants to another vcf
#' @field write Streaming out current variant the output vcf
#' @field close Close the connection to the output vcf
Expand Down
3 changes: 3 additions & 0 deletions codecov.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# sample regex patterns
ignore:
- "src/htslib-1.18" # ignore folders and all its contents
6 changes: 4 additions & 2 deletions man/vcfreader.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

54 changes: 38 additions & 16 deletions src/vcf-reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ using namespace std;
//' @field hasOTHER Test if current variant has a OTHER or not
//' @field hasOVERLAP Test if current variant has a OVERLAP or not
//' @field nsamples Return the number of samples
//' @field string Return the raw string of current variant
//' @field header Return the raw string of the vcf header
//' @field string Return the raw string of current variant including newline
//' @field line Return the raw string of current variant without newline
//' @field output Init an output object for streaming out the variants to another vcf
//' @field write Streaming out current variant the output vcf
//' @field close Close the connection to the output vcf
Expand Down Expand Up @@ -243,13 +244,19 @@ class vcfreader {
auto v = genotypes(false);
return v.size() / nsamples();
}
inline std::string string() const { return var.asString(); }
inline std::string header() const { return br.header.asString(); }
inline std::string string() const { return var.asString(); }
inline std::string line() {
std::string s = var.asString();
s.pop_back();
return s;
}

// WRITE
inline void output(const std::string& vcffile) {
bw.open(vcffile);
bw.initalHeader(br.header);
writable = true;
}
inline void write() { bw.writeRecord(var); }
inline void close() { bw.close(); }
Expand All @@ -258,40 +265,54 @@ class vcfreader {
inline void setID(std::string s) { var.setID(s.c_str()); }
inline void setPOS(int pos) { var.setPOS(pos); }
inline void setRefAlt(const std::string& s) { var.setRefAlt(s.c_str()); }
inline void setInfoInt(std::string tag, int v) { var.setINFO(tag, v); }
inline void setInfoFloat(std::string tag, double v) { var.setINFO(tag, v); }
inline void setInfoStr(std::string tag, const std::string& s) { var.setINFO(tag, s); }
inline void setFormatInt(std::string tag, const vector<int>& v) { var.setFORMAT(tag, v); }
inline void setFormatFloat(std::string tag, const vector<double>& v) {
inline bool setInfoInt(std::string tag, int v) { return var.setINFO(tag, v); }
inline bool setInfoFloat(std::string tag, double v) { return var.setINFO(tag, v); }
inline bool setInfoStr(std::string tag, const std::string& s) { return var.setINFO(tag, s); }
inline bool setFormatInt(std::string tag, const vector<int>& v) {
return var.setFORMAT(tag, v);
}
inline bool setFormatFloat(std::string tag, const vector<double>& v) {
vector<float> f(v.begin(), v.end());
var.setFORMAT(tag, f);
return var.setFORMAT(tag, f);
}
inline bool setFormatStr(std::string tag, const std::string& s) {
if (s.length() % nsamples() != 0) {
Rcpp::Rcout << "the length of s must be divisable by nsamples()";
return false;
}
return var.setFORMAT(tag, s);
}
inline void setFormatStr(std::string tag, const std::string& s) { var.setINFO(tag, s); }
inline void setGenotypes(const vector<int>& v) {
inline bool setGenotypes(const vector<int>& v) {
if ((int)v.size() != nsamples() * ploidy()) {
Rcpp::Rcout << "nsamples: " << nsamples() << ", ploidy: " << ploidy() << "\n";
throw std::runtime_error(
"the size of genotype vector is not equal to nsamples * ploidy");
Rcpp::Rcout << "the size of genotype vector is not equal to nsamples * ploidy\n";
return false;
}
var.setGenotypes(v);
return var.setGenotypes(v);
}
inline void setPhasing(const vector<int>& v) {
vector<char> c(v.begin(), v.end());
var.setPhasing(c);
}

inline void rmInfoTag(std::string s) { var.removeINFO(s); }
inline void setVariant(const std::string& s) { var.addLineFromString(s); }
inline void addINFO(const std::string& id, const std::string& number, const std::string& type,
const std::string& desc) {
bw.header.addINFO(id, number, type, desc);
if (writable)
bw.header.addINFO(id, number, type, desc);
else
Rcpp::Rcout << "please call the `output(filename)` function first\n";
}
inline void addFORMAT(const std::string& id, const std::string& number, const std::string& type,
const std::string& desc) {
bw.header.addFORMAT(id, number, type, desc);
if (writable)
bw.header.addFORMAT(id, number, type, desc);
else
Rcpp::Rcout << "please call the `output(filename)` function first\n";
}

private:
bool writable = false;
vcfpp::BcfReader br;
vcfpp::BcfRecord var;
vcfpp::BcfWriter bw;
Expand Down Expand Up @@ -344,6 +365,7 @@ RCPP_MODULE(vcfreader) {
.method("nsamples", &vcfreader::nsamples)
.method("header", &vcfreader::header)
.method("string", &vcfreader::string)
.method("line", &vcfreader::line)
.method("output", &vcfreader::output)
.method("write", &vcfreader::write)
.method("close", &vcfreader::close)
Expand Down
42 changes: 42 additions & 0 deletions tests/testthat/test-modify-vcf.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,45 @@ test_that("modify the genotypes", {
expect_false(isTRUE(all.equal(g0, g2)))
expect_identical(g1, g2)
})

test_that("modify item in FORMAT", {
## creat a VCF with GP in FORMAT
outvcf <- paste0(tempfile(), ".vcf.gz")
bw <- vcfwriter$new(outvcf, "VCF4.3")
bw$addContig("chr20")
bw$addINFO("AF", "A", "Float", "Estimated allele frequency in the range (0,1)");
bw$addFORMAT("GP", "3", "Float", "Posterior genotype probability of 0/0, 0/1, and 1/1");
bw$addSample("NA12878")
bw$addSample("NA12879")
s1 <- "chr20\t2006060\trs146931526\tG\tC\t100\tPASS\tAF=0.000998403\tGP\t0.966,0.034,0\t0.003,0.872,0.125"
bw$writeline(s1)
bw$close()
## tests
br <- vcfreader$new(outvcf)
expect_true(br$variant()) ## get a variant record
br$string()
gp <- br$formatFloat("GP")
gp <- array(gp, c(3, br$nsamples()))
ds <- gp[2,] + gp[3,] * 2
## will prompt uses a message if `output` is not called
br$addFORMAT("DS", "1", "Float", "Diploid dosage")
br$addINFO("INFO", "1", "Float", "INFO score of imputation")
## now open another file for output
newvcf <- paste0(tempfile(), ".vcf.gz")
br$output(newvcf)
## add INFO, DS in header first
br$addINFO("INFO", "1", "Float", "INFO score of imputation")
br$addFORMAT("DS", "1", "Float", "Diploid dosage")
br$addFORMAT("AC", "1", "Integer", "Allele counts")
br$addFORMAT("STR", "1", "String", "Test String type")
print(br$header())
## set DS in FORMAT now
br$setFormatFloat("DS", ds)
## test if DS presents
expect_identical(br$formatFloat("DS"), ds)
## more tests
br$setFormatInt("AC", c(3L, 4L))
expect_false(br$setFormatStr("STR","HHH,JJJ")) ## length(s) %% nsamples != 0
expect_true(br$setFormatStr("STR","HHHJJJ")) ## length(s) %% nsamples == 0
print(br$string())
})
10 changes: 9 additions & 1 deletion tests/testthat/test-popgen.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ vcffile <- system.file("extdata", "raw.gt.vcf.gz", package="vcfppR")

test_that("both vcftable and vcfreader work for calculating heterzygosity", {
vcf <- vcftable(vcffile, vartype = "snps")
res1 <- colSums(vcf[["gt"]]==1, na.rm = TRUE)
res1 <- as.integer(colSums(vcf[["gt"]]==1, na.rm = TRUE))
br <- vcfreader$new(vcffile)
res2 <- rep(0L, br$nsamples())
while(br$variant()) {
Expand All @@ -14,3 +14,11 @@ test_that("both vcftable and vcfreader work for calculating heterzygosity", {
}
expect_identical(as.integer(res1), res2)
})

test_that("heterzygosity only counts snps", {
vcf <- vcftable(vcffile, vartype = "snps", pass = TRUE)
res1 <- as.integer(colSums(vcf[["gt"]]==1, na.rm = TRUE))
vcf <- vcfpopgen(vcffile, pass = TRUE, fun = "heterozygosity")
res2 <- vcf$hets
expect_identical(res1, res2)
})
114 changes: 114 additions & 0 deletions tests/testthat/test-vcf-reader.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,117 @@
library(testthat)

vcffile <- system.file("extdata", "raw.gt.vcf.gz", package="vcfppR")
svfile <- system.file("extdata", "sv.vcf.gz", package="vcfppR")

test_that("vcfreader: constructor with vcfffile only", {
br <- vcfreader$new(vcffile)
br$variant()
expect_identical(br$chr(), "chr21")
expect_identical(br$pos(), 5030082L)
expect_identical(br$id(), "chr21:5030082:G:A")
expect_identical(br$ref(), "G")
expect_identical(br$alt(), "A")
})

test_that("vcfreader: constructor with vcfffile and region", {
br <- vcfreader$new(vcffile, "chr21:5030082-")
br$variant()
expect_identical(br$chr(), "chr21")
expect_identical(br$pos(), 5030082L)
expect_identical(br$id(), "chr21:5030082:G:A")
expect_identical(br$ref(), "G")
expect_identical(br$alt(), "A")
})

test_that("vcfreader: constructor with vcfffile, region and samples", {
br <- vcfreader$new(vcffile, "chr21:5030082-", "HG00097,HG00099")
br$variant()
expect_identical(br$genotypes(F), rep(0L, 4))
expect_identical(br$chr(), "chr21")
expect_identical(br$pos(), 5030082L)
expect_identical(br$id(), "chr21:5030082:G:A")
expect_identical(br$ref(), "G")
expect_identical(br$alt(), "A")
})

test_that("vcfreader: get FORMAT item with Float type", {
br <- vcfreader$new(vcffile, "chr21:5030347-", "HG00097,HG00099")
br$variant()
expect_identical(br$pos(), 5030347L)
expect_identical(all(is.na(br$formatFloat("AB"))), TRUE)
})

test_that("vcfreader: get FORMAT item with Integer type", {
br <- vcfreader$new(vcffile, "chr21:5030347-", "HG00097,HG00099")
br$variant()
expect_identical(br$pos(), 5030347L)
expect_identical(br$formatInt("AD"), c(4L, 0L, 16L, 0L))
})

test_that("vcfreader: get FORMAT item with String type", {
br <- vcfreader$new(svfile, "chr21:5114000-", "HG00096,HG00097")
expect_true(br$variant())
expect_identical(br$pos(), 5114000L)
expect_identical(br$formatStr("EV"), c("RD","RD"))
})

test_that("vcfreader: get FORMAT item for the unexpected and error", {
br <- vcfreader$new(vcffile, "chr21:5030347-", "HG00097,HG00099")
br$variant()
expect_identical(br$pos(), 5030347L)
expect_error(br$formatInt("AA")) ## AA not exists
expect_identical(br$formatInt("AD"), c(4L, 0L, 16L, 0L))
## use formatFloat will return unexpected values if AD in FORMAT if Integer
expect_error(expect_identical(br$formatFloat("AD"), c(4L, 0L, 16L, 0L))) ## AA exists but of Integer type
})

test_that("vcfreader: get variants type", {
br <- vcfreader$new(vcffile)
i <- 0
s <- 0
m <- 0
ms <- 0
while(br$variant()) {
if(br$isIndel()) i <- i + 1
if(br$isSV()) s <- s + 1
if(br$isMultiAllelics()) m <- m + 1
if(br$isMultiAllelicSNP()) ms <- ms + 1
}
expect_identical(i, 6)
expect_identical(s, 0)
expect_identical(m, 4)
expect_identical(ms, 3)
})

test_that("vcfreader: test variants type", {
br <- vcfreader$new(vcffile)
i1 <- 0
i2 <- 0
i3 <- 0
i4 <- 0
i5 <- 0
i6 <- 0
i7 <- 0
i8 <- 0
while(br$variant()) {
if(br$hasSNP()) i1 <- i1 + 1
if(br$hasINDEL()) i2 <- i2 + 1
if(br$hasINS()) i3 <- i3 + 1
if(br$hasDEL()) i4 <- i4 + 1
if(br$hasMNP()) i5 <- i5 + 1
if(br$hasBND()) i6 <- i6 + 1
if(br$hasOTHER()) i7 <- i7 + 1
if(br$hasOVERLAP()) i8 <- i8 + 1
}
expect_identical(i1, 66)
expect_identical(i2, 6)
expect_identical(i3, 3)
expect_identical(i4, 4)
expect_identical(i5, 0)
expect_identical(i6, 0)
expect_identical(i7, 0)
expect_identical(i8, 1)
})

test_that("vcfreader: reading variant only", {
br <- vcfreader$new(vcffile)
Expand All @@ -13,6 +124,9 @@ test_that("vcfreader: reading variant only", {
expect_equal(br$qual(), 70.06, tolerance = 1e-6)
expect_identical(br$filter(), "VQSRTrancheSNP99.80to100.00")
expect_identical(br$info(), "AC=2;AF=0.000616523;AN=3244;DP=2498;FS=0;MLEAC=1;MLEAF=0.0003083;MQ=17.07;MQ0=0;QD=17.52;SOR=3.258;VQSLOD=-32.6;culprit=MQ;AN_EUR=658;AN_EAS=548;AN_AMR=520;AN_SAS=624;AN_AFR=894;AF_EUR=0;AF_EAS=0;AF_AMR=0.00384615;AF_SAS=0;AF_AFR=0;AC_EUR=0;AC_EAS=0;AC_AMR=2;AC_SAS=0;AC_AFR=0;AC_Het_EUR=0;AC_Het_EAS=0;AC_Het_AMR=0;AC_Het_SAS=0;AC_Het_AFR=0;AC_Het=0;AC_Hom_EUR=0;AC_Hom_EAS=0;AC_Hom_AMR=2;AC_Hom_SAS=0;AC_Hom_AFR=0;AC_Hom=2;HWE_EUR=1;ExcHet_EUR=1;HWE_EAS=1;ExcHet_EAS=1;HWE_AMR=0.00192678;ExcHet_AMR=1;HWE_SAS=1;ExcHet_SAS=1;HWE_AFR=1;ExcHet_AFR=1;HWE=0.000308356;ExcHet=1;ME=0;AN_EUR_unrel=508;AN_EAS_unrel=448;AN_AMR_unrel=330;AN_SAS_unrel=484;AN_AFR_unrel=666;AF_EUR_unrel=0;AF_EAS_unrel=0;AF_AMR_unrel=0;AF_SAS_unrel=0;AF_AFR_unrel=0;AC_EUR_unrel=0;AC_EAS_unrel=0;AC_AMR_unrel=0;AC_SAS_unrel=0;AC_AFR_unrel=0;AC_Het_EUR_unrel=0;AC_Het_EAS_unrel=0;AC_Het_AMR_unrel=0;AC_Het_SAS_unrel=0;AC_Het_AFR_unrel=0;AC_Hom_EUR_unrel=0;AC_Hom_EAS_unrel=0;AC_Hom_AMR_unrel=0;AC_Hom_SAS_unrel=0;AC_Hom_AFR_unrel=0")
br$rmInfoTag("AC")
print(br$info())
print(br$infoInt("AC")) ## TODO: AC value still exists
af <- br$infoFloat("AF")
## expect_equal(0.00061652297, 0.00061652300, tolerance = 1e-6)
expect_equal(af, 0.000616523, tolerance = 1e-6)
Expand Down
12 changes: 12 additions & 0 deletions tests/testthat/test-vcf-table.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,18 @@ test_that("extract GT for variant with ID=chr21:5030516:G:A", {
expect_equal(res$alt, "A")
})

test_that("extract GT for all multisnps", {
res <- vcftable(vcffile, vartype = "multisnps")
expect_identical(res$ref, c("C", "C", "G"))
expect_identical(res$alt, c("G,T", "T,G", "T,*"))
})

test_that("extract GT for all multiallelics", {
res <- vcftable(vcffile, vartype = "multiallelics")
expect_identical(res$ref, c("C", "C", "CTTTTTT","G"))
expect_identical(res$alt, c("G,T", "T,G", "CTTTTT,CTTTTTTT,C", "T,*"))
})

test_that("extract AD (Integer, number=.) for variant with ID=chr21:5030247:G:A", {
res <- vcftable(vcffile, id = c("chr21:5030347:G:A"), pass = TRUE, format = "AD")
expect_equal(nrow(res$AD), 1)
Expand Down
2 changes: 2 additions & 0 deletions tests/testthat/test-vcf-writer.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ test_that("vcfwriter: writing variant works", {
s2 <- gsub("\n", "", br$string())
expect_identical(br$chr(), "chr20")
expect_identical(s1, s2)
s2 <- br$line()
expect_identical(s1, s2)
})


Expand Down

0 comments on commit dba1787

Please sign in to comment.