From caafbd773fd929ae2963a91599333d75273d85ff Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Fri, 6 Oct 2023 16:49:42 +0200 Subject: [PATCH] setRegion supports empty string --- .gitignore | 3 + doc/latex.org | 2 +- doc/paper.org | 293 +++++++++++++++++++++++++---------------- scripts/test-cyvcf2.py | 3 +- vcfpp.h | 155 +++++++++++++++------- 5 files changed, 294 insertions(+), 162 deletions(-) diff --git a/.gitignore b/.gitignore index 687813a..5329090 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,9 @@ t.* a-* .cache tmp/ +test/ +*.vcf* +*.bcf* test.cpp *.bin compile_commands.json diff --git a/doc/latex.org b/doc/latex.org index ffa1293..9ab3288 100644 --- a/doc/latex.org +++ b/doc/latex.org @@ -8,6 +8,6 @@ #+latex_header: \DeclareCaptionFormat{listing}{\parbox{\textwidth}{\colorbox{gray}{\parbox{\textwidth}{#1#2#3}}\vskip-4pt}} #+latex_header: \captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white} #+latex_header: \newlength\tdima \newlength\tdimb \setlength\tdima{ \fboxsep+\fboxrule} \setlength\tdimb{-\fboxsep+\fboxrule} -#+latex_header: \lstset{frame=tlrb,xleftmargin=\tdima,xrightmargin=\tdimb, rulecolor=\color{gray}, keywordstyle=\ttfamily} +#+latex_header: \lstset{breaklines=true,frame=tlrb,xleftmargin=\tdima,xrightmargin=\tdimb, rulecolor=\color{gray}, keywordstyle=\ttfamily} #+latex_header: \usepackage[backend=biber,style=authoryear-comp,date=year,sorting=nyt,sortlocale=auto,maxcitenames=1,maxbibnames=10,uniquename=init,maxitems=1,giveninits=true,terseinits=true,dashed=false,doi=true,isbn=false,url=false] {biblatex} #+latex_header: \addbibresource{ref.bib} diff --git a/doc/paper.org b/doc/paper.org index 3d8bd26..b435f54 100644 --- a/doc/paper.org +++ b/doc/paper.org @@ -2,7 +2,7 @@ #+setupfile: latex.org #+language: en #+startup: show2levels indent hidestars hideblocks -#+options: title:nil toc:nil H:4 author:nil +#+options: title:nil toc:nil H:4 author:nil num:nil #+bibliography: "ref.bib" #+begin_export latex @@ -16,20 +16,19 @@ Email: zilong.dk@gamil.com.}} methods to manipulate it in different programming languages. Static languages like C++ has the superiority of performance, and modern C++ standards strive to provide growing standard library for - developing program quickly and easily. In this work, I present - vcfpp, a C++ API of htslib in a single header file, providing an - intuitive interface to manipulate VCF/BCF files rapidly and - safely. An example demonstrates how minimal vcfpp code can - manipulate variant records rapidly. Additionally, the simplicity and - portability makes vcfpp working with scripting language like R - easily. The second example demonstrates how vcfpp can be useful to - Statisticians analyzing genomics variations in R. In the - Benchmarking, a simple R script using vcfpp API shows faster speed - than script using cyvcf2 when streaming variant analysis. And in a - two-step setting, a R script using vcfpp has faster speed of both - loading VCF contents and processing genotypes than the R scripts - using vcfR and data.table. Finally, some useful command line tools - using vcfpp are available. + developing program quickly and easily. This work presents vcfpp, a + C++ API of htslib in a single file, providing an intuitive interface + to manipulate VCF/BCF files rapidly and safely, as well as being + portable. Additionally, this work introduces the vcfppR package to + demonstrate the usage of vcfpp API in writing R script and packages + esaily for analyzing as well as visualizing genomic variations in + R. In the Benchmarking, the R script using vcfpp API shows faster + speed than the python script using cyvcf2 API when streaming variant + analysis. And in a two-step setting, the vcfppR package shows faster + speed of both loading VCF contents and processing genotypes than the + vcfR package. Finally, some useful command line tools using vcfpp + are available to demonstrate the easy-to-use of vcfpp in writing a + C++ program. \end{abstract} \end{titlepage} #+end_export @@ -43,27 +42,28 @@ representing genetic variation information with complicated specification [cite:@danecek2011]. Later on, as the ever-growing data size, the BCF format is designed to query and store big data efficiently. The C API of htslib provide the full functionalities to -manipulate VCF and BCF format for both compressed and uncompressed -files. As the C API is hard to use for in-proficient programmers, -API of other languages are invented. Existing popular libraries -include vcfR[cite:@brian2017] for R, cyvcf2 [cite:@pedersen2017a] -for Python, hts-nim [cite:@pedersen2018] for Nim and vcflib -[cite:@garrison2022] for C++. All are useful for corresponding -community, having both advantages and disadvantages. In particular, -vcflib is both an API and a large collection of command line tools, -with the main disadvantage being not supporting BCF files. It is +manipulate the VCF and BCF format for both compressed and +uncompressed files. As the C API is hard to use for in-proficient +programmers, API of other languages are invented. Existing popular +libraries include vcfR[cite:@brian2017] for R, cyvcf2 +[cite:@pedersen2017a] for Python, hts-nim [cite:@pedersen2018] for +Nim and vcflib [cite:@garrison2022] for C++. All are valuable for +corresponding community but not perfect. In particular, vcflib is +both an API and a large collection of command line tools, with the +primary disadvantage being not supporting the BCF format. It is noteworthy that many methods written in C++ designed for large -sample size can not even input the BCF file which is tailed for big -data like Syllbable-PBWT [cite:@wang2023]. The motivation of vcfpp -is to offer full functionalities as htslib and provide simple and -safe API in a single header file that can be easily integrated for -writing scripts quickly in C++ and R. +sample size can not even input the BCF file tailed for big data or +the compressed VCF file like Syllbable-PBWT [cite:@wang2023]. The +motivation of vcfpp is to offer full functionalities as htslib and +provide simple and safe API in a single header file that can be +easily integrated for writing program quickly in C++ and R. * Methods vcfpp is implemented as a single header file for being easily integrated and compiled. There are four core class for manipulating -uncompressed and compressed VCF/BCF as summarized in Table [[tb:class]]. +the uncompressed and compressed VCF/BCF files as summarized in Table +[[tb:class]]. #+caption: vcfpp capabilities and implemented C++ class #+name: tb:class @@ -78,45 +78,46 @@ uncompressed and compressed VCF/BCF as summarized in Table [[tb:class]]. * Results -To demonstrate the power and performance of vcfpp, the -following sections illustrate commonly used features of vcfpp and -highlight the portability of vcfpp in R. Other examples and further -details of vcfpp can be found at https://github.com/Zilong-Li/vcfpp. +To demonstrate the power and performance of vcfpp, the following +sections illustrate commonly used features of vcfpp and highlight +the vcfppR package as an example of vcfpp working with R. ** C++ API -In this example, we count the number of heterozygous sites for each -sample in all records. The following code first includes a single -vcfpp.h file (line 1), opens a compressed BCF file constrained to 3 -samples and region "chr21" (line 2), and creates a variant record -associated with the header information in the file (line 3). Then it -defines several types of objects to collect the results we want (line -4-6). Taking advantage of generic templates in C++, the type of field -like genotype can be very flexible regarding memory consumption. Then +In this example (Listing [[list1]]), we count the number of +heterozygous sites for each sample across all records. The +following code first includes a single vcfpp.h file (line 1), +opens a compressed BCF file constrained to 3 samples and region +"chr21" (line 2), and creates a variant record associated with the +header information in the BCF (line 3). Then it defines several +types of objects to collect the results we want (line 4-6). Taking +advantage of generic templates in C++, the type of field like +genotype can be very flexible regarding memory consumption. Then it starts iterating over the BCF file and processes each variant -record in the loop (line 7). We ignore variants of other types (INDEL, -SV), or with missing genotypes, or with /QUAL/ value smaller than 9, -while we can also retrieve more information of the variant and do -filtering (line 9-10). Finally, we count the number of heterozygous -variant for each diploid sample (line 11-13). The core is just 13 -lines. +record in the loop (line 7). We ignore variants of other types +(INDEL, SV), or with /FILTER/ not being "PASS", or with /QUAL/ value +smaller than 9, while we can also retrieve more information of the +variant and do filtering (line 9-10). Finally, we count the number +of heterozygous variant for each diploid sample (line 11-13). The +core is just 12 lines. +#+name: list1 #+caption: Counting the number of heterozygous genotypes for 3 samples on chr21 #+attr_latex: :options captionpos=t #+begin_src C++ -n #include vcfpp::BcfReader vcf("bcf.gz", "chr21", "id1,id2,id3"); vcfpp::BcfRecord var(vcf.header); // create a variant object -vector gt; // genotype can be bool, char or int type +vector gt; // genotype can be bool, char or int type vector hetsum(vcf.nsamples, 0); // store the het counts float hwe, exchet; // capture more INFO while(vcf.getNextVariant(var)){ var.getGenotypes(gt); var.getINFO("HWE",hwe), var.getINFO("ExcHet",exchet);//more - if(!var.isSNP()||!var.isNoneMissing()||var.QUAL()<9) continue; + if(!var.isSNP()||var.QUAL()<9||var.FILTER()!="PASS")continue; assert(var.ploidy()==2); // make sure it is diploidy for(int i=0; i @@ -146,37 +149,60 @@ input. using namespace Rcpp; using namespace std; // [[Rcpp::export]] -List readbcf(string vcffile) { - vcfpp::BcfReader vcf(vcffile); - vcfpp::BcfRecord var(vcf.header); - vector> GT, GQ; - vector gt, gq; // store a vector of genotype - while (vcf.getNextVariant(var)) { - var.getGenotypes(gt); - var.getFORMAT("GQ",gq); // can get all other tags - GT.push_back(gt), GQ.push_back(gq); - } - return List::create(GT, GQ); +List heterozygosity(std::string vcffile, + std::string region, + std::string samples) { + // here copy the lines 2-14 in listing 1. + return List::create(Named("samples")=vcf.header.getSamples(), + Named("hets")=hetsum); } #+end_src -#+caption: The R code compiles and calls the above C++ code +#+name: list3 +#+caption: The R code compiles and calls the vcfpp-het.cpp #+attr_latex: :options captionpos=t #+begin_src R library(Rcpp) -sourceCpp("vcfpp-r.cpp") -out <- readbcf("bcf.gz") -## next perform statistical modeling +sourceCpp("vcfpp-hets.cpp") +heterozygosity("bcf.gz", "chr21","id1,id2,id3"); #+end_src +** The vcfppR package + +The vcfppR package is developed and powered by the vcfpp API. For +parsing the VCF in R, the /vcftable/ function can read contents of +the VCF into the R data type rapidly with fine control of the +region, the samples, the variant types, the INFO / FORMAT column +and the filters. For instance, the code in Listing 4 parses the +read depth per variant (DP) in the raw called VCF by the 1000 +Genomes Project through the URL link, restricting to 3 samples +with variants in "chr21:1-10000000" region of SNP type and passing +FILTER, as well as discarding the INFO column in the returned +list. Consequently, the visual summarization is done by using +/barplot/ in R. + +#+caption: vcftable in vcfppR +#+attr_latex: :options captionpos=t +#+begin_src R +library(vcfppR) +vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz" +vcf <- vcftable(vcffile, region="chr21:1-10000000", samples="NA12878,HG00118,HG00119", format="DP", vartype="snps", pass = TRUE, info = FALSE) +boxplot(vcf$dp, names=vcf$samples, ylab="Read Depth (DP)") +#+end_src + +Furthermore, as characterization of the variations is an essential +task in understanding the genome, I showcase the /vcfsummary/ +function in summarizing the variations in the VCF by reproducing +the statistics in the 1KGP paper [cite:@byrska-bishop2022]. + ** Benchmarking -In addition to simplicity and portability, I show how fast and -efficient scripts using vcfpp can be. In the benchmarking, we -perform the same task as the cyvcf2 paper, that is counting -heterozygous genotypes per sample using code in Listing 1. As shown -in Table [[tb:counthets]], the "test-cyvcf2.py" is $1.3\times$ slower -than the "test-vcfpp-1.R", and both use little RAM in a streaming +In addition to simplicity and portability, I showcase the +performance of vcfpp and vcfppR. In the benchmarking, I performed +the same task as the cyvcf2 paper, that is counting heterozygous +genotypes per sample using code in Listing 1. As shown in Table +[[tb:counthets]], the "test-cyvcf2.py" is $1.3\times$ slower than the +"test-vcfpp-1.R", and both use little RAM in a streaming strategy. As R packages usually load data into tables first and perform analysis later, I also write a function to load whole VCF content in R for such two-step comparison although it is not @@ -185,55 +211,55 @@ only $1.9\times$ slower compared to $70\times$, $85\times$ slower for "test-vcfR.R" and "test-fread.R" respectively. This is because genotypes made by both vcfR and data.table::fread are characters, which are very slow to parse in R despite a faster string library -was used. In contrast, with vcfpp, integer vectors of genotypes can -be returned to R directly for computation. Moreover, regarding the -best practice of data analysis in R, we usually want to inspect part -of the data table first to make further decisions. And vcfpp has the -full functionalities of htslib that is supporting reading BCF, -selecting samples and regions. A rapid query of VCF content can be -achieved by passing a region parameter. - -#+caption: Performance of counting heterozygous genotypes per sample in the 1000 Genome Project for chromosome 21. (^) used by /sourceCpp/ function. (*) used by loading data in two-step strategy. +was used. In contrast, with vcfpp, integer vectors of genotypes +can be returned to R directly for computation. Moreover, regarding +the best practice of data analysis in R, we usually want to +inspect part of the data table first to make further +decisions. And vcfpp has the full functionalities of htslib that +is supporting reading BCF, selecting samples and regions. A rapid +query of VCF content can be achieved by passing a region +parameter. + +#+caption: Performance of counting heterozygous genotypes per sample in the 1000 Genome Project for chromosome 21. (*) used by loading data in two-step strategy. #+name: tb:counthets #+attr_latex: :align lllllll :placement [H] -|-------------------+------------+-------+------------+-----------| -| API | Time (s) | Ratio | RAM (Gb) | Strategy | -|-------------------+------------+-------+------------+-----------| -| vcfpp::BcfReader | 118 | 1.0 | 0.006 | streaming | -| vcfpp::BcfReader | 119+5^ | 1.0 | 0.07+0.28^ | streaming | -| cyvcf2::VCF | 159 | 1.3 | 0.04 | streaming | -| vcfppR::tableGT | 164*+196 | 1.8 | 73 | two-step | -| vcfR::read.vcfR | 651*+1382 | 9.9 | 105 | two-step | -| data.table::fread | 272*+10275 | 85 | 77 | two-step | -|-------------------+------------+-------+------------+-----------| +|-------------+------------+-------+----------+-----------| +| API/Package | Time (s) | Ratio | RAM (Gb) | Strategy | +|-------------+------------+-------+----------+-----------| +| vcfpp | 118 | 1.0 | 0.006 | streaming | +| vcfppR | 119 | 1.0 | 0.07 | streaming | +| cyvcf2 | 159 | 1.3 | 0.04 | streaming | +| vcfppR | 164*+196 | 1.8 | 73 | two-step | +| vcfR | 651*+1382 | 9.9 | 105 | two-step | +| data.table | 272*+10275 | 85 | 77 | two-step | +|-------------+------------+-------+----------+-----------| * Discussion I have developed vcfpp, a fast and flexible C++ API for scripting high-performance genetic variant analyses. Its simplicity and portability can be very useful for both developing packages and -writing daily used scripts. Many existing packages written in C++ -using customized VCF parser can be replaced with vcfpp to offer more +writing daily used scripts. Many packages written in C++ using +customized VCF parser can be replaced with vcfpp to offer more functionalities. For instance, imputation software STITCH [cite:@davies2016] and QUILT [cite:@davies2021] are using vcfpp to parse large reference panel in VCF/BCF format. Also, there are some useful command line tools available, such as a modified -Syllable-PBWT with compressed VCF/BCF as input. +Syllable-PBWT with the compressed VCF/BCF as input. * Software and Code -The latest release of vcfpp.h can be downloaded from -https://github.com/Zilong-Li/vcfpp/releases. Scripts for -Benchmarking can be found here -https://github.com/Zilong-Li/vcfpp/tree/main/scripts. More helpful -example functions can be found at -https://github.com/Zilong-Li/vcfpp. +The latest release of vcfpp.h and documentation can be found at +https://github.com/Zilong-Li/vcfpp. Scripts for Benchmarking can be +found at https://github.com/Zilong-Li/vcfpp/tree/main/scripts. The +vcfppR package is available at https://github.com/Zilong-Li/vcfppR. * Acknowledgments -I would like to thank Anders Albrechtsen at Copenhagen University and -Robert W Davies at Oxford University for helpful comments. Also, this -project is inspired by open source projects SeqLib and cyvcf2. +I would like to thank Anders Albrechtsen at Copenhagen University +and Robert W Davies at Oxford University for helpful comments. They +are Statisticians working on genetics as well as R enthusiast, whom +I work with and learn a lot from. * Funding @@ -241,6 +267,41 @@ This work is supported by the Novo Nordisk 462 Foundation (NNF20OC0061343). #+print_bibliography: +\newpage +\appendix + +* Supplementary + +** Example + +#+caption: vcfsummary in vcfppR +#+attr_latex: :options captionpos=t +#+begin_src R +library(vcfppR) +svfile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20210124.SV_Illumina_Integration/1KGP_3202.gatksv_svtools_novelins.freeze_V3.wAF.vcf.gz" +sv <- vcfsummary(svfile, svtype = TRUE) +par(mfrow=c(1,3)) ## layout in plots +allsvs <- sv$summary[-1] +bar <- barplot(allsvs, ylim = c(0, 1.1*max(allsvs)), + main = "Variant Count (all SVs)") +text(bar, allsvs+4500, paste0("n: ", allsvs)) +boxplot(sv[c("DEL","DUP", "CNV", "INS","INV","CPX","CTX")], + main = "SVs per Genome stratified by SV types") +ped <- read.table("https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/20130606_g1k_3202_samples_ped_population.txt", h=T) +ped <- ped[order(ped$Superpopulation),] +supers <- unique(ped$Superpopulation) +o <- lapply(supers, function(pop) { + id <- subset(ped, Superpopulation == pop)[,"SampleID"] + ord <- match(id, samples) + sapply(sv[-c(1,2)], "[", ord) +}) +names(o) <- supers +s <- as.data.frame(lapply(o, colMeans)) +barplot(colSums(s), + main = "SVs per Genome stratified by populations") +#+end_src + +[[file:dp.pdf]] * Local setup :noexport: Local Variables: diff --git a/scripts/test-cyvcf2.py b/scripts/test-cyvcf2.py index c7b32e7..65e62ac 100644 --- a/scripts/test-cyvcf2.py +++ b/scripts/test-cyvcf2.py @@ -4,9 +4,10 @@ vcffile = sys.argv[1] # vcffile = "1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz" -vcf = cyvcf2.VCF(vcffile) +vcf = cyvcf2.VCF(vcffile, strict_gt=True) sample_counts = np.zeros(len(vcf.samples), dtype=float) for var in vcf: + if var.is_snp==False: continue sample_counts[(var.gt_types == vcf.HET)] += 1 print(sample_counts) diff --git a/vcfpp.h b/vcfpp.h index b26ecc0..e19a41b 100644 --- a/vcfpp.h +++ b/vcfpp.h @@ -2,7 +2,7 @@ * @file https://github.com/Zilong-Li/vcfpp/vcfpp.h * @author Zilong Li * @email zilong.dk@gmail.com - * @version v0.2.0 + * @version v0.3.0 * @breif a single C++ file for manipulating VCF * Copyright (C) 2022-2023.The use of this code is governed by the LICENSE file. ******************************************************************************/ @@ -357,6 +357,10 @@ class BcfRecord int nploidy = 0; // the number of ploidy int nvalues = 0; // the number of values for a tag in FORMAT + public: + /// if there is "." in GT for the sample, then it's coded as missing (TRUE) + std::vector isGenoMissing; + public: /** @brief initilize a BcfRecord object using a given BcfHeader object. */ BcfRecord(const BcfHeader & h) : header(h) @@ -399,7 +403,7 @@ type as noted in the other overloading function. { ndst = 0; ret = bcf_get_genotypes(header.hdr, line, >s, &ndst); - if(ret <= 0) return false; // gt not present + if(ret <= 0) throw std::runtime_error("genotypes not present"); // if nploidy is not set manually. find the max nploidy using the first variant (eg. 2) resize v as // max(nploidy) // * nsamples (ret) NOTE: if ret == nsamples and only one sample then haploid @@ -415,6 +419,7 @@ type as noted in the other overloading function. nploidy = ret / nsamples; } // work with nploidy == 1, haploid? + isGenoMissing.assign(nsamples, 0); int i, j, nphased = 0; noneMissing = true; fmt = bcf_get_fmt(header.hdr, line, "GT"); @@ -426,6 +431,7 @@ type as noted in the other overloading function. if(typeOfGT[i] == GT_UNKN) { noneMissing = false; // set missing as het, user should check if isNoneMissing(); + isGenoMissing[i] = 1; v[i * nploidy + 0] = 1; for(j = 1; j < nploidy_cur; j++) v[i * nploidy + j] = 0; continue; @@ -460,8 +466,9 @@ type as noted in the other overloading function. { ndst = 0; ret = bcf_get_genotypes(header.hdr, line, >s, &ndst); - if(ret <= 0) return false; // gt not present + if(ret <= 0) throw std::runtime_error("genotypes not present"); v.resize(ret); + isGenoMissing.assign(nsamples, 0); nploidy = ret / nsamples; int i, j, nphased = 0; noneMissing = true; @@ -477,6 +484,7 @@ type as noted in the other overloading function. if(bcf_gt_is_missing(ptr[j])) { noneMissing = false; + isGenoMissing[i] = 1; v[i * nploidy + j] = -9; continue; } @@ -860,7 +868,7 @@ type as noted in the other overloading function. return true; } - /** @brief return boolean value indicates if current variant is INDEL or not */ + /** @brief return boolean value indicates if current variant is exclusively INDEL */ inline bool isIndel() const { // REF has multiple allels @@ -874,28 +882,22 @@ type as noted in the other overloading function. return false; } - /** @brief return boolean value indicates if current variant is INDEL and DELETION or not */ - inline bool isDeletion() const + /** @brief return boolean value indicates if current variant is multiallelic sites */ + inline bool isMultiAllelics() const { - if(ALT().length() > 1) return false; - if(!isIndel()) return false; - if(ALT().length() == 0) - return true; - else if(ALT()[0] == '.') - return true; - if(REF().length() > ALT().length()) return true; - return false; + if(line->n_allele <= 2) return false; + return true; } - /** @brief return boolean value indicates if current variant is multi allelic or not */ - inline bool isMultiAllelic() const + /** @brief return boolean value indicates if current variant is exclusively multiallelic SNP sites */ + inline bool isMultiAllelicSNP() const { - // REF has multiple allels + // skip REF with length > 1, i.e. INDEL if(REF().length() > 1 || line->n_allele <= 2) return false; for(int i = 1; i < line->n_allele; i++) { std::string snp(line->d.allele[i]); - if(!(snp == "A" || snp == "C" || snp == "G" || snp == "T")) + if(snp.length() != 1) { return false; } @@ -903,7 +905,7 @@ type as noted in the other overloading function. return true; } - /** @brief return boolean value indicates if current variant is SNP (biallelic) or not */ + /** @brief return boolean value indicates if current variant is exclusively biallelic SNP. Note ALT=* are skipped */ inline bool isSNP() const { // REF and ALT have multiple allels @@ -916,6 +918,78 @@ type as noted in the other overloading function. return true; } + /** @brief return boolean value indicates if current variant has SNP type defined in vcf.h (htslib>=1.16) */ + inline bool hasSNP() const + { + int type = bcf_has_variant_types(line, VCF_SNP, bcf_match_overlap); + if (type < 0) throw std::runtime_error("something wrong with variant type\n"); + if (type == 0) return false; + return true; + } + + /// return boolean value indicates if current variant has INDEL type defined in vcf.h (htslib>=1.16) + inline bool hasINDEL() const + { + int type = bcf_has_variant_types(line, VCF_INDEL, bcf_match_overlap); + if (type < 0) throw std::runtime_error("something wrong with variant type\n"); + if (type == 0) return false; + return true; + } + + /// return boolean value indicates if current variant has INS type defined in vcf.h (htslib>=1.16) + inline bool hasINS() const + { + int type = bcf_has_variant_types(line, VCF_INS, bcf_match_overlap); + if (type < 0) throw std::runtime_error("something wrong with variant type\n"); + if (type == 0) return false; + return true; + } + + /// return boolean value indicates if current variant has DEL type defined in vcf.h (htslib>=1.16) + inline bool hasDEL() const + { + int type = bcf_has_variant_types(line, VCF_DEL, bcf_match_overlap); + if (type < 0) throw std::runtime_error("something wrong with variant type\n"); + if (type == 0) return false; + return true; + } + + /// return boolean value indicates if current variant has MNP type defined in vcf.h (htslib>=1.16) + inline bool hasMNP() const + { + int type = bcf_has_variant_types(line, VCF_MNP, bcf_match_overlap); + if (type < 0) throw std::runtime_error("something wrong with variant type\n"); + if (type == 0) return false; + return true; + } + + /// return boolean value indicates if current variant has BND type defined in vcf.h (htslib>=1.16) + inline bool hasBND() const + { + int type = bcf_has_variant_types(line, VCF_BND, bcf_match_overlap); + if (type < 0) throw std::runtime_error("something wrong with variant type\n"); + if (type == 0) return false; + return true; + } + + /// return boolean value indicates if current variant has OTHER type defined in vcf.h (htslib>=1.16) + inline bool hasOTHER() const + { + int type = bcf_has_variant_types(line, VCF_OTHER, bcf_match_overlap); + if (type < 0) throw std::runtime_error("something wrong with variant type\n"); + if (type == 0) return false; + return true; + } + + /// return boolean value indicates if current variant has OVERLAP type defined in vcf.h (htslib>=1.16) + inline bool hasOVERLAP() const + { + int type = bcf_has_variant_types(line, VCF_OVERLAP, bcf_match_overlap); + if (type < 0) throw std::runtime_error("something wrong with variant type\n"); + if (type == 0) return false; + return true; + } + /** @brief return CHROM name */ inline std::string CHROM() const { @@ -970,10 +1044,10 @@ type as noted in the other overloading function. return std::string(line->d.allele[0]); } - /** @brief swap REF and ALT for biallelic */ + /** @brief swap REF and ALT for biallelic SNP */ inline void swap_REF_ALT() { - if(!isMultiAllelic()) std::swap(line->d.allele[0], line->d.allele[1]); + if(!isMultiAllelicSNP()) std::swap(line->d.allele[0], line->d.allele[1]); } /** @brief return raw ALT alleles as string */ @@ -1250,27 +1324,13 @@ class BcfReader return hts_set_threads(fp, n); } - /** @brief get the number of records of given region from index file */ - uint64_t getRegionIndex(const std::string & region) + /** @brief get the number of records of given region */ + uint64_t getVariantsCount(BcfRecord & r, const std::string & region) { - setRegion(region); // only one chromosome - int tid = 0, ret = 0, nseq = 0; - uint64_t records, v; - if(tidx) - { - tbx_seqnames(tidx, &nseq); - } - else - { - nseq = hts_idx_nseq(hidx); - } - for(tid = 0; tid < nseq; tid++) - { - ret = hts_idx_get_stat(tidx ? tidx->idx : hidx, tid, &records, &v); - // printf("%" PRIu64 "\n", records); - if(ret >= 0 && records > 0) return records; - } - return 0; + uint64_t c{0}; + while(getNextVariant(r)) c++; + setRegion(region); // reset the region + return c; } /** @@ -1281,20 +1341,27 @@ class BcfReader { // 1. check and load index first // 2. query iterval region + // 3. if region is empty, use "." if(isEndWith(fname, "bcf") || isEndWith(fname, "bcf.gz")) { isBcf = true; hidx = bcf_index_load(fname.c_str()); if(itr) bcf_itr_destroy(itr); // reset current region. - itr = bcf_itr_querys(hidx, header.hdr, region.c_str()); + if (region.empty()) + itr = bcf_itr_querys(hidx, header.hdr, "."); + else + itr = bcf_itr_querys(hidx, header.hdr, region.c_str()); } else { isBcf = false; tidx = tbx_index_load(fname.c_str()); assert(tidx != NULL && "error loading tabix index!"); - if(itr) tbx_itr_destroy(itr); // reset current region. - itr = tbx_itr_querys(tidx, region.c_str()); + if (itr) tbx_itr_destroy(itr); // reset current region. + if (region.empty()) + itr = tbx_itr_querys(tidx, "."); + else + itr = tbx_itr_querys(tidx, region.c_str()); assert(itr != NULL && "no interval region found.failed!"); } }