From caafbd773fd929ae2963a91599333d75273d85ff Mon Sep 17 00:00:00 2001
From: Zilong-Li <zimusen94@gmail.com>
Date: Fri, 6 Oct 2023 16:49:42 +0200
Subject: [PATCH] setRegion supports empty string

---
 .gitignore             |   3 +
 doc/latex.org          |   2 +-
 doc/paper.org          | 293 +++++++++++++++++++++++++----------------
 scripts/test-cyvcf2.py |   3 +-
 vcfpp.h                | 155 +++++++++++++++-------
 5 files changed, 294 insertions(+), 162 deletions(-)

diff --git a/.gitignore b/.gitignore
index 687813a..5329090 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,9 @@ t.*
 a-*
 .cache
 tmp/
+test/
+*.vcf*
+*.bcf*
 test.cpp
 *.bin
 compile_commands.json
diff --git a/doc/latex.org b/doc/latex.org
index ffa1293..9ab3288 100644
--- a/doc/latex.org
+++ b/doc/latex.org
@@ -8,6 +8,6 @@
 #+latex_header: \DeclareCaptionFormat{listing}{\parbox{\textwidth}{\colorbox{gray}{\parbox{\textwidth}{#1#2#3}}\vskip-4pt}}
 #+latex_header: \captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white}
 #+latex_header: \newlength\tdima \newlength\tdimb \setlength\tdima{ \fboxsep+\fboxrule} \setlength\tdimb{-\fboxsep+\fboxrule}
-#+latex_header: \lstset{frame=tlrb,xleftmargin=\tdima,xrightmargin=\tdimb, rulecolor=\color{gray}, keywordstyle=\ttfamily}
+#+latex_header: \lstset{breaklines=true,frame=tlrb,xleftmargin=\tdima,xrightmargin=\tdimb, rulecolor=\color{gray}, keywordstyle=\ttfamily}
 #+latex_header: \usepackage[backend=biber,style=authoryear-comp,date=year,sorting=nyt,sortlocale=auto,maxcitenames=1,maxbibnames=10,uniquename=init,maxitems=1,giveninits=true,terseinits=true,dashed=false,doi=true,isbn=false,url=false] {biblatex}
 #+latex_header: \addbibresource{ref.bib}
diff --git a/doc/paper.org b/doc/paper.org
index 3d8bd26..b435f54 100644
--- a/doc/paper.org
+++ b/doc/paper.org
@@ -2,7 +2,7 @@
 #+setupfile: latex.org
 #+language: en
 #+startup: show2levels indent hidestars hideblocks
-#+options: title:nil toc:nil H:4 author:nil
+#+options: title:nil toc:nil H:4 author:nil num:nil
 #+bibliography: "ref.bib"
 
 #+begin_export latex
@@ -16,20 +16,19 @@ Email: zilong.dk@gamil.com.}}
   methods to manipulate it in different programming languages. Static
   languages like C++ has the superiority of performance, and modern
   C++ standards strive to provide growing standard library for
-  developing program quickly and easily. In this work, I present
-  vcfpp, a C++ API of htslib in a single header file, providing an
-  intuitive interface to manipulate VCF/BCF files rapidly and
-  safely. An example demonstrates how minimal vcfpp code can
-  manipulate variant records rapidly. Additionally, the simplicity and
-  portability makes vcfpp working with scripting language like R
-  easily. The second example demonstrates how vcfpp can be useful to
-  Statisticians analyzing genomics variations in R. In the
-  Benchmarking, a simple R script using vcfpp API shows faster speed
-  than script using cyvcf2 when streaming variant analysis. And in a
-  two-step setting, a R script using vcfpp has faster speed of both
-  loading VCF contents and processing genotypes than the R scripts
-  using vcfR and data.table. Finally, some useful command line tools
-  using vcfpp are available.
+  developing program quickly and easily. This work presents vcfpp, a
+  C++ API of htslib in a single file, providing an intuitive interface
+  to manipulate VCF/BCF files rapidly and safely, as well as being
+  portable. Additionally, this work introduces the vcfppR package to
+  demonstrate the usage of vcfpp API in writing R script and packages
+  esaily for analyzing as well as visualizing genomic variations in
+  R. In the Benchmarking, the R script using vcfpp API shows faster
+  speed than the python script using cyvcf2 API when streaming variant
+  analysis. And in a two-step setting, the vcfppR package shows faster
+  speed of both loading VCF contents and processing genotypes than the
+  vcfR package. Finally, some useful command line tools using vcfpp
+  are available to demonstrate the easy-to-use of vcfpp in writing a
+  C++ program.
   \end{abstract}
 \end{titlepage}
 #+end_export
@@ -43,27 +42,28 @@ representing genetic variation information with complicated
 specification [cite:@danecek2011]. Later on, as the ever-growing
 data size, the BCF format is designed to query and store big data
 efficiently. The C API of htslib provide the full functionalities to
-manipulate VCF and BCF format for both compressed and uncompressed
-files. As the C API is hard to use for in-proficient programmers,
-API of other languages are invented. Existing popular libraries
-include vcfR[cite:@brian2017] for R, cyvcf2 [cite:@pedersen2017a]
-for Python, hts-nim [cite:@pedersen2018] for Nim and vcflib
-[cite:@garrison2022] for C++. All are useful for corresponding
-community, having both advantages and disadvantages. In particular,
-vcflib is both an API and a large collection of command line tools,
-with the main disadvantage being not supporting BCF files. It is
+manipulate the VCF and BCF format for both compressed and
+uncompressed files. As the C API is hard to use for in-proficient
+programmers, API of other languages are invented. Existing popular
+libraries include vcfR[cite:@brian2017] for R, cyvcf2
+[cite:@pedersen2017a] for Python, hts-nim [cite:@pedersen2018] for
+Nim and vcflib [cite:@garrison2022] for C++. All are valuable for
+corresponding community but not perfect. In particular, vcflib is
+both an API and a large collection of command line tools, with the
+primary disadvantage being not supporting the BCF format. It is
 noteworthy that many methods written in C++ designed for large
-sample size can not even input the BCF file which is tailed for big
-data like Syllbable-PBWT [cite:@wang2023]. The motivation of vcfpp
-is to offer full functionalities as htslib and provide simple and
-safe API in a single header file that can be easily integrated for
-writing scripts quickly in C++ and R.
+sample size can not even input the BCF file tailed for big data or
+the compressed VCF file like Syllbable-PBWT [cite:@wang2023]. The
+motivation of vcfpp is to offer full functionalities as htslib and
+provide simple and safe API in a single header file that can be
+easily integrated for writing program quickly in C++ and R.
 
 * Methods 
 
 vcfpp is implemented as a single header file for being easily
 integrated and compiled. There are four core class for manipulating
-uncompressed and compressed VCF/BCF as summarized in Table [[tb:class]].
+the uncompressed and compressed VCF/BCF files as summarized in Table
+[[tb:class]].
 
 #+caption: vcfpp capabilities and implemented C++ class
 #+name: tb:class
@@ -78,45 +78,46 @@ uncompressed and compressed VCF/BCF as summarized in Table [[tb:class]].
 
 * Results
 
-To demonstrate the power and performance of vcfpp, the
-following sections illustrate commonly used features of vcfpp and
-highlight the portability of vcfpp in R. Other examples and further
-details of vcfpp can be found at https://github.com/Zilong-Li/vcfpp.
+To demonstrate the power and performance of vcfpp, the following
+sections illustrate commonly used features of vcfpp and highlight
+the vcfppR package as an example of vcfpp working with R.
 
 ** C++ API
 
-In this example, we count the number of heterozygous sites for each
-sample in all records. The following code first includes a single
-vcfpp.h file (line 1), opens a compressed BCF file constrained to 3
-samples and region "chr21" (line 2), and creates a variant record
-associated with the header information in the file (line 3). Then it
-defines several types of objects to collect the results we want (line
-4-6). Taking advantage of generic templates in C++, the type of field
-like genotype can be very flexible regarding memory consumption. Then
+In this example (Listing [[list1]]), we count the number of
+heterozygous sites for each sample across all records. The
+following code first includes a single vcfpp.h file (line 1),
+opens a compressed BCF file constrained to 3 samples and region
+"chr21" (line 2), and creates a variant record associated with the
+header information in the BCF (line 3). Then it defines several
+types of objects to collect the results we want (line 4-6). Taking
+advantage of generic templates in C++, the type of field like
+genotype can be very flexible regarding memory consumption. Then
 it starts iterating over the BCF file and processes each variant
-record in the loop (line 7). We ignore variants of other types (INDEL,
-SV), or with missing genotypes, or with /QUAL/ value smaller than 9,
-while we can also retrieve more information of the variant and do
-filtering (line 9-10). Finally, we count the number of heterozygous
-variant for each diploid sample (line 11-13). The core is just 13
-lines.
+record in the loop (line 7). We ignore variants of other types
+(INDEL, SV), or with /FILTER/ not being "PASS", or with /QUAL/ value
+smaller than 9, while we can also retrieve more information of the
+variant and do filtering (line 9-10). Finally, we count the number
+of heterozygous variant for each diploid sample (line 11-13). The
+core is just 12 lines.
 
+#+name: list1
 #+caption: Counting the number of heterozygous genotypes for 3 samples on chr21
 #+attr_latex: :options captionpos=t
 #+begin_src C++ -n
 #include <vcfpp.h>
 vcfpp::BcfReader vcf("bcf.gz", "chr21", "id1,id2,id3"); 
 vcfpp::BcfRecord var(vcf.header); // create a variant object
-vector<char> gt; // genotype can be bool, char or int type
+vector<int> gt; // genotype can be bool, char or int type
 vector<int> hetsum(vcf.nsamples, 0); // store the het counts
 float hwe, exchet; // capture more INFO
 while(vcf.getNextVariant(var)){
   var.getGenotypes(gt);
   var.getINFO("HWE",hwe), var.getINFO("ExcHet",exchet);//more
-  if(!var.isSNP()||!var.isNoneMissing()||var.QUAL()<9) continue; 
+  if(!var.isSNP()||var.QUAL()<9||var.FILTER()!="PASS")continue; 
   assert(var.ploidy()==2); // make sure it is diploidy
   for(int i=0; i<gt.size()/2; i++) 
-    hetsum[i] += abs(gt[2*i+0] - gt[2*i+1]);
+    hetsum[i] += abs(gt[2*i+0] - gt[2*i+1])==1;
 }
 #+end_src
 
@@ -126,19 +127,21 @@ while(vcf.getNextVariant(var)){
 
 While vcfpp is very simple for writing a C++ program, a single C++
 header file can be easily integrated into popular script languages
-like Python and R etc. The process of finding biological insights
-from variant information always involves statistical models, and
-Statisticians prefer analyzing and visualizing data in R. In this
-example, I demonstrate how vcfpp can work with R seamlessly by
-using Rcpp [cite:@eddelbuettel2011]. With the basic knowledge of
-C++ and Rcpp, we can write such a simple C++ function to get a
-list of genotypes and its quality by only changing the output
-objects to Rcpp::List. Then we can call the /readbcf/ function to
-proceed further statistical tests in R. This exhibits the
-advantage of vcfpp for developing R packages with VCF/BCF as
-input.
-
-#+caption: vcfpp-r.cpp
+like R and Python etc. The process of finding biological insights
+from variant information always involves statistical modeling and
+visualization, which is what R designed for. Therefore, I develop
+the vcfppR package to demonstrate how vcfpp can work with R
+seamlessly by using Rcpp [cite:@eddelbuettel2011]. With the basic
+knowledge of C++ and Rcpp, we can turn the C++ code in Listing
+[[list1]] into the Rcpp function to obtain a list of the number
+heterzygous sites per sample along with the IDs (listing [[list2]]),
+which can be compiled and called in R (Listing [[list3]]). As such, we
+can further proccess and visualize the results efficiently in R,
+such as stratifying the results by populations given external
+labels of each sample and visualizing them in boxplot (supp Fig1).
+
+#+name: list2
+#+caption: vcfpp-hets.cpp
 #+attr_latex: :options captionpos=t
 #+begin_src C++
 #include <Rcpp.h>
@@ -146,37 +149,60 @@ input.
 using namespace Rcpp;
 using namespace std;
 // [[Rcpp::export]]
-List readbcf(string vcffile) {
-    vcfpp::BcfReader vcf(vcffile);
-    vcfpp::BcfRecord var(vcf.header);
-    vector<vector<int>> GT, GQ;
-    vector<int> gt, gq; // store a vector of genotype
-    while (vcf.getNextVariant(var)) {
-        var.getGenotypes(gt);
-        var.getFORMAT("GQ",gq); // can get all other tags
-        GT.push_back(gt), GQ.push_back(gq);
-    }
-    return List::create(GT, GQ); 
+List heterozygosity(std::string vcffile,
+                    std::string region,
+                    std::string samples) {
+  // here copy the lines 2-14 in listing 1.
+  return List::create(Named("samples")=vcf.header.getSamples(),
+                      Named("hets")=hetsum);
 }
 #+end_src
 
-#+caption: The R code compiles and calls the above C++ code
+#+name: list3
+#+caption: The R code compiles and calls the vcfpp-het.cpp
 #+attr_latex: :options captionpos=t
 #+begin_src R
 library(Rcpp)
-sourceCpp("vcfpp-r.cpp")
-out <- readbcf("bcf.gz")
-## next perform statistical modeling 
+sourceCpp("vcfpp-hets.cpp")
+heterozygosity("bcf.gz", "chr21","id1,id2,id3");
 #+end_src
 
+** The vcfppR package
+
+The vcfppR package is developed and powered by the vcfpp API. For
+parsing the VCF in R, the /vcftable/ function can read contents of
+the VCF into the R data type rapidly with fine control of the
+region, the samples, the variant types, the INFO / FORMAT column
+and the filters. For instance, the code in Listing 4 parses the
+read depth per variant (DP) in the raw called VCF by the 1000
+Genomes Project through the URL link, restricting to 3 samples
+with variants in "chr21:1-10000000" region of SNP type and passing
+FILTER, as well as discarding the INFO column in the returned
+list. Consequently, the visual summarization is done by using
+/barplot/ in R.
+
+#+caption: vcftable in vcfppR
+#+attr_latex: :options captionpos=t
+#+begin_src R
+library(vcfppR)
+vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz"
+vcf <- vcftable(vcffile, region="chr21:1-10000000", samples="NA12878,HG00118,HG00119", format="DP", vartype="snps", pass = TRUE, info = FALSE)
+boxplot(vcf$dp, names=vcf$samples, ylab="Read Depth (DP)")
+#+end_src
+
+Furthermore, as characterization of the variations is an essential
+task in understanding the genome, I showcase the /vcfsummary/
+function in summarizing the variations in the VCF by reproducing
+the statistics in the 1KGP paper [cite:@byrska-bishop2022].
+
 ** Benchmarking
 
-In addition to simplicity and portability, I show how fast and
-efficient scripts using vcfpp can be. In the benchmarking, we
-perform the same task as the cyvcf2 paper, that is counting
-heterozygous genotypes per sample using code in Listing 1. As shown
-in Table [[tb:counthets]], the "test-cyvcf2.py" is $1.3\times$ slower
-than the "test-vcfpp-1.R", and both use little RAM in a streaming
+In addition to simplicity and portability, I showcase the
+performance of vcfpp and vcfppR. In the benchmarking, I performed
+the same task as the cyvcf2 paper, that is counting heterozygous
+genotypes per sample using code in Listing 1. As shown in Table
+[[tb:counthets]], the "test-cyvcf2.py" is $1.3\times$ slower than the
+"test-vcfpp-1.R", and both use little RAM in a streaming
 strategy. As R packages usually load data into tables first and
 perform analysis later, I also write a function to load whole VCF
 content in R for such two-step comparison although it is not
@@ -185,55 +211,55 @@ only $1.9\times$ slower compared to $70\times$, $85\times$ slower
 for "test-vcfR.R" and "test-fread.R" respectively. This is because
 genotypes made by both vcfR and data.table::fread are characters,
 which are very slow to parse in R despite a faster string library
-was used. In contrast, with vcfpp, integer vectors of genotypes can
-be returned to R directly for computation. Moreover, regarding the
-best practice of data analysis in R, we usually want to inspect part
-of the data table first to make further decisions. And vcfpp has the
-full functionalities of htslib that is supporting reading BCF,
-selecting samples and regions. A rapid query of VCF content can be
-achieved by passing a region parameter.
-
-#+caption: Performance of counting heterozygous genotypes per sample in the 1000 Genome Project for chromosome 21. (^) used by /sourceCpp/ function. (*) used by loading data in two-step strategy.
+was used. In contrast, with vcfpp, integer vectors of genotypes
+can be returned to R directly for computation. Moreover, regarding
+the best practice of data analysis in R, we usually want to
+inspect part of the data table first to make further
+decisions. And vcfpp has the full functionalities of htslib that
+is supporting reading BCF, selecting samples and regions. A rapid
+query of VCF content can be achieved by passing a region
+parameter.
+
+#+caption: Performance of counting heterozygous genotypes per sample in the 1000 Genome Project for chromosome 21. (*) used by loading data in two-step strategy.
 #+name: tb:counthets
 #+attr_latex: :align lllllll :placement [H]
-|-------------------+------------+-------+------------+-----------|
-| API               | Time (s)   | Ratio |   RAM (Gb) | Strategy  |
-|-------------------+------------+-------+------------+-----------|
-| vcfpp::BcfReader  | 118        |   1.0 |      0.006 | streaming |
-| vcfpp::BcfReader  | 119+5^     |   1.0 | 0.07+0.28^ | streaming |
-| cyvcf2::VCF       | 159        |   1.3 |       0.04 | streaming |
-| vcfppR::tableGT   | 164*+196   |   1.8 |         73 | two-step  |
-| vcfR::read.vcfR   | 651*+1382  |   9.9 |        105 | two-step  |
-| data.table::fread | 272*+10275 |    85 |         77 | two-step  |
-|-------------------+------------+-------+------------+-----------|
+|-------------+------------+-------+----------+-----------|
+| API/Package | Time (s)   | Ratio | RAM (Gb) | Strategy  |
+|-------------+------------+-------+----------+-----------|
+| vcfpp       | 118        |   1.0 |    0.006 | streaming |
+| vcfppR      | 119        |   1.0 |     0.07 | streaming |
+| cyvcf2      | 159        |   1.3 |     0.04 | streaming |
+| vcfppR      | 164*+196   |   1.8 |       73 | two-step  |
+| vcfR        | 651*+1382  |   9.9 |      105 | two-step  |
+| data.table  | 272*+10275 |    85 |       77 | two-step  |
+|-------------+------------+-------+----------+-----------|
 
 * Discussion
 
 I have developed vcfpp, a fast and flexible C++ API for scripting
 high-performance genetic variant analyses. Its simplicity and
 portability can be very useful for both developing packages and
-writing daily used scripts. Many existing packages written in C++
-using customized VCF parser can be replaced with vcfpp to offer more
+writing daily used scripts. Many packages written in C++ using
+customized VCF parser can be replaced with vcfpp to offer more
 functionalities. For instance, imputation software STITCH
 [cite:@davies2016] and QUILT [cite:@davies2021] are using vcfpp to
 parse large reference panel in VCF/BCF format. Also, there are some
 useful command line tools available, such as a modified
-Syllable-PBWT with compressed VCF/BCF as input.
+Syllable-PBWT with the compressed VCF/BCF as input.
 
 * Software and Code
 
-The latest release of vcfpp.h can be downloaded from
-https://github.com/Zilong-Li/vcfpp/releases. Scripts for
-Benchmarking can be found here
-https://github.com/Zilong-Li/vcfpp/tree/main/scripts. More helpful
-example functions can be found at
-https://github.com/Zilong-Li/vcfpp.
+The latest release of vcfpp.h and documentation can be found at
+https://github.com/Zilong-Li/vcfpp. Scripts for Benchmarking can be
+found at https://github.com/Zilong-Li/vcfpp/tree/main/scripts. The
+vcfppR package is available at https://github.com/Zilong-Li/vcfppR.
 
 * Acknowledgments
 
-I would like to thank Anders Albrechtsen at Copenhagen University and
-Robert W Davies at Oxford University for helpful comments. Also, this
-project is inspired by open source projects SeqLib and cyvcf2.
+I would like to thank Anders Albrechtsen at Copenhagen University
+and Robert W Davies at Oxford University for helpful comments. They
+are Statisticians working on genetics as well as R enthusiast, whom
+I work with and learn a lot from.
 
 * Funding
 
@@ -241,6 +267,41 @@ This work is supported by the Novo Nordisk 462 Foundation
 (NNF20OC0061343).
 
 #+print_bibliography:
+\newpage
+\appendix
+
+* Supplementary
+
+** Example
+
+#+caption: vcfsummary in vcfppR
+#+attr_latex: :options captionpos=t
+#+begin_src R
+library(vcfppR)
+svfile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20210124.SV_Illumina_Integration/1KGP_3202.gatksv_svtools_novelins.freeze_V3.wAF.vcf.gz"
+sv <- vcfsummary(svfile, svtype = TRUE)
+par(mfrow=c(1,3)) ## layout in plots
+allsvs <- sv$summary[-1]
+bar <- barplot(allsvs, ylim = c(0, 1.1*max(allsvs)),
+               main = "Variant Count (all SVs)")
+text(bar, allsvs+4500, paste0("n: ", allsvs))
+boxplot(sv[c("DEL","DUP", "CNV", "INS","INV","CPX","CTX")],
+        main = "SVs per Genome stratified by SV types")
+ped <- read.table("https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/20130606_g1k_3202_samples_ped_population.txt", h=T)
+ped <- ped[order(ped$Superpopulation),]
+supers <- unique(ped$Superpopulation)
+o <- lapply(supers, function(pop) {
+  id <- subset(ped, Superpopulation == pop)[,"SampleID"]
+  ord <- match(id, samples)
+  sapply(sv[-c(1,2)], "[", ord)
+})
+names(o) <- supers
+s <- as.data.frame(lapply(o, colMeans))
+barplot(colSums(s),
+        main = "SVs per Genome stratified by populations")
+#+end_src
+
+[[file:dp.pdf]]
 
 * Local setup :noexport:
 Local Variables:
diff --git a/scripts/test-cyvcf2.py b/scripts/test-cyvcf2.py
index c7b32e7..65e62ac 100644
--- a/scripts/test-cyvcf2.py
+++ b/scripts/test-cyvcf2.py
@@ -4,9 +4,10 @@
 
 vcffile = sys.argv[1]
 # vcffile = "1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz"
-vcf = cyvcf2.VCF(vcffile)
+vcf = cyvcf2.VCF(vcffile, strict_gt=True)
 sample_counts = np.zeros(len(vcf.samples), dtype=float)
 
 for var in vcf:
+    if var.is_snp==False: continue
     sample_counts[(var.gt_types == vcf.HET)] += 1
 print(sample_counts)
diff --git a/vcfpp.h b/vcfpp.h
index b26ecc0..e19a41b 100644
--- a/vcfpp.h
+++ b/vcfpp.h
@@ -2,7 +2,7 @@
  * @file        https://github.com/Zilong-Li/vcfpp/vcfpp.h
  * @author      Zilong Li
  * @email       zilong.dk@gmail.com
- * @version     v0.2.0
+ * @version     v0.3.0
  * @breif       a single C++ file for manipulating VCF
  * Copyright (C) 2022-2023.The use of this code is governed by the LICENSE file.
  ******************************************************************************/
@@ -357,6 +357,10 @@ class BcfRecord
     int nploidy = 0; // the number of ploidy
     int nvalues = 0; // the number of values for a tag in FORMAT
 
+  public:
+    /// if there is "." in GT for the sample, then it's coded as missing (TRUE)
+    std::vector<char> isGenoMissing; 
+
   public:
     /** @brief initilize a BcfRecord object using a given BcfHeader object. */
     BcfRecord(const BcfHeader & h) : header(h)
@@ -399,7 +403,7 @@ type as noted in the other overloading function.
     {
         ndst = 0;
         ret = bcf_get_genotypes(header.hdr, line, &gts, &ndst);
-        if(ret <= 0) return false; // gt not present
+        if(ret <= 0) throw std::runtime_error("genotypes not present");
         // if nploidy is not set manually. find the max nploidy using the first variant (eg. 2) resize v as
         // max(nploidy)
         // * nsamples (ret) NOTE: if ret == nsamples and only one sample then haploid
@@ -415,6 +419,7 @@ type as noted in the other overloading function.
             nploidy = ret / nsamples;
         }
         // work with nploidy == 1, haploid?
+        isGenoMissing.assign(nsamples, 0);
         int i, j, nphased = 0;
         noneMissing = true;
         fmt = bcf_get_fmt(header.hdr, line, "GT");
@@ -426,6 +431,7 @@ type as noted in the other overloading function.
             if(typeOfGT[i] == GT_UNKN)
             {
                 noneMissing = false; // set missing as het, user should check if isNoneMissing();
+                isGenoMissing[i] = 1;
                 v[i * nploidy + 0] = 1;
                 for(j = 1; j < nploidy_cur; j++) v[i * nploidy + j] = 0;
                 continue;
@@ -460,8 +466,9 @@ type as noted in the other overloading function.
     {
         ndst = 0;
         ret = bcf_get_genotypes(header.hdr, line, &gts, &ndst);
-        if(ret <= 0) return false; // gt not present
+        if(ret <= 0) throw std::runtime_error("genotypes not present");
         v.resize(ret);
+        isGenoMissing.assign(nsamples, 0);
         nploidy = ret / nsamples;
         int i, j, nphased = 0;
         noneMissing = true;
@@ -477,6 +484,7 @@ type as noted in the other overloading function.
                 if(bcf_gt_is_missing(ptr[j]))
                 {
                     noneMissing = false;
+                    isGenoMissing[i] = 1;
                     v[i * nploidy + j] = -9;
                     continue;
                 }
@@ -860,7 +868,7 @@ type as noted in the other overloading function.
             return true;
     }
 
-    /** @brief return boolean value indicates if current variant is INDEL or not */
+    /** @brief return boolean value indicates if current variant is exclusively INDEL */
     inline bool isIndel() const
     {
         // REF has multiple allels
@@ -874,28 +882,22 @@ type as noted in the other overloading function.
         return false;
     }
 
-    /** @brief return boolean value indicates if current variant is INDEL and DELETION or not */
-    inline bool isDeletion() const
+    /** @brief return boolean value indicates if current variant is multiallelic sites */
+    inline bool isMultiAllelics() const
     {
-        if(ALT().length() > 1) return false;
-        if(!isIndel()) return false;
-        if(ALT().length() == 0)
-            return true;
-        else if(ALT()[0] == '.')
-            return true;
-        if(REF().length() > ALT().length()) return true;
-        return false;
+        if(line->n_allele <= 2) return false;
+        return true;
     }
 
-    /** @brief return boolean value indicates if current variant is multi allelic or not */
-    inline bool isMultiAllelic() const
+    /** @brief return boolean value indicates if current variant is exclusively multiallelic SNP sites */
+    inline bool isMultiAllelicSNP() const
     {
-        // REF has multiple allels
+        // skip REF with length > 1, i.e. INDEL
         if(REF().length() > 1 || line->n_allele <= 2) return false;
         for(int i = 1; i < line->n_allele; i++)
         {
             std::string snp(line->d.allele[i]);
-            if(!(snp == "A" || snp == "C" || snp == "G" || snp == "T"))
+            if(snp.length() != 1)
             {
                 return false;
             }
@@ -903,7 +905,7 @@ type as noted in the other overloading function.
         return true;
     }
 
-    /** @brief return boolean value indicates if current variant is SNP (biallelic) or not */
+    /** @brief return boolean value indicates if current variant is exclusively biallelic SNP. Note ALT=* are skipped */
     inline bool isSNP() const
     {
         // REF and ALT have multiple allels
@@ -916,6 +918,78 @@ type as noted in the other overloading function.
         return true;
     }
 
+    /** @brief return boolean value indicates if current variant has SNP type defined in vcf.h (htslib>=1.16) */
+    inline bool hasSNP() const
+    {
+        int type = bcf_has_variant_types(line, VCF_SNP, bcf_match_overlap);
+        if (type < 0) throw std::runtime_error("something wrong with variant type\n");
+        if (type == 0) return false;
+        return true;
+    }
+
+    /// return boolean value indicates if current variant has INDEL type defined in vcf.h (htslib>=1.16)
+    inline bool hasINDEL() const
+    {
+        int type = bcf_has_variant_types(line, VCF_INDEL, bcf_match_overlap);
+        if (type < 0) throw std::runtime_error("something wrong with variant type\n");
+        if (type == 0) return false;
+        return true;
+    }
+
+    /// return boolean value indicates if current variant has INS type defined in vcf.h (htslib>=1.16)
+    inline bool hasINS() const
+    {
+        int type = bcf_has_variant_types(line, VCF_INS, bcf_match_overlap);
+        if (type < 0) throw std::runtime_error("something wrong with variant type\n");
+        if (type == 0) return false;
+        return true;
+    }
+
+    /// return boolean value indicates if current variant has DEL type defined in vcf.h (htslib>=1.16)
+    inline bool hasDEL() const
+    {
+        int type = bcf_has_variant_types(line, VCF_DEL, bcf_match_overlap);
+        if (type < 0) throw std::runtime_error("something wrong with variant type\n");
+        if (type == 0) return false;
+        return true;
+    }
+
+    /// return boolean value indicates if current variant has MNP type defined in vcf.h (htslib>=1.16)
+    inline bool hasMNP() const
+    {
+        int type = bcf_has_variant_types(line, VCF_MNP, bcf_match_overlap);
+        if (type < 0) throw std::runtime_error("something wrong with variant type\n");
+        if (type == 0) return false;
+        return true;
+    }
+
+    /// return boolean value indicates if current variant has BND type defined in vcf.h (htslib>=1.16)
+    inline bool hasBND() const
+    {
+        int type = bcf_has_variant_types(line, VCF_BND, bcf_match_overlap);
+        if (type < 0) throw std::runtime_error("something wrong with variant type\n");
+        if (type == 0) return false;
+        return true;
+    }
+
+    /// return boolean value indicates if current variant has OTHER type defined in vcf.h (htslib>=1.16)
+    inline bool hasOTHER() const
+    {
+        int type = bcf_has_variant_types(line, VCF_OTHER, bcf_match_overlap);
+        if (type < 0) throw std::runtime_error("something wrong with variant type\n");
+        if (type == 0) return false;
+        return true;
+    }
+
+    /// return boolean value indicates if current variant has OVERLAP type defined in vcf.h (htslib>=1.16)
+    inline bool hasOVERLAP() const
+    {
+        int type = bcf_has_variant_types(line, VCF_OVERLAP, bcf_match_overlap);
+        if (type < 0) throw std::runtime_error("something wrong with variant type\n");
+        if (type == 0) return false;
+        return true;
+    }
+
     /** @brief return CHROM name */
     inline std::string CHROM() const
     {
@@ -970,10 +1044,10 @@ type as noted in the other overloading function.
         return std::string(line->d.allele[0]);
     }
 
-    /** @brief swap REF and ALT for biallelic */
+    /** @brief swap REF and ALT for biallelic SNP */
     inline void swap_REF_ALT()
     {
-        if(!isMultiAllelic()) std::swap(line->d.allele[0], line->d.allele[1]);
+        if(!isMultiAllelicSNP()) std::swap(line->d.allele[0], line->d.allele[1]);
     }
 
     /** @brief return raw ALT alleles as string */
@@ -1250,27 +1324,13 @@ class BcfReader
         return hts_set_threads(fp, n);
     }
 
-    /** @brief get the number of records of given region from index file */
-    uint64_t getRegionIndex(const std::string & region)
+    /** @brief get the number of records of given region */
+    uint64_t getVariantsCount(BcfRecord & r, const std::string & region)
     {
-        setRegion(region); // only one chromosome
-        int tid = 0, ret = 0, nseq = 0;
-        uint64_t records, v;
-        if(tidx)
-        {
-            tbx_seqnames(tidx, &nseq);
-        }
-        else
-        {
-            nseq = hts_idx_nseq(hidx);
-        }
-        for(tid = 0; tid < nseq; tid++)
-        {
-            ret = hts_idx_get_stat(tidx ? tidx->idx : hidx, tid, &records, &v);
-            // printf("%" PRIu64 "\n", records);
-            if(ret >= 0 && records > 0) return records;
-        }
-        return 0;
+        uint64_t c{0};
+        while(getNextVariant(r)) c++;
+        setRegion(region);  // reset the region
+        return c;
     }
 
     /**
@@ -1281,20 +1341,27 @@ class BcfReader
     {
         // 1. check and load index first
         // 2. query iterval region
+        // 3. if region is empty, use "."
         if(isEndWith(fname, "bcf") || isEndWith(fname, "bcf.gz"))
         {
             isBcf = true;
             hidx = bcf_index_load(fname.c_str());
             if(itr) bcf_itr_destroy(itr); // reset current region.
-            itr = bcf_itr_querys(hidx, header.hdr, region.c_str());
+            if (region.empty())
+                itr = bcf_itr_querys(hidx, header.hdr, ".");
+            else
+                itr = bcf_itr_querys(hidx, header.hdr, region.c_str());
         }
         else
         {
             isBcf = false;
             tidx = tbx_index_load(fname.c_str());
             assert(tidx != NULL && "error loading tabix index!");
-            if(itr) tbx_itr_destroy(itr); // reset current region.
-            itr = tbx_itr_querys(tidx, region.c_str());
+            if (itr) tbx_itr_destroy(itr);  // reset current region.
+            if (region.empty())
+                itr = tbx_itr_querys(tidx, ".");
+            else
+                itr = tbx_itr_querys(tidx, region.c_str());
             assert(itr != NULL && "no interval region found.failed!");
         }
     }