Skip to content

Commit

Permalink
update scripts and python example
Browse files Browse the repository at this point in the history
  • Loading branch information
Zilong-Li committed Dec 12, 2023
1 parent 3253ce3 commit c9f75f2
Show file tree
Hide file tree
Showing 9 changed files with 379 additions and 126 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ test.cpp
*.bin
compile_commands.json
*.o
*log
*llog*
*gz*
*dSYM
*.so
32 changes: 32 additions & 0 deletions Pybind11/py_example.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include "../vcfpp.h"

namespace py = pybind11;

using namespace vcfpp;
using namespace std;

vector<int> heterozygosity(std::string vcffile, std::string region = "", std::string samples = "")
{
BcfReader vcf(vcffile, region, samples);
BcfRecord var(vcf.header); // construct a variant record
vector<int> gt; // genotype can be bool, char or int type
vector<int> hetsum(vcf.nsamples, 0);
while(vcf.getNextVariant(var))
{
var.getGenotypes(gt);
// analyze SNP variant with no genotype missingness
if(!var.isSNP()) continue; // analyze SNPs only
assert(var.ploidy() == 2); // make sure it is diploidy
for(size_t i = 0; i < gt.size() / 2; i++) hetsum[i] += abs(gt[2 * i + 0] - gt[2 * i + 1]) == 1;
}
return hetsum;
}

PYBIND11_MODULE(py_example, m)
{
m.doc() = "pybind11 example plugin"; // optional module docstring

m.def("heterozygosity", &heterozygosity, "A function that calculates the heterozygosity for each sample in the VCF");
}
23 changes: 23 additions & 0 deletions Pybind11/readme.org
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#+title: How to use vcfpp in Python
#+author: Zilong Li
#+language: en

* Install pybind11 and htslib in conda

#+begin_src shell
conda install -c conda-forge pybind11
conda install -c bioconda htslib
#+end_src

* Compiling the py_example.cpp

#+begin_src shell
g++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) example.cpp -o example$(python3-config --extension-suffix) -I${CONDA_PREFIX}/include -L${CONDA_PREFIX}/lib -lhts
#+end_src

* Call the C++ function in ~Python~

#+begin_src python
import py_example
ret = py_example.heterzygosity(vcffile, region, samples)
#+end_src
6 changes: 6 additions & 0 deletions doc/example.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
library(vcfppR)

## Listing 4
vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz"
vcf <- vcftable(vcffile, region="chr21:1-10000000", samples="NA12878,HG00118,HG00119", format="DP", vartype="snps", pass = TRUE, info = FALSE)
boxplot(vcf$dp, names=vcf$samples, ylab="Read Depth (DP)")
238 changes: 126 additions & 112 deletions doc/paper.org

Large diffs are not rendered by default.

158 changes: 158 additions & 0 deletions doc/ref.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
@article{danecek2011,
title = {The variant call format and VCFtools},
author = {Danecek, Petr and Auton, Adam and Abecasis, Goncalo and Albers, Cornelis A. and Banks, Eric and DePristo, Mark A. and Handsaker, Robert E. and Lunter, Gerton and Marth, Gabor T. and Sherry, Stephen T. and McVean, Gilean and Durbin, Richard and {1000 Genomes Project Analysis Group}},
date = {2011-08-01},
journaltitle = {Bioinformatics (Oxford, England)},
shortjournal = {Bioinformatics},
volume = {27},
number = {15},
eprint = {21653522},
eprinttype = {pmid},
pages = {2156--2158},
issn = {1367-4811},
doi = {10.1093/bioinformatics/btr330},
langid = {english},
pmcid = {PMC3137218},
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/DanecekP_2011/Danecek_2011_The variant call format and VCFtools.pdf}
}

@article{brian2017,
title = {vcfr: a package to manipulate and visualize variant call format data in R},
shorttitle = {vcfr},
author = {Brian, J. Knaus and Niklaus, J. Gr\"unwald},
date = {2017},
journaltitle = {Molecular Ecology Resources},
volume = {17},
number = {1},
pages = {44--53},
issn = {1755-0998},
doi = {10.1111/1755-0998.12549},
langid = {english},
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/KnausB_2017/Knaus_2017_vcfr.pdf}
}

@article{pedersen2017a,
title = {cyvcf2: fast, flexible variant analysis with Python},
shorttitle = {cyvcf2},
author = {Pedersen, Brent S and Quinlan, Aaron R},
date = {2017-06-15},
journaltitle = {Bioinformatics},
shortjournal = {Bioinformatics},
volume = {33},
number = {12},
pages = {1867--1869},
issn = {1367-4803},
doi = {10.1093/bioinformatics/btx057},
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/PedersenB_2017/Pedersen_2017_cyvcf2.pdf}
}

@article{pedersen2018,
ids = {pedersen2018a},
title = {hts-nim: scripting high-performance genomic analyses},
shorttitle = {hts-nim},
author = {Pedersen, Brent S and Quinlan, Aaron R},
date = {2018-10-01},
journaltitle = {Bioinformatics},
shortjournal = {Bioinformatics},
volume = {34},
number = {19},
pages = {3387--3389},
issn = {1367-4803},
doi = {10.1093/bioinformatics/bty358},
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/PedersenB_2018/Pedersen_2018_hts-nim.pdf;/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/PedersenB_2018/Pedersen_2018_hts-nim2.pdf}
}

@article{garrison2022,
title = {A spectrum of free software tools for processing the VCF variant call format: vcflib, bio-vcf, cyvcf2, hts-nim and slivar},
shorttitle = {A spectrum of free software tools for processing the VCF variant call format},
author = {Garrison, Erik and Kronenberg, Zev N. and Dawson, Eric T. and Pedersen, Brent S. and Prins, Pjotr},
date = {2022-05-31},
journaltitle = {PLOS Computational Biology},
shortjournal = {PLOS Computational Biology},
volume = {18},
number = {5},
pages = {e1009123},
publisher = {Public Library of Science},
issn = {1553-7358},
doi = {10.1371/journal.pcbi.1009123},
langid = {english},
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/GarrisonE_2022/Garrison_2022_A spectrum of free software tools for processing the VCF variant call format.pdf}
}

@article{wang2023,
title = {Syllable-PBWT for space-efficient haplotype long-match query},
author = {Wang, Victor and Naseri, Ardalan and Zhang, Shaojie and Zhi, Degui},
date = {2023-01-01},
journaltitle = {Bioinformatics},
shortjournal = {Bioinformatics},
volume = {39},
number = {1},
pages = {btac734},
issn = {1367-4811},
doi = {10.1093/bioinformatics/btac734},
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/WangV_2023/Wang_2023_Syllable-PBWT for space-efficient haplotype long-match query.pdf}
}

@article{eddelbuettel2011,
title = {Rcpp: Seamless R and C++ Integration},
shorttitle = {Rcpp},
author = {Eddelbuettel, Dirk and Francois, Romain},
date = {2011-04-13},
journaltitle = {Journal of Statistical Software},
volume = {40},
pages = {1--18},
issn = {1548-7660},
doi = {10.18637/jss.v040.i08},
langid = {english},
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/EddelbuettelD_2011/Eddelbuettel_2011_Rcpp.pdf}
}

@article{byrska-bishop2022,
title = {High-coverage whole-genome sequencing of the expanded 1000 Genomes Project cohort including 602 trios},
author = {Byrska-Bishop, Marta and Evani, Uday S. and Zhao, Xuefang and Basile, Anna O. and Abel, Haley J. and Regier, Allison A. and Corvelo, Andr\'e and Clarke, Wayne E. and Musunuri, Rajeeva and Nagulapalli, Kshithija and Fairley, Susan and Runnels, Alexi and Winterkorn, Lara and Lowy, Ernesto and Eichler, Evan E. and Korbel, Jan O. and Lee, Charles and Marschall, Tobias and Devine, Scott E. and Harvey, William T. and Zhou, Weichen and Mills, Ryan E. and Rausch, Tobias and Kumar, Sushant and Alkan, Can and Hormozdiari, Fereydoun and Chong, Zechen and Chen, Yu and Yang, Xiaofei and Lin, Jiadong and Gerstein, Mark B. and Kai, Ye and Zhu, Qihui and Yilmaz, Feyza and Xiao, Chunlin and {Paul Flicek} and Germer, Soren and Brand, Harrison and Hall, Ira M. and Talkowski, Michael E. and Narzisi, Giuseppe and Zody, Michael C.},
date = {2022-09-01},
journaltitle = {Cell},
shortjournal = {Cell},
volume = {185},
number = {18},
pages = {3426-3440.e19},
issn = {0092-8674},
doi = {10.1016/j.cell.2022.08.004},
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Imputation/Byrska-BishopM_2022/Byrska-Bishop_2022_High-coverage whole-genome sequencing of the expanded 1000 Genomes Project.pdf}
}

@article{davies2016,
ids = {Davies2016a},
title = {Rapid genotype imputation from sequence without reference panels},
author = {Davies, Robert W. and Flint, Jonathan and Myers, Simon and Mott, Richard},
date = {2016},
journaltitle = {Nature Genetics},
volume = {48},
number = {8},
eprint = {27376236},
eprinttype = {pmid},
pages = {965--969},
issn = {15461718},
doi = {10.1038/ng.3594},
isbn = {1061-4036},
keywords = {Imputation},
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Imputation/DaviesR_2016/Davies_2016_Rapid genotype imputation from sequence without reference panels.pdf;/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Imputation/DaviesR_2016/Davies_2016_Rapid genotype imputation from sequence without reference panels2.pdf}
}

@article{davies2021,
title = {Rapid genotype imputation from sequence with reference panels},
author = {Davies, Robert W. and Kucka, Marek and Su, Dingwen and Shi, Sinan and Flanagan, Maeve and Cunniff, Christopher M. and Chan, Yingguang Frank and Myers, Simon},
date = {2021-07},
journaltitle = {Nature Genetics},
shortjournal = {Nat Genet},
volume = {53},
number = {7},
pages = {1104--1111},
publisher = {Nature Publishing Group},
issn = {1546-1718},
doi = {10.1038/s41588-021-00877-0},
issue = {7},
langid = {english},
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Imputation/DaviesR_2021/Davies_2021_Rapid genotype imputation from sequence with reference panels.pdf;/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Imputation/DaviesR_2021/Davies_2021_Rapid genotype imputation from sequence with reference panels2.pdf}
}

8 changes: 4 additions & 4 deletions scripts/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@ if [ ! -f $vcffile ];then
fi


# first compile a C++ script
x86_64-conda-linux-gnu-c++ test-vcfpp.cpp -o test-vcfpp -std=c++11 -O3 -Wall -I/home/rlk420/mambaforge/envs/R/include -lhts


# to be fair, make sure cyvcf2 and vcfpp are compiled against same version of htslib.
# do the benchmarking in same conda env
Expand All @@ -24,8 +21,11 @@ $gtime -vvv Rscript test-fread.R $vcffile &> test-fread.llog.1 &
$gtime -vvv Rscript test-vcfR.R $vcffile 2 &> test-vcfR.llog.2 &
$gtime -vvv Rscript test-vcfppR.R $vcffile 1 &> test-vcfppR.llog.1 &
$gtime -vvv Rscript test-vcfppR.R $vcffile 2 &> test-vcfppR.llog.2 &
$gtime -vvv Rscript test-vcfppR.R $vcffile 3 &> test-vcfppR.llog.3 &
$gtime -vvv python test-cyvcf2.py $vcffile &> test-cyvcf2.llog &
$gtime -vvv ./test-vcfpp -i $vcffile &> test-vcfpp.llog &
# compare the above scripts against a compiled C++ program
# x86_64-conda-linux-gnu-c++ test-vcfpp.cpp -o test-vcfpp -std=c++11 -O3 -Wall -I/home/rlk420/mambaforge/envs/R/include -lhts
# $gtime -vvv ./test-vcfpp -i $vcffile &> test-vcfpp.llog &
wait

echo "jobs done. god day"
Expand Down
17 changes: 15 additions & 2 deletions scripts/test-vcfppR.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,28 @@ vcffile <- args[1]
run <- as.integer(args[2])

if(run == 1) {
print(paste("run", run))
print(paste("run", run, "-- wrapper of C++ code"))
print(system.time(res1 <- heterozygosity(vcffile)))
q(save="no")
}

if(run == 2) {
print(paste("run", run))
print(paste("run", run, "-- with vcftable function in vcfppR"))
print(system.time(vcf <- vcftable(vcffile, vartype = "snps")))
print(system.time(res2 <- colSums(vcf[["gt"]]==1, na.rm = TRUE)))
q(save="no")
}

if(run == 3) {
print(paste("run", run, "-- with vcfreader API in vcfppR"))
vcf <- vcfreader$new(vcffile)
res <- rep(0L, vcf$nsamples())
while(vcf$variant()) {
if(vcf$isSNP()) {
gt <- vcf$genotypes(TRUE) == 1
gt[is.na(gt)] <- FALSE
res <- res + gt
}
}
q(save="no")
}
20 changes: 13 additions & 7 deletions vcfpp.h
Original file line number Diff line number Diff line change
Expand Up @@ -818,7 +818,7 @@ type as noted in the other overloading function.
* */
void setPhasing(const std::vector<char> & v)
{
assert(v.size() == nsamples);
assert((int)v.size() == nsamples);
gtPhase = v;
}

Expand Down Expand Up @@ -1042,12 +1042,6 @@ type as noted in the other overloading function.
return line->pos + 1;
}

/** @brief modify position given 1-based value */
inline void setAlleleStr(const char * alleles_string)
{
bcf_update_alleles_str(header.hdr, line, alleles_string);
}

/** @brief modify CHROM value */
inline void setCHR(const char * chr)
{
Expand All @@ -1060,6 +1054,18 @@ type as noted in the other overloading function.
line->pos = p - 1;
}

/** @brief update ID */
inline void setID(const char * s)
{
bcf_update_id(header.hdr, line, s);
}

/** @brief set REF and ALT alleles given a string seperated by comma */
inline void setRefAlt(const char * alleles_string)
{
bcf_update_alleles_str(header.hdr, line, alleles_string);
}

/** @brief return 0-base start of the variant (can be any type) */
inline int64_t Start() const
{
Expand Down

0 comments on commit c9f75f2

Please sign in to comment.