-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
379 additions
and
126 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ test.cpp | |
*.bin | ||
compile_commands.json | ||
*.o | ||
*log | ||
*llog* | ||
*gz* | ||
*dSYM | ||
*.so |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#include <pybind11/pybind11.h> | ||
#include <pybind11/stl.h> | ||
#include "../vcfpp.h" | ||
|
||
namespace py = pybind11; | ||
|
||
using namespace vcfpp; | ||
using namespace std; | ||
|
||
vector<int> heterozygosity(std::string vcffile, std::string region = "", std::string samples = "") | ||
{ | ||
BcfReader vcf(vcffile, region, samples); | ||
BcfRecord var(vcf.header); // construct a variant record | ||
vector<int> gt; // genotype can be bool, char or int type | ||
vector<int> hetsum(vcf.nsamples, 0); | ||
while(vcf.getNextVariant(var)) | ||
{ | ||
var.getGenotypes(gt); | ||
// analyze SNP variant with no genotype missingness | ||
if(!var.isSNP()) continue; // analyze SNPs only | ||
assert(var.ploidy() == 2); // make sure it is diploidy | ||
for(size_t i = 0; i < gt.size() / 2; i++) hetsum[i] += abs(gt[2 * i + 0] - gt[2 * i + 1]) == 1; | ||
} | ||
return hetsum; | ||
} | ||
|
||
PYBIND11_MODULE(py_example, m) | ||
{ | ||
m.doc() = "pybind11 example plugin"; // optional module docstring | ||
|
||
m.def("heterozygosity", &heterozygosity, "A function that calculates the heterozygosity for each sample in the VCF"); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#+title: How to use vcfpp in Python | ||
#+author: Zilong Li | ||
#+language: en | ||
|
||
* Install pybind11 and htslib in conda | ||
|
||
#+begin_src shell | ||
conda install -c conda-forge pybind11 | ||
conda install -c bioconda htslib | ||
#+end_src | ||
|
||
* Compiling the py_example.cpp | ||
|
||
#+begin_src shell | ||
g++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) example.cpp -o example$(python3-config --extension-suffix) -I${CONDA_PREFIX}/include -L${CONDA_PREFIX}/lib -lhts | ||
#+end_src | ||
|
||
* Call the C++ function in ~Python~ | ||
|
||
#+begin_src python | ||
import py_example | ||
ret = py_example.heterzygosity(vcffile, region, samples) | ||
#+end_src |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
library(vcfppR) | ||
|
||
## Listing 4 | ||
vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz" | ||
vcf <- vcftable(vcffile, region="chr21:1-10000000", samples="NA12878,HG00118,HG00119", format="DP", vartype="snps", pass = TRUE, info = FALSE) | ||
boxplot(vcf$dp, names=vcf$samples, ylab="Read Depth (DP)") |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
@article{danecek2011, | ||
title = {The variant call format and VCFtools}, | ||
author = {Danecek, Petr and Auton, Adam and Abecasis, Goncalo and Albers, Cornelis A. and Banks, Eric and DePristo, Mark A. and Handsaker, Robert E. and Lunter, Gerton and Marth, Gabor T. and Sherry, Stephen T. and McVean, Gilean and Durbin, Richard and {1000 Genomes Project Analysis Group}}, | ||
date = {2011-08-01}, | ||
journaltitle = {Bioinformatics (Oxford, England)}, | ||
shortjournal = {Bioinformatics}, | ||
volume = {27}, | ||
number = {15}, | ||
eprint = {21653522}, | ||
eprinttype = {pmid}, | ||
pages = {2156--2158}, | ||
issn = {1367-4811}, | ||
doi = {10.1093/bioinformatics/btr330}, | ||
langid = {english}, | ||
pmcid = {PMC3137218}, | ||
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/DanecekP_2011/Danecek_2011_The variant call format and VCFtools.pdf} | ||
} | ||
|
||
@article{brian2017, | ||
title = {vcfr: a package to manipulate and visualize variant call format data in R}, | ||
shorttitle = {vcfr}, | ||
author = {Brian, J. Knaus and Niklaus, J. Gr\"unwald}, | ||
date = {2017}, | ||
journaltitle = {Molecular Ecology Resources}, | ||
volume = {17}, | ||
number = {1}, | ||
pages = {44--53}, | ||
issn = {1755-0998}, | ||
doi = {10.1111/1755-0998.12549}, | ||
langid = {english}, | ||
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/KnausB_2017/Knaus_2017_vcfr.pdf} | ||
} | ||
|
||
@article{pedersen2017a, | ||
title = {cyvcf2: fast, flexible variant analysis with Python}, | ||
shorttitle = {cyvcf2}, | ||
author = {Pedersen, Brent S and Quinlan, Aaron R}, | ||
date = {2017-06-15}, | ||
journaltitle = {Bioinformatics}, | ||
shortjournal = {Bioinformatics}, | ||
volume = {33}, | ||
number = {12}, | ||
pages = {1867--1869}, | ||
issn = {1367-4803}, | ||
doi = {10.1093/bioinformatics/btx057}, | ||
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/PedersenB_2017/Pedersen_2017_cyvcf2.pdf} | ||
} | ||
|
||
@article{pedersen2018, | ||
ids = {pedersen2018a}, | ||
title = {hts-nim: scripting high-performance genomic analyses}, | ||
shorttitle = {hts-nim}, | ||
author = {Pedersen, Brent S and Quinlan, Aaron R}, | ||
date = {2018-10-01}, | ||
journaltitle = {Bioinformatics}, | ||
shortjournal = {Bioinformatics}, | ||
volume = {34}, | ||
number = {19}, | ||
pages = {3387--3389}, | ||
issn = {1367-4803}, | ||
doi = {10.1093/bioinformatics/bty358}, | ||
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/PedersenB_2018/Pedersen_2018_hts-nim.pdf;/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/PedersenB_2018/Pedersen_2018_hts-nim2.pdf} | ||
} | ||
|
||
@article{garrison2022, | ||
title = {A spectrum of free software tools for processing the VCF variant call format: vcflib, bio-vcf, cyvcf2, hts-nim and slivar}, | ||
shorttitle = {A spectrum of free software tools for processing the VCF variant call format}, | ||
author = {Garrison, Erik and Kronenberg, Zev N. and Dawson, Eric T. and Pedersen, Brent S. and Prins, Pjotr}, | ||
date = {2022-05-31}, | ||
journaltitle = {PLOS Computational Biology}, | ||
shortjournal = {PLOS Computational Biology}, | ||
volume = {18}, | ||
number = {5}, | ||
pages = {e1009123}, | ||
publisher = {Public Library of Science}, | ||
issn = {1553-7358}, | ||
doi = {10.1371/journal.pcbi.1009123}, | ||
langid = {english}, | ||
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/GarrisonE_2022/Garrison_2022_A spectrum of free software tools for processing the VCF variant call format.pdf} | ||
} | ||
|
||
@article{wang2023, | ||
title = {Syllable-PBWT for space-efficient haplotype long-match query}, | ||
author = {Wang, Victor and Naseri, Ardalan and Zhang, Shaojie and Zhi, Degui}, | ||
date = {2023-01-01}, | ||
journaltitle = {Bioinformatics}, | ||
shortjournal = {Bioinformatics}, | ||
volume = {39}, | ||
number = {1}, | ||
pages = {btac734}, | ||
issn = {1367-4811}, | ||
doi = {10.1093/bioinformatics/btac734}, | ||
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/WangV_2023/Wang_2023_Syllable-PBWT for space-efficient haplotype long-match query.pdf} | ||
} | ||
|
||
@article{eddelbuettel2011, | ||
title = {Rcpp: Seamless R and C++ Integration}, | ||
shorttitle = {Rcpp}, | ||
author = {Eddelbuettel, Dirk and Francois, Romain}, | ||
date = {2011-04-13}, | ||
journaltitle = {Journal of Statistical Software}, | ||
volume = {40}, | ||
pages = {1--18}, | ||
issn = {1548-7660}, | ||
doi = {10.18637/jss.v040.i08}, | ||
langid = {english}, | ||
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Bioinformatics/EddelbuettelD_2011/Eddelbuettel_2011_Rcpp.pdf} | ||
} | ||
|
||
@article{byrska-bishop2022, | ||
title = {High-coverage whole-genome sequencing of the expanded 1000 Genomes Project cohort including 602 trios}, | ||
author = {Byrska-Bishop, Marta and Evani, Uday S. and Zhao, Xuefang and Basile, Anna O. and Abel, Haley J. and Regier, Allison A. and Corvelo, Andr\'e and Clarke, Wayne E. and Musunuri, Rajeeva and Nagulapalli, Kshithija and Fairley, Susan and Runnels, Alexi and Winterkorn, Lara and Lowy, Ernesto and Eichler, Evan E. and Korbel, Jan O. and Lee, Charles and Marschall, Tobias and Devine, Scott E. and Harvey, William T. and Zhou, Weichen and Mills, Ryan E. and Rausch, Tobias and Kumar, Sushant and Alkan, Can and Hormozdiari, Fereydoun and Chong, Zechen and Chen, Yu and Yang, Xiaofei and Lin, Jiadong and Gerstein, Mark B. and Kai, Ye and Zhu, Qihui and Yilmaz, Feyza and Xiao, Chunlin and {Paul Flicek} and Germer, Soren and Brand, Harrison and Hall, Ira M. and Talkowski, Michael E. and Narzisi, Giuseppe and Zody, Michael C.}, | ||
date = {2022-09-01}, | ||
journaltitle = {Cell}, | ||
shortjournal = {Cell}, | ||
volume = {185}, | ||
number = {18}, | ||
pages = {3426-3440.e19}, | ||
issn = {0092-8674}, | ||
doi = {10.1016/j.cell.2022.08.004}, | ||
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Imputation/Byrska-BishopM_2022/Byrska-Bishop_2022_High-coverage whole-genome sequencing of the expanded 1000 Genomes Project.pdf} | ||
} | ||
|
||
@article{davies2016, | ||
ids = {Davies2016a}, | ||
title = {Rapid genotype imputation from sequence without reference panels}, | ||
author = {Davies, Robert W. and Flint, Jonathan and Myers, Simon and Mott, Richard}, | ||
date = {2016}, | ||
journaltitle = {Nature Genetics}, | ||
volume = {48}, | ||
number = {8}, | ||
eprint = {27376236}, | ||
eprinttype = {pmid}, | ||
pages = {965--969}, | ||
issn = {15461718}, | ||
doi = {10.1038/ng.3594}, | ||
isbn = {1061-4036}, | ||
keywords = {Imputation}, | ||
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Imputation/DaviesR_2016/Davies_2016_Rapid genotype imputation from sequence without reference panels.pdf;/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Imputation/DaviesR_2016/Davies_2016_Rapid genotype imputation from sequence without reference panels2.pdf} | ||
} | ||
|
||
@article{davies2021, | ||
title = {Rapid genotype imputation from sequence with reference panels}, | ||
author = {Davies, Robert W. and Kucka, Marek and Su, Dingwen and Shi, Sinan and Flanagan, Maeve and Cunniff, Christopher M. and Chan, Yingguang Frank and Myers, Simon}, | ||
date = {2021-07}, | ||
journaltitle = {Nature Genetics}, | ||
shortjournal = {Nat Genet}, | ||
volume = {53}, | ||
number = {7}, | ||
pages = {1104--1111}, | ||
publisher = {Nature Publishing Group}, | ||
issn = {1546-1718}, | ||
doi = {10.1038/s41588-021-00877-0}, | ||
issue = {7}, | ||
langid = {english}, | ||
file = {/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Imputation/DaviesR_2021/Davies_2021_Rapid genotype imputation from sequence with reference panels.pdf;/Users/zilong/Library/CloudStorage/Dropbox/Zll/Datahub/Literatures/Imputation/DaviesR_2021/Davies_2021_Rapid genotype imputation from sequence with reference panels2.pdf} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters